From 2630fb196814c543f54e833633f4efdfbedf25cb Mon Sep 17 00:00:00 2001
From: florianmattana <mattana.florian@gmail.com>
Date: Sun, 15 Mar 2026 13:02:42 +0100
Subject: [PATCH 1/3] remove timestep from Q

---
 kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
index bb5cddd..9c436bb 100644
--- a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
+++ b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
@@ -179,7 +179,7 @@ int8_attention_kernel(
     }
 
     float abs_max_Q   = block_reduce_max(lqmax, warp_scr);
-    const float inv_Q = 127.f / fmaxf(abs_max_Q * ts, 1e-6f);
+    const float inv_Q = 127.f / fmaxf(abs_max_Q, 1e-6f);
     const float scl_Q = 1.f / inv_Q;
 
     // Quantize Q tile

From 1ba12999e2990224c353d638668cb0a01f6ae28e Mon Sep 17 00:00:00 2001
From: florianmattana <mattana.florian@gmail.com>
Date: Mon, 16 Mar 2026 17:54:22 +0100
Subject: [PATCH 2/3] use global K scale before tile loop for consistent
 quantization

---
 kernels-v1/attention-int8/CMakeLists.txt      | 251 +++++++
 .../attention_int8_cuda/attention_int8.cu     |  15 +-
 kernels-v1/attention-int8/cmake/_ops.py.in    |   9 +
 .../cmake/add_gpu_arch_metadata.py            |  55 ++
 .../attention-int8/cmake/build-variants.cmake | 298 +++++++++
 .../attention-int8/cmake/compile-metal.cmake  | 104 +++
 .../attention-int8/cmake/get_gpu_lang.cmake   |  17 +
 .../attention-int8/cmake/get_gpu_lang.py      |  20 +
 kernels-v1/attention-int8/cmake/hipify.py     |  76 +++
 kernels-v1/attention-int8/cmake/kernel.cmake  | 296 +++++++++
 .../cmake/metallib_to_header.py               |  73 +++
 kernels-v1/attention-int8/cmake/utils.cmake   | 620 ++++++++++++++++++
 kernels-v1/attention-int8/compat.py           |  26 +
 kernels-v1/attention-int8/metadata-cpu.json   |   4 +
 kernels-v1/attention-int8/metadata-cuda.json  |   4 +
 kernels-v1/attention-int8/metadata-metal.json |   4 +
 .../attention-int8/metadata-neuron.json       |   4 +
 kernels-v1/attention-int8/metadata-rocm.json  |   4 +
 kernels-v1/attention-int8/metadata-xpu.json   |   4 +
 kernels-v1/attention-int8/pyproject.toml      |  23 +
 kernels-v1/attention-int8/setup.py            | 157 +++++
 kernels-v1/attention-int8/test_simple.py      |  25 +
 .../torch-ext/attention_int8/_ops.py          |   9 +
 .../attention-int8/torch-ext/registration.h   |  30 +
 24 files changed, 2121 insertions(+), 7 deletions(-)
 create mode 100644 kernels-v1/attention-int8/CMakeLists.txt
 create mode 100644 kernels-v1/attention-int8/cmake/_ops.py.in
 create mode 100644 kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py
 create mode 100644 kernels-v1/attention-int8/cmake/build-variants.cmake
 create mode 100644 kernels-v1/attention-int8/cmake/compile-metal.cmake
 create mode 100644 kernels-v1/attention-int8/cmake/get_gpu_lang.cmake
 create mode 100644 kernels-v1/attention-int8/cmake/get_gpu_lang.py
 create mode 100644 kernels-v1/attention-int8/cmake/hipify.py
 create mode 100644 kernels-v1/attention-int8/cmake/kernel.cmake
 create mode 100644 kernels-v1/attention-int8/cmake/metallib_to_header.py
 create mode 100644 kernels-v1/attention-int8/cmake/utils.cmake
 create mode 100644 kernels-v1/attention-int8/compat.py
 create mode 100644 kernels-v1/attention-int8/metadata-cpu.json
 create mode 100644 kernels-v1/attention-int8/metadata-cuda.json
 create mode 100644 kernels-v1/attention-int8/metadata-metal.json
 create mode 100644 kernels-v1/attention-int8/metadata-neuron.json
 create mode 100644 kernels-v1/attention-int8/metadata-rocm.json
 create mode 100644 kernels-v1/attention-int8/metadata-xpu.json
 create mode 100644 kernels-v1/attention-int8/pyproject.toml
 create mode 100644 kernels-v1/attention-int8/setup.py
 create mode 100644 kernels-v1/attention-int8/test_simple.py
 create mode 100644 kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py
 create mode 100644 kernels-v1/attention-int8/torch-ext/registration.h

diff --git a/kernels-v1/attention-int8/CMakeLists.txt b/kernels-v1/attention-int8/CMakeLists.txt
new file mode 100644
index 0000000..09b1633
--- /dev/null
+++ b/kernels-v1/attention-int8/CMakeLists.txt
@@ -0,0 +1,251 @@
+cmake_minimum_required(VERSION 3.26)
+
+# Set Intel SYCL compiler before project() call
+find_program(ICX_COMPILER icx)
+find_program(ICPX_COMPILER icpx)
+
+if(ICX_COMPILER OR ICPX_COMPILER)
+  set(CMAKE_C_COMPILER ${ICX_COMPILER})
+
+ if(WIN32)
+    set(CMAKE_CXX_COMPILER ${ICX_COMPILER})
+  else()
+    set(CMAKE_CXX_COMPILER ${ICPX_COMPILER})
+  endif()
+endif()
+
+project(attention-int8 LANGUAGES CXX)
+
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
+include(FetchContent)
+file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
+message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
+
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/kernel.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/get_gpu_lang.cmake)
+
+if(DEFINED Python3_EXECUTABLE)
+  # Allow passing through the interpreter (e.g. from setup.py).
+  find_package(Python3 COMPONENTS Development Development.SABIModule Interpreter)
+  if (NOT Python3_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+else()
+  find_package(Python3 REQUIRED COMPONENTS Development Development.SABIModule Interpreter)
+endif()
+
+get_gpu_lang(DETECTED_GPU_LANG)
+set(GPU_LANG "${DETECTED_GPU_LANG}" CACHE STRING "GPU language")
+gpu_lang_to_backend(BACKEND "${GPU_LANG}")
+message(STATUS "Using backend: ${BACKEND}, GPU language: ${GPU_LANG}")
+
+set(KERNEL_REVISION "dba582b_dirty" CACHE STRING "Kernel revision, must be unique")
+set(OPS_NAME "_attention_int8_${BACKEND}_dba582b_dirty")
+
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+find_package(Torch REQUIRED)
+
+run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version")
+
+
+
+option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off)
+
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+ set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX")
+else()
+  set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX")
+endif()
+
+# Basic checks for each GPU language.
+if(GPU_LANG STREQUAL "CUDA")
+  if(NOT CUDA_FOUND)
+    message(FATAL_ERROR "GPU language is set to CUDA, but cannot find CUDA toolkit")
+  endif()
+
+  
+  
+  # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need
+  # to set our own set of capabilities.
+  clear_gencode_flags()
+
+  # Get the capabilities without +PTX suffixes, so that we can use them as
+  # the target archs in the loose intersection with a kernel's capabilities.
+  cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}")
+
+  if(BUILD_ALL_SUPPORTED_ARCHS)
+    set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+  else()
+    try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability")
+    if(NOT SUCCESS)
+      message(WARNING "Failed to detect CUDA capability, using default capabilities.")
+      set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}")
+    endif()
+  endif()
+
+  message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}")
+
+  if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA")
+    list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}")
+  endif()
+
+  # TODO: deprecate one of these settings.
+  add_compile_definitions(USE_CUDA=1)
+  add_compile_definitions(CUDA_KERNEL)
+elseif(GPU_LANG STREQUAL "HIP")
+  if(NOT HIP_FOUND)
+    message(FATAL_ERROR "GPU language is set to HIP, but cannot find ROCm toolkit")
+  endif()
+
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+
+  override_gpu_arches(GPU_ARCHES HIP ${HIP_SUPPORTED_ARCHS})
+  set(ROCM_ARCHS ${GPU_ARCHES})
+  message(STATUS "ROCM supported target architectures: ${ROCM_ARCHS}")
+
+  # TODO: deprecate one of these settings.
+  add_compile_definitions(USE_ROCM=1)
+  add_compile_definitions(ROCM_KERNEL)
+elseif(GPU_LANG STREQUAL "CPU")
+  add_compile_definitions(CPU_KERNEL)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET "15.0" CACHE STRING "Minimum macOS deployment version")
+elseif(GPU_LANG STREQUAL "METAL")
+  set(CMAKE_OSX_DEPLOYMENT_TARGET "26.0" CACHE STRING "Minimum macOS deployment version")
+  enable_language(C OBJC OBJCXX)
+
+  add_compile_definitions(METAL_KERNEL)
+
+  # Initialize lists for Metal shader sources and their include directories
+  set(ALL_METAL_SOURCES)
+  set(METAL_INCLUDE_DIRS)
+elseif(GPU_LANG STREQUAL "SYCL")
+  if(NOT ICX_COMPILER AND NOT ICPX_COMPILER)
+    message(FATAL_ERROR "Intel SYCL C++ compiler (icpx) and/or C compiler (icx) not found. Please install Intel oneAPI toolkit.")
+  endif()
+
+  execute_process(
+    COMMAND ${ICPX_COMPILER} --version
+    OUTPUT_VARIABLE ICPX_VERSION_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  string(REGEX MATCH "[0-9]+\\.[0-9]+" DPCPP_VERSION "${ICPX_VERSION_OUTPUT}")
+  set(DPCPP_VERSION "${DPCPP_VERSION}" CACHE STRING "DPCPP major.minor version")
+
+  # On Windows, use icx (MSVC-compatible) for C++ to work with Ninja generator
+  # On Linux, use icpx (GNU-compatible) for C++
+  if(WIN32)
+    message(STATUS "Using Intel SYCL C++ compiler: ${ICX_COMPILER} and C compiler: ${ICX_COMPILER} Version: ${DPCPP_VERSION} (Windows MSVC-compatible mode)")
+  else()
+    message(STATUS "Using Intel SYCL C++ compiler: ${ICPX_COMPILER} and C compiler: ${ICX_COMPILER} Version: ${DPCPP_VERSION}")
+  endif()
+
+
+  set(sycl_link_flags "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required';")
+  set(sycl_flags "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
+  set(GPU_FLAGS "${sycl_flags}")
+  set(GPU_ARCHES "")
+
+
+  add_compile_definitions(XPU_KERNEL)
+  add_compile_definitions(USE_XPU)
+else()
+  message(FATAL_ERROR "Unsupported GPU language: ${GPU_LANG}")
+endif()
+
+# Initialize SRC list for kernel and binding sources
+set(SRC "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/build-variants.cmake)
+
+# Generate build variant name.
+if(GPU_LANG STREQUAL "CUDA")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}")
+elseif(GPU_LANG STREQUAL "HIP")
+  run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}")
+elseif(GPU_LANG STREQUAL "SYCL")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "xpu" "${DPCPP_VERSION}")
+elseif(GPU_LANG STREQUAL "METAL")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "metal" "")
+elseif(GPU_LANG STREQUAL "CPU")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "")
+else()
+  message(FATAL_ERROR "Cannot generate build name for unknown GPU_LANG: ${GPU_LANG}")
+endif()
+
+configure_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/_ops.py.in
+  ${CMAKE_CURRENT_SOURCE_DIR}/torch-ext/attention_int8/_ops.py
+  @ONLY
+)
+
+if(GPU_LANG STREQUAL "CUDA")
+  get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG})
+  list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS})
+endif()
+
+set(TORCH_attention-int8_SRC
+  torch-ext/torch_binding.cpp torch-ext/torch_binding.h
+)
+
+
+list(APPEND SRC "${TORCH_attention-int8_SRC}")
+cuda_kernel_component(SRC
+  SOURCES "attention_int8_cuda/attention_int8.cu"
+                )
+# Include Metal shader compilation utilities if needed
+if(GPU_LANG STREQUAL "METAL")
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake)
+endif()
+
+# Define the extension target with unified parameters
+define_gpu_extension_target(
+  ${OPS_NAME}
+  ${OPS_NAME}
+  DESTINATION ${OPS_NAME}
+  LANGUAGE ${GPU_LANG}
+  SOURCES ${SRC}
+  COMPILE_FLAGS ${GPU_FLAGS}
+  ARCHITECTURES ${GPU_ARCHES}
+  USE_SABI 3
+  WITH_SOABI)
+
+if(NOT (MSVC OR GPU_LANG STREQUAL "SYCL"))
+    target_link_options(${OPS_NAME} PRIVATE -static-libstdc++)
+endif()
+
+if(GPU_LANG STREQUAL "SYCL")
+    target_link_options(${OPS_NAME} PRIVATE ${sycl_link_flags})
+    target_link_libraries(${OPS_NAME} PRIVATE dnnl)
+endif()
+
+# Compile Metal shaders if any were found
+if(GPU_LANG STREQUAL "METAL")
+    if(ALL_METAL_SOURCES)
+        compile_metal_shaders(${OPS_NAME} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}")
+    endif()
+endif()
+
+
+# Add kernels_install target for huggingface/kernels library layout
+add_kernels_install_target(${OPS_NAME} "attention_int8" "${BUILD_VARIANT_NAME}"
+    DATA_EXTENSIONS ""
+    GPU_ARCHS "${ALL_GPU_ARCHS}")
+
+# Add local_install target for local development with get_local_kernel()
+add_local_install_target(${OPS_NAME} "attention_int8" "${BUILD_VARIANT_NAME}"
+    DATA_EXTENSIONS ""
+    GPU_ARCHS "${ALL_GPU_ARCHS}")
diff --git a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
index 9c436bb..c05f352 100644
--- a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
+++ b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu
@@ -204,17 +204,18 @@ int8_attention_kernel(
 
     const float inv_sqrt_d = rsqrtf((float)HEAD_DIM);
 
+    float lkmax_global = 0.f;
+
+    for (int i = tid; i < N * HEAD_DIM; i += THREADS) lkmax_global = fmaxf(lkmax_global, fabsf(__half2float(K_head[i])));
+    
+    float abs_max_K_global = block_reduce_max(lkmax_global, warp_scr);
+    const float inv_K = 127.f / fmaxf(abs_max_K_global * ts, 1e-6f);
+    const float scl_K = 1.f / inv_K;
+
     // Stream K tiles
     for (int k_start = 0; k_start < N; k_start += BK) {
         const int k_size = min(BK, N - k_start);
 
-        float lkmax = 0.f;
-        for (int i = tid; i < k_size * HEAD_DIM; i += THREADS)
-            lkmax = fmaxf(lkmax, fabsf(__half2float(K_head[k_start * HEAD_DIM + i])));
-        float abs_max_K   = block_reduce_max(lkmax, warp_scr);
-        const float inv_K = 127.f / fmaxf(abs_max_K * ts, 1e-6f);
-        const float scl_K = 1.f / inv_K;
-
         // [F5][G1] Fused quantize + transpose K
         load_and_quantize_K_transposed<HEAD_DIM, BK>(
             K_head + k_start * HEAD_DIM, K_i8_T, k_size, tid, inv_K);
diff --git a/kernels-v1/attention-int8/cmake/_ops.py.in b/kernels-v1/attention-int8/cmake/_ops.py.in
new file mode 100644
index 0000000..736771e
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/_ops.py.in
@@ -0,0 +1,9 @@
+import torch
+from . import @OPS_NAME@
+ops = torch.ops.@OPS_NAME@
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"@OPS_NAME@::{op_name}"
diff --git a/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py b/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py
new file mode 100644
index 0000000..93174db
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py
@@ -0,0 +1,55 @@
+import argparse
+import json
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Write a metadata JSON file with GPU architecture information, "
+        "reading from a source file and writing to a destination."
+    )
+    parser.add_argument(
+        "input",
+        help="Path to the source metadata JSON file to read from.",
+    )
+    parser.add_argument(
+        "destination",
+        help="Path to write the output metadata JSON file to.",
+    )
+    parser.add_argument(
+        "--backend",
+        required=True,
+        choices=["cuda", "rocm"],
+        help="GPU backend type.",
+    )
+    parser.add_argument(
+        "--archs",
+        required=True,
+        help="Semicolon-separated list of GPU architectures/capabilities.",
+    )
+    args = parser.parse_args()
+
+    archs = sorted(set(a for a in args.archs.split(";") if a))
+
+    try:
+        with open(args.input) as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"Error: input metadata file not found: {args.input}", file=sys.stderr)
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error: failed to parse input metadata JSON: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    data["backend"] = {
+        "type": args.backend,
+        "archs": archs,
+    }
+
+    with open(args.destination, "w") as f:
+        json.dump(data, f, indent=2)
+        f.write("\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kernels-v1/attention-int8/cmake/build-variants.cmake b/kernels-v1/attention-int8/cmake/build-variants.cmake
new file mode 100644
index 0000000..b889772
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/build-variants.cmake
@@ -0,0 +1,298 @@
+# Generate a standardized build variant name following the pattern:
+# torch<VERSION>-[cxx11-]<COMPUTE>-<ARCH>-<OS>
+#
+# Arguments:
+#   OUT_BUILD_NAME - Output variable name
+#   TORCH_VERSION - PyTorch version (e.g., "2.7.1")
+#   COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu, cpu
+#   COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
+#                     Optional for CPU-only builds (pass empty string or omit)
+# Example output: torch271-cxx11-cu124-x86_64-linux (Linux)
+#                 torch271-cu124-x86_64-windows (Windows)
+#                 torch271-metal-aarch64-darwin (macOS)
+#
+function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMPUTE_VERSION)
+    # Flatten version by removing dots and padding to 2 components
+    string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}")
+    list(LENGTH VERSION_LIST VERSION_COMPONENTS)
+
+    # Pad to at least 2 components
+    if(VERSION_COMPONENTS LESS 2)
+        list(APPEND VERSION_LIST "0")
+    endif()
+
+    # Take first 2 components and join without dots
+    list(GET VERSION_LIST 0 MAJOR)
+    list(GET VERSION_LIST 1 MINOR)
+    set(FLATTENED_TORCH "${MAJOR}${MINOR}")
+
+    # Generate compute string
+    if(COMPUTE_FRAMEWORK STREQUAL "cuda")
+        # Flatten CUDA version (e.g., "12.4" -> "124")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "rocm")
+        # Flatten ROCm version (e.g., "6.0" -> "60")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "xpu")
+        # Flatten XPU version (e.g., "2025.2" -> "202552")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "metal")
+        set(COMPUTE_STRING "metal")
+    elseif(COMPUTE_FRAMEWORK STREQUAL "cpu")
+        set(COMPUTE_STRING "cpu")
+    else()
+        message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
+    endif()
+
+    # Detect from target system (CMAKE_SYSTEM_* variables refer to target, not host)
+    # Normalize architecture name
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64|AMD64)$")
+        set(CPU_ARCH "x86_64")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64)$")
+        set(CPU_ARCH "aarch64")
+    else()
+        message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
+
+    # Normalize OS name
+    if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+        set(OS_NAME "windows")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        set(OS_NAME "linux")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+        set(OS_NAME "darwin")
+    else()
+        message(WARNING "Unknown OS ${CMAKE_SYSTEM_NAME}, using as-is")
+        string(TOLOWER "${CMAKE_SYSTEM_NAME}" OS_NAME)
+    endif()
+
+    set(ARCH_OS_STRING "${CPU_ARCH}-${OS_NAME}")
+
+    # Assemble the final build name
+    # For Linux, include cxx11 ABI indicator for compatibility
+    if(ARCH_OS_STRING MATCHES "-linux$")
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-cxx11-${COMPUTE_STRING}-${ARCH_OS_STRING}")
+    else()
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${ARCH_OS_STRING}")
+    endif()
+
+    set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
+    message(STATUS "Generated build name: ${BUILD_NAME}")
+endfunction()
+
+#
+# Create a custom install target for the huggingface/kernels library layout.
+# This installs the extension into a directory structure suitable for kernel hub discovery:
+#   <PREFIX>/<BUILD_VARIANT_NAME>
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#   INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX)
+#   GPU_ARCHS - List of GPU architectures that were compiled
+#               (optional; when provided for CUDA/ROCm, metadata.json will include
+#               a "backend" key with the type and arch list)
+#
+function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    set(oneValueArgs INSTALL_PREFIX)
+    set(multiValueArgs DATA_EXTENSIONS GPU_ARCHS)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT ARG_INSTALL_PREFIX)
+        set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+    endif()
+
+    gpu_lang_to_backend(_BACKEND ${GPU_LANG})
+
+    # Always include 'py' extension for Python files
+    set(ALL_EXTENSIONS ${ARG_DATA_EXTENSIONS})
+    list(APPEND ALL_EXTENSIONS "py")
+
+    # Set the installation directory
+    set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}")
+
+    message(STATUS "Using PACKAGE_NAME: ${PACKAGE_NAME}")
+
+    # Install the compiled extension using CMake's install() command
+    # This will be triggered by the standard INSTALL target
+    install(TARGETS ${TARGET_NAME}
+        LIBRARY DESTINATION "${KERNEL_INSTALL_DIR}"
+        RUNTIME DESTINATION "${KERNEL_INSTALL_DIR}"
+        COMPONENT ${TARGET_NAME})
+
+    # Install data files with specified extensions
+    foreach(ext IN LISTS ALL_EXTENSIONS)
+        file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}")
+        foreach(data_file IN LISTS DATA_FILES)
+            get_filename_component(data_file_dir "${data_file}" DIRECTORY)
+            install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}"
+                DESTINATION "${KERNEL_INSTALL_DIR}/${data_file_dir}"
+                COMPONENT ${TARGET_NAME})
+        endforeach()
+    endforeach()
+
+    message(STATUS "GPU archs: ${ARG_GPU_ARCHS}")
+
+    # Add the GPU archs to matadata.json when applicable.
+    if((GPU_LANG STREQUAL "CUDA" OR GPU_LANG STREQUAL "HIP") AND ARG_GPU_ARCHS)
+        list(JOIN ARG_GPU_ARCHS ";" _GPU_ARCHS_STR)
+        install(CODE "
+            file(MAKE_DIRECTORY \"${KERNEL_INSTALL_DIR}\")
+            execute_process(
+                COMMAND \"${Python3_EXECUTABLE}\"
+                    \"${CMAKE_CURRENT_LIST_DIR}/cmake/add_gpu_arch_metadata.py\"
+                    \"${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json\"
+                    \"${KERNEL_INSTALL_DIR}/metadata.json\"
+                    --backend \"${_BACKEND}\"
+                    --archs \"${_GPU_ARCHS_STR}\"
+                RESULT_VARIABLE _METADATA_RESULT
+                ERROR_VARIABLE _METADATA_ERROR
+            )
+            if(NOT _METADATA_RESULT EQUAL 0)
+                message(WARNING \"Failed to add GPU arch metadata: \${_METADATA_ERROR}\")
+            endif()
+        " COMPONENT ${TARGET_NAME})
+    else()
+        install(FILES ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json
+            DESTINATION "${KERNEL_INSTALL_DIR}"
+            RENAME "metadata.json"
+            COMPONENT ${TARGET_NAME})
+    endif()
+
+    # Compatibility with older kernels and direct Python imports.
+    install(FILES ${CMAKE_SOURCE_DIR}/compat.py
+      DESTINATION "${KERNEL_INSTALL_DIR}/${PACKAGE_NAME}"
+        RENAME "__init__.py"
+        COMPONENT ${TARGET_NAME})
+
+    message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}")
+endfunction()
+
+#
+# Add install rules for local development with huggingface/kernels.
+# This installs the extension into the layout expected by get_local_kernel():
+#   ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/
+#
+# This allows developers to use get_local_kernel() from the kernels library to load
+# locally built kernels without needing to publish to the hub.
+#
+# This uses the standard CMake install() command, so it works with the default
+# "install" target that is always available.
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#   GPU_ARCHS - List of GPU architectures that were compiled
+#               (optional; when provided for CUDA/ROCm, metadata.json will include
+#               a "backend" key with the type and arch list)
+#
+function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    set(oneValueArgs)
+    set(multiValueArgs DATA_EXTENSIONS GPU_ARCHS)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # Always include 'py' extension for Python files
+    set(ALL_EXTENSIONS ${ARG_DATA_EXTENSIONS})
+    list(APPEND ALL_EXTENSIONS "py")
+
+    # Define your local, folder based, installation directory
+    set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}")
+    # Variant directory is where metadata.json should go (for kernels upload discovery)
+    set(VARIANT_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}")
+
+    # Create a custom target for local installation
+    add_custom_target(local_install
+            COMMENT "Installing files to local directory..."
+    )
+
+    gpu_lang_to_backend(_BACKEND ${GPU_LANG})
+
+    # Copy data files with specified extensions
+    foreach(ext IN LISTS ALL_EXTENSIONS)
+        file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}")
+        foreach(data_file IN LISTS DATA_FILES)
+            get_filename_component(data_file_dir "${data_file}" DIRECTORY)
+            add_custom_command(TARGET local_install POST_BUILD
+                COMMAND ${CMAKE_COMMAND} -E make_directory
+                ${LOCAL_INSTALL_DIR}/${data_file_dir}
+                COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}
+                ${LOCAL_INSTALL_DIR}/${data_file_dir}/
+                COMMENT "Copying ${data_file} to ${LOCAL_INSTALL_DIR}/${data_file_dir}"
+            )
+        endforeach()
+    endforeach()
+
+
+    # Add the GPU archs to matadata.json when applicable.
+    if((GPU_LANG STREQUAL "CUDA" OR GPU_LANG STREQUAL "HIP") AND ARG_GPU_ARCHS)
+        list(JOIN ARG_GPU_ARCHS ";" _GPU_ARCHS_STR)
+        add_custom_command(TARGET local_install POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${VARIANT_DIR}
+            COMMAND ${Python3_EXECUTABLE}
+                ${CMAKE_CURRENT_LIST_DIR}/cmake/add_gpu_arch_metadata.py
+                ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json
+                ${VARIANT_DIR}/metadata.json
+                --backend ${_BACKEND}
+                --archs "${_GPU_ARCHS_STR}"
+            COMMENT "Writing metadata.json with GPU arch info to ${VARIANT_DIR}"
+        )
+    else()
+        add_custom_command(TARGET local_install POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${VARIANT_DIR}
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json
+                ${VARIANT_DIR}/metadata.json
+            COMMENT "Copying metadata.json to ${VARIANT_DIR}"
+        )
+    endif()
+
+    add_custom_command(TARGET local_install POST_BUILD
+            # Copy the shared library
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:${TARGET_NAME}>
+            ${LOCAL_INSTALL_DIR}/
+
+            # Compatibility with older kernels and direct Python imports.
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            ${CMAKE_SOURCE_DIR}/compat.py
+            ${VARIANT_DIR}/${PACKAGE_NAME}/__init__.py
+
+            COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
+            COMMAND_EXPAND_LISTS
+    )
+
+    # Create both directories: variant dir for metadata.json, package dir for binaries
+    file(MAKE_DIRECTORY ${VARIANT_DIR})
+    file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
+    message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}")
+endfunction()
diff --git a/kernels-v1/attention-int8/cmake/compile-metal.cmake b/kernels-v1/attention-int8/cmake/compile-metal.cmake
new file mode 100644
index 0000000..50d44a2
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/compile-metal.cmake
@@ -0,0 +1,104 @@
+# Metal shader compilation function
+function(compile_metal_shaders TARGET_NAME METAL_SOURCES EXTRA_INCLUDE_DIRS)
+    if(NOT DEFINED METAL_TOOLCHAIN)
+      execute_process(
+        COMMAND "xcodebuild" "-showComponent" "MetalToolchain"
+        OUTPUT_VARIABLE FIND_METAL_OUT
+        RESULT_VARIABLE FIND_METAL_ERROR_CODE
+        ERROR_VARIABLE FIND_METAL_STDERR
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+      if(NOT FIND_METAL_ERROR_CODE EQUAL 0)
+        message(FATAL_ERROR "${ERR_MSG}: ${FIND_METAL_STDERR}")
+      endif()
+
+      # Extract the Toolchain Search Path value and append Metal.xctoolchain
+      string(REGEX MATCH "Toolchain Search Path: ([^\n]+)" MATCH_RESULT "${FIND_METAL_OUT}")
+      set(METAL_TOOLCHAIN "${CMAKE_MATCH_1}/Metal.xctoolchain")
+    endif()
+
+    # Set Metal compiler flags
+    set(METAL_FLAGS "-std=metal4.0" "-O2")
+
+    # Output directory for compiled metallib
+    set(METALLIB_OUTPUT_DIR "${CMAKE_BINARY_DIR}/metallib")
+    file(MAKE_DIRECTORY ${METALLIB_OUTPUT_DIR})
+
+    foreach(INC ${EXTRA_INCLUDE_DIRS})
+        list(APPEND METAL_FLAGS "-I${INC}")
+    endforeach()
+
+    # Separate .metal files from .h files and compile .metal files to .air
+    set(AIR_FILES)
+    set(METAL_FILES)
+    set(HEADER_FILES)
+
+    foreach(SOURCE_FILE ${METAL_SOURCES})
+        if(SOURCE_FILE MATCHES "\\.metal$")
+            list(APPEND METAL_FILES ${SOURCE_FILE})
+        elseif(SOURCE_FILE MATCHES "\\.h$")
+            list(APPEND HEADER_FILES ${SOURCE_FILE})
+        endif()
+    endforeach()
+
+    foreach(METAL_FILE ${METAL_FILES})
+        get_filename_component(METAL_NAME ${METAL_FILE} NAME_WE)
+        set(AIR_FILE "${CMAKE_BINARY_DIR}/${METAL_NAME}.air")
+
+        # Include header files as dependencies
+        set(ALL_DEPENDENCIES ${CMAKE_CURRENT_SOURCE_DIR}/${METAL_FILE})
+        foreach(HEADER_FILE ${HEADER_FILES})
+            list(APPEND ALL_DEPENDENCIES ${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE})
+        endforeach()
+
+        add_custom_command(
+            OUTPUT ${AIR_FILE}
+            COMMAND "${METAL_TOOLCHAIN}/usr/bin/metal" ${METAL_FLAGS}
+                    -c ${CMAKE_CURRENT_SOURCE_DIR}/${METAL_FILE}
+                    -o ${AIR_FILE}
+            DEPENDS ${ALL_DEPENDENCIES}
+            COMMENT "Compiling Metal shader ${METAL_FILE} to ${AIR_FILE}"
+            VERBATIM
+        )
+
+        list(APPEND AIR_FILES ${AIR_FILE})
+    endforeach()
+
+    # Link all .air files into a single .metallib
+    set(METALLIB_FILE "${METALLIB_OUTPUT_DIR}/${TARGET_NAME}.metallib")
+    add_custom_command(
+        OUTPUT ${METALLIB_FILE}
+        COMMAND "${METAL_TOOLCHAIN}/usr/bin/metallib" ${AIR_FILES}
+                -o ${METALLIB_FILE}
+        DEPENDS ${AIR_FILES}
+        COMMENT "Linking Metal library ${METALLIB_FILE}"
+        VERBATIM
+    )
+
+    # Generate C++ header with embedded metallib data
+    set(METALLIB_HEADER "${CMAKE_BINARY_DIR}/${TARGET_NAME}_metallib.h")
+    set(METALLIB_TO_HEADER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/metallib_to_header.py")
+
+    add_custom_command(
+        OUTPUT ${METALLIB_HEADER}
+        COMMAND ${Python3_EXECUTABLE} ${METALLIB_TO_HEADER_SCRIPT} ${METALLIB_FILE} ${METALLIB_HEADER} ${TARGET_NAME}
+        DEPENDS ${METALLIB_FILE} ${METALLIB_TO_HEADER_SCRIPT}
+        COMMENT "Generating embedded Metal library header ${METALLIB_HEADER}"
+        VERBATIM
+    )
+
+    # Create a custom target for the metallib
+    add_custom_target(${TARGET_NAME}_metallib ALL DEPENDS ${METALLIB_FILE} ${METALLIB_HEADER})
+
+    # Add dependency to main target
+    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_metallib)
+
+    # Add the generated header to include directories
+    target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_BINARY_DIR})
+
+    # Pass the metallib header and namespace as compile definitions
+    target_compile_definitions(${TARGET_NAME} PRIVATE
+        EMBEDDED_METALLIB_HEADER="${TARGET_NAME}_metallib.h"
+        EMBEDDED_METALLIB_NAMESPACE=${TARGET_NAME}_metal
+    )
+endfunction()
diff --git a/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake b/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake
new file mode 100644
index 0000000..004f219
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake
@@ -0,0 +1,17 @@
+#
+# Get the GPU language from Torch.
+#
+function(get_gpu_lang OUT)
+    execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/get_gpu_lang.py"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(NOT PYTHON_ERROR_CODE EQUAL 0)
+        message(FATAL_ERROR "Cannot detect GPU language: ${PYTHON_STDERR}")
+    endif()
+    set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
diff --git a/kernels-v1/attention-int8/cmake/get_gpu_lang.py b/kernels-v1/attention-int8/cmake/get_gpu_lang.py
new file mode 100644
index 0000000..1eedff7
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/get_gpu_lang.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import sys
+
+try:
+    import torch
+except ImportError:
+    print("Torch is required for configuring a kernel build.", file=sys.stderr)
+    sys.exit(1)
+
+if torch.version.cuda is not None:
+    print("CUDA")
+elif torch.version.hip is not None:
+    print("HIP")
+elif torch.backends.mps.is_available():
+    print("METAL")
+elif hasattr(torch.version, "xpu") and torch.version.xpu is not None:
+    print("SYCL")
+else:
+    print("CPU")
diff --git a/kernels-v1/attention-int8/cmake/hipify.py b/kernels-v1/attention-int8/cmake/hipify.py
new file mode 100644
index 0000000..a1539c0
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/hipify.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+
+# From vLLM: https://github.com/vllm-project/vllm/blob/main/cmake/hipify.py
+
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+
+import argparse
+import os
+import shutil
+
+from torch.utils.hipify.hipify_python import hipify
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # Project directory where all the source + include files live.
+    parser.add_argument(
+        "-p",
+        "--project_dir",
+        help="The project directory.",
+    )
+
+    # Directory where hipified files are written.
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+
+    # Source files to convert.
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
+
+    args = parser.parse_args()
+
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
+
+    # Get absolute path for all source files.
+    extra_files = [os.path.abspath(s) for s in args.sources]
+
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
+
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
+
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
+        hipified_sources.append(hipified_s_abs)
+
+    assert (len(hipified_sources) == len(args.sources))
+
+    # Print hipified source files.
+    print("\n".join(hipified_sources))
diff --git a/kernels-v1/attention-int8/cmake/kernel.cmake b/kernels-v1/attention-int8/cmake/kernel.cmake
new file mode 100644
index 0000000..454f3e0
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/kernel.cmake
@@ -0,0 +1,296 @@
+function(accumulate_gpu_archs OUT_ACC ACC EXTRA_ARCHS)
+    list(APPEND ACC ${EXTRA_ARCHS})
+    list(REMOVE_DUPLICATES ACC)
+    list(SORT ACC)
+    set(${OUT_ACC} ${ACC} PARENT_SCOPE)
+endfunction()
+
+function(cuda_kernel_component SRC_VAR)
+    set(options SUPPORTS_HIPIFY)
+    set(oneValueArgs CUDA_MINVER)
+    set(multiValueArgs SOURCES INCLUDES CUDA_CAPABILITIES CUDA_FLAGS CXX_FLAGS HIP_FLAGS ROCM_ARCHS)
+    cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT KERNEL_SOURCES)
+        message(FATAL_ERROR "cuda_kernel_component: SOURCES argument is required")
+    endif()
+
+    # Bail out if this component is not supported by the CUDA version.
+    if(KERNEL_CUDA_MINVER)
+        if(CUDA_VERSION VERSION_LESS ${KERNEL_CUDA_MINVER})
+            return()
+        endif()
+    endif()
+
+    set(_KERNEL_SRC ${KERNEL_SOURCES})
+
+    if(KERNEL_INCLUDES)
+        # TODO: check if CLion support this:
+        # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
+        set_source_files_properties(
+      ${_KERNEL_SRC}
+      PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}")
+    endif()
+
+    if(GPU_LANG STREQUAL "CUDA")
+        # Determine CUDA architectures
+        if(KERNEL_CUDA_CAPABILITIES)
+            cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}")
+        else()
+            set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}")
+        endif()
+        message(STATUS "CUDA kernel capabilities: ${_KERNEL_ARCHS}")
+        set_gencode_flags_for_srcs(SRCS "${_KERNEL_SRC}" CUDA_ARCHS "${_KERNEL_ARCHS}")
+
+        accumulate_gpu_archs(_ALL_GPU_ARCHS "${ALL_GPU_ARCHS}" "${_KERNEL_ARCHS}")
+        set(ALL_GPU_ARCHS ${_ALL_GPU_ARCHS} PARENT_SCOPE)
+
+        # Apply CUDA-specific compile flags
+        if(KERNEL_CUDA_FLAGS)
+            set(_CUDA_FLAGS "${KERNEL_CUDA_FLAGS}")
+            # -static-global-template-stub is not supported on CUDA < 12.8. Remove this
+            # once we don't support CUDA 12.6 anymore.
+            if(CUDA_VERSION VERSION_LESS 12.8)
+                string(REGEX REPLACE "-static-global-template-stub=(true|false)" "" _CUDA_FLAGS "${_CUDA_FLAGS}")
+            endif()
+
+            foreach(_SRC ${_KERNEL_SRC})
+                if(_SRC MATCHES ".*\\.cu$")
+                    set_property(
+            SOURCE ${_SRC}
+            APPEND PROPERTY
+            COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_CUDA_FLAGS}>"
+          )
+                endif()
+            endforeach()
+        endif()
+
+        # Apply CXX-specific compile flags
+        if(KERNEL_CXX_FLAGS)
+            foreach(_SRC ${_KERNEL_SRC})
+                set_property(
+          SOURCE ${_SRC}
+          APPEND PROPERTY
+          COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${KERNEL_CXX_FLAGS}>"
+        )
+            endforeach()
+        endif()
+
+        set(_TMP_SRC ${${SRC_VAR}})
+        list(APPEND _TMP_SRC ${_KERNEL_SRC})
+        set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE)
+
+    elseif(GPU_LANG STREQUAL "HIP")
+        if(NOT KERNEL_SUPPORTS_HIPIFY)
+            message(WARNING "Kernel does not support HIP")
+            return()
+        endif()
+
+        # Apply HIP-specific compile flags
+        if(KERNEL_HIP_FLAGS)
+            foreach(_SRC ${_KERNEL_SRC})
+                if(_SRC MATCHES ".*\\.(cu|hip)$")
+                    set_property(
+            SOURCE ${_SRC}
+            APPEND PROPERTY
+            COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:HIP>:${KERNEL_HIP_FLAGS}>"
+          )
+                endif()
+            endforeach()
+        endif()
+
+        # Determine ROCm architectures
+        if(KERNEL_ROCM_ARCHS)
+            hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}")
+        else()
+            set(_KERNEL_ARCHS "${ROCM_ARCHS}")
+        endif()
+        message(STATUS "HIP kernel archs: ${_KERNEL_ARCHS}")
+
+        accumulate_gpu_archs(_ALL_GPU_ARCHS "${ALL_GPU_ARCHS}" "${_KERNEL_ARCHS}")
+        set(ALL_GPU_ARCHS ${_ALL_GPU_ARCHS} PARENT_SCOPE)
+
+        foreach(_SRC ${_KERNEL_SRC})
+            if(_SRC MATCHES ".*\\.(cu|hip)$")
+                foreach(_ARCH ${_KERNEL_ARCHS})
+                    set_property(
+            SOURCE ${_SRC}
+            APPEND PROPERTY
+            COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:HIP>:--offload-arch=${_ARCH}>"
+          )
+                endforeach()
+            endif()
+        endforeach()
+
+        set(_TMP_SRC ${${SRC_VAR}})
+        list(APPEND _TMP_SRC ${_KERNEL_SRC})
+        set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(xpu_kernel_component SRC_VAR)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SOURCES INCLUDES CXX_FLAGS SYCL_FLAGS)
+    cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT KERNEL_SOURCES)
+        message(FATAL_ERROR "xpu_kernel_component: SOURCES argument is required")
+    endif()
+
+    set(_KERNEL_SRC ${KERNEL_SOURCES})
+
+    # Handle per-file include directories if specified
+    if(KERNEL_INCLUDES)
+        # TODO: check if CLion support this:
+        # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
+        set_source_files_properties(
+            ${_KERNEL_SRC}
+            PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}")
+    endif()
+
+    # Apply CXX-specific compile flags
+    if(KERNEL_CXX_FLAGS)
+        foreach(_SRC ${_KERNEL_SRC})
+            set_property(
+                SOURCE ${_SRC}
+                APPEND PROPERTY
+                COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${KERNEL_CXX_FLAGS}>"
+            )
+        endforeach()
+    endif()
+
+    # Add SYCL-specific compilation flags for XPU sources
+    if(KERNEL_SYCL_FLAGS)
+        # Use kernel-specific SYCL flags
+        foreach(_SRC ${_KERNEL_SRC})
+            if(_SRC MATCHES ".*\\.(cpp|cxx|cc)$")
+                set_property(
+                    SOURCE ${_SRC}
+                    APPEND PROPERTY
+                    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${KERNEL_SYCL_FLAGS}>"
+                )
+            endif()
+        endforeach()
+    else()
+        # Use default SYCL flags (from parent scope variable sycl_flags)
+        foreach(_SRC ${_KERNEL_SRC})
+            if(_SRC MATCHES ".*\\.(cpp|cxx|cc)$")
+                set_property(
+                    SOURCE ${_SRC}
+                    APPEND PROPERTY
+                    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${sycl_flags}>"
+                )
+            endif()
+        endforeach()
+    endif()
+
+    # Append to parent scope SRC variable
+    set(_TMP_SRC ${${SRC_VAR}})
+    list(APPEND _TMP_SRC ${_KERNEL_SRC})
+    set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE)
+endfunction()
+
+function(cpu_kernel_component SRC_VAR)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SOURCES INCLUDES CXX_FLAGS)
+    cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT KERNEL_SOURCES)
+        message(FATAL_ERROR "cpu_kernel_component: SOURCES argument is required")
+    endif()
+
+    set(_KERNEL_SRC ${KERNEL_SOURCES})
+
+    # Handle per-file include directories if specified
+    if(KERNEL_INCLUDES)
+        # TODO: check if CLion support this:
+        # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
+        set_source_files_properties(
+            ${_KERNEL_SRC}
+            PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}")
+    endif()
+
+    # Apply CXX-specific compile flags
+    if(KERNEL_CXX_FLAGS)
+        foreach(_SRC ${_KERNEL_SRC})
+            set_property(
+                SOURCE ${_SRC}
+                APPEND PROPERTY
+                COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${KERNEL_CXX_FLAGS}>"
+            )
+        endforeach()
+    endif()
+
+    # Append to parent scope SRC variable
+    set(_TMP_SRC ${${SRC_VAR}})
+    list(APPEND _TMP_SRC ${_KERNEL_SRC})
+    set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE)
+endfunction()
+
+function(metal_kernel_component SRC_VAR)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SOURCES INCLUDES CXX_FLAGS)
+    cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT KERNEL_SOURCES)
+        message(FATAL_ERROR "metal_kernel_component: SOURCES argument is required")
+    endif()
+
+    set(_KERNEL_SRC ${KERNEL_SOURCES})
+
+    # Separate Metal shader files from other sources
+    set(_METAL_SRC)
+    set(_CPP_SRC)
+
+    foreach(_SRC_FILE IN LISTS _KERNEL_SRC)
+        if(_SRC_FILE MATCHES "\\.(metal|h)$")
+            list(APPEND _METAL_SRC ${_SRC_FILE})
+        else()
+            list(APPEND _CPP_SRC ${_SRC_FILE})
+        endif()
+    endforeach()
+
+    # Handle per-file include directories if specified (for C++ sources only)
+    if(KERNEL_INCLUDES AND _CPP_SRC)
+        # TODO: check if CLion support this:
+        # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories
+        set_source_files_properties(
+            ${_CPP_SRC}
+            PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}")
+    endif()
+
+    # Apply CXX-specific compile flags
+    if(KERNEL_CXX_FLAGS AND _CPP_SRC)
+        foreach(_SRC ${_CPP_SRC})
+            set_property(
+                SOURCE ${_SRC}
+                APPEND PROPERTY
+                COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CXX>:${KERNEL_CXX_FLAGS}>"
+            )
+        endforeach()
+    endif()
+
+    # Add C++ sources to main source list
+    if(_CPP_SRC)
+        set(_TMP_SRC ${${SRC_VAR}})
+        list(APPEND _TMP_SRC ${_CPP_SRC})
+        set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE)
+    endif()
+
+    # Keep track of Metal sources for later compilation
+    if(_METAL_SRC)
+        set(_TMP_METAL ${ALL_METAL_SOURCES})
+        list(APPEND _TMP_METAL ${_METAL_SRC})
+        set(ALL_METAL_SOURCES ${_TMP_METAL} PARENT_SCOPE)
+    endif()
+
+    # Keep the includes directory for the Metal sources
+    if(KERNEL_INCLUDES AND _METAL_SRC)
+        set(_TMP_METAL_INCLUDES ${METAL_INCLUDE_DIRS})
+        list(APPEND _TMP_METAL_INCLUDES ${KERNEL_INCLUDES})
+        set(METAL_INCLUDE_DIRS ${_TMP_METAL_INCLUDES} PARENT_SCOPE)
+    endif()
+endfunction()
diff --git a/kernels-v1/attention-int8/cmake/metallib_to_header.py b/kernels-v1/attention-int8/cmake/metallib_to_header.py
new file mode 100644
index 0000000..82bd252
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/metallib_to_header.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import sys
+import os
+
+def convert_metallib_to_header(metallib_path: str, header_path: str, target_name: str) -> None:
+    """Convert a metallib binary file to a C++ header with embedded data."""
+    
+    # Read the metallib binary data
+    with open(metallib_path, 'rb') as f:
+        data: bytes = f.read()
+    
+    # Generate the header content
+    header_content: str = """// Auto-generated file containing embedded Metal library
+#pragma once
+#include <cstddef>
+#include <Metal/Metal.h>
+
+namespace """ + target_name + """_metal {
+    static const unsigned char metallib_data[] = {
+"""
+    
+    # Convert binary data to C array format
+    bytes_per_line: int = 16
+    for i in range(0, len(data), bytes_per_line):
+        chunk: bytes = data[i:i + bytes_per_line]
+        hex_values: str = ', '.join('0x{:02x}'.format(b) for b in chunk)
+        header_content += "        " + hex_values + ","
+        if i + bytes_per_line < len(data):
+            header_content += "\n"
+    
+    header_content += """
+    };
+    static const size_t metallib_data_len = """ + str(len(data)) + """;
+    
+    // Convenience function to create Metal library from embedded data
+    inline id<MTLLibrary> createLibrary(id<MTLDevice> device, NSError** error = nullptr) {
+        dispatch_data_t libraryData = dispatch_data_create(
+            metallib_data,
+            metallib_data_len,
+            dispatch_get_main_queue(),
+            ^{ /* No cleanup needed for static data */ });
+        
+        NSError* localError = nil;
+        id<MTLLibrary> library = [device newLibraryWithData:libraryData error:&localError];
+        
+        if (error) {
+            *error = localError;
+        }
+        
+        return library;
+    }
+} // namespace """ + target_name + """_metal
+"""
+    
+    # Write the header file
+    dir_path: str = os.path.dirname(header_path)
+    if dir_path:
+        os.makedirs(dir_path, exist_ok=True)
+    with open(header_path, 'w') as f:
+        f.write(header_content)
+    
+    print("Generated {} ({} bytes)".format(header_path, len(data)))
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: metallib_to_header.py <metallib_path> <header_path> <target_name>")
+        sys.exit(1)
+    
+    metallib_path: str = sys.argv[1]
+    header_path: str = sys.argv[2]
+    target_name: str = sys.argv[3]
+    
+    convert_metallib_to_header(metallib_path, header_path, target_name)
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/cmake/utils.cmake b/kernels-v1/attention-int8/cmake/utils.cmake
new file mode 100644
index 0000000..005c2e0
--- /dev/null
+++ b/kernels-v1/attention-int8/cmake/utils.cmake
@@ -0,0 +1,620 @@
+# Vendored from vLLM:
+#
+# https://github.com/vllm-project/vllm/blob/main/cmake/utils.cmake
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python3_EXECUTABLE ${EXECUTABLE})
+  find_package(Python3 COMPONENTS Interpreter Development.Module Development.SABIModule)
+  if (NOT Python3_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE.
+#
+function (try_run_python OUT SUCCESS EXPR)
+  execute_process(
+    COMMAND
+    "${Python3_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    set(${SUCCESS} FALSE PARENT_SCOPE)
+    set(${OUT} "" PARENT_SCOPE)
+  else()
+    set(${SUCCESS} TRUE PARENT_SCOPE)
+    set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(NODUP_SRCS ${ORIG_SRCS})
+  list(REMOVE_DUPLICATES NODUP_SRCS)
+  set(SRCS ${NODUP_SRCS})
+  set(CXX_SRCS ${NODUP_SRCS})
+  list(FILTER SRCS INCLUDE REGEX "\.cu$")
+  list(FILTER CXX_SRCS EXCLUDE REGEX "\.cu$")
+
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    get_source_file_property(include_dirs "${SRC}" INCLUDE_DIRECTORIES)
+    get_source_file_property(compile_options "${SRC}" COMPILE_OPTIONS)
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+
+    if(include_dirs)
+      # Copy over include directories from the original CUDA file.
+      set_source_files_properties(
+        ${SRC}
+        PROPERTIES INCLUDE_DIRECTORIES "${include_dirs}")
+    endif()
+
+    if(compile_options)
+      set_source_files_properties(
+        ${SRC}
+        PROPERTIES COMPILE_OPTIONS "${compile_options}")
+    endif()
+
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+
+  add_custom_target(
+    hipify${NAME}
+    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR} -o ${CMAKE_CURRENT_BINARY_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
+
+#
+# Get additional GPU compiler flags from torch.
+#
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND GPU_FLAGS "-DENABLE_FP8")
+      list(REMOVE_ITEM GPU_FLAGS
+        "-D__CUDA_NO_HALF_OPERATORS__"
+        "-D__CUDA_NO_HALF_CONVERSIONS__"
+        "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+        "-D__CUDA_NO_HALF2_OPERATORS__")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    list(APPEND GPU_FLAGS
+      "-DUSE_ROCM"
+      "-DENABLE_FP8"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-fno-gpu-rdc")
+
+  endif()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
+
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+
+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_gencode_flags()
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_gencode_flags)
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
+
+#
+# Extract unique CUDA architectures from a list of compute capabilities codes in
+# the form `<major><minor>[<letter>]`, convert them to the form sort
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and
+# stores them in `OUT_ARCHES`.
+#
+# Example:
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a"
+#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
+#   OUT_ARCHES="7.5;...;9.0"
+function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
+  set(_CUDA_ARCHES)
+  foreach(_ARCH ${CUDA_ARCH_FLAGS})
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+    if (_COMPUTE)
+      set(_COMPUTE ${CMAKE_MATCH_1})
+    endif()
+
+    string_to_ver(_COMPUTE_VER ${_COMPUTE})
+    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHES)
+  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
+  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
+endfunction()
+
+#
+# For a specific file set the `-gencode` flag in compile options conditionally 
+# for the CUDA language. 
+#
+# Example:
+#   set_gencode_flag_for_srcs(
+#     SRCS "foo.cu"
+#     ARCH "compute_75"
+#     CODE "sm_75")
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#    `foo.cu` (only for the CUDA language).
+#
+macro(set_gencode_flag_for_srcs)
+  set(options)
+  set(oneValueArgs ARCH CODE)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
+  set_property(
+    SOURCE ${arg_SRCS}
+    APPEND PROPERTY
+    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
+  )
+
+  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
+endmacro(set_gencode_flag_for_srcs)
+
+#
+# For a list of source files set the `-gencode` flags in the files specific
+#  compile options (specifically for the CUDA language).
+#
+# arguments are:
+#  SRCS: list of source files
+#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
+#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS
+#    that is larger than BUILD_PTX_FOR_ARCH.
+#
+macro(set_gencode_flags_for_srcs)
+  set(options)
+  set(oneValueArgs BUILD_PTX_FOR_ARCH)
+  set(multiValueArgs SRCS CUDA_ARCHS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  foreach(_ARCH ${arg_CUDA_ARCHS})
+    # handle +PTX suffix: generate both sm and ptx codes if requested
+    string(FIND "${_ARCH}" "+PTX" _HAS_PTX)
+    if(NOT _HAS_PTX EQUAL -1)
+      string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}")
+      string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "compute_${_STRIPPED_ARCH}")
+    else()
+      string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_STRIPPED_ARCH}"
+        CODE "sm_${_STRIPPED_ARCH}")
+    endif()
+  endforeach()
+
+  if (${arg_BUILD_PTX_FOR_ARCH})
+    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
+    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
+      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_PTX_ARCH}"
+        CODE "compute_${_PTX_ARCH}")
+    endif()
+  endif()
+endmacro()
+
+#
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the
+#  `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in
+#  `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there
+#  is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the
+#  architecture in `SRC_CUDA_ARCHS`.
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
+#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
+# The result is stored in `OUT_CUDA_ARCHS`.
+#
+# Example:
+#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
+#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
+#
+# Example With PTX:
+#   SRC_CUDA_ARCHS="8.0+PTX"
+#   TGT_CUDA_ARCHS="9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0+PTX"
+#
+function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+  set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}")
+  set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS})
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  set(_PTX_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(APPEND _PTX_ARCHS "${_base}")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      list(APPEND _SRC_CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+  list(REMOVE_DUPLICATES _PTX_ARCHS)
+  list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
+
+  # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
+  set(_CUDA_ARCHS)
+  foreach(_arch ${_SRC_CUDA_ARCHS})
+    if(_arch MATCHES "[af]$")
+      list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
+      string(REGEX REPLACE "[af]$" "" _base "${_arch}")
+      if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
+        list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
+        list(APPEND _CUDA_ARCHS "${_arch}")
+      endif()
+    endif()
+  endforeach()
+
+  list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${_TGT_CUDA_ARCHS})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check version-less-or-equal, and allow PTX arches to match across majors
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+
+  # reapply +PTX suffix to architectures that requested PTX
+  set(_FINAL_ARCHS)
+  foreach(_arch ${_CUDA_ARCHS})
+    if(_arch IN_LIST _PTX_ARCHS)
+      list(APPEND _FINAL_ARCHS "${_arch}+PTX")
+    else()
+      list(APPEND _FINAL_ARCHS "${_arch}")
+    endif()
+  endforeach()
+  set(_CUDA_ARCHS ${_FINAL_ARCHS})
+
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+#
+# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form
+# `<name>` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list.
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_ROCM_ARCHS` find the highest version
+#  in `SRC_ROCM_ARCHS` that is less or equal to the version in `TGT_ROCM_ARCHS`.
+# The result is stored in `OUT_ROCM_ARCHS`.
+#
+# Example:
+#   SRC_ROCM_ARCHS="gfx900;gfx906;gfx908;gfx90a"
+#   TGT_ROCM_ARCHS="gfx906;gfx908;gfx1030"
+#   hip_archs_loose_intersection(OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
+#   OUT_ROCM_ARCHS="gfx906;gfx908"
+#
+function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS)
+  list(REMOVE_DUPLICATES SRC_ROCM_ARCHS)
+
+  # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit
+  # and x is a letter. We can sort them by string comparison which works for this format.
+  list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING)
+
+  set(_ROCM_ARCHS)
+
+  # Find the intersection of supported architectures
+  foreach(_SRC_ARCH ${SRC_ROCM_ARCHS})
+    if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS)
+      list(APPEND _ROCM_ARCHS ${_SRC_ARCH})
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _ROCM_ARCHS)
+  set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE)
+endfunction()
+
+function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS)
+  set(_CUDA_ARCHS "${CUDA_ARCHS}")
+
+  # handle +PTX suffix: separate base arch for matching, record PTX requests
+  foreach(_arch ${CUDA_ARCHS})
+    if(_arch MATCHES "\\+PTX$")
+      string(REPLACE "+PTX" "" _base "${_arch}")
+      list(REMOVE_ITEM _CUDA_ARCHS "${_arch}")
+      list(APPEND _CUDA_ARCHS "${_base}")
+    endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
+
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set
+# the architectures on a per file basis.
+#
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
+
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    #
+    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
+    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
+    # "rocm_agent_enumerator" in "enable_language(HIP)"
+    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
+    #
+    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
+    else()
+      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
+    endif()
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
+      endif()
+    endforeach()
+
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+  endif()
+endmacro()
+
+#
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE;USE_SABI"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+
+  if (GPU_USE_SABI)
+    Python3_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python3_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+
+  if (GPU_ARCHITECTURES)
+    if (GPU_LANGUAGE STREQUAL "HIP")
+      # Clear target architectures, we are passing arch flags per source file.
+      set_property(TARGET ${GPU_MOD_NAME} PROPERTY HIP_ARCHITECTURES off)
+    else()
+      set_target_properties(${GPU_MOD_NAME} PROPERTIES
+        ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+    endif()
+  endif()
+
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart)
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+endfunction()
+
+# Map a GPU language to its backend name.
+#
+# Arguments:
+#   OUT_BACKEND - Output variable name for the backend string
+#   GPU_LANG - The GPU language (CPU, CUDA, HIP, METAL, SYCL)
+#
+function(gpu_lang_to_backend OUT_BACKEND GPU_LANG)
+    if (${GPU_LANG} STREQUAL "CPU")
+        set(_BACKEND "cpu")
+    elseif (${GPU_LANG} STREQUAL "CUDA")
+        set(_BACKEND "cuda")
+    elseif (${GPU_LANG} STREQUAL "HIP")
+        set(_BACKEND "rocm")
+    elseif (${GPU_LANG} STREQUAL "METAL")
+        set(_BACKEND "metal")
+    elseif (${GPU_LANG} STREQUAL "SYCL")
+        set(_BACKEND "xpu")
+    else()
+        message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}")
+    endif()
+
+    set(${OUT_BACKEND} "${_BACKEND}" PARENT_SCOPE)
+endfunction()
diff --git a/kernels-v1/attention-int8/compat.py b/kernels-v1/attention-int8/compat.py
new file mode 100644
index 0000000..03dbc1a
--- /dev/null
+++ b/kernels-v1/attention-int8/compat.py
@@ -0,0 +1,26 @@
+import ctypes
+import sys
+
+import importlib
+from pathlib import Path
+from types import ModuleType
+
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+
+
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/kernels-v1/attention-int8/metadata-cpu.json b/kernels-v1/attention-int8/metadata-cpu.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-cpu.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-cuda.json b/kernels-v1/attention-int8/metadata-cuda.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-cuda.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-metal.json b/kernels-v1/attention-int8/metadata-metal.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-metal.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-neuron.json b/kernels-v1/attention-int8/metadata-neuron.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-neuron.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-rocm.json b/kernels-v1/attention-int8/metadata-rocm.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-rocm.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-xpu.json b/kernels-v1/attention-int8/metadata-xpu.json
new file mode 100644
index 0000000..9cf5dee
--- /dev/null
+++ b/kernels-v1/attention-int8/metadata-xpu.json
@@ -0,0 +1,4 @@
+{
+  "version": 1,
+  "python-depends": []
+}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/pyproject.toml b/kernels-v1/attention-int8/pyproject.toml
new file mode 100644
index 0000000..0a60c0a
--- /dev/null
+++ b/kernels-v1/attention-int8/pyproject.toml
@@ -0,0 +1,23 @@
+[project]
+name = "attention_int8"
+version = "0.1.0"
+requires-python = ">=3.9"
+
+[build-system]
+requires = [
+  "cmake>=3.26",
+  "ninja",
+  "packaging",
+  "setuptools>=61",
+  "torch",
+  "wheel",
+  
+]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+
+[tool.pytest.ini_options]
+markers = [
+  "kernels_ci: mark a test as a kernel CI test"
+]
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/setup.py b/kernels-v1/attention-int8/setup.py
new file mode 100644
index 0000000..d9c82fe
--- /dev/null
+++ b/kernels-v1/attention-int8/setup.py
@@ -0,0 +1,157 @@
+import logging
+import os
+from shutil import which, move
+import subprocess
+import sys
+from pathlib import Path
+
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+
+logger = logging.getLogger(__name__)
+
+
+def get_backend() -> str:
+    """Detect the backend by inspecting torch."""
+    import torch
+
+    if torch.version.cuda is not None:
+        return "cuda"
+    elif torch.version.hip is not None:
+        return "rocm"
+    elif torch.backends.mps.is_available():
+        return "metal"
+    elif hasattr(torch.version, "xpu") and torch.version.xpu is not None:
+        return "xpu"
+    else:
+        return "cpu"
+
+
+def is_sccache_available() -> bool:
+    return which("sccache") is not None
+
+
+def is_ccache_available() -> bool:
+    return which("ccache") is not None
+
+
+def is_ninja_available() -> bool:
+    return which("ninja") is not None
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: str = "") -> None:
+        super().__init__(name, sources=[], py_limited_api=True)
+        self.sourcedir = os.fspath(Path(sourcedir).resolve())
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext: CMakeExtension) -> None:
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
+        extdir = ext_fullpath.parent.resolve()
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python3_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
+            f"-DPython3_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+        ]
+        build_args = []
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        if not cmake_generator or cmake_generator == "Ninja":
+            try:
+                import ninja
+
+                ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
+                cmake_args += [
+                    "-GNinja",
+                    f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                ]
+            except ImportError:
+                pass
+
+        if is_sccache_available():
+            cmake_args += [
+                "-DCMAKE_C_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_OBJC_COMPILER_LAUNCHER=sccache",
+                "-DCMAKE_OBJCXX_COMPILER_LAUNCHER=sccache",
+            ]
+        elif is_ccache_available():
+            cmake_args += [
+                "-DCMAKE_C_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_OBJC_COMPILER_LAUNCHER=ccache",
+                "-DCMAKE_OBJCXX_COMPILER_LAUNCHER=ccache",
+            ]
+
+        num_jobs = os.getenv("MAX_JOBS", None)
+        if num_jobs is not None:
+            num_jobs = int(num_jobs)
+            logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
+        else:
+            try:
+                # os.sched_getaffinity() isn't universally available, so fall
+                #  back to os.cpu_count() if we get an error here.
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
+
+        nvcc_threads = os.getenv("NVCC_THREADS", None)
+        if nvcc_threads is not None:
+            nvcc_threads = int(nvcc_threads)
+            logger.info(
+                "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
+            )
+            num_jobs = max(1, num_jobs // nvcc_threads)
+            cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)]
+
+        build_args += [f"-j{num_jobs}"]
+        if sys.platform == "win32":
+            build_args += ["--config", cfg]
+
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+
+        subprocess.run(
+            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
+        )
+        subprocess.run(
+            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
+        )
+
+        if sys.platform == "win32":
+            # Move the dylib one folder up for discovery.
+            for filename in os.listdir(extdir / cfg):
+                move(extdir / cfg / filename, extdir / filename)
+
+
+backend = get_backend()
+ops_name = f"_attention_int8_{backend}_dba582b_dirty"
+
+setup(
+    name="attention_int8",
+    # The version is just a stub, it's not used by the final build artefact.
+    version="0.1.0",
+    ext_modules=[CMakeExtension(f"attention_int8.{ops_name}")],
+    cmdclass={"build_ext": CMakeBuild},
+    packages=find_packages(where="torch-ext", include=["attention_int8*"]),
+    package_dir={"": "torch-ext"},
+    zip_safe=False,
+    install_requires=["torch"],
+    python_requires=">=3.9",
+)
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/test_simple.py b/kernels-v1/attention-int8/test_simple.py
new file mode 100644
index 0000000..18b98c2
--- /dev/null
+++ b/kernels-v1/attention-int8/test_simple.py
@@ -0,0 +1,25 @@
+import torch
+torch.ops.load_library("./torch-ext/attention_int8/_attention_int8_cuda_dba582b_dirty.abi3.so")
+
+Q = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
+K = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
+V = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
+
+# Parag's kernel
+O = torch.ops.int8_attn.int8_attention_forward(Q, K, V)
+
+# PyTorch native attention (reference)
+ref = torch.nn.functional.scaled_dot_product_attention(Q, K, V)
+
+# Compare
+diff_mean = (O.float() - ref.float()).abs().mean().item()
+diff_max = (O.float() - ref.float()).abs().max().item()
+
+print(f"Output shape: {O.shape}")
+print(f"Mean difference: {diff_mean:.6f}")
+print(f"Max difference: {diff_max:.6f}")
+
+if diff_mean < 0.05:
+    print("OK - results are correct")
+else:
+    print("PROBLEM - difference too large")
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py b/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py
new file mode 100644
index 0000000..a76c858
--- /dev/null
+++ b/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _attention_int8_cuda_dba582b_dirty
+ops = torch.ops._attention_int8_cuda_dba582b_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_attention_int8_cuda_dba582b_dirty::{op_name}"
diff --git a/kernels-v1/attention-int8/torch-ext/registration.h b/kernels-v1/attention-int8/torch-ext/registration.h
new file mode 100644
index 0000000..19a82cc
--- /dev/null
+++ b/kernels-v1/attention-int8/torch-ext/registration.h
@@ -0,0 +1,30 @@
+// Registration macros from vLLM:
+// https://github.com/vllm-project/vllm/blob/main/csrc/core/registration.h
+
+#pragma once
+
+#include <Python.h>
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
+  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
+
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }

From 577bdee14cdff99de49333d66eccfc98a5ec03a4 Mon Sep 17 00:00:00 2001
From: florianmattana <mattana.florian@gmail.com>
Date: Thu, 26 Mar 2026 09:42:32 +0100
Subject: [PATCH 3/3] remove test and metadata files from tracked files

---
 kernels-v1/attention-int8/compat.py           | 26 -------------------
 kernels-v1/attention-int8/metadata-cpu.json   |  4 ---
 kernels-v1/attention-int8/metadata-cuda.json  |  4 ---
 kernels-v1/attention-int8/metadata-metal.json |  4 ---
 .../attention-int8/metadata-neuron.json       |  4 ---
 kernels-v1/attention-int8/metadata-rocm.json  |  4 ---
 kernels-v1/attention-int8/metadata-xpu.json   |  4 ---
 kernels-v1/attention-int8/test_simple.py      | 25 ------------------
 8 files changed, 75 deletions(-)
 delete mode 100644 kernels-v1/attention-int8/compat.py
 delete mode 100644 kernels-v1/attention-int8/metadata-cpu.json
 delete mode 100644 kernels-v1/attention-int8/metadata-cuda.json
 delete mode 100644 kernels-v1/attention-int8/metadata-metal.json
 delete mode 100644 kernels-v1/attention-int8/metadata-neuron.json
 delete mode 100644 kernels-v1/attention-int8/metadata-rocm.json
 delete mode 100644 kernels-v1/attention-int8/metadata-xpu.json
 delete mode 100644 kernels-v1/attention-int8/test_simple.py

diff --git a/kernels-v1/attention-int8/compat.py b/kernels-v1/attention-int8/compat.py
deleted file mode 100644
index 03dbc1a..0000000
--- a/kernels-v1/attention-int8/compat.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import ctypes
-import sys
-
-import importlib
-from pathlib import Path
-from types import ModuleType
-
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-
-
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
diff --git a/kernels-v1/attention-int8/metadata-cpu.json b/kernels-v1/attention-int8/metadata-cpu.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-cpu.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-cuda.json b/kernels-v1/attention-int8/metadata-cuda.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-cuda.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-metal.json b/kernels-v1/attention-int8/metadata-metal.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-metal.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-neuron.json b/kernels-v1/attention-int8/metadata-neuron.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-neuron.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-rocm.json b/kernels-v1/attention-int8/metadata-rocm.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-rocm.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/metadata-xpu.json b/kernels-v1/attention-int8/metadata-xpu.json
deleted file mode 100644
index 9cf5dee..0000000
--- a/kernels-v1/attention-int8/metadata-xpu.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "version": 1,
-  "python-depends": []
-}
\ No newline at end of file
diff --git a/kernels-v1/attention-int8/test_simple.py b/kernels-v1/attention-int8/test_simple.py
deleted file mode 100644
index 18b98c2..0000000
--- a/kernels-v1/attention-int8/test_simple.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import torch
-torch.ops.load_library("./torch-ext/attention_int8/_attention_int8_cuda_dba582b_dirty.abi3.so")
-
-Q = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
-K = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
-V = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda")
-
-# Parag's kernel
-O = torch.ops.int8_attn.int8_attention_forward(Q, K, V)
-
-# PyTorch native attention (reference)
-ref = torch.nn.functional.scaled_dot_product_attention(Q, K, V)
-
-# Compare
-diff_mean = (O.float() - ref.float()).abs().mean().item()
-diff_max = (O.float() - ref.float()).abs().max().item()
-
-print(f"Output shape: {O.shape}")
-print(f"Mean difference: {diff_mean:.6f}")
-print(f"Max difference: {diff_max:.6f}")
-
-if diff_mean < 0.05:
-    print("OK - results are correct")
-else:
-    print("PROBLEM - difference too large")
\ No newline at end of file