From 2630fb196814c543f54e833633f4efdfbedf25cb Mon Sep 17 00:00:00 2001 From: florianmattana Date: Sun, 15 Mar 2026 13:02:42 +0100 Subject: [PATCH 1/3] remove timestep from Q --- kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu index bb5cddd..9c436bb 100644 --- a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu +++ b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu @@ -179,7 +179,7 @@ int8_attention_kernel( } float abs_max_Q = block_reduce_max(lqmax, warp_scr); - const float inv_Q = 127.f / fmaxf(abs_max_Q * ts, 1e-6f); + const float inv_Q = 127.f / fmaxf(abs_max_Q, 1e-6f); const float scl_Q = 1.f / inv_Q; // Quantize Q tile From 1ba12999e2990224c353d638668cb0a01f6ae28e Mon Sep 17 00:00:00 2001 From: florianmattana Date: Mon, 16 Mar 2026 17:54:22 +0100 Subject: [PATCH 2/3] use global K scale before tile loop for consistent quantization --- kernels-v1/attention-int8/CMakeLists.txt | 251 +++++++ .../attention_int8_cuda/attention_int8.cu | 15 +- kernels-v1/attention-int8/cmake/_ops.py.in | 9 + .../cmake/add_gpu_arch_metadata.py | 55 ++ .../attention-int8/cmake/build-variants.cmake | 298 +++++++++ .../attention-int8/cmake/compile-metal.cmake | 104 +++ .../attention-int8/cmake/get_gpu_lang.cmake | 17 + .../attention-int8/cmake/get_gpu_lang.py | 20 + kernels-v1/attention-int8/cmake/hipify.py | 76 +++ kernels-v1/attention-int8/cmake/kernel.cmake | 296 +++++++++ .../cmake/metallib_to_header.py | 73 +++ kernels-v1/attention-int8/cmake/utils.cmake | 620 ++++++++++++++++++ kernels-v1/attention-int8/compat.py | 26 + kernels-v1/attention-int8/metadata-cpu.json | 4 + kernels-v1/attention-int8/metadata-cuda.json | 4 + kernels-v1/attention-int8/metadata-metal.json | 4 + .../attention-int8/metadata-neuron.json | 4 + kernels-v1/attention-int8/metadata-rocm.json | 4 + kernels-v1/attention-int8/metadata-xpu.json | 4 + kernels-v1/attention-int8/pyproject.toml | 23 + kernels-v1/attention-int8/setup.py | 157 +++++ kernels-v1/attention-int8/test_simple.py | 25 + .../torch-ext/attention_int8/_ops.py | 9 + .../attention-int8/torch-ext/registration.h | 30 + 24 files changed, 2121 insertions(+), 7 deletions(-) create mode 100644 kernels-v1/attention-int8/CMakeLists.txt create mode 100644 kernels-v1/attention-int8/cmake/_ops.py.in create mode 100644 kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py create mode 100644 kernels-v1/attention-int8/cmake/build-variants.cmake create mode 100644 kernels-v1/attention-int8/cmake/compile-metal.cmake create mode 100644 kernels-v1/attention-int8/cmake/get_gpu_lang.cmake create mode 100644 kernels-v1/attention-int8/cmake/get_gpu_lang.py create mode 100644 kernels-v1/attention-int8/cmake/hipify.py create mode 100644 kernels-v1/attention-int8/cmake/kernel.cmake create mode 100644 kernels-v1/attention-int8/cmake/metallib_to_header.py create mode 100644 kernels-v1/attention-int8/cmake/utils.cmake create mode 100644 kernels-v1/attention-int8/compat.py create mode 100644 kernels-v1/attention-int8/metadata-cpu.json create mode 100644 kernels-v1/attention-int8/metadata-cuda.json create mode 100644 kernels-v1/attention-int8/metadata-metal.json create mode 100644 kernels-v1/attention-int8/metadata-neuron.json create mode 100644 kernels-v1/attention-int8/metadata-rocm.json create mode 100644 kernels-v1/attention-int8/metadata-xpu.json create mode 100644 kernels-v1/attention-int8/pyproject.toml create mode 100644 kernels-v1/attention-int8/setup.py create mode 100644 kernels-v1/attention-int8/test_simple.py create mode 100644 kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py create mode 100644 kernels-v1/attention-int8/torch-ext/registration.h diff --git a/kernels-v1/attention-int8/CMakeLists.txt b/kernels-v1/attention-int8/CMakeLists.txt new file mode 100644 index 0000000..09b1633 --- /dev/null +++ b/kernels-v1/attention-int8/CMakeLists.txt @@ -0,0 +1,251 @@ +cmake_minimum_required(VERSION 3.26) + +# Set Intel SYCL compiler before project() call +find_program(ICX_COMPILER icx) +find_program(ICPX_COMPILER icpx) + +if(ICX_COMPILER OR ICPX_COMPILER) + set(CMAKE_C_COMPILER ${ICX_COMPILER}) + + if(WIN32) + set(CMAKE_CXX_COMPILER ${ICX_COMPILER}) + else() + set(CMAKE_CXX_COMPILER ${ICPX_COMPILER}) + endif() +endif() + +project(attention-int8 LANGUAGES CXX) + +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + +include(FetchContent) +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists +message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") + +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/kernel.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/get_gpu_lang.cmake) + +if(DEFINED Python3_EXECUTABLE) + # Allow passing through the interpreter (e.g. from setup.py). + find_package(Python3 COMPONENTS Development Development.SABIModule Interpreter) + if (NOT Python3_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() +else() + find_package(Python3 REQUIRED COMPONENTS Development Development.SABIModule Interpreter) +endif() + +get_gpu_lang(DETECTED_GPU_LANG) +set(GPU_LANG "${DETECTED_GPU_LANG}" CACHE STRING "GPU language") +gpu_lang_to_backend(BACKEND "${GPU_LANG}") +message(STATUS "Using backend: ${BACKEND}, GPU language: ${GPU_LANG}") + +set(KERNEL_REVISION "dba582b_dirty" CACHE STRING "Kernel revision, must be unique") +set(OPS_NAME "_attention_int8_${BACKEND}_dba582b_dirty") + +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") + +find_package(Torch REQUIRED) + +run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version") + + + +option(BUILD_ALL_SUPPORTED_ARCHS "Build all supported architectures" off) + +if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + set(CUDA_DEFAULT_KERNEL_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0+PTX") +elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) + set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0+PTX") +else() + set(CUDA_DEFAULT_KERNEL_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0+PTX") +endif() + +# Basic checks for each GPU language. +if(GPU_LANG STREQUAL "CUDA") + if(NOT CUDA_FOUND) + message(FATAL_ERROR "GPU language is set to CUDA, but cannot find CUDA toolkit") + endif() + + + + # This clears out -gencode arguments from `CMAKE_CUDA_FLAGS`, which we need + # to set our own set of capabilities. + clear_gencode_flags() + + # Get the capabilities without +PTX suffixes, so that we can use them as + # the target archs in the loose intersection with a kernel's capabilities. + cuda_remove_ptx_suffixes(CUDA_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + message(STATUS "CUDA supported base architectures: ${CUDA_ARCHS}") + + if(BUILD_ALL_SUPPORTED_ARCHS) + set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + else() + try_run_python(CUDA_KERNEL_ARCHS SUCCESS "import torch; cc=torch.cuda.get_device_capability(); print(f\"{cc[0]}.{cc[1]}\")" "Failed to get CUDA capability") + if(NOT SUCCESS) + message(WARNING "Failed to detect CUDA capability, using default capabilities.") + set(CUDA_KERNEL_ARCHS "${CUDA_DEFAULT_KERNEL_ARCHS}") + endif() + endif() + + message(STATUS "CUDA supported kernel architectures: ${CUDA_KERNEL_ARCHS}") + + if(NVCC_THREADS AND GPU_LANG STREQUAL "CUDA") + list(APPEND GPU_FLAGS "--threads=${NVCC_THREADS}") + endif() + + # TODO: deprecate one of these settings. + add_compile_definitions(USE_CUDA=1) + add_compile_definitions(CUDA_KERNEL) +elseif(GPU_LANG STREQUAL "HIP") + if(NOT HIP_FOUND) + message(FATAL_ERROR "GPU language is set to HIP, but cannot find ROCm toolkit") + endif() + + # Importing torch recognizes and sets up some HIP/ROCm configuration but does + # not let cmake recognize .hip files. In order to get cmake to understand the + # .hip extension automatically, HIP must be enabled explicitly. + enable_language(HIP) + + override_gpu_arches(GPU_ARCHES HIP ${HIP_SUPPORTED_ARCHS}) + set(ROCM_ARCHS ${GPU_ARCHES}) + message(STATUS "ROCM supported target architectures: ${ROCM_ARCHS}") + + # TODO: deprecate one of these settings. + add_compile_definitions(USE_ROCM=1) + add_compile_definitions(ROCM_KERNEL) +elseif(GPU_LANG STREQUAL "CPU") + add_compile_definitions(CPU_KERNEL) + set(CMAKE_OSX_DEPLOYMENT_TARGET "15.0" CACHE STRING "Minimum macOS deployment version") +elseif(GPU_LANG STREQUAL "METAL") + set(CMAKE_OSX_DEPLOYMENT_TARGET "26.0" CACHE STRING "Minimum macOS deployment version") + enable_language(C OBJC OBJCXX) + + add_compile_definitions(METAL_KERNEL) + + # Initialize lists for Metal shader sources and their include directories + set(ALL_METAL_SOURCES) + set(METAL_INCLUDE_DIRS) +elseif(GPU_LANG STREQUAL "SYCL") + if(NOT ICX_COMPILER AND NOT ICPX_COMPILER) + message(FATAL_ERROR "Intel SYCL C++ compiler (icpx) and/or C compiler (icx) not found. Please install Intel oneAPI toolkit.") + endif() + + execute_process( + COMMAND ${ICPX_COMPILER} --version + OUTPUT_VARIABLE ICPX_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+" DPCPP_VERSION "${ICPX_VERSION_OUTPUT}") + set(DPCPP_VERSION "${DPCPP_VERSION}" CACHE STRING "DPCPP major.minor version") + + # On Windows, use icx (MSVC-compatible) for C++ to work with Ninja generator + # On Linux, use icpx (GNU-compatible) for C++ + if(WIN32) + message(STATUS "Using Intel SYCL C++ compiler: ${ICX_COMPILER} and C compiler: ${ICX_COMPILER} Version: ${DPCPP_VERSION} (Windows MSVC-compatible mode)") + else() + message(STATUS "Using Intel SYCL C++ compiler: ${ICPX_COMPILER} and C compiler: ${ICX_COMPILER} Version: ${DPCPP_VERSION}") + endif() + + + set(sycl_link_flags "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required';") + set(sycl_flags "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;") + set(GPU_FLAGS "${sycl_flags}") + set(GPU_ARCHES "") + + + add_compile_definitions(XPU_KERNEL) + add_compile_definitions(USE_XPU) +else() + message(FATAL_ERROR "Unsupported GPU language: ${GPU_LANG}") +endif() + +# Initialize SRC list for kernel and binding sources +set(SRC "") + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/build-variants.cmake) + +# Generate build variant name. +if(GPU_LANG STREQUAL "CUDA") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cuda" "${CUDA_VERSION}") +elseif(GPU_LANG STREQUAL "HIP") + run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "rocm" "${ROCM_VERSION}") +elseif(GPU_LANG STREQUAL "SYCL") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "xpu" "${DPCPP_VERSION}") +elseif(GPU_LANG STREQUAL "METAL") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "metal" "") +elseif(GPU_LANG STREQUAL "CPU") + generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" "cpu" "") +else() + message(FATAL_ERROR "Cannot generate build name for unknown GPU_LANG: ${GPU_LANG}") +endif() + +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/cmake/_ops.py.in + ${CMAKE_CURRENT_SOURCE_DIR}/torch-ext/attention_int8/_ops.py + @ONLY +) + +if(GPU_LANG STREQUAL "CUDA") + get_torch_gpu_compiler_flags(TORCH_GPU_FLAGS ${GPU_LANG}) + list(APPEND GPU_FLAGS ${TORCH_GPU_FLAGS}) +endif() + +set(TORCH_attention-int8_SRC + torch-ext/torch_binding.cpp torch-ext/torch_binding.h +) + + +list(APPEND SRC "${TORCH_attention-int8_SRC}") +cuda_kernel_component(SRC + SOURCES "attention_int8_cuda/attention_int8.cu" + ) +# Include Metal shader compilation utilities if needed +if(GPU_LANG STREQUAL "METAL") + include(${CMAKE_CURRENT_LIST_DIR}/cmake/compile-metal.cmake) +endif() + +# Define the extension target with unified parameters +define_gpu_extension_target( + ${OPS_NAME} + ${OPS_NAME} + DESTINATION ${OPS_NAME} + LANGUAGE ${GPU_LANG} + SOURCES ${SRC} + COMPILE_FLAGS ${GPU_FLAGS} + ARCHITECTURES ${GPU_ARCHES} + USE_SABI 3 + WITH_SOABI) + +if(NOT (MSVC OR GPU_LANG STREQUAL "SYCL")) + target_link_options(${OPS_NAME} PRIVATE -static-libstdc++) +endif() + +if(GPU_LANG STREQUAL "SYCL") + target_link_options(${OPS_NAME} PRIVATE ${sycl_link_flags}) + target_link_libraries(${OPS_NAME} PRIVATE dnnl) +endif() + +# Compile Metal shaders if any were found +if(GPU_LANG STREQUAL "METAL") + if(ALL_METAL_SOURCES) + compile_metal_shaders(${OPS_NAME} "${ALL_METAL_SOURCES}" "${METAL_INCLUDE_DIRS}") + endif() +endif() + + +# Add kernels_install target for huggingface/kernels library layout +add_kernels_install_target(${OPS_NAME} "attention_int8" "${BUILD_VARIANT_NAME}" + DATA_EXTENSIONS "" + GPU_ARCHS "${ALL_GPU_ARCHS}") + +# Add local_install target for local development with get_local_kernel() +add_local_install_target(${OPS_NAME} "attention_int8" "${BUILD_VARIANT_NAME}" + DATA_EXTENSIONS "" + GPU_ARCHS "${ALL_GPU_ARCHS}") diff --git a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu index 9c436bb..c05f352 100644 --- a/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu +++ b/kernels-v1/attention-int8/attention_int8_cuda/attention_int8.cu @@ -204,17 +204,18 @@ int8_attention_kernel( const float inv_sqrt_d = rsqrtf((float)HEAD_DIM); + float lkmax_global = 0.f; + + for (int i = tid; i < N * HEAD_DIM; i += THREADS) lkmax_global = fmaxf(lkmax_global, fabsf(__half2float(K_head[i]))); + + float abs_max_K_global = block_reduce_max(lkmax_global, warp_scr); + const float inv_K = 127.f / fmaxf(abs_max_K_global * ts, 1e-6f); + const float scl_K = 1.f / inv_K; + // Stream K tiles for (int k_start = 0; k_start < N; k_start += BK) { const int k_size = min(BK, N - k_start); - float lkmax = 0.f; - for (int i = tid; i < k_size * HEAD_DIM; i += THREADS) - lkmax = fmaxf(lkmax, fabsf(__half2float(K_head[k_start * HEAD_DIM + i]))); - float abs_max_K = block_reduce_max(lkmax, warp_scr); - const float inv_K = 127.f / fmaxf(abs_max_K * ts, 1e-6f); - const float scl_K = 1.f / inv_K; - // [F5][G1] Fused quantize + transpose K load_and_quantize_K_transposed( K_head + k_start * HEAD_DIM, K_i8_T, k_size, tid, inv_K); diff --git a/kernels-v1/attention-int8/cmake/_ops.py.in b/kernels-v1/attention-int8/cmake/_ops.py.in new file mode 100644 index 0000000..736771e --- /dev/null +++ b/kernels-v1/attention-int8/cmake/_ops.py.in @@ -0,0 +1,9 @@ +import torch +from . import @OPS_NAME@ +ops = torch.ops.@OPS_NAME@ + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"@OPS_NAME@::{op_name}" diff --git a/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py b/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py new file mode 100644 index 0000000..93174db --- /dev/null +++ b/kernels-v1/attention-int8/cmake/add_gpu_arch_metadata.py @@ -0,0 +1,55 @@ +import argparse +import json +import sys + + +def main(): + parser = argparse.ArgumentParser( + description="Write a metadata JSON file with GPU architecture information, " + "reading from a source file and writing to a destination." + ) + parser.add_argument( + "input", + help="Path to the source metadata JSON file to read from.", + ) + parser.add_argument( + "destination", + help="Path to write the output metadata JSON file to.", + ) + parser.add_argument( + "--backend", + required=True, + choices=["cuda", "rocm"], + help="GPU backend type.", + ) + parser.add_argument( + "--archs", + required=True, + help="Semicolon-separated list of GPU architectures/capabilities.", + ) + args = parser.parse_args() + + archs = sorted(set(a for a in args.archs.split(";") if a)) + + try: + with open(args.input) as f: + data = json.load(f) + except FileNotFoundError: + print(f"Error: input metadata file not found: {args.input}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: failed to parse input metadata JSON: {e}", file=sys.stderr) + sys.exit(1) + + data["backend"] = { + "type": args.backend, + "archs": archs, + } + + with open(args.destination, "w") as f: + json.dump(data, f, indent=2) + f.write("\n") + + +if __name__ == "__main__": + main() diff --git a/kernels-v1/attention-int8/cmake/build-variants.cmake b/kernels-v1/attention-int8/cmake/build-variants.cmake new file mode 100644 index 0000000..b889772 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/build-variants.cmake @@ -0,0 +1,298 @@ +# Generate a standardized build variant name following the pattern: +# torch-[cxx11-]-- +# +# Arguments: +# OUT_BUILD_NAME - Output variable name +# TORCH_VERSION - PyTorch version (e.g., "2.7.1") +# COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu, cpu +# COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm) +# Optional for CPU-only builds (pass empty string or omit) +# Example output: torch271-cxx11-cu124-x86_64-linux (Linux) +# torch271-cu124-x86_64-windows (Windows) +# torch271-metal-aarch64-darwin (macOS) +# +function(generate_build_name OUT_BUILD_NAME TORCH_VERSION COMPUTE_FRAMEWORK COMPUTE_VERSION) + # Flatten version by removing dots and padding to 2 components + string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}") + list(LENGTH VERSION_LIST VERSION_COMPONENTS) + + # Pad to at least 2 components + if(VERSION_COMPONENTS LESS 2) + list(APPEND VERSION_LIST "0") + endif() + + # Take first 2 components and join without dots + list(GET VERSION_LIST 0 MAJOR) + list(GET VERSION_LIST 1 MINOR) + set(FLATTENED_TORCH "${MAJOR}${MINOR}") + + # Generate compute string + if(COMPUTE_FRAMEWORK STREQUAL "cuda") + # Flatten CUDA version (e.g., "12.4" -> "124") + string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}") + list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS) + if(COMPUTE_COMPONENTS GREATER_EQUAL 2) + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR) + set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}") + else() + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0") + endif() + elseif(COMPUTE_FRAMEWORK STREQUAL "rocm") + # Flatten ROCm version (e.g., "6.0" -> "60") + string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}") + list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS) + if(COMPUTE_COMPONENTS GREATER_EQUAL 2) + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR) + set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}") + else() + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0") + endif() + elseif(COMPUTE_FRAMEWORK STREQUAL "xpu") + # Flatten XPU version (e.g., "2025.2" -> "202552") + string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}") + list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS) + if(COMPUTE_COMPONENTS GREATER_EQUAL 2) + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR) + set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}") + else() + list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR) + set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0") + endif() + elseif(COMPUTE_FRAMEWORK STREQUAL "metal") + set(COMPUTE_STRING "metal") + elseif(COMPUTE_FRAMEWORK STREQUAL "cpu") + set(COMPUTE_STRING "cpu") + else() + message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}") + endif() + + # Detect from target system (CMAKE_SYSTEM_* variables refer to target, not host) + # Normalize architecture name + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64|AMD64)$") + set(CPU_ARCH "x86_64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64)$") + set(CPU_ARCH "aarch64") + else() + message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}") + endif() + + # Normalize OS name + if(CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(OS_NAME "windows") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(OS_NAME "linux") + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(OS_NAME "darwin") + else() + message(WARNING "Unknown OS ${CMAKE_SYSTEM_NAME}, using as-is") + string(TOLOWER "${CMAKE_SYSTEM_NAME}" OS_NAME) + endif() + + set(ARCH_OS_STRING "${CPU_ARCH}-${OS_NAME}") + + # Assemble the final build name + # For Linux, include cxx11 ABI indicator for compatibility + if(ARCH_OS_STRING MATCHES "-linux$") + set(BUILD_NAME "torch${FLATTENED_TORCH}-cxx11-${COMPUTE_STRING}-${ARCH_OS_STRING}") + else() + set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-${ARCH_OS_STRING}") + endif() + + set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE) + message(STATUS "Generated build name: ${BUILD_NAME}") +endfunction() + +# +# Create a custom install target for the huggingface/kernels library layout. +# This installs the extension into a directory structure suitable for kernel hub discovery: +# / +# +# Arguments: +# TARGET_NAME - Name of the target to create the install rule for +# PACKAGE_NAME - Python package name (e.g., "activation") +# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux") +# INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX) +# GPU_ARCHS - List of GPU architectures that were compiled +# (optional; when provided for CUDA/ROCm, metadata.json will include +# a "backend" key with the type and arch list) +# +function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) + set(oneValueArgs INSTALL_PREFIX) + set(multiValueArgs DATA_EXTENSIONS GPU_ARCHS) + cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT ARG_INSTALL_PREFIX) + set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + endif() + + gpu_lang_to_backend(_BACKEND ${GPU_LANG}) + + # Always include 'py' extension for Python files + set(ALL_EXTENSIONS ${ARG_DATA_EXTENSIONS}) + list(APPEND ALL_EXTENSIONS "py") + + # Set the installation directory + set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}") + + message(STATUS "Using PACKAGE_NAME: ${PACKAGE_NAME}") + + # Install the compiled extension using CMake's install() command + # This will be triggered by the standard INSTALL target + install(TARGETS ${TARGET_NAME} + LIBRARY DESTINATION "${KERNEL_INSTALL_DIR}" + RUNTIME DESTINATION "${KERNEL_INSTALL_DIR}" + COMPONENT ${TARGET_NAME}) + + # Install data files with specified extensions + foreach(ext IN LISTS ALL_EXTENSIONS) + file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}") + foreach(data_file IN LISTS DATA_FILES) + get_filename_component(data_file_dir "${data_file}" DIRECTORY) + install(FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file}" + DESTINATION "${KERNEL_INSTALL_DIR}/${data_file_dir}" + COMPONENT ${TARGET_NAME}) + endforeach() + endforeach() + + message(STATUS "GPU archs: ${ARG_GPU_ARCHS}") + + # Add the GPU archs to matadata.json when applicable. + if((GPU_LANG STREQUAL "CUDA" OR GPU_LANG STREQUAL "HIP") AND ARG_GPU_ARCHS) + list(JOIN ARG_GPU_ARCHS ";" _GPU_ARCHS_STR) + install(CODE " + file(MAKE_DIRECTORY \"${KERNEL_INSTALL_DIR}\") + execute_process( + COMMAND \"${Python3_EXECUTABLE}\" + \"${CMAKE_CURRENT_LIST_DIR}/cmake/add_gpu_arch_metadata.py\" + \"${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json\" + \"${KERNEL_INSTALL_DIR}/metadata.json\" + --backend \"${_BACKEND}\" + --archs \"${_GPU_ARCHS_STR}\" + RESULT_VARIABLE _METADATA_RESULT + ERROR_VARIABLE _METADATA_ERROR + ) + if(NOT _METADATA_RESULT EQUAL 0) + message(WARNING \"Failed to add GPU arch metadata: \${_METADATA_ERROR}\") + endif() + " COMPONENT ${TARGET_NAME}) + else() + install(FILES ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json + DESTINATION "${KERNEL_INSTALL_DIR}" + RENAME "metadata.json" + COMPONENT ${TARGET_NAME}) + endif() + + # Compatibility with older kernels and direct Python imports. + install(FILES ${CMAKE_SOURCE_DIR}/compat.py + DESTINATION "${KERNEL_INSTALL_DIR}/${PACKAGE_NAME}" + RENAME "__init__.py" + COMPONENT ${TARGET_NAME}) + + message(STATUS "Added install rules for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}") +endfunction() + +# +# Add install rules for local development with huggingface/kernels. +# This installs the extension into the layout expected by get_local_kernel(): +# ${CMAKE_SOURCE_DIR}/build// +# +# This allows developers to use get_local_kernel() from the kernels library to load +# locally built kernels without needing to publish to the hub. +# +# This uses the standard CMake install() command, so it works with the default +# "install" target that is always available. +# +# Arguments: +# TARGET_NAME - Name of the target to create the install rule for +# PACKAGE_NAME - Python package name (e.g., "activation") +# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux") +# GPU_ARCHS - List of GPU architectures that were compiled +# (optional; when provided for CUDA/ROCm, metadata.json will include +# a "backend" key with the type and arch list) +# +function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME) + set(oneValueArgs) + set(multiValueArgs DATA_EXTENSIONS GPU_ARCHS) + cmake_parse_arguments(ARG "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + # Always include 'py' extension for Python files + set(ALL_EXTENSIONS ${ARG_DATA_EXTENSIONS}) + list(APPEND ALL_EXTENSIONS "py") + + # Define your local, folder based, installation directory + set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}") + # Variant directory is where metadata.json should go (for kernels upload discovery) + set(VARIANT_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}") + + # Create a custom target for local installation + add_custom_target(local_install + COMMENT "Installing files to local directory..." + ) + + gpu_lang_to_backend(_BACKEND ${GPU_LANG}) + + # Copy data files with specified extensions + foreach(ext IN LISTS ALL_EXTENSIONS) + file(GLOB_RECURSE DATA_FILES RELATIVE "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}" "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.${ext}") + foreach(data_file IN LISTS DATA_FILES) + get_filename_component(data_file_dir "${data_file}" DIRECTORY) + add_custom_command(TARGET local_install POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${LOCAL_INSTALL_DIR}/${data_file_dir} + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/${data_file} + ${LOCAL_INSTALL_DIR}/${data_file_dir}/ + COMMENT "Copying ${data_file} to ${LOCAL_INSTALL_DIR}/${data_file_dir}" + ) + endforeach() + endforeach() + + + # Add the GPU archs to matadata.json when applicable. + if((GPU_LANG STREQUAL "CUDA" OR GPU_LANG STREQUAL "HIP") AND ARG_GPU_ARCHS) + list(JOIN ARG_GPU_ARCHS ";" _GPU_ARCHS_STR) + add_custom_command(TARGET local_install POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${VARIANT_DIR} + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_LIST_DIR}/cmake/add_gpu_arch_metadata.py + ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json + ${VARIANT_DIR}/metadata.json + --backend ${_BACKEND} + --archs "${_GPU_ARCHS_STR}" + COMMENT "Writing metadata.json with GPU arch info to ${VARIANT_DIR}" + ) + else() + add_custom_command(TARGET local_install POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${VARIANT_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_SOURCE_DIR}/metadata-${_BACKEND}.json + ${VARIANT_DIR}/metadata.json + COMMENT "Copying metadata.json to ${VARIANT_DIR}" + ) + endif() + + add_custom_command(TARGET local_install POST_BUILD + # Copy the shared library + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + ${LOCAL_INSTALL_DIR}/ + + # Compatibility with older kernels and direct Python imports. + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${CMAKE_SOURCE_DIR}/compat.py + ${VARIANT_DIR}/${PACKAGE_NAME}/__init__.py + + COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}" + COMMAND_EXPAND_LISTS + ) + + # Create both directories: variant dir for metadata.json, package dir for binaries + file(MAKE_DIRECTORY ${VARIANT_DIR}) + file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR}) + message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}") +endfunction() diff --git a/kernels-v1/attention-int8/cmake/compile-metal.cmake b/kernels-v1/attention-int8/cmake/compile-metal.cmake new file mode 100644 index 0000000..50d44a2 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/compile-metal.cmake @@ -0,0 +1,104 @@ +# Metal shader compilation function +function(compile_metal_shaders TARGET_NAME METAL_SOURCES EXTRA_INCLUDE_DIRS) + if(NOT DEFINED METAL_TOOLCHAIN) + execute_process( + COMMAND "xcodebuild" "-showComponent" "MetalToolchain" + OUTPUT_VARIABLE FIND_METAL_OUT + RESULT_VARIABLE FIND_METAL_ERROR_CODE + ERROR_VARIABLE FIND_METAL_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT FIND_METAL_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${FIND_METAL_STDERR}") + endif() + + # Extract the Toolchain Search Path value and append Metal.xctoolchain + string(REGEX MATCH "Toolchain Search Path: ([^\n]+)" MATCH_RESULT "${FIND_METAL_OUT}") + set(METAL_TOOLCHAIN "${CMAKE_MATCH_1}/Metal.xctoolchain") + endif() + + # Set Metal compiler flags + set(METAL_FLAGS "-std=metal4.0" "-O2") + + # Output directory for compiled metallib + set(METALLIB_OUTPUT_DIR "${CMAKE_BINARY_DIR}/metallib") + file(MAKE_DIRECTORY ${METALLIB_OUTPUT_DIR}) + + foreach(INC ${EXTRA_INCLUDE_DIRS}) + list(APPEND METAL_FLAGS "-I${INC}") + endforeach() + + # Separate .metal files from .h files and compile .metal files to .air + set(AIR_FILES) + set(METAL_FILES) + set(HEADER_FILES) + + foreach(SOURCE_FILE ${METAL_SOURCES}) + if(SOURCE_FILE MATCHES "\\.metal$") + list(APPEND METAL_FILES ${SOURCE_FILE}) + elseif(SOURCE_FILE MATCHES "\\.h$") + list(APPEND HEADER_FILES ${SOURCE_FILE}) + endif() + endforeach() + + foreach(METAL_FILE ${METAL_FILES}) + get_filename_component(METAL_NAME ${METAL_FILE} NAME_WE) + set(AIR_FILE "${CMAKE_BINARY_DIR}/${METAL_NAME}.air") + + # Include header files as dependencies + set(ALL_DEPENDENCIES ${CMAKE_CURRENT_SOURCE_DIR}/${METAL_FILE}) + foreach(HEADER_FILE ${HEADER_FILES}) + list(APPEND ALL_DEPENDENCIES ${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE}) + endforeach() + + add_custom_command( + OUTPUT ${AIR_FILE} + COMMAND "${METAL_TOOLCHAIN}/usr/bin/metal" ${METAL_FLAGS} + -c ${CMAKE_CURRENT_SOURCE_DIR}/${METAL_FILE} + -o ${AIR_FILE} + DEPENDS ${ALL_DEPENDENCIES} + COMMENT "Compiling Metal shader ${METAL_FILE} to ${AIR_FILE}" + VERBATIM + ) + + list(APPEND AIR_FILES ${AIR_FILE}) + endforeach() + + # Link all .air files into a single .metallib + set(METALLIB_FILE "${METALLIB_OUTPUT_DIR}/${TARGET_NAME}.metallib") + add_custom_command( + OUTPUT ${METALLIB_FILE} + COMMAND "${METAL_TOOLCHAIN}/usr/bin/metallib" ${AIR_FILES} + -o ${METALLIB_FILE} + DEPENDS ${AIR_FILES} + COMMENT "Linking Metal library ${METALLIB_FILE}" + VERBATIM + ) + + # Generate C++ header with embedded metallib data + set(METALLIB_HEADER "${CMAKE_BINARY_DIR}/${TARGET_NAME}_metallib.h") + set(METALLIB_TO_HEADER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/metallib_to_header.py") + + add_custom_command( + OUTPUT ${METALLIB_HEADER} + COMMAND ${Python3_EXECUTABLE} ${METALLIB_TO_HEADER_SCRIPT} ${METALLIB_FILE} ${METALLIB_HEADER} ${TARGET_NAME} + DEPENDS ${METALLIB_FILE} ${METALLIB_TO_HEADER_SCRIPT} + COMMENT "Generating embedded Metal library header ${METALLIB_HEADER}" + VERBATIM + ) + + # Create a custom target for the metallib + add_custom_target(${TARGET_NAME}_metallib ALL DEPENDS ${METALLIB_FILE} ${METALLIB_HEADER}) + + # Add dependency to main target + add_dependencies(${TARGET_NAME} ${TARGET_NAME}_metallib) + + # Add the generated header to include directories + target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_BINARY_DIR}) + + # Pass the metallib header and namespace as compile definitions + target_compile_definitions(${TARGET_NAME} PRIVATE + EMBEDDED_METALLIB_HEADER="${TARGET_NAME}_metallib.h" + EMBEDDED_METALLIB_NAMESPACE=${TARGET_NAME}_metal + ) +endfunction() diff --git a/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake b/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake new file mode 100644 index 0000000..004f219 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/get_gpu_lang.cmake @@ -0,0 +1,17 @@ +# +# Get the GPU language from Torch. +# +function(get_gpu_lang OUT) + execute_process( + COMMAND + "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/get_gpu_lang.py" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "Cannot detect GPU language: ${PYTHON_STDERR}") + endif() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() diff --git a/kernels-v1/attention-int8/cmake/get_gpu_lang.py b/kernels-v1/attention-int8/cmake/get_gpu_lang.py new file mode 100644 index 0000000..1eedff7 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/get_gpu_lang.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import sys + +try: + import torch +except ImportError: + print("Torch is required for configuring a kernel build.", file=sys.stderr) + sys.exit(1) + +if torch.version.cuda is not None: + print("CUDA") +elif torch.version.hip is not None: + print("HIP") +elif torch.backends.mps.is_available(): + print("METAL") +elif hasattr(torch.version, "xpu") and torch.version.xpu is not None: + print("SYCL") +else: + print("CPU") diff --git a/kernels-v1/attention-int8/cmake/hipify.py b/kernels-v1/attention-int8/cmake/hipify.py new file mode 100644 index 0000000..a1539c0 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/hipify.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + +# From vLLM: https://github.com/vllm-project/vllm/blob/main/cmake/hipify.py + +# +# A command line tool for running pytorch's hipify preprocessor on CUDA +# source files. +# +# See https://github.com/ROCm/hipify_torch +# and /utils/hipify/hipify_python.py +# + +import argparse +import os +import shutil + +from torch.utils.hipify.hipify_python import hipify + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # Project directory where all the source + include files live. + parser.add_argument( + "-p", + "--project_dir", + help="The project directory.", + ) + + # Directory where hipified files are written. + parser.add_argument( + "-o", + "--output_dir", + help="The output directory.", + ) + + # Source files to convert. + parser.add_argument("sources", + help="Source files to hipify.", + nargs="*", + default=[]) + + args = parser.parse_args() + + # Limit include scope to project_dir only + includes = [os.path.join(args.project_dir, '*')] + + # Get absolute path for all source files. + extra_files = [os.path.abspath(s) for s in args.sources] + + # Copy sources from project directory to output directory. + # The directory might already exist to hold object files so we ignore that. + shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) + + hipify_result = hipify(project_directory=args.project_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True) + + hipified_sources = [] + for source in args.sources: + s_abs = os.path.abspath(source) + hipified_s_abs = (hipify_result[s_abs].hipified_path if + (s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None) + else s_abs) + hipified_sources.append(hipified_s_abs) + + assert (len(hipified_sources) == len(args.sources)) + + # Print hipified source files. + print("\n".join(hipified_sources)) diff --git a/kernels-v1/attention-int8/cmake/kernel.cmake b/kernels-v1/attention-int8/cmake/kernel.cmake new file mode 100644 index 0000000..454f3e0 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/kernel.cmake @@ -0,0 +1,296 @@ +function(accumulate_gpu_archs OUT_ACC ACC EXTRA_ARCHS) + list(APPEND ACC ${EXTRA_ARCHS}) + list(REMOVE_DUPLICATES ACC) + list(SORT ACC) + set(${OUT_ACC} ${ACC} PARENT_SCOPE) +endfunction() + +function(cuda_kernel_component SRC_VAR) + set(options SUPPORTS_HIPIFY) + set(oneValueArgs CUDA_MINVER) + set(multiValueArgs SOURCES INCLUDES CUDA_CAPABILITIES CUDA_FLAGS CXX_FLAGS HIP_FLAGS ROCM_ARCHS) + cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT KERNEL_SOURCES) + message(FATAL_ERROR "cuda_kernel_component: SOURCES argument is required") + endif() + + # Bail out if this component is not supported by the CUDA version. + if(KERNEL_CUDA_MINVER) + if(CUDA_VERSION VERSION_LESS ${KERNEL_CUDA_MINVER}) + return() + endif() + endif() + + set(_KERNEL_SRC ${KERNEL_SOURCES}) + + if(KERNEL_INCLUDES) + # TODO: check if CLion support this: + # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories + set_source_files_properties( + ${_KERNEL_SRC} + PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}") + endif() + + if(GPU_LANG STREQUAL "CUDA") + # Determine CUDA architectures + if(KERNEL_CUDA_CAPABILITIES) + cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}") + else() + set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}") + endif() + message(STATUS "CUDA kernel capabilities: ${_KERNEL_ARCHS}") + set_gencode_flags_for_srcs(SRCS "${_KERNEL_SRC}" CUDA_ARCHS "${_KERNEL_ARCHS}") + + accumulate_gpu_archs(_ALL_GPU_ARCHS "${ALL_GPU_ARCHS}" "${_KERNEL_ARCHS}") + set(ALL_GPU_ARCHS ${_ALL_GPU_ARCHS} PARENT_SCOPE) + + # Apply CUDA-specific compile flags + if(KERNEL_CUDA_FLAGS) + set(_CUDA_FLAGS "${KERNEL_CUDA_FLAGS}") + # -static-global-template-stub is not supported on CUDA < 12.8. Remove this + # once we don't support CUDA 12.6 anymore. + if(CUDA_VERSION VERSION_LESS 12.8) + string(REGEX REPLACE "-static-global-template-stub=(true|false)" "" _CUDA_FLAGS "${_CUDA_FLAGS}") + endif() + + foreach(_SRC ${_KERNEL_SRC}) + if(_SRC MATCHES ".*\\.cu$") + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${_CUDA_FLAGS}>" + ) + endif() + endforeach() + endif() + + # Apply CXX-specific compile flags + if(KERNEL_CXX_FLAGS) + foreach(_SRC ${_KERNEL_SRC}) + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_CXX_FLAGS}>" + ) + endforeach() + endif() + + set(_TMP_SRC ${${SRC_VAR}}) + list(APPEND _TMP_SRC ${_KERNEL_SRC}) + set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE) + + elseif(GPU_LANG STREQUAL "HIP") + if(NOT KERNEL_SUPPORTS_HIPIFY) + message(WARNING "Kernel does not support HIP") + return() + endif() + + # Apply HIP-specific compile flags + if(KERNEL_HIP_FLAGS) + foreach(_SRC ${_KERNEL_SRC}) + if(_SRC MATCHES ".*\\.(cu|hip)$") + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_HIP_FLAGS}>" + ) + endif() + endforeach() + endif() + + # Determine ROCm architectures + if(KERNEL_ROCM_ARCHS) + hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}") + else() + set(_KERNEL_ARCHS "${ROCM_ARCHS}") + endif() + message(STATUS "HIP kernel archs: ${_KERNEL_ARCHS}") + + accumulate_gpu_archs(_ALL_GPU_ARCHS "${ALL_GPU_ARCHS}" "${_KERNEL_ARCHS}") + set(ALL_GPU_ARCHS ${_ALL_GPU_ARCHS} PARENT_SCOPE) + + foreach(_SRC ${_KERNEL_SRC}) + if(_SRC MATCHES ".*\\.(cu|hip)$") + foreach(_ARCH ${_KERNEL_ARCHS}) + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:--offload-arch=${_ARCH}>" + ) + endforeach() + endif() + endforeach() + + set(_TMP_SRC ${${SRC_VAR}}) + list(APPEND _TMP_SRC ${_KERNEL_SRC}) + set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE) + endif() +endfunction() + +function(xpu_kernel_component SRC_VAR) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES INCLUDES CXX_FLAGS SYCL_FLAGS) + cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT KERNEL_SOURCES) + message(FATAL_ERROR "xpu_kernel_component: SOURCES argument is required") + endif() + + set(_KERNEL_SRC ${KERNEL_SOURCES}) + + # Handle per-file include directories if specified + if(KERNEL_INCLUDES) + # TODO: check if CLion support this: + # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories + set_source_files_properties( + ${_KERNEL_SRC} + PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}") + endif() + + # Apply CXX-specific compile flags + if(KERNEL_CXX_FLAGS) + foreach(_SRC ${_KERNEL_SRC}) + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_CXX_FLAGS}>" + ) + endforeach() + endif() + + # Add SYCL-specific compilation flags for XPU sources + if(KERNEL_SYCL_FLAGS) + # Use kernel-specific SYCL flags + foreach(_SRC ${_KERNEL_SRC}) + if(_SRC MATCHES ".*\\.(cpp|cxx|cc)$") + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_SYCL_FLAGS}>" + ) + endif() + endforeach() + else() + # Use default SYCL flags (from parent scope variable sycl_flags) + foreach(_SRC ${_KERNEL_SRC}) + if(_SRC MATCHES ".*\\.(cpp|cxx|cc)$") + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${sycl_flags}>" + ) + endif() + endforeach() + endif() + + # Append to parent scope SRC variable + set(_TMP_SRC ${${SRC_VAR}}) + list(APPEND _TMP_SRC ${_KERNEL_SRC}) + set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE) +endfunction() + +function(cpu_kernel_component SRC_VAR) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES INCLUDES CXX_FLAGS) + cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT KERNEL_SOURCES) + message(FATAL_ERROR "cpu_kernel_component: SOURCES argument is required") + endif() + + set(_KERNEL_SRC ${KERNEL_SOURCES}) + + # Handle per-file include directories if specified + if(KERNEL_INCLUDES) + # TODO: check if CLion support this: + # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories + set_source_files_properties( + ${_KERNEL_SRC} + PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}") + endif() + + # Apply CXX-specific compile flags + if(KERNEL_CXX_FLAGS) + foreach(_SRC ${_KERNEL_SRC}) + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_CXX_FLAGS}>" + ) + endforeach() + endif() + + # Append to parent scope SRC variable + set(_TMP_SRC ${${SRC_VAR}}) + list(APPEND _TMP_SRC ${_KERNEL_SRC}) + set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE) +endfunction() + +function(metal_kernel_component SRC_VAR) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES INCLUDES CXX_FLAGS) + cmake_parse_arguments(KERNEL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT KERNEL_SOURCES) + message(FATAL_ERROR "metal_kernel_component: SOURCES argument is required") + endif() + + set(_KERNEL_SRC ${KERNEL_SOURCES}) + + # Separate Metal shader files from other sources + set(_METAL_SRC) + set(_CPP_SRC) + + foreach(_SRC_FILE IN LISTS _KERNEL_SRC) + if(_SRC_FILE MATCHES "\\.(metal|h)$") + list(APPEND _METAL_SRC ${_SRC_FILE}) + else() + list(APPEND _CPP_SRC ${_SRC_FILE}) + endif() + endforeach() + + # Handle per-file include directories if specified (for C++ sources only) + if(KERNEL_INCLUDES AND _CPP_SRC) + # TODO: check if CLion support this: + # https://youtrack.jetbrains.com/issue/CPP-16510/CLion-does-not-handle-per-file-include-directories + set_source_files_properties( + ${_CPP_SRC} + PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDES}") + endif() + + # Apply CXX-specific compile flags + if(KERNEL_CXX_FLAGS AND _CPP_SRC) + foreach(_SRC ${_CPP_SRC}) + set_property( + SOURCE ${_SRC} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${KERNEL_CXX_FLAGS}>" + ) + endforeach() + endif() + + # Add C++ sources to main source list + if(_CPP_SRC) + set(_TMP_SRC ${${SRC_VAR}}) + list(APPEND _TMP_SRC ${_CPP_SRC}) + set(${SRC_VAR} ${_TMP_SRC} PARENT_SCOPE) + endif() + + # Keep track of Metal sources for later compilation + if(_METAL_SRC) + set(_TMP_METAL ${ALL_METAL_SOURCES}) + list(APPEND _TMP_METAL ${_METAL_SRC}) + set(ALL_METAL_SOURCES ${_TMP_METAL} PARENT_SCOPE) + endif() + + # Keep the includes directory for the Metal sources + if(KERNEL_INCLUDES AND _METAL_SRC) + set(_TMP_METAL_INCLUDES ${METAL_INCLUDE_DIRS}) + list(APPEND _TMP_METAL_INCLUDES ${KERNEL_INCLUDES}) + set(METAL_INCLUDE_DIRS ${_TMP_METAL_INCLUDES} PARENT_SCOPE) + endif() +endfunction() diff --git a/kernels-v1/attention-int8/cmake/metallib_to_header.py b/kernels-v1/attention-int8/cmake/metallib_to_header.py new file mode 100644 index 0000000..82bd252 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/metallib_to_header.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import sys +import os + +def convert_metallib_to_header(metallib_path: str, header_path: str, target_name: str) -> None: + """Convert a metallib binary file to a C++ header with embedded data.""" + + # Read the metallib binary data + with open(metallib_path, 'rb') as f: + data: bytes = f.read() + + # Generate the header content + header_content: str = """// Auto-generated file containing embedded Metal library +#pragma once +#include +#include + +namespace """ + target_name + """_metal { + static const unsigned char metallib_data[] = { +""" + + # Convert binary data to C array format + bytes_per_line: int = 16 + for i in range(0, len(data), bytes_per_line): + chunk: bytes = data[i:i + bytes_per_line] + hex_values: str = ', '.join('0x{:02x}'.format(b) for b in chunk) + header_content += " " + hex_values + "," + if i + bytes_per_line < len(data): + header_content += "\n" + + header_content += """ + }; + static const size_t metallib_data_len = """ + str(len(data)) + """; + + // Convenience function to create Metal library from embedded data + inline id createLibrary(id device, NSError** error = nullptr) { + dispatch_data_t libraryData = dispatch_data_create( + metallib_data, + metallib_data_len, + dispatch_get_main_queue(), + ^{ /* No cleanup needed for static data */ }); + + NSError* localError = nil; + id library = [device newLibraryWithData:libraryData error:&localError]; + + if (error) { + *error = localError; + } + + return library; + } +} // namespace """ + target_name + """_metal +""" + + # Write the header file + dir_path: str = os.path.dirname(header_path) + if dir_path: + os.makedirs(dir_path, exist_ok=True) + with open(header_path, 'w') as f: + f.write(header_content) + + print("Generated {} ({} bytes)".format(header_path, len(data))) + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: metallib_to_header.py ") + sys.exit(1) + + metallib_path: str = sys.argv[1] + header_path: str = sys.argv[2] + target_name: str = sys.argv[3] + + convert_metallib_to_header(metallib_path, header_path, target_name) \ No newline at end of file diff --git a/kernels-v1/attention-int8/cmake/utils.cmake b/kernels-v1/attention-int8/cmake/utils.cmake new file mode 100644 index 0000000..005c2e0 --- /dev/null +++ b/kernels-v1/attention-int8/cmake/utils.cmake @@ -0,0 +1,620 @@ +# Vendored from vLLM: +# +# https://github.com/vllm-project/vllm/blob/main/cmake/utils.cmake +# +# Attempt to find the python package that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python3_EXECUTABLE ${EXECUTABLE}) + find_package(Python3 COMPONENTS Interpreter Development.Module Development.SABIModule) + if (NOT Python3_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(_VER "${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}") + set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${_VER}) is not one of the supported versions: " + "${_SUPPORTED_VERSIONS_LIST}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +function (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${Python3_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, `SUCCESS` is set to FALSE. If successful, `SUCCESS` is set to TRUE. +# +function (try_run_python OUT SUCCESS EXPR) + execute_process( + COMMAND + "${Python3_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + set(${SUCCESS} FALSE PARENT_SCOPE) + set(${OUT} "" PARENT_SCOPE) + else() + set(${SUCCESS} TRUE PARENT_SCOPE) + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) + endif() +endfunction() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(_PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) +endmacro() + +# +# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set +# of CUDA source files. The names of the corresponding "hipified" sources are +# stored in `OUT_SRCS`. +# +function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) + # + # Split into C++ and non-C++ (i.e. CUDA) sources. + # + set(NODUP_SRCS ${ORIG_SRCS}) + list(REMOVE_DUPLICATES NODUP_SRCS) + set(SRCS ${NODUP_SRCS}) + set(CXX_SRCS ${NODUP_SRCS}) + list(FILTER SRCS INCLUDE REGEX "\.cu$") + list(FILTER CXX_SRCS EXCLUDE REGEX "\.cu$") + + # + # Generate ROCm/HIP source file names from CUDA file names. + # Since HIP files are generated code, they will appear in the build area + # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. + # + set(HIP_SRCS) + foreach (SRC ${SRCS}) + get_source_file_property(include_dirs "${SRC}" INCLUDE_DIRECTORIES) + get_source_file_property(compile_options "${SRC}" COMPILE_OPTIONS) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + + if(include_dirs) + # Copy over include directories from the original CUDA file. + set_source_files_properties( + ${SRC} + PROPERTIES INCLUDE_DIRECTORIES "${include_dirs}") + endif() + + if(compile_options) + set_source_files_properties( + ${SRC} + PROPERTIES COMPILE_OPTIONS "${compile_options}") + endif() + + list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + endforeach() + + add_custom_target( + hipify${NAME} + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR} -o ${CMAKE_CURRENT_BINARY_DIR} ${SRCS} + DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} + BYPRODUCTS ${HIP_SRCS} + COMMENT "Running hipify on ${NAME} extension source files.") + + # Swap out original extension sources with hipified sources. + list(APPEND HIP_SRCS ${CXX_SRCS}) + set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) +endfunction() + +# +# Get additional GPU compiler flags from torch. +# +function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) + if (${GPU_LANG} STREQUAL "CUDA") + # + # Get common NVCC flags from torch. + # + run_python(GPU_FLAGS + "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND GPU_FLAGS "-DENABLE_FP8") + list(REMOVE_ITEM GPU_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") + endif() + + elseif(${GPU_LANG} STREQUAL "HIP") + # + # Get common HIP/HIPCC flags from torch. + # + run_python(GPU_FLAGS + "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + list(APPEND GPU_FLAGS + "-DUSE_ROCM" + "-DENABLE_FP8" + "-U__HIP_NO_HALF_CONVERSIONS__" + "-U__HIP_NO_HALF_OPERATORS__" + "-fno-gpu-rdc") + + endif() + set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) +endfunction() + +# Macro for converting a `gencode` version number to a cmake version number. +macro(string_to_ver OUT_VER IN_STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) +endmacro() + +# +# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS`. +# +# Example: +# CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" +# clear_gencode_flags() +# CMAKE_CUDA_FLAGS="-Wall" +# +macro(clear_gencode_flags) + # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified + # and passed back via the `CUDA_ARCHITECTURES` property. + string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS + ${CMAKE_CUDA_FLAGS}) +endmacro() + +# +# Extract unique CUDA architectures from a list of compute capabilities codes in +# the form `[]`, convert them to the form sort +# `.`, dedupes them and then sorts them in ascending order and +# stores them in `OUT_ARCHES`. +# +# Example: +# CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" +# extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS) +# OUT_ARCHES="7.5;...;9.0" +function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS) + set(_CUDA_ARCHES) + foreach(_ARCH ${CUDA_ARCH_FLAGS}) + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH}) + if (_COMPUTE) + set(_COMPUTE ${CMAKE_MATCH_1}) + endif() + + string_to_ver(_COMPUTE_VER ${_COMPUTE}) + list(APPEND _CUDA_ARCHES ${_COMPUTE_VER}) + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHES) + list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING) + set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE) +endfunction() + +# +# For a specific file set the `-gencode` flag in compile options conditionally +# for the CUDA language. +# +# Example: +# set_gencode_flag_for_srcs( +# SRCS "foo.cu" +# ARCH "compute_75" +# CODE "sm_75") +# adds: "-gencode arch=compute_75,code=sm_75" to the compile options for +# `foo.cu` (only for the CUDA language). +# +macro(set_gencode_flag_for_srcs) + set(options) + set(oneValueArgs ARCH CODE) + set(multiValueArgs SRCS) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE}) + set_property( + SOURCE ${arg_SRCS} + APPEND PROPERTY + COMPILE_OPTIONS "$<$:${_FLAG}>" + ) + + message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}") +endmacro(set_gencode_flag_for_srcs) + +# +# For a list of source files set the `-gencode` flags in the files specific +# compile options (specifically for the CUDA language). +# +# arguments are: +# SRCS: list of source files +# CUDA_ARCHS: list of CUDA architectures in the form `.[letter]` +# BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built +# for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS +# that is larger than BUILD_PTX_FOR_ARCH. +# +macro(set_gencode_flags_for_srcs) + set(options) + set(oneValueArgs BUILD_PTX_FOR_ARCH) + set(multiValueArgs SRCS CUDA_ARCHS) + cmake_parse_arguments(arg "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + foreach(_ARCH ${arg_CUDA_ARCHS}) + # handle +PTX suffix: generate both sm and ptx codes if requested + string(FIND "${_ARCH}" "+PTX" _HAS_PTX) + if(NOT _HAS_PTX EQUAL -1) + string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}") + string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "sm_${_STRIPPED_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "compute_${_STRIPPED_ARCH}") + else() + string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "sm_${_STRIPPED_ARCH}") + endif() + endforeach() + + if (${arg_BUILD_PTX_FOR_ARCH}) + list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH) + if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH}) + string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_PTX_ARCH}" + CODE "compute_${_PTX_ARCH}") + endif() + endif() +endmacro() + +# +# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form +# `.[letter]` compute the "loose intersection" with the +# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in +# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there +# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the +# architecture in `SRC_CUDA_ARCHS`. +# The loose intersection is defined as: +# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } +# where `<=` is the version comparison operator. +# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version +# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. +# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is +# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add +# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). +# The result is stored in `OUT_CUDA_ARCHS`. +# +# Example: +# SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a" +# TGT_CUDA_ARCHS="8.0;8.9;9.0" +# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) +# OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a" +# +# Example With PTX: +# SRC_CUDA_ARCHS="8.0+PTX" +# TGT_CUDA_ARCHS="9.0" +# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) +# OUT_CUDA_ARCHS="8.0+PTX" +# +function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) + set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}") + set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS}) + + # handle +PTX suffix: separate base arch for matching, record PTX requests + set(_PTX_ARCHS) + foreach(_arch ${_SRC_CUDA_ARCHS}) + if(_arch MATCHES "\\+PTX$") + string(REPLACE "+PTX" "" _base "${_arch}") + list(APPEND _PTX_ARCHS "${_base}") + list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") + list(APPEND _SRC_CUDA_ARCHS "${_base}") + endif() + endforeach() + list(REMOVE_DUPLICATES _PTX_ARCHS) + list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) + + # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should + # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS + set(_CUDA_ARCHS) + foreach(_arch ${_SRC_CUDA_ARCHS}) + if(_arch MATCHES "[af]$") + list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") + string(REGEX REPLACE "[af]$" "" _base "${_arch}") + if ("${_base}" IN_LIST TGT_CUDA_ARCHS) + list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") + list(APPEND _CUDA_ARCHS "${_arch}") + endif() + endif() + endforeach() + + list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${_TGT_CUDA_ARCHS}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check version-less-or-equal, and allow PTX arches to match across majors + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") + endif() + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHS) + + # reapply +PTX suffix to architectures that requested PTX + set(_FINAL_ARCHS) + foreach(_arch ${_CUDA_ARCHS}) + if(_arch IN_LIST _PTX_ARCHS) + list(APPEND _FINAL_ARCHS "${_arch}+PTX") + else() + list(APPEND _FINAL_ARCHS "${_arch}") + endif() + endforeach() + set(_CUDA_ARCHS ${_FINAL_ARCHS}) + + list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) +endfunction() + +# +# For the given `SRC_ROCM_ARCHS` list of architecture versions in the form +# `` compute the "loose intersection" with the `TGT_ROCM_ARCHS` list. +# The loose intersection is defined as: +# { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } +# where `<=` is the version comparison operator. +# In other words, for each version in `TGT_ROCM_ARCHS` find the highest version +# in `SRC_ROCM_ARCHS` that is less or equal to the version in `TGT_ROCM_ARCHS`. +# The result is stored in `OUT_ROCM_ARCHS`. +# +# Example: +# SRC_ROCM_ARCHS="gfx900;gfx906;gfx908;gfx90a" +# TGT_ROCM_ARCHS="gfx906;gfx908;gfx1030" +# hip_archs_loose_intersection(OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS) +# OUT_ROCM_ARCHS="gfx906;gfx908" +# +function(hip_archs_loose_intersection OUT_ROCM_ARCHS SRC_ROCM_ARCHS TGT_ROCM_ARCHS) + list(REMOVE_DUPLICATES SRC_ROCM_ARCHS) + + # ROCm architectures are typically in format gfxNNN or gfxNNNx where N is a digit + # and x is a letter. We can sort them by string comparison which works for this format. + list(SORT SRC_ROCM_ARCHS COMPARE STRING ORDER ASCENDING) + + set(_ROCM_ARCHS) + + # Find the intersection of supported architectures + foreach(_SRC_ARCH ${SRC_ROCM_ARCHS}) + if(_SRC_ARCH IN_LIST TGT_ROCM_ARCHS) + list(APPEND _ROCM_ARCHS ${_SRC_ARCH}) + endif() + endforeach() + + list(REMOVE_DUPLICATES _ROCM_ARCHS) + set(${OUT_ROCM_ARCHS} ${_ROCM_ARCHS} PARENT_SCOPE) +endfunction() + +function(cuda_remove_ptx_suffixes OUT_CUDA_ARCHS CUDA_ARCHS) + set(_CUDA_ARCHS "${CUDA_ARCHS}") + + # handle +PTX suffix: separate base arch for matching, record PTX requests + foreach(_arch ${CUDA_ARCHS}) + if(_arch MATCHES "\\+PTX$") + string(REPLACE "+PTX" "" _base "${_arch}") + list(REMOVE_ITEM _CUDA_ARCHS "${_arch}") + list(APPEND _CUDA_ARCHS "${_base}") + endif() + endforeach() + + list(REMOVE_DUPLICATES _CUDA_ARCHS) + list(SORT _CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) +endfunction() + + + +# +# Override the GPU architectures detected by cmake/torch and filter them by +# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in +# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set +# the architectures on a per file basis. +# +# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. +# +macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) + set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) + message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}") + + if (${GPU_LANG} STREQUAL "HIP") + # + # `GPU_ARCHES` controls the `--offload-arch` flags. + # + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() + # + # Find the intersection of the supported + detected architectures to + # set the module architecture flags. + # + set(${GPU_ARCHES}) + foreach (_ARCH ${HIP_ARCHITECTURES}) + if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) + list(APPEND ${GPU_ARCHES} ${_ARCH}) + endif() + endforeach() + + if(NOT ${GPU_ARCHES}) + message(FATAL_ERROR + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" + " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") + endif() + endif() +endmacro() + +# +# Define a target named `GPU_MOD_NAME` for a single extension. The +# arguments are: +# +# DESTINATION - Module destination directory. +# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, +# etc. +# SOURCES - List of source files relative to CMakeLists.txt +# directory. +# +# Optional arguments: +# +# ARCHITECTURES - A list of target GPU architectures in cmake +# format. +# Refer `CMAKE_CUDA_ARCHITECTURES` documentation +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# ARCHITECTURES will use cmake's defaults if +# not provided. +# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. +# INCLUDE_DIRECTORIES - Extra include directories. +# LIBRARIES - Extra link libraries. +# WITH_SOABI - Generate library with python SOABI suffix name. +# USE_SABI - Use python stable api +# +# Note: optimization level/debug info is set via cmake build type. +# +function (define_gpu_extension_target GPU_MOD_NAME) + cmake_parse_arguments(PARSE_ARGV 1 + GPU + "WITH_SOABI" + "DESTINATION;LANGUAGE;USE_SABI" + "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") + + # Add hipify preprocessing step when building with HIP/ROCm. + if (GPU_LANGUAGE STREQUAL "HIP") + hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") + endif() + + if (GPU_WITH_SOABI) + set(GPU_WITH_SOABI WITH_SOABI) + else() + set(GPU_WITH_SOABI) + endif() + + if (GPU_USE_SABI) + Python3_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") + else() + Python3_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") + endif() + + if (GPU_LANGUAGE STREQUAL "HIP") + # Make this target dependent on the hipify preprocessor step. + add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + endif() + + if (GPU_ARCHITECTURES) + if (GPU_LANGUAGE STREQUAL "HIP") + # Clear target architectures, we are passing arch flags per source file. + set_property(TARGET ${GPU_MOD_NAME} PROPERTY HIP_ARCHITECTURES off) + else() + set_target_properties(${GPU_MOD_NAME} PROPERTIES + ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") + endif() + endif() + + set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) + + target_compile_options(${GPU_MOD_NAME} PRIVATE + $<$:${GPU_COMPILE_FLAGS}>) + + target_compile_definitions(${GPU_MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") + + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) + + target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) + + # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of + # dependencies that are not necessary and may not be installed. + if (GPU_LANGUAGE STREQUAL "CUDA") + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart) + else() + target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + endif() + + install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) +endfunction() + +# Map a GPU language to its backend name. +# +# Arguments: +# OUT_BACKEND - Output variable name for the backend string +# GPU_LANG - The GPU language (CPU, CUDA, HIP, METAL, SYCL) +# +function(gpu_lang_to_backend OUT_BACKEND GPU_LANG) + if (${GPU_LANG} STREQUAL "CPU") + set(_BACKEND "cpu") + elseif (${GPU_LANG} STREQUAL "CUDA") + set(_BACKEND "cuda") + elseif (${GPU_LANG} STREQUAL "HIP") + set(_BACKEND "rocm") + elseif (${GPU_LANG} STREQUAL "METAL") + set(_BACKEND "metal") + elseif (${GPU_LANG} STREQUAL "SYCL") + set(_BACKEND "xpu") + else() + message(FATAL_ERROR "Unsupported GPU_LANG: ${GPU_LANG}") + endif() + + set(${OUT_BACKEND} "${_BACKEND}" PARENT_SCOPE) +endfunction() diff --git a/kernels-v1/attention-int8/compat.py b/kernels-v1/attention-int8/compat.py new file mode 100644 index 0000000..03dbc1a --- /dev/null +++ b/kernels-v1/attention-int8/compat.py @@ -0,0 +1,26 @@ +import ctypes +import sys + +import importlib +from pathlib import Path +from types import ModuleType + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/kernels-v1/attention-int8/metadata-cpu.json b/kernels-v1/attention-int8/metadata-cpu.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-cpu.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-cuda.json b/kernels-v1/attention-int8/metadata-cuda.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-cuda.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-metal.json b/kernels-v1/attention-int8/metadata-metal.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-metal.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-neuron.json b/kernels-v1/attention-int8/metadata-neuron.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-neuron.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-rocm.json b/kernels-v1/attention-int8/metadata-rocm.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-rocm.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-xpu.json b/kernels-v1/attention-int8/metadata-xpu.json new file mode 100644 index 0000000..9cf5dee --- /dev/null +++ b/kernels-v1/attention-int8/metadata-xpu.json @@ -0,0 +1,4 @@ +{ + "version": 1, + "python-depends": [] +} \ No newline at end of file diff --git a/kernels-v1/attention-int8/pyproject.toml b/kernels-v1/attention-int8/pyproject.toml new file mode 100644 index 0000000..0a60c0a --- /dev/null +++ b/kernels-v1/attention-int8/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "attention_int8" +version = "0.1.0" +requires-python = ">=3.9" + +[build-system] +requires = [ + "cmake>=3.26", + "ninja", + "packaging", + "setuptools>=61", + "torch", + "wheel", + +] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] + +[tool.pytest.ini_options] +markers = [ + "kernels_ci: mark a test as a kernel CI test" +] \ No newline at end of file diff --git a/kernels-v1/attention-int8/setup.py b/kernels-v1/attention-int8/setup.py new file mode 100644 index 0000000..d9c82fe --- /dev/null +++ b/kernels-v1/attention-int8/setup.py @@ -0,0 +1,157 @@ +import logging +import os +from shutil import which, move +import subprocess +import sys +from pathlib import Path + +from setuptools import Extension, find_packages, setup +from setuptools.command.build_ext import build_ext + +logger = logging.getLogger(__name__) + + +def get_backend() -> str: + """Detect the backend by inspecting torch.""" + import torch + + if torch.version.cuda is not None: + return "cuda" + elif torch.version.hip is not None: + return "rocm" + elif torch.backends.mps.is_available(): + return "metal" + elif hasattr(torch.version, "xpu") and torch.version.xpu is not None: + return "xpu" + else: + return "cpu" + + +def is_sccache_available() -> bool: + return which("sccache") is not None + + +def is_ccache_available() -> bool: + return which("ccache") is not None + + +def is_ninja_available() -> bool: + return which("ninja") is not None + + +class CMakeExtension(Extension): + def __init__(self, name: str, sourcedir: str = "") -> None: + super().__init__(name, sources=[], py_limited_api=True) + self.sourcedir = os.fspath(Path(sourcedir).resolve()) + + +class CMakeBuild(build_ext): + def build_extension(self, ext: CMakeExtension) -> None: + ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name) + extdir = ext_fullpath.parent.resolve() + + debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug + cfg = "Debug" if debug else "Release" + + cmake_generator = os.environ.get("CMAKE_GENERATOR", "") + + # Set Python3_EXECUTABLE instead if you use PYBIND11_FINDPYTHON + # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code + # from Python. + cmake_args = [ + f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}", + f"-DPython3_EXECUTABLE={sys.executable}", + f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm + ] + build_args = [] + if "CMAKE_ARGS" in os.environ: + cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + + if not cmake_generator or cmake_generator == "Ninja": + try: + import ninja + + ninja_executable_path = Path(ninja.BIN_DIR) / "ninja" + cmake_args += [ + "-GNinja", + f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", + ] + except ImportError: + pass + + if is_sccache_available(): + cmake_args += [ + "-DCMAKE_C_COMPILER_LAUNCHER=sccache", + "-DCMAKE_CXX_COMPILER_LAUNCHER=sccache", + "-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache", + "-DCMAKE_HIP_COMPILER_LAUNCHER=sccache", + "-DCMAKE_OBJC_COMPILER_LAUNCHER=sccache", + "-DCMAKE_OBJCXX_COMPILER_LAUNCHER=sccache", + ] + elif is_ccache_available(): + cmake_args += [ + "-DCMAKE_C_COMPILER_LAUNCHER=ccache", + "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache", + "-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache", + "-DCMAKE_HIP_COMPILER_LAUNCHER=ccache", + "-DCMAKE_OBJC_COMPILER_LAUNCHER=ccache", + "-DCMAKE_OBJCXX_COMPILER_LAUNCHER=ccache", + ] + + num_jobs = os.getenv("MAX_JOBS", None) + if num_jobs is not None: + num_jobs = int(num_jobs) + logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) + else: + try: + # os.sched_getaffinity() isn't universally available, so fall + # back to os.cpu_count() if we get an error here. + num_jobs = len(os.sched_getaffinity(0)) + except AttributeError: + num_jobs = os.cpu_count() + + nvcc_threads = os.getenv("NVCC_THREADS", None) + if nvcc_threads is not None: + nvcc_threads = int(nvcc_threads) + logger.info( + "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads + ) + num_jobs = max(1, num_jobs // nvcc_threads) + cmake_args += ["-DNVCC_THREADS={}".format(nvcc_threads)] + + build_args += [f"-j{num_jobs}"] + if sys.platform == "win32": + build_args += ["--config", cfg] + + build_temp = Path(self.build_temp) / ext.name + if not build_temp.exists(): + build_temp.mkdir(parents=True) + + subprocess.run( + ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True + ) + subprocess.run( + ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True + ) + + if sys.platform == "win32": + # Move the dylib one folder up for discovery. + for filename in os.listdir(extdir / cfg): + move(extdir / cfg / filename, extdir / filename) + + +backend = get_backend() +ops_name = f"_attention_int8_{backend}_dba582b_dirty" + +setup( + name="attention_int8", + # The version is just a stub, it's not used by the final build artefact. + version="0.1.0", + ext_modules=[CMakeExtension(f"attention_int8.{ops_name}")], + cmdclass={"build_ext": CMakeBuild}, + packages=find_packages(where="torch-ext", include=["attention_int8*"]), + package_dir={"": "torch-ext"}, + zip_safe=False, + install_requires=["torch"], + python_requires=">=3.9", +) \ No newline at end of file diff --git a/kernels-v1/attention-int8/test_simple.py b/kernels-v1/attention-int8/test_simple.py new file mode 100644 index 0000000..18b98c2 --- /dev/null +++ b/kernels-v1/attention-int8/test_simple.py @@ -0,0 +1,25 @@ +import torch +torch.ops.load_library("./torch-ext/attention_int8/_attention_int8_cuda_dba582b_dirty.abi3.so") + +Q = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") +K = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") +V = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") + +# Parag's kernel +O = torch.ops.int8_attn.int8_attention_forward(Q, K, V) + +# PyTorch native attention (reference) +ref = torch.nn.functional.scaled_dot_product_attention(Q, K, V) + +# Compare +diff_mean = (O.float() - ref.float()).abs().mean().item() +diff_max = (O.float() - ref.float()).abs().max().item() + +print(f"Output shape: {O.shape}") +print(f"Mean difference: {diff_mean:.6f}") +print(f"Max difference: {diff_max:.6f}") + +if diff_mean < 0.05: + print("OK - results are correct") +else: + print("PROBLEM - difference too large") \ No newline at end of file diff --git a/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py b/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py new file mode 100644 index 0000000..a76c858 --- /dev/null +++ b/kernels-v1/attention-int8/torch-ext/attention_int8/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _attention_int8_cuda_dba582b_dirty +ops = torch.ops._attention_int8_cuda_dba582b_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_attention_int8_cuda_dba582b_dirty::{op_name}" diff --git a/kernels-v1/attention-int8/torch-ext/registration.h b/kernels-v1/attention-int8/torch-ext/registration.h new file mode 100644 index 0000000..19a82cc --- /dev/null +++ b/kernels-v1/attention-int8/torch-ext/registration.h @@ -0,0 +1,30 @@ +// Registration macros from vLLM: +// https://github.com/vllm-project/vllm/blob/main/csrc/core/registration.h + +#pragma once + +#include + +#define _CONCAT(A, B) A##B +#define CONCAT(A, B) _CONCAT(A, B) + +#define _STRINGIFY(A) #A +#define STRINGIFY(A) _STRINGIFY(A) + +// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME +// could be a macro instead of a literal token. +#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) + +// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME +// could be a macro instead of a literal token. +#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \ + TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE) + +// REGISTER_EXTENSION allows the shared library to be loaded and initialized +// via python's import statement. +#define REGISTER_EXTENSION(NAME) \ + PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \ + static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \ + STRINGIFY(NAME), nullptr, 0, nullptr}; \ + return PyModule_Create(&module); \ + } From 577bdee14cdff99de49333d66eccfc98a5ec03a4 Mon Sep 17 00:00:00 2001 From: florianmattana Date: Thu, 26 Mar 2026 09:42:32 +0100 Subject: [PATCH 3/3] remove test and metadata files from tracked files --- kernels-v1/attention-int8/compat.py | 26 ------------------- kernels-v1/attention-int8/metadata-cpu.json | 4 --- kernels-v1/attention-int8/metadata-cuda.json | 4 --- kernels-v1/attention-int8/metadata-metal.json | 4 --- .../attention-int8/metadata-neuron.json | 4 --- kernels-v1/attention-int8/metadata-rocm.json | 4 --- kernels-v1/attention-int8/metadata-xpu.json | 4 --- kernels-v1/attention-int8/test_simple.py | 25 ------------------ 8 files changed, 75 deletions(-) delete mode 100644 kernels-v1/attention-int8/compat.py delete mode 100644 kernels-v1/attention-int8/metadata-cpu.json delete mode 100644 kernels-v1/attention-int8/metadata-cuda.json delete mode 100644 kernels-v1/attention-int8/metadata-metal.json delete mode 100644 kernels-v1/attention-int8/metadata-neuron.json delete mode 100644 kernels-v1/attention-int8/metadata-rocm.json delete mode 100644 kernels-v1/attention-int8/metadata-xpu.json delete mode 100644 kernels-v1/attention-int8/test_simple.py diff --git a/kernels-v1/attention-int8/compat.py b/kernels-v1/attention-int8/compat.py deleted file mode 100644 index 03dbc1a..0000000 --- a/kernels-v1/attention-int8/compat.py +++ /dev/null @@ -1,26 +0,0 @@ -import ctypes -import sys - -import importlib -from pathlib import Path -from types import ModuleType - -def _import_from_path(file_path: Path) -> ModuleType: - # We cannot use the module name as-is, after adding it to `sys.modules`, - # it would also be used for other imports. So, we make a module name that - # depends on the path for it to be unique using the hex-encoded hash of - # the path. - path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) - module_name = path_hash - spec = importlib.util.spec_from_file_location(module_name, file_path) - if spec is None: - raise ImportError(f"Cannot load spec for {module_name} from {file_path}") - module = importlib.util.module_from_spec(spec) - if module is None: - raise ImportError(f"Cannot load module {module_name} from spec") - sys.modules[module_name] = module - spec.loader.exec_module(module) # type: ignore - return module - - -globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/kernels-v1/attention-int8/metadata-cpu.json b/kernels-v1/attention-int8/metadata-cpu.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-cpu.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-cuda.json b/kernels-v1/attention-int8/metadata-cuda.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-cuda.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-metal.json b/kernels-v1/attention-int8/metadata-metal.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-metal.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-neuron.json b/kernels-v1/attention-int8/metadata-neuron.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-neuron.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-rocm.json b/kernels-v1/attention-int8/metadata-rocm.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-rocm.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/metadata-xpu.json b/kernels-v1/attention-int8/metadata-xpu.json deleted file mode 100644 index 9cf5dee..0000000 --- a/kernels-v1/attention-int8/metadata-xpu.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "version": 1, - "python-depends": [] -} \ No newline at end of file diff --git a/kernels-v1/attention-int8/test_simple.py b/kernels-v1/attention-int8/test_simple.py deleted file mode 100644 index 18b98c2..0000000 --- a/kernels-v1/attention-int8/test_simple.py +++ /dev/null @@ -1,25 +0,0 @@ -import torch -torch.ops.load_library("./torch-ext/attention_int8/_attention_int8_cuda_dba582b_dirty.abi3.so") - -Q = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") -K = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") -V = torch.randn(1, 8, 2048, 64, dtype=torch.float16, device="cuda") - -# Parag's kernel -O = torch.ops.int8_attn.int8_attention_forward(Q, K, V) - -# PyTorch native attention (reference) -ref = torch.nn.functional.scaled_dot_product_attention(Q, K, V) - -# Compare -diff_mean = (O.float() - ref.float()).abs().mean().item() -diff_max = (O.float() - ref.float()).abs().max().item() - -print(f"Output shape: {O.shape}") -print(f"Mean difference: {diff_mean:.6f}") -print(f"Max difference: {diff_max:.6f}") - -if diff_mean < 0.05: - print("OK - results are correct") -else: - print("PROBLEM - difference too large") \ No newline at end of file