vectorch-ai · guocuimi · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
@@ -12,7 +12,7 @@ else
   arch_path='sbsa'
 fi
 
-NVSHMEM_VERSION=3.3.24
+NVSHMEM_VERSION=3.4.5
 
 function install_cuda {
   version=$1
@@ -178,10 +178,10 @@ function install_130 {
   NCCL_VERSION=v2.27.7-1
   CUSPARSELT_VERSION=0.8.0.4_cuda13
 
-  echo "Installing CUDA 12.8.1, cuDNN ${CUDNN_VERSION}, NCCL ${NCCL_VERSION}, NVSHMEM ${NVSHMEM_VERSION} and cuSparseLt ${CUSPARSELT_VERSION}"
+  echo "Installing CUDA 13.0.2, cuDNN ${CUDNN_VERSION}, NCCL ${NCCL_VERSION}, NVSHMEM ${NVSHMEM_VERSION} and cuSparseLt ${CUSPARSELT_VERSION}"
 
   # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
+  install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
 
   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
   install_cudnn 13 $CUDNN_VERSION

diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
@@ -24,7 +24,7 @@ jobs:
       fail-fast: false
       matrix:
         python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
-        cuda: ["12.6", "12.8", "12.9"]
+        cuda: ["12.6", "12.8", "13.0"]
         torch: ["2.8.0"]
     runs-on: [self-hosted, linux]
     env:

diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml
@@ -21,7 +21,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ["12.6", "12.8", "12.9", "13.0"]
+        cuda: ["12.6", "12.8", "13.0"]
         gcc: ["12"]
     runs-on: [self-hosted, linux]
     steps:

diff --git a/.github/workflows/publish_manylinux_2_28_image.yml b/.github/workflows/publish_manylinux_2_28_image.yml
@@ -21,7 +21,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ["12.6", "12.8", "12.9"]
+        cuda: ["12.6", "12.8", "13.0"]
     runs-on: [self-hosted, linux]
     steps:
       - name: Checkout repository

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,8 @@ option(USE_MANYLINUX "Build for manylinux" OFF)
 option(BUILD_NVBENCH "Build the nvbench binary" OFF)
 option(INSTALL_PY_MODULE "Install python module to scalellm directory" OFF)
 
+option(BUILD_KERNEL_ONLY "Build only the CUDA kernel library" OFF)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -92,6 +94,9 @@ message(STATUS "TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}")
 
 # configure vcpkg
 # have to set CMAKE_TOOLCHAIN_FILE before first project call.
+if (NOT BUILD_KERNEL_ONLY)
+  set(VCPKG_MANIFEST_FEATURES service)
+endif()
 if (DEFINED ENV{VCPKG_ROOT} AND NOT DEFINED CMAKE_TOOLCHAIN_FILE)
   set(CMAKE_TOOLCHAIN_FILE "$ENV{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
       CACHE STRING "Vcpkg toolchain file")
@@ -121,64 +126,71 @@ project(
   LANGUAGES C CXX CUDA
 )
 
-find_package(CUDAToolkit REQUIRED)
-
 # setup CMake module path, defines path for include() and find_package()
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
-enable_language(Rust)
-find_package(Rust REQUIRED)
-
-# include custom cmake modules
-include(static_analyzers)
-# TODO: can't use sanitizers with CUDA for now.
-# include(sanitizers)
-
-if(UNIX)
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og")
-endif()
+# include current and third_party paths
+list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src)
+list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
 
-find_package(Boost REQUIRED)
-find_package(Threads REQUIRED)
-# find all dependencies from vcpkg
-find_package(fmt CONFIG REQUIRED GLOBAL)
+# find required common packages
+find_package(CUDAToolkit REQUIRED)
 find_package(glog CONFIG REQUIRED)
-find_package(gflags CONFIG REQUIRED)
 find_package(absl CONFIG REQUIRED)
-find_package(Protobuf CONFIG REQUIRED)
-find_package(gRPC CONFIG REQUIRED)
-find_package(re2 CONFIG REQUIRED)
-find_package(folly CONFIG REQUIRED)
 find_package(GTest CONFIG REQUIRED)
-find_package(benchmark CONFIG REQUIRED)
-find_package(nlohmann_json CONFIG REQUIRED)
-find_package(prometheus-cpp CONFIG REQUIRED)
-find_package(RapidJSON CONFIG REQUIRED)
+
+# find packages for service build
+if (NOT BUILD_KERNEL_ONLY)
+  enable_language(Rust)
+  find_package(Rust REQUIRED)
+
+  # include custom cmake modules
+  include(static_analyzers)
+  # TODO: can't use sanitizers with CUDA for now.
+  # include(sanitizers)
+
+  if(UNIX)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og")
+  endif()
+
+  find_package(Boost REQUIRED)
+  find_package(Threads REQUIRED)
+  # find all dependencies from vcpkg
+  find_package(fmt CONFIG REQUIRED GLOBAL)
+  find_package(gflags CONFIG REQUIRED)
+  find_package(Protobuf CONFIG REQUIRED)
+  find_package(gRPC CONFIG REQUIRED)
+  find_package(re2 CONFIG REQUIRED)
+  find_package(folly CONFIG REQUIRED)
+  find_package(benchmark CONFIG REQUIRED)
+  find_package(nlohmann_json CONFIG REQUIRED)
+  find_package(prometheus-cpp CONFIG REQUIRED)
+  find_package(RapidJSON CONFIG REQUIRED)
+
+  find_package(NCCL REQUIRED)
+
+  find_package(Jemalloc)
+  if(Jemalloc_FOUND)
+    link_libraries(Jemalloc::jemalloc)
+  endif()
+endif()
 
 if (USE_MANYLINUX)
   # manylinux doesn't ship Development.Embed
   find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
 else()
   find_package(Python REQUIRED COMPONENTS Interpreter Development)
 endif()
-
-find_package(NCCL REQUIRED)
-
-find_package(Jemalloc)
-if(Jemalloc_FOUND)
-  link_libraries(Jemalloc::jemalloc)
-endif()
-
 # Important Note: Always invoke find_package for other dependencies
 # before including libtorch, as doing so afterwards may lead to
 # unexpected linker errors.
 if (DEFINED ENV{LIBTORCH_ROOT})
   find_package(Torch REQUIRED HINTS "$ENV{LIBTORCH_ROOT}")
   message(STATUS "Using libtorch at $ENV{LIBTORCH_ROOT}")
 else()
-  SET(TORCH_VERSION "2.8.0")
+  SET(TORCH_VERSION "2.9.0")
   include(FetchContent)
-  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
-    set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu129/libtorch-shared-with-deps-${TORCH_VERSION}%2Bcu129.zip")
+  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu130/libtorch-shared-with-deps-${TORCH_VERSION}%2Bcu130.zip")
   elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
     set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu128/libtorch-shared-with-deps-${TORCH_VERSION}%2Bcu128.zip")
   elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.6)
@@ -235,12 +247,10 @@ message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
 include(CTest)
 include(GoogleTest)
 
-# include current path
-list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src)
-list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
-
 # add subdirectories
-add_subdirectory(proto)
-add_subdirectory(src)
 add_subdirectory(third_party)
-add_subdirectory(scalellm)
+add_subdirectory(src)
+if (NOT BUILD_KERNEL_ONLY)
+  add_subdirectory(proto)
+  add_subdirectory(scalellm)
+endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,18 +1,22 @@
-add_subdirectory(chat_template)
-add_subdirectory(common)
-add_subdirectory(handlers)
+add_subdirectory(gtest_main)
 add_subdirectory(kernels)
-add_subdirectory(tokenizer)
-add_subdirectory(layers)
-add_subdirectory(models)
-add_subdirectory(model_loader)
-add_subdirectory(model_parallel)
-add_subdirectory(sampling)
-add_subdirectory(request)
-add_subdirectory(memory)
-add_subdirectory(scheduler)
-add_subdirectory(speculative)
-add_subdirectory(engine)
-add_subdirectory(server)
-add_subdirectory(benchmark)
-add_subdirectory(huggingface)
+
+if (NOT BUILD_KERNEL_ONLY)
+  add_subdirectory(chat_template)
+  add_subdirectory(common)
+  add_subdirectory(handlers)
+  add_subdirectory(tokenizer)
+  add_subdirectory(layers)
+  add_subdirectory(models)
+  add_subdirectory(model_loader)
+  add_subdirectory(model_parallel)
+  add_subdirectory(sampling)
+  add_subdirectory(request)
+  add_subdirectory(memory)
+  add_subdirectory(scheduler)
+  add_subdirectory(speculative)
+  add_subdirectory(engine)
+  add_subdirectory(server)
+  add_subdirectory(benchmark)
+  add_subdirectory(huggingface)
+endif()
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
@@ -2,7 +2,7 @@ include(cc_library)
 include(cc_test)
 
 cc_library(
-  NAME 
+  NAME
     common
   HDRS
     macros.h
@@ -28,21 +28,8 @@ cc_library(
     glog::glog
 )
 
-cc_library(
-  TESTONLY
-  NAME
-    gtest_main
-  SRCS
-    gtest_main.cpp
-  DEPS
-    GTest::gtest
-    CUDA::toolkit
-  LINKOPTS
-    cudart
-)
-
 cc_test(
-  NAME 
+  NAME
     common_test
   SRCS
     range_test.cpp
@@ -55,4 +42,3 @@ cc_test(
     :gtest_main
     torch
 )
-
diff --git a/src/gtest_main/CMakeLists.txt b/src/gtest_main/CMakeLists.txt
@@ -0,0 +1,14 @@
+include(cc_library)
+
+cc_library(
+  TESTONLY
+  NAME
+    gtest_main
+  SRCS
+    gtest_main.cpp
+  DEPS
+    GTest::gtest
+    CUDA::toolkit
+  LINKOPTS
+    cudart
+)
diff --git a/src/common/gtest_main.cpp → src/gtest_main/gtest_main.cpp b/src/common/gtest_main.cpp → src/gtest_main/gtest_main.cpp
diff --git a/src/huggingface/src/lib.rs b/src/huggingface/src/lib.rs
@@ -1,6 +1,10 @@
 // A simple C wrapper of safetensors and tokenizers library
 // adapted from https://github.com/huggingface/safetensors/tree/c_bindings
 
+// #[repr(C)]: use C language’s data layout
+// extern "C": tells Rust to use C ABI conventions
+// #[no_mangle]: prevent Rust from renaming the function
+
 // Import the needed libraries
 use core::ffi::c_uint;
 use core::str::Utf8Error;
@@ -415,7 +419,6 @@ unsafe fn _get_tensor(
     Ok(())
 }
 
-
 // A simple C wrapper of hf-tokenzier library
 // ported from https://github.com/mlc-ai/tokenizers-cpp
 
@@ -433,12 +436,8 @@ pub struct TokenizerWrapper {
 impl TokenizerWrapper {
     pub fn encode(&mut self, text: &str, add_special_tokens: bool) {
         // Encode the text and store the ids
-        self.encode_ids = Vec::from(
-            self.tokenizer
-                .encode(text, add_special_tokens)
-                .unwrap()
-                .get_ids(),
-        );
+        let encoded = self.tokenizer.encode(text, add_special_tokens).unwrap();
+        self.encode_ids = encoded.get_ids().to_vec();
     }
 
     pub fn decode(&mut self, ids: Vec<u32>, skip_special_tokens: bool) {
@@ -453,11 +452,8 @@ impl TokenizerWrapper {
 
 #[no_mangle]
 extern "C" fn tokenizer_from_file(path: *const c_char) -> *mut TokenizerWrapper {
-    let c_str = unsafe { CStr::from_ptr(path) };
-    let path_str = match c_str.to_str() {
-        Ok(s) => s,
-        Err(_) => panic!("Failed to convert C string to Rust string"),
-    };
+    let c_str = unsafe { CStr::from_ptr(path) }; // borrowed C string
+    let path_str = c_str.to_str().unwrap(); // convert CString to &str
 
     let boxed = Box::new(TokenizerWrapper {
         tokenizer: Tokenizer::from_file(path_str).unwrap().into(),
@@ -466,6 +462,7 @@ extern "C" fn tokenizer_from_file(path: *const c_char) -> *mut TokenizerWrapper
         id_to_token_result: String::new(),
     });
 
+    // Convert into a raw pointer: *mut TokenizerWrapper
     Box::into_raw(boxed)
 }
 
@@ -527,11 +524,7 @@ extern "C" fn tokenizer_free(wrapper: *mut TokenizerWrapper) {
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_token_to_id(
-    handle: *mut TokenizerWrapper,
-    token: *const u8,
-    len: usize
-) {
+extern "C" fn tokenizer_token_to_id(handle: *mut TokenizerWrapper, token: *const u8, len: usize) {
     unsafe {
         let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap();
         let id = (*handle).tokenizer.token_to_id(token);
@@ -564,8 +557,7 @@ extern "C" fn tokenizer_id_to_token(
 #[no_mangle]
 extern "C" fn tokenizer_get_vocab_size(
     handle: *mut TokenizerWrapper,
-    with_added_tokens: bool) -> usize {
-    unsafe {
-        (*handle).get_vocab_size(with_added_tokens)
-    }
+    with_added_tokens: bool,
+) -> usize {
+    unsafe { (*handle).get_vocab_size(with_added_tokens) }
 }
diff --git a/src/kernels/attention/CMakeLists.txt b/src/kernels/attention/CMakeLists.txt
@@ -53,6 +53,7 @@ cc_library(
   DEPS
     :attention.template
     glog::glog
+    torch
 )
 
 cc_test(
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,6 +53,7 @@ cc_library( @@
       DEPS
         :attention.template
         glog::glog
+        torch
     )
     cc_test(
@@ Expand Down @@