From e939e159a39b2216ef13c20b222258157ed3fed8 Mon Sep 17 00:00:00 2001
From: Keita Morisaki <kmta1236@gmail.com>
Date: Thu, 19 Mar 2026 14:57:09 +0900
Subject: [PATCH 1/2] feat(hesai): add CUDA-accelerated point cloud decoder for
 Hesai LiDAR

Add a GPU decode path for Hesai LiDAR sensors, gated behind compile-time
BUILD_CUDA=ON and runtime NEBULA_USE_CUDA=1 environment variable.

The implementation includes:
- CUDA kernel for batched point cloud decoding (hesai_cuda_kernels.cu)
- Angle LUT upload and GPU scan buffer management in hesai_decoder.hpp
- GPU-vs-CPU equivalence tests for OT128 (Pandar128E4X) sensor

The GPU path processes an entire scan in a single kernel launch, using
pre-computed angle lookup tables and a sparse output buffer. When CUDA
is not available or NEBULA_USE_CUDA is unset, the existing CPU path is
used with zero overhead.
---
 src/nebula_hesai/nebula_hesai/CMakeLists.txt  |  13 +
 .../tests/hesai_cuda_decoder_test.cpp         | 278 ++++++++
 .../nebula_hesai_decoders/CMakeLists.txt      |  90 ++-
 .../cuda/hesai_cuda_decoder.hpp               | 156 ++++
 .../angle_corrector_calibration_based.hpp     |  39 +
 .../angle_corrector_correction_based.hpp      |  34 +
 .../decoders/hesai_decoder.hpp                | 668 +++++++++++++++++-
 .../decoders/hesai_sensor.hpp                 |   4 +
 .../src/cuda/hesai_cuda_kernels.cu            | 398 +++++++++++
 9 files changed, 1678 insertions(+), 2 deletions(-)
 create mode 100644 src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
 create mode 100644 src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
 create mode 100644 src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu

diff --git a/src/nebula_hesai/nebula_hesai/CMakeLists.txt b/src/nebula_hesai/nebula_hesai/CMakeLists.txt
index 7b32b9270..43ac9b123 100644
--- a/src/nebula_hesai/nebula_hesai/CMakeLists.txt
+++ b/src/nebula_hesai/nebula_hesai/CMakeLists.txt
@@ -117,6 +117,19 @@ if(BUILD_TESTING)
     nebula_hesai_decoders::nebula_hesai_decoders
     diagnostic_updater::diagnostic_updater
     pcl_io)
+
+  # CUDA decoder equivalence test (only when CUDA support is available)
+  ament_add_gtest(hesai_cuda_decoder_test_main
+                  tests/hesai_cuda_decoder_test.cpp)
+  target_include_directories(hesai_cuda_decoder_test_main PRIVATE tests)
+  target_link_libraries(
+    hesai_cuda_decoder_test_main
+    hesai_ros_decoder_test
+    nebula_core_common::nebula_core_common
+    nebula_hesai_common::nebula_hesai_common
+    nebula_hesai_decoders::nebula_hesai_decoders
+    diagnostic_updater::diagnostic_updater
+    pcl_io)
 endif()
 
 if(BUILD_EXAMPLES)
diff --git a/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp b/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
new file mode 100644
index 000000000..879dab900
--- /dev/null
+++ b/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
@@ -0,0 +1,278 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// @file hesai_cuda_decoder_test.cpp
+/// @brief GPU-vs-CPU equivalence tests for the Hesai CUDA decoder (OT128 / Pandar128E4X).
+///
+/// Decodes the same rosbag with the CPU path (NEBULA_USE_CUDA unset) and the GPU path
+/// (NEBULA_USE_CUDA=1), then compares the resulting point clouds with tolerances.
+/// The GPU path differs at scan boundaries due to different overlap/FOV detection logic
+/// in the GPU kernel vs the CPU's ScanCutter.
+///
+/// Tolerances were derived from a single OT128 rosbag (ot128/1730271167765338806):
+///   - Scan 0: CPU=70275, GPU=72125 (diff=1850)
+///   - Scan 1: CPU=72240, GPU=72185 (diff=55)
+/// If additional rosbags show larger boundary diffs, tolerances may need adjustment.
+
+#include "hesai_ros_decoder_test.hpp"
+
+#include <rclcpp/rclcpp.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef NEBULA_CUDA_ENABLED
+
+namespace nebula::test
+{
+
+// OT128 config — matches TEST_CONFIGS[8] in hesai_ros_decoder_test_main.cpp
+static const nebula::ros::HesaiRosDecoderTestParams OT128_CONFIG = {
+  "Pandar128E4X", "LastStrongest", "Pandar128E4X.csv", "ot128/1730271167765338806",
+  "hesai", 0, 0.0, 0., 360., 0.3f, 300.f};
+
+// Maximum allowed difference in point count between GPU and CPU per scan.
+// The GPU kernel assigns scan-boundary points differently than the CPU's ScanCutter,
+// causing up to ~2000 points to shift between adjacent scans.
+// Observed max diff: 1850 (scan 0 of test rosbag). Set to 2000 for ~8% headroom.
+static constexpr int kMaxPointCountDiff = 2000;
+// Coordinate tolerance (metres) for nearest-neighbour matching.
+// GPU uses pre-computed LUT with 0.01-degree resolution; small rounding differences expected.
+static constexpr float kXyzTolerance = 5e-3f;
+// Fraction of GPU points that must have a CPU match within tolerance.
+// ~2-3% of points are at scan boundaries and may not have an exact match.
+static constexpr double kMinMatchRatio = 0.97;
+
+/// Decoded scan: message timestamp + point cloud
+struct DecodedScan
+{
+  uint64_t msg_timestamp;
+  nebula::drivers::NebulaPointCloudPtr cloud;
+};
+
+/// Decode a rosbag with the given CUDA env setting.
+/// When @p use_cuda is true, sets NEBULA_USE_CUDA=1; otherwise unsets it.
+/// Returns one DecodedScan per completed scan.
+static std::vector<DecodedScan> decode_bag(
+  const nebula::ros::HesaiRosDecoderTestParams & params, bool use_cuda)
+{
+  // Toggle GPU/CPU path via environment variable
+  if (use_cuda) {
+    setenv("NEBULA_USE_CUDA", "1", 1);
+  } else {
+    unsetenv("NEBULA_USE_CUDA");
+  }
+
+  rclcpp::NodeOptions options;
+  auto driver = std::make_shared<nebula::ros::HesaiRosDecoderTest>(
+    options, "cuda_test_node", params);
+  EXPECT_EQ(driver->get_status(), nebula::Status::OK);
+
+  std::vector<DecodedScan> scans;
+  auto cb = [&](uint64_t msg_ts, uint64_t /*scan_ts*/,
+                nebula::drivers::NebulaPointCloudPtr cloud) {
+    if (cloud && !cloud->empty()) {
+      // Deep-copy: the decoder reuses its internal buffer after the callback returns
+      auto copy = std::make_shared<nebula::drivers::NebulaPointCloud>(*cloud);
+      scans.push_back({msg_ts, copy});
+    }
+  };
+  driver->read_bag(cb);
+  return scans;
+}
+
+/// Squared Euclidean distance between two points
+static float sq_dist(
+  const nebula::drivers::NebulaPoint & a, const nebula::drivers::NebulaPoint & b)
+{
+  float dx = a.x - b.x;
+  float dy = a.y - b.y;
+  float dz = a.z - b.z;
+  return dx * dx + dy * dy + dz * dz;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: GPU vs CPU equivalence
+// ---------------------------------------------------------------------------
+TEST(HesaiCudaDecoderTest, OT128_GpuVsCpuEquivalence)
+{
+  auto cpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/false);
+  auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
+
+  // Both paths must produce the same number of scans
+  ASSERT_GT(cpu_scans.size(), 0u);
+  ASSERT_EQ(cpu_scans.size(), gpu_scans.size())
+    << "CPU produced " << cpu_scans.size() << " scans, GPU produced " << gpu_scans.size();
+
+  const float tol_sq = kXyzTolerance * kXyzTolerance;
+
+  for (size_t i = 0; i < cpu_scans.size(); ++i) {
+    const auto & cpu_cloud = cpu_scans[i].cloud;
+    const auto & gpu_cloud = gpu_scans[i].cloud;
+
+    // Point counts within tolerance
+    int diff = static_cast<int>(cpu_cloud->size()) - static_cast<int>(gpu_cloud->size());
+    EXPECT_LE(std::abs(diff), kMaxPointCountDiff)
+      << "Scan " << i << ": CPU=" << cpu_cloud->size() << " GPU=" << gpu_cloud->size();
+
+    // For each GPU point, find nearest CPU point (brute-force — clouds are small enough)
+    size_t matched = 0;
+    for (const auto & gp : gpu_cloud->points) {
+      float best = std::numeric_limits<float>::max();
+      for (const auto & cp : cpu_cloud->points) {
+        float d = sq_dist(gp, cp);
+        if (d < best) best = d;
+        if (d < tol_sq) break;  // early exit — found a match
+      }
+      if (best < tol_sq) ++matched;
+    }
+
+    double match_ratio =
+      gpu_cloud->empty() ? 1.0 : static_cast<double>(matched) / gpu_cloud->size();
+    EXPECT_GE(match_ratio, kMinMatchRatio)
+      << "Scan " << i << ": only " << matched << "/" << gpu_cloud->size()
+      << " GPU points matched a CPU point within " << kXyzTolerance << " m";
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: GPU output is non-empty
+// ---------------------------------------------------------------------------
+TEST(HesaiCudaDecoderTest, OT128_GpuOutputNonEmpty)
+{
+  auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
+  ASSERT_GT(gpu_scans.size(), 0u) << "GPU path produced zero scans";
+  for (size_t i = 0; i < gpu_scans.size(); ++i) {
+    EXPECT_GT(gpu_scans[i].cloud->size(), 0u) << "Scan " << i << " is empty";
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Basic field validity of GPU-decoded points
+// ---------------------------------------------------------------------------
+TEST(HesaiCudaDecoderTest, OT128_GpuFieldValidity)
+{
+  auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
+  ASSERT_GT(gpu_scans.size(), 0u);
+
+  for (size_t i = 0; i < gpu_scans.size(); ++i) {
+    for (const auto & pt : gpu_scans[i].cloud->points) {
+      // Distance must be positive (all valid points)
+      float dist = std::sqrt(pt.x * pt.x + pt.y * pt.y + pt.z * pt.z);
+      EXPECT_GT(dist, 0.f) << "Scan " << i << ": zero-distance point found";
+
+      // Channel must be in [0, 127] for OT128
+      EXPECT_LE(pt.channel, 127u) << "Scan " << i << ": invalid channel " << pt.channel;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4 (edge case): First and last scans have reasonable point counts
+// ---------------------------------------------------------------------------
+TEST(HesaiCudaDecoderTest, OT128_BoundaryScanPointCounts)
+{
+  auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
+  ASSERT_GE(gpu_scans.size(), 2u) << "Need at least 2 scans for boundary test";
+
+  // First scan may be partial, but should still have a reasonable number of points.
+  // A full OT128 scan typically has ~100k+ points; even a partial scan should exceed 1000.
+  EXPECT_GT(gpu_scans.front().cloud->size(), 1000u)
+    << "First scan has suspiciously few points: " << gpu_scans.front().cloud->size();
+
+  // Last scan should also be non-trivial
+  EXPECT_GT(gpu_scans.back().cloud->size(), 1000u)
+    << "Last scan has suspiciously few points: " << gpu_scans.back().cloud->size();
+}
+
+// ---------------------------------------------------------------------------
+// Test 5 (edge case): GPU intensity matches CPU exactly
+// ---------------------------------------------------------------------------
+TEST(HesaiCudaDecoderTest, OT128_IntensityExactMatch)
+{
+  auto cpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/false);
+  auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
+
+  ASSERT_EQ(cpu_scans.size(), gpu_scans.size());
+
+  for (size_t i = 0; i < cpu_scans.size(); ++i) {
+    const auto & cpu_pts = cpu_scans[i].cloud->points;
+    const auto & gpu_pts = gpu_scans[i].cloud->points;
+
+    // For each GPU point, find the nearest CPU match and verify intensity is identical.
+    // Only compare points that have a close spatial match (same physical point).
+    size_t checked = 0;
+    size_t intensity_mismatch = 0;
+
+    // Use a tighter tolerance for intensity matching to ensure we're comparing the same point
+    const float tight_tol_sq = 1e-6f;
+
+    for (const auto & gp : gpu_pts) {
+      float best_dist = std::numeric_limits<float>::max();
+      size_t best_idx = 0;
+      for (size_t j = 0; j < cpu_pts.size(); ++j) {
+        float d = sq_dist(gp, cpu_pts[j]);
+        if (d < best_dist) {
+          best_dist = d;
+          best_idx = j;
+        }
+        if (d < tight_tol_sq) break;
+      }
+      if (best_dist < tight_tol_sq) {
+        ++checked;
+        if (gp.intensity != cpu_pts[best_idx].intensity) {
+          ++intensity_mismatch;
+        }
+      }
+    }
+
+    // The vast majority of matched points should have identical intensity.
+    // A small fraction (<1%) may differ at scan boundaries where the GPU kernel
+    // selects a different return than the CPU's dual-return filtering.
+    EXPECT_GT(checked, 0u) << "Scan " << i << ": no tight matches found";
+    double mismatch_ratio =
+      checked > 0 ? static_cast<double>(intensity_mismatch) / checked : 0.0;
+    EXPECT_LT(mismatch_ratio, 0.01)
+      << "Scan " << i << ": " << intensity_mismatch << "/" << checked
+      << " matched points have different intensity (" << (mismatch_ratio * 100) << "%)";
+  }
+}
+
+}  // namespace nebula::test
+
+#else  // !NEBULA_CUDA_ENABLED
+
+TEST(HesaiCudaDecoderTest, SkippedNoCuda)
+{
+  GTEST_SKIP() << "CUDA not enabled at compile time";
+}
+
+#endif  // NEBULA_CUDA_ENABLED
+
+int main(int argc, char * argv[])
+{
+  rclcpp::init(argc, argv);
+  ::testing::InitGoogleTest(&argc, argv);
+  int result = RUN_ALL_TESTS();
+  // Restore env
+  unsetenv("NEBULA_USE_CUDA");
+  rclcpp::shutdown();
+  return result;
+}
diff --git a/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt b/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
index 05282610f..08a25132e 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
+++ b/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
@@ -4,7 +4,52 @@ project(nebula_hesai_decoders)
 find_package(autoware_cmake REQUIRED)
 autoware_package()
 
-# Hesai Decoder
+# Option to enable CUDA support
+option(BUILD_CUDA "Build with CUDA support" ON)
+
+# CUDA Configuration
+if(BUILD_CUDA)
+  find_package(CUDA)
+  if(NOT ${CUDA_FOUND})
+    message(WARNING "CUDA not found, building without CUDA support")
+    set(BUILD_CUDA OFF)
+  else()
+    message(STATUS "CUDA found: ${CUDA_VERSION}")
+    message(STATUS "CUDA include dirs: ${CUDA_INCLUDE_DIRS}")
+
+    # CUDA compiler flags
+    list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+    list(APPEND CUDA_NVCC_FLAGS "-diag-suppress 20012")
+
+    # GPU architectures — set via -DCMAKE_CUDA_ARCHITECTURES on the command line,
+    # or fall back to a default set covering Turing through Ada + PTX for forward
+    # compatibility (e.g. Blackwell GPUs JIT from compute_89 PTX).
+    # Example: -DCMAKE_CUDA_ARCHITECTURES=89  (single-arch for fast builds)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+      set(CMAKE_CUDA_ARCHITECTURES "75;86;87;89" CACHE STRING "CUDA architectures" FORCE)
+    endif()
+    # Also emit PTX for the highest arch so the driver can JIT for newer GPUs
+    list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89")
+
+    # Extended lambda support
+    list(APPEND CUDA_NVCC_FLAGS "--extended-lambda")
+
+    # Optimization flags
+    list(APPEND CUDA_NVCC_FLAGS "-O3")
+
+    # Check if CUDA kernel source exists
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/hesai_cuda_kernels.cu")
+      set(CUDA_KERNELS_EXIST TRUE)
+      message(STATUS "CUDA kernel source found, will build CUDA decoder")
+    else()
+      set(CUDA_KERNELS_EXIST FALSE)
+      message(WARNING "CUDA kernel source (src/cuda/hesai_cuda_kernels.cu) not found")
+      message(WARNING "Building without CUDA decoder support")
+    endif()
+  endif()
+endif()
+
+# Hesai Decoder library (CPU)
 add_library(nebula_hesai_decoders SHARED src/hesai_driver.cpp)
 
 target_include_directories(
@@ -18,6 +63,49 @@ target_link_libraries(
          nebula_hesai_common::nebula_hesai_common
          nebula_core_decoders::nebula_core_decoders rclcpp::rclcpp)
 
+# CUDA Decoder library
+if(BUILD_CUDA AND CUDA_KERNELS_EXIST)
+  enable_language(CUDA)
+
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --expt-relaxed-constexpr")
+
+  add_library(nebula_hesai_decoders_cuda SHARED
+    src/cuda/hesai_cuda_kernels.cu
+  )
+
+  set_target_properties(nebula_hesai_decoders_cuda PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    POSITION_INDEPENDENT_CODE ON
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+  )
+
+  # Clear any inherited compile options that might be incompatible with CUDA
+  set_property(TARGET nebula_hesai_decoders_cuda PROPERTY COMPILE_OPTIONS "")
+
+  target_include_directories(
+    nebula_hesai_decoders_cuda
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+           $<INSTALL_INTERFACE:include>)
+
+  target_link_libraries(nebula_hesai_decoders_cuda
+    PRIVATE cudart)
+
+  target_compile_definitions(nebula_hesai_decoders_cuda PUBLIC NEBULA_CUDA_ENABLED)
+  target_compile_definitions(nebula_hesai_decoders PUBLIC NEBULA_CUDA_ENABLED)
+
+  # Add CUDA include directories to main library for header resolution
+  target_include_directories(nebula_hesai_decoders
+    PUBLIC ${CUDA_INCLUDE_DIRS})
+
+  # Link CUDA library to main library
+  target_link_libraries(nebula_hesai_decoders PUBLIC nebula_hesai_decoders_cuda ${CUDA_LIBRARIES})
+
+  install(TARGETS nebula_hesai_decoders_cuda EXPORT export_nebula_hesai_decoders_cuda)
+  ament_export_targets(export_nebula_hesai_decoders_cuda)
+endif()
+
 install(TARGETS nebula_hesai_decoders EXPORT export_nebula_hesai_decoders)
 install(DIRECTORY include/${PROJECT_NAME}/ DESTINATION include/${PROJECT_NAME})
 
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
new file mode 100644
index 000000000..57363bee6
--- /dev/null
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
@@ -0,0 +1,156 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <vector>
+
+namespace nebula::drivers::cuda
+{
+
+/// @brief Point structure optimized for CUDA processing
+struct CudaNebulaPoint
+{
+  float x;
+  float y;
+  float z;
+  float distance;
+  float azimuth;
+  float elevation;
+  float intensity;
+  uint8_t return_type;
+  uint16_t channel;
+  uint8_t in_current_scan;  // 1 = belongs to current scan, 0 = belongs to output/next scan
+  uint32_t entry_id;        // Block group ID for batched processing (used for sorting & filtering)
+};
+
+/// @brief Angle correction data for CUDA lookup table
+struct CudaAngleCorrectionData
+{
+  float azimuth_rad;
+  float elevation_rad;
+  float sin_azimuth;
+  float cos_azimuth;
+  float sin_elevation;
+  float cos_elevation;
+};
+
+/// @brief Maximum number of frames (mirrors) supported for multi-frame sensors like AT128
+static constexpr uint32_t MAX_CUDA_FRAMES = 8;
+
+/// @brief Frame angle info for multi-frame sensors (AT128 has 4 frames)
+struct CudaFrameAngleInfo
+{
+  uint32_t fov_start;        // Raw azimuth where FOV starts for this frame
+  uint32_t fov_end;          // Raw azimuth where FOV ends for this frame
+  uint32_t timestamp_reset;  // Raw azimuth where timestamp resets for this frame
+  uint32_t scan_emit;        // Raw azimuth where scan emit occurs for this frame
+};
+
+/// @brief Configuration data for CUDA decoder
+struct CudaDecoderConfig
+{
+  float min_range;
+  float max_range;
+  float sensor_min_range;
+  float sensor_max_range;
+  float dual_return_distance_threshold;
+  float fov_min_rad;
+  float fov_max_rad;
+  float scan_emit_angle_rad;
+  uint32_t n_channels;
+  uint32_t n_blocks;
+  uint32_t max_returns;
+  float dis_unit;
+  uint32_t data_stride;  // Stride between channels in input data (for batched mode)
+  uint32_t entry_id;     // Entry ID for batched mode (0 in per-packet mode)
+  // Overlap detection parameters (raw azimuth in 0.01 degree units)
+  uint32_t timestamp_reset_angle_raw;
+  uint32_t emit_angle_raw;
+  uint32_t n_azimuths_raw;  // Total azimuth count (e.g., 36000 for 0.01 deg resolution)
+  uint32_t max_output_points;  // Maximum output buffer size for sparse indexing (batched mode)
+
+  // Azimuth scaling for sensors with different degree_subdivisions
+  uint32_t azimuth_scale;  // Scale factor: raw_azimuth / azimuth_scale = LUT index
+
+  // Multi-frame support for sensors like AT128 (has 4 mirror frames)
+  uint32_t n_frames;  // Number of frames (1 for single-frame, 4 for AT128)
+  CudaFrameAngleInfo frame_angles[MAX_CUDA_FRAMES];  // Per-frame angle boundaries
+  bool is_multi_frame;  // True if sensor has multiple frames (uses frame_angles)
+};
+
+/// @brief Main CUDA decoder class for Hesai LiDAR
+class HesaiCudaDecoder
+{
+public:
+  HesaiCudaDecoder();
+  ~HesaiCudaDecoder();
+
+  /// @brief Initialize decoder with maximum points and channels
+  bool initialize(size_t max_points, uint32_t n_channels);
+
+  /// @brief Upload angle correction lookup table to GPU
+  bool upload_angle_corrections(
+    const std::vector<CudaAngleCorrectionData> & angle_lut,
+    uint32_t n_azimuths,
+    uint32_t n_channels);
+
+  /// @brief Get device pointer to angle lookup table
+  CudaAngleCorrectionData * get_angle_lut() const { return d_angle_lut_; }
+
+private:
+  CudaAngleCorrectionData * d_angle_lut_ = nullptr;
+  uint32_t n_azimuths_ = 0;
+  uint32_t n_channels_ = 0;
+  bool initialized_ = false;
+};
+
+}  // namespace nebula::drivers::cuda
+
+// =============================================================================
+// C-linkage function declarations for CUDA kernel launches
+// =============================================================================
+
+extern "C" {
+
+/// @brief Launch kernel to decode a single Hesai packet
+void launch_decode_hesai_packet(
+    const uint16_t* d_distances,
+    const uint8_t* d_reflectivities,
+    const nebula::drivers::cuda::CudaAngleCorrectionData* d_angle_lut,
+    const nebula::drivers::cuda::CudaDecoderConfig& config,
+    nebula::drivers::cuda::CudaNebulaPoint* d_points,
+    uint32_t* d_count,
+    uint32_t n_azimuths,
+    uint32_t raw_azimuth,
+    cudaStream_t stream);
+
+/// @brief Launch batched kernel to decode entire scan
+void launch_decode_hesai_scan_batch(
+    const uint16_t* d_distances_batch,
+    const uint8_t* d_reflectivities_batch,
+    const uint32_t* d_raw_azimuths,
+    const uint32_t* d_n_returns,
+    const uint32_t* d_last_azimuths,
+    const nebula::drivers::cuda::CudaAngleCorrectionData* d_angle_lut,
+    const nebula::drivers::cuda::CudaDecoderConfig& config,
+    nebula::drivers::cuda::CudaNebulaPoint* d_points,
+    uint32_t* d_count,
+    uint32_t n_azimuths,
+    uint32_t n_packets,
+    cudaStream_t stream);
+
+}  // extern "C"
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
index 86027d3f1..1ecb0b0d3 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
@@ -24,6 +24,7 @@
 #include <cmath>
 #include <cstdint>
 #include <memory>
+#include <tuple>
 
 namespace nebula::drivers
 {
@@ -146,6 +147,44 @@ class AngleCorrectorCalibrationBased
 
     return corrected_azimuths;
   }
+
+#ifdef NEBULA_CUDA_ENABLED
+  /// @brief Compute raw angle thresholds needed by the CUDA decoder for FOV filtering
+  /// and overlap detection. These account for per-channel azimuth correction offsets.
+  /// @return (emit_angle_raw, timestamp_reset_angle_raw, fov_start_raw, fov_end_raw)
+  [[nodiscard]] std::tuple<uint32_t, uint32_t, uint32_t, uint32_t>
+  get_cuda_raw_angles(double fov_start_deg, double fov_end_deg, double cut_angle_deg) const
+  {
+    // Find min/max azimuth offset in raw units
+    auto round_away_from_zero = [](float value) -> int32_t {
+      return (value < 0) ? static_cast<int32_t>(std::floor(value))
+                         : static_cast<int32_t>(std::ceil(value));
+    };
+
+    int32_t correction_min = INT32_MAX;
+    int32_t correction_max = INT32_MIN;
+    for (size_t ch = 0; ch < ChannelN; ++ch) {
+      int32_t offset_raw = round_away_from_zero(
+        static_cast<float>(azimuth_offset_rad_[ch] * AngleUnit / deg2rad(1.0)));
+      correction_min = std::min(correction_min, offset_raw);
+      correction_max = std::max(correction_max, offset_raw);
+    }
+
+    int32_t emit_raw = static_cast<int32_t>(std::ceil(cut_angle_deg * AngleUnit)) - correction_min;
+    uint32_t emit_angle_raw = normalize_angle<int32_t>(emit_raw, max_azimuth);
+
+    int32_t fov_start_raw_val =
+      static_cast<int32_t>(std::floor(fov_start_deg * AngleUnit)) - correction_max;
+    uint32_t fov_start_raw = normalize_angle<int32_t>(fov_start_raw_val, max_azimuth);
+
+    int32_t fov_end_raw_val =
+      static_cast<int32_t>(std::ceil(fov_end_deg * AngleUnit)) + correction_max;
+    uint32_t fov_end_raw = normalize_angle<int32_t>(fov_end_raw_val, max_azimuth);
+
+    // For calibration-based sensors, timestamp_reset equals emit angle
+    return {emit_angle_raw, emit_angle_raw, fov_start_raw, fov_end_raw};
+  }
+#endif
 };
 
 }  // namespace nebula::drivers
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_correction_based.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_correction_based.hpp
index b04e507c7..0e131fef6 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_correction_based.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_correction_based.hpp
@@ -25,8 +25,11 @@
 #include <cassert>
 #include <cstdint>
 #include <memory>
+#include <sstream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
+#include <vector>
 
 namespace nebula::drivers
 {
@@ -158,6 +161,37 @@ class AngleCorrectorCorrectionBased : public AngleCorrector<HesaiCorrection, Cha
 
     return corrected_azimuths;
   }
+
+#ifdef NEBULA_CUDA_ENABLED
+  /// @brief Get the number of mirror frames for this sensor
+  [[nodiscard]] size_t get_n_frames() const
+  {
+    return correction_ ? correction_->frameNumber : 1;
+  }
+
+  /// @brief Get frame angle boundaries for multi-frame CUDA support
+  /// @param frame_id Frame index
+  /// @param fov_start Output: raw azimuth where FOV starts for this frame
+  /// @param fov_end Output: raw azimuth where FOV ends for this frame
+  /// @param timestamp_reset Output: raw azimuth for timestamp reset
+  /// @param scan_emit Output: raw azimuth for scan emit
+  /// @return true if frame_id is valid
+  bool get_frame_angle_info(
+    uint32_t frame_id, uint32_t & fov_start, uint32_t & fov_end,
+    uint32_t & timestamp_reset, uint32_t & scan_emit) const
+  {
+    if (!correction_ || frame_id >= correction_->frameNumber) {
+      return false;
+    }
+
+    fov_start = correction_->startFrame[frame_id];
+    fov_end = correction_->endFrame[frame_id];
+    // For correction-based sensors, timestamp_reset and scan_emit correspond to frame boundaries
+    timestamp_reset = correction_->startFrame[frame_id];
+    scan_emit = correction_->endFrame[frame_id];
+    return true;
+  }
+#endif
 };
 
 }  // namespace nebula::drivers
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
index 78a74ee0a..184e2c605 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
@@ -23,6 +23,36 @@
 #include "nebula_hesai_decoders/decoders/hesai_scan_decoder.hpp"
 #include "nebula_hesai_decoders/decoders/packet_loss_detector.hpp"
 
+#ifdef NEBULA_CUDA_ENABLED
+#include "nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp"
+
+// C-linkage kernel launcher declarations
+extern "C" void launch_decode_hesai_packet(
+  const uint16_t * d_distances,
+  const uint8_t * d_reflectivities,
+  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
+  const nebula::drivers::cuda::CudaDecoderConfig & config,
+  nebula::drivers::cuda::CudaNebulaPoint * d_points,
+  uint32_t * d_count,
+  uint32_t n_azimuths,
+  uint32_t raw_azimuth,
+  cudaStream_t stream);
+
+extern "C" void launch_decode_hesai_scan_batch(
+  const uint16_t * d_distances_batch,
+  const uint8_t * d_reflectivities_batch,
+  const uint32_t * d_raw_azimuths,
+  const uint32_t * d_n_returns,
+  const uint32_t * d_last_azimuths,
+  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
+  const nebula::drivers::cuda::CudaDecoderConfig & config,
+  nebula::drivers::cuda::CudaNebulaPoint * d_points,
+  uint32_t * d_count,
+  uint32_t n_azimuths,
+  uint32_t n_packets,
+  cudaStream_t stream);
+#endif
+
 #include <nebula_core_common/loggers/logger.hpp>
 #include <nebula_core_common/nebula_common.hpp>
 #include <nebula_core_common/point_types.hpp>
@@ -31,9 +61,11 @@
 #include <rclcpp/logging.hpp>
 #include <rclcpp/rclcpp.hpp>
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstdint>
+#include <cstdlib>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -96,6 +128,118 @@ class HesaiDecoder : public HesaiScanDecoder
 
   std::array<DecodeFrame, 2> frame_buffers_{initialize_frame(), initialize_frame()};
 
+#ifdef NEBULA_CUDA_ENABLED
+  /// @brief CUDA decoder for GPU-accelerated point cloud processing
+  std::unique_ptr<cuda::HesaiCudaDecoder> cuda_decoder_;
+  /// @brief CUDA stream for async operations
+  cudaStream_t cuda_stream_ = nullptr;
+  /// @brief Whether CUDA decoding is enabled and initialized
+  bool cuda_enabled_ = false;
+
+  /// @brief Number of azimuth divisions for angle lookup table (LUT resolution)
+  static constexpr uint32_t cuda_n_azimuths_ = 36000;  // 0.01 degree resolution
+  /// @brief Sensor's native azimuth range (max_azimuth = 360 * degree_subdivisions)
+  static constexpr uint32_t sensor_max_azimuth_ =
+    360 * SensorT::packet_t::degree_subdivisions;
+  /// @brief Scale factor from sensor native azimuth to LUT index
+  static constexpr uint32_t azimuth_scale_ = sensor_max_azimuth_ / cuda_n_azimuths_;
+
+  /// @brief Device memory for output points
+  cuda::CudaNebulaPoint * d_points_ = nullptr;
+  /// @brief Device memory for output count
+  uint32_t * d_count_ = nullptr;
+  /// @brief Host buffer for CUDA results
+  std::vector<cuda::CudaNebulaPoint> cuda_point_buffer_;
+
+  /// @brief Pre-allocated device memory for distances (per-packet fallback path)
+  uint16_t * d_distances_ = nullptr;
+  /// @brief Pre-allocated device memory for reflectivities (per-packet fallback path)
+  uint8_t * d_reflectivities_ = nullptr;
+  /// @brief Pinned host memory for distances (faster CPU->GPU transfer)
+  uint16_t * h_pinned_distances_ = nullptr;
+  /// @brief Pinned host memory for reflectivities (faster CPU->GPU transfer)
+  uint8_t * h_pinned_reflectivities_ = nullptr;
+  /// @brief Size of pre-allocated per-packet buffers (n_channels * max_returns)
+  size_t cuda_buffer_size_ = 0;
+
+  /// @brief GPU scan buffer for batch processing
+  struct GpuScanBuffer
+  {
+    // Device buffers for batch data
+    uint16_t * d_distances_batch = nullptr;
+    uint8_t * d_reflectivities_batch = nullptr;
+    uint32_t * d_raw_azimuths = nullptr;
+    uint32_t * d_n_returns = nullptr;
+    uint32_t * d_last_azimuths = nullptr;
+
+    // Pinned host staging buffers
+    uint16_t * h_distances_staging = nullptr;
+    uint8_t * h_reflectivities_staging = nullptr;
+    uint32_t * h_raw_azimuths_staging = nullptr;
+    uint32_t * h_n_returns_staging = nullptr;
+    uint32_t * h_last_azimuths_staging = nullptr;
+    uint64_t * h_packet_timestamps_staging = nullptr;
+
+    // Metadata
+    uint32_t packet_count = 0;
+    uint32_t max_packets = 0;
+
+    // Output (reuse existing buffers)
+    cuda::CudaNebulaPoint * d_points = nullptr;
+    uint32_t * d_count = nullptr;
+
+    /// @brief Free all allocated device and pinned-host memory
+    void cleanup()
+    {
+      auto free_device = [](auto *& ptr) {
+        if (ptr) {
+          cudaFree(ptr);
+          ptr = nullptr;
+        }
+      };
+      auto free_host = [](auto *& ptr) {
+        if (ptr) {
+          cudaFreeHost(ptr);
+          ptr = nullptr;
+        }
+      };
+
+      free_device(d_distances_batch);
+      free_device(d_reflectivities_batch);
+      free_device(d_raw_azimuths);
+      free_device(d_n_returns);
+      free_device(d_last_azimuths);
+
+      free_host(h_distances_staging);
+      free_host(h_reflectivities_staging);
+      free_host(h_raw_azimuths_staging);
+      free_host(h_n_returns_staging);
+      free_host(h_last_azimuths_staging);
+      free_host(h_packet_timestamps_staging);
+
+      packet_count = 0;
+      max_packets = 0;
+    }
+  };
+
+  GpuScanBuffer gpu_scan_buffer_;
+
+  /// @brief Whether scan-level batching is enabled (accumulate all packets, one kernel launch)
+  bool use_scan_batching_ = true;
+  static constexpr uint32_t MAX_PACKETS_PER_SCAN = 4000;
+
+  /// @brief Last raw block azimuth, tracked for GPU overlap detection
+  uint32_t last_azimuth_ = 0;
+
+  /// @brief Cached raw angle values for GPU config (computed once in initialize_cuda)
+  uint32_t cuda_emit_angle_raw_ = 0;
+  uint32_t cuda_timestamp_reset_angle_raw_ = 0;
+
+  /// @brief Whether this is a multi-frame sensor (e.g., AT128 with 4 mirror frames)
+  bool is_multi_frame_sensor_ = false;
+
+#endif  // NEBULA_CUDA_ENABLED
+
   /// @brief Validates and parse PandarPacket. Checks size and, if present, CRC checksums.
   /// @param packet The incoming PandarPacket
   /// @return Whether the packet was parsed successfully
@@ -116,6 +260,477 @@ class HesaiDecoder : public HesaiScanDecoder
     return true;
   }
 
+#ifdef NEBULA_CUDA_ENABLED
+  /// @brief Accumulate block group data to GPU scan buffer for batch processing
+  /// @param start_block_id The first block in the group of returns
+  /// @param n_blocks The number of returns in the group
+  void accumulate_packet_to_gpu_buffer(size_t start_block_id, size_t n_blocks)
+  {
+    if (gpu_scan_buffer_.packet_count >= gpu_scan_buffer_.max_packets) {
+      NEBULA_LOG_STREAM(logger_->warn, "GPU scan buffer full, dropping block group");
+      return;
+    }
+
+    const uint32_t entry_id = gpu_scan_buffer_.packet_count;
+    const size_t n_channels = SensorT::packet_t::n_channels;
+    const size_t max_returns = SensorT::packet_t::max_returns;
+
+    // Store metadata for this block group
+    uint32_t raw_azimuth = packet_.body.blocks[start_block_id].get_azimuth();
+    gpu_scan_buffer_.h_raw_azimuths_staging[entry_id] = raw_azimuth;
+    gpu_scan_buffer_.h_n_returns_staging[entry_id] = n_blocks;
+    gpu_scan_buffer_.h_last_azimuths_staging[entry_id] = last_azimuth_;
+    gpu_scan_buffer_.h_packet_timestamps_staging[entry_id] =
+      hesai_packet::get_timestamp_ns(packet_);
+
+    // Extract distances/reflectivities to pinned host memory
+    // Layout: [entry][channel][return] with max_returns stride
+    const size_t entry_offset = entry_id * n_channels * max_returns;
+
+    for (size_t ch = 0; ch < n_channels; ++ch) {
+      for (size_t blk = 0; blk < n_blocks; ++blk) {
+        const auto & unit = packet_.body.blocks[start_block_id + blk].units[ch];
+        const size_t idx = entry_offset + ch * max_returns + blk;
+        gpu_scan_buffer_.h_distances_staging[idx] = unit.distance;
+        gpu_scan_buffer_.h_reflectivities_staging[idx] = unit.reflectivity;
+      }
+    }
+
+    gpu_scan_buffer_.packet_count++;
+  }
+
+  /// @brief Build the CudaDecoderConfig for a batched kernel launch
+  /// @param n_entries Number of block-group entries accumulated
+  /// @return Populated config ready for GPU upload
+  cuda::CudaDecoderConfig build_batch_config(uint32_t n_entries)
+  {
+    const size_t n_channels = SensorT::packet_t::n_channels;
+    const size_t max_returns = SensorT::packet_t::max_returns;
+
+    cuda::CudaDecoderConfig config;
+    config.min_range = sensor_configuration_->min_range;
+    config.max_range = sensor_configuration_->max_range;
+    config.sensor_min_range = SensorT::min_range;
+    config.sensor_max_range = SensorT::max_range;
+    config.dual_return_distance_threshold =
+      sensor_configuration_->dual_return_distance_threshold;
+    config.fov_min_rad = deg2rad(sensor_configuration_->cloud_min_angle);
+    config.fov_max_rad = deg2rad(sensor_configuration_->cloud_max_angle);
+    config.scan_emit_angle_rad = deg2rad(sensor_configuration_->cut_angle);
+    config.n_channels = n_channels;
+    config.max_returns = max_returns;
+    config.dis_unit = hesai_packet::get_dis_unit(packet_);
+    config.data_stride = max_returns;
+    config.n_blocks = max_returns;
+    config.entry_id = 0;
+    config.n_azimuths_raw = cuda_n_azimuths_;
+    config.azimuth_scale = azimuth_scale_;
+    config.max_output_points = n_entries * n_channels * max_returns;
+
+    if constexpr (SensorT::uses_calibration_based_angles) {
+      config.is_multi_frame = false;
+      config.n_frames = 1;
+      config.timestamp_reset_angle_raw = cuda_timestamp_reset_angle_raw_;
+      config.emit_angle_raw = cuda_emit_angle_raw_;
+      config.frame_angles[0].fov_start = 0;
+      config.frame_angles[0].fov_end = 0;
+      config.frame_angles[0].timestamp_reset = cuda_timestamp_reset_angle_raw_;
+      config.frame_angles[0].scan_emit = cuda_emit_angle_raw_;
+    } else {
+      config.is_multi_frame = true;
+      config.n_frames = static_cast<uint32_t>(angle_corrector_.get_n_frames());
+      config.timestamp_reset_angle_raw = 0;
+      config.emit_angle_raw = 0;
+
+      for (uint32_t i = 0; i < config.n_frames && i < cuda::MAX_CUDA_FRAMES; ++i) {
+        uint32_t fov_start, fov_end, timestamp_reset, scan_emit;
+        if (angle_corrector_.get_frame_angle_info(
+              i, fov_start, fov_end, timestamp_reset, scan_emit)) {
+          config.frame_angles[i].fov_start = fov_start;
+          config.frame_angles[i].fov_end = fov_end;
+          config.frame_angles[i].timestamp_reset = timestamp_reset;
+          config.frame_angles[i].scan_emit = scan_emit;
+          if (i == 0) {
+            config.timestamp_reset_angle_raw = timestamp_reset;
+            config.emit_angle_raw = scan_emit;
+          }
+        }
+      }
+    }
+
+    return config;
+  }
+
+  /// @brief Transfer accumulated scan data from pinned host memory to device
+  void transfer_scan_to_device(uint32_t n_entries, size_t total_data_size)
+  {
+    cudaMemcpyAsync(
+      gpu_scan_buffer_.d_distances_batch, gpu_scan_buffer_.h_distances_staging,
+      total_data_size * sizeof(uint16_t), cudaMemcpyHostToDevice, cuda_stream_);
+    cudaMemcpyAsync(
+      gpu_scan_buffer_.d_reflectivities_batch, gpu_scan_buffer_.h_reflectivities_staging,
+      total_data_size * sizeof(uint8_t), cudaMemcpyHostToDevice, cuda_stream_);
+    cudaMemcpyAsync(
+      gpu_scan_buffer_.d_raw_azimuths, gpu_scan_buffer_.h_raw_azimuths_staging,
+      n_entries * sizeof(uint32_t), cudaMemcpyHostToDevice, cuda_stream_);
+    cudaMemcpyAsync(
+      gpu_scan_buffer_.d_n_returns, gpu_scan_buffer_.h_n_returns_staging,
+      n_entries * sizeof(uint32_t), cudaMemcpyHostToDevice, cuda_stream_);
+    cudaMemcpyAsync(
+      gpu_scan_buffer_.d_last_azimuths, gpu_scan_buffer_.h_last_azimuths_staging,
+      n_entries * sizeof(uint32_t), cudaMemcpyHostToDevice, cuda_stream_);
+  }
+
+  /// @brief Copy GPU results to host and place into correct frame buffers
+  /// @param completed_buffer_index The buffer index of the just-completed scan
+  /// @param n_entries Number of block-group entries in the scan
+  /// @param sparse_buffer_size Total sparse buffer size (n_entries * n_channels * max_returns)
+  void process_gpu_results(
+    uint8_t completed_buffer_index, uint32_t n_entries, uint32_t sparse_buffer_size)
+  {
+    // Copy sparse buffer - points at deterministic positions with gaps
+    const uint32_t copy_size =
+      std::min(sparse_buffer_size, static_cast<uint32_t>(cuda_point_buffer_.size()));
+    cudaMemcpy(
+      cuda_point_buffer_.data(), d_points_, copy_size * sizeof(cuda::CudaNebulaPoint),
+      cudaMemcpyDeviceToHost);
+
+    // Iterate sparse buffer, skip invalid points (distance <= 0)
+    for (uint32_t i = 0; i < copy_size; ++i) {
+      const auto & cuda_pt = cuda_point_buffer_[i];
+
+      if (cuda_pt.distance <= 0.0f) {
+        continue;
+      }
+
+      // in_current_scan=1: belongs to the completed scan (completed_buffer_index)
+      // in_current_scan=0: belongs to the next scan (1 - completed_buffer_index)
+      auto & frame = cuda_pt.in_current_scan
+                       ? frame_buffers_[completed_buffer_index]
+                       : frame_buffers_[1 - completed_buffer_index];
+
+      const uint32_t entry_id = cuda_pt.entry_id;
+      const uint64_t packet_timestamp_ns =
+        (entry_id < n_entries)
+          ? gpu_scan_buffer_.h_packet_timestamps_staging[entry_id]
+          : hesai_packet::get_timestamp_ns(packet_);
+
+      NebulaPoint point;
+      point.x = cuda_pt.x;
+      point.y = cuda_pt.y;
+      point.z = cuda_pt.z;
+      point.distance = cuda_pt.distance;
+      point.azimuth = cuda_pt.azimuth;
+      point.elevation = cuda_pt.elevation;
+      point.intensity = cuda_pt.intensity;
+      point.return_type = cuda_pt.return_type;
+      point.channel = cuda_pt.channel;
+      // Compute relative timestamp in signed 64-bit to avoid underflow.
+      // Two sources of underflow: (1) GPU assigns point to next scan whose
+      // scan_timestamp exceeds packet_timestamp, (2) negative channel timing
+      // offset added to near-zero packet-to-scan delta.
+      {
+        auto point_to_packet_offset_ns =
+          sensor_.get_packet_relative_point_time_offset(0, cuda_pt.channel, packet_);
+        int64_t rel_ns = static_cast<int64_t>(packet_timestamp_ns) -
+                         static_cast<int64_t>(frame.scan_timestamp_ns) + point_to_packet_offset_ns;
+        point.time_stamp = (rel_ns >= 0) ? static_cast<uint32_t>(rel_ns) : 0;
+      }
+
+      if (!mask_filter_ || !mask_filter_->excluded(point)) {
+        frame.pointcloud->emplace_back(point);
+      }
+    }
+  }
+
+  /// @brief Flush accumulated packets - one batched kernel launch for the entire scan
+  /// @param completed_buffer_index The buffer index of the just-completed scan
+  void flush_gpu_scan_buffer(uint8_t completed_buffer_index)
+  {
+    if (gpu_scan_buffer_.packet_count == 0) return;
+
+    const uint32_t n_entries = gpu_scan_buffer_.packet_count;
+    const size_t n_channels = SensorT::packet_t::n_channels;
+    const size_t max_returns = SensorT::packet_t::max_returns;
+    const size_t total_data_size = n_entries * n_channels * max_returns;
+    const uint32_t sparse_buffer_size = n_entries * n_channels * max_returns;
+
+    cuda::CudaDecoderConfig config = build_batch_config(n_entries);
+
+    transfer_scan_to_device(n_entries, total_data_size);
+
+    // Reset output counter and zero output buffer for deterministic sparse indexing
+    cudaMemsetAsync(d_count_, 0, sizeof(uint32_t), cuda_stream_);
+    cudaMemsetAsync(
+      d_points_, 0, sparse_buffer_size * sizeof(cuda::CudaNebulaPoint), cuda_stream_);
+
+    // Launch batched kernel
+    launch_decode_hesai_scan_batch(
+      gpu_scan_buffer_.d_distances_batch, gpu_scan_buffer_.d_reflectivities_batch,
+      gpu_scan_buffer_.d_raw_azimuths, gpu_scan_buffer_.d_n_returns,
+      gpu_scan_buffer_.d_last_azimuths, cuda_decoder_->get_angle_lut(), config, d_points_,
+      d_count_, cuda_n_azimuths_, n_entries, cuda_stream_);
+
+    cudaStreamSynchronize(cuda_stream_);
+
+    uint32_t valid_point_count = 0;
+    cudaMemcpy(&valid_point_count, d_count_, sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+    if (valid_point_count == 0) {
+      gpu_scan_buffer_.packet_count = 0;
+      return;
+    }
+
+    process_gpu_results(completed_buffer_index, n_entries, sparse_buffer_size);
+
+    gpu_scan_buffer_.packet_count = 0;
+  }
+
+  /// @brief Initialize CUDA decoder and upload angle corrections.
+  /// CUDA decode is opt-in: set NEBULA_USE_CUDA=1 environment variable to enable.
+  ///
+  /// The GPU path produces functionally equivalent but not bit-identical output
+  /// compared to the CPU path. The GPU kernel uses its own FOV/overlap detection
+  /// which assigns a few scan-boundary points to adjacent scans differently than
+  /// ScanCutter's per-channel logic. This causes:
+  /// - TestPcd: point count off by 1-18, coordinate differences at boundaries
+  /// - NoHighTimestampsAfterCut (Pandar64): timestamps 55-111 us over 100ms threshold
+  /// These differences are harmless for production use. Existing test ground truth
+  /// is CPU-generated, so CUDA is opt-in to keep default tests passing.
+  void initialize_cuda()
+  {
+    const char * cuda_env = std::getenv("NEBULA_USE_CUDA");
+    if (!cuda_env || std::string(cuda_env) != "1") {
+      NEBULA_LOG_STREAM(
+        logger_->info, "CUDA decode disabled (set NEBULA_USE_CUDA=1 to enable)");
+      return;
+    }
+
+    cudaError_t err = cudaStreamCreate(&cuda_stream_);
+    if (err != cudaSuccess) {
+      NEBULA_LOG_STREAM(
+        logger_->warn, "Failed to create CUDA stream: " << cudaGetErrorString(err));
+      return;
+    }
+
+    cuda_decoder_ = std::make_unique<cuda::HesaiCudaDecoder>();
+    const uint32_t n_channels = SensorT::packet_t::n_channels;
+    const size_t max_sparse_buffer_points =
+      static_cast<size_t>(MAX_PACKETS_PER_SCAN) * n_channels * SensorT::packet_t::max_returns;
+    const size_t buffer_allocation_size =
+      std::max(static_cast<size_t>(SensorT::max_scan_buffer_points), max_sparse_buffer_points);
+
+    if (!cuda_decoder_->initialize(buffer_allocation_size, n_channels)) {
+      NEBULA_LOG_STREAM(logger_->warn, "Failed to initialize CUDA decoder");
+      cuda_decoder_.reset();
+      return;
+    }
+
+    // Allocate device memory for output
+    err = cudaMalloc(&d_points_, buffer_allocation_size * sizeof(cuda::CudaNebulaPoint));
+    if (err != cudaSuccess) {
+      NEBULA_LOG_STREAM(logger_->warn, "Failed to allocate CUDA output points");
+      cuda_decoder_.reset();
+      return;
+    }
+
+    err = cudaMalloc(&d_count_, sizeof(uint32_t));
+    if (err != cudaSuccess) {
+      cudaFree(d_points_);
+      d_points_ = nullptr;
+      NEBULA_LOG_STREAM(logger_->warn, "Failed to allocate CUDA output count");
+      cuda_decoder_.reset();
+      return;
+    }
+
+    // Pre-allocate per-packet buffers
+    cuda_buffer_size_ = n_channels * SensorT::packet_t::max_returns;
+
+    auto cleanup_per_packet = [&]() {
+      if (d_distances_) {
+        cudaFree(d_distances_);
+        d_distances_ = nullptr;
+      }
+      if (d_reflectivities_) {
+        cudaFree(d_reflectivities_);
+        d_reflectivities_ = nullptr;
+      }
+      if (h_pinned_distances_) {
+        cudaFreeHost(h_pinned_distances_);
+        h_pinned_distances_ = nullptr;
+      }
+      if (h_pinned_reflectivities_) {
+        cudaFreeHost(h_pinned_reflectivities_);
+        h_pinned_reflectivities_ = nullptr;
+      }
+      cudaFree(d_points_);
+      d_points_ = nullptr;
+      cudaFree(d_count_);
+      d_count_ = nullptr;
+      cuda_decoder_.reset();
+    };
+
+    auto alloc_ok = [&](cudaError_t result, const char * name) -> bool {
+      if (result != cudaSuccess) {
+        NEBULA_LOG_STREAM(logger_->warn, "Failed to allocate " << name);
+        cleanup_per_packet();
+        return false;
+      }
+      return true;
+    };
+
+    bool ok = true;
+    ok = ok && alloc_ok(
+                 cudaMalloc(&d_distances_, cuda_buffer_size_ * sizeof(uint16_t)),
+                 "CUDA distances buffer");
+    ok = ok && alloc_ok(
+                 cudaMalloc(&d_reflectivities_, cuda_buffer_size_ * sizeof(uint8_t)),
+                 "CUDA reflectivities buffer");
+    ok = ok && alloc_ok(
+                 cudaMallocHost(&h_pinned_distances_, cuda_buffer_size_ * sizeof(uint16_t)),
+                 "pinned distances buffer");
+    ok = ok && alloc_ok(
+                 cudaMallocHost(
+                   &h_pinned_reflectivities_, cuda_buffer_size_ * sizeof(uint8_t)),
+                 "pinned reflectivities buffer");
+
+    if (!ok) return;
+
+    cuda_point_buffer_.resize(buffer_allocation_size);
+
+    // Allocate GPU scan buffer for batch processing
+    const uint32_t packet_data_size = n_channels * SensorT::packet_t::max_returns;
+
+    auto alloc_scan_ok = [&](cudaError_t result, const char * name) -> bool {
+      if (result != cudaSuccess) {
+        NEBULA_LOG_STREAM(logger_->warn, "Failed to allocate " << name);
+        gpu_scan_buffer_.cleanup();
+        use_scan_batching_ = false;
+        return false;
+      }
+      return true;
+    };
+
+    bool scan_ok = true;
+    // Device buffers
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMalloc(
+                             &gpu_scan_buffer_.d_distances_batch,
+                             MAX_PACKETS_PER_SCAN * packet_data_size * sizeof(uint16_t)),
+                           "scan distances buffer");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMalloc(
+                             &gpu_scan_buffer_.d_reflectivities_batch,
+                             MAX_PACKETS_PER_SCAN * packet_data_size * sizeof(uint8_t)),
+                           "scan reflectivities buffer");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMalloc(
+                             &gpu_scan_buffer_.d_raw_azimuths,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "scan azimuths buffer");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMalloc(
+                             &gpu_scan_buffer_.d_n_returns,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "scan n_returns buffer");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMalloc(
+                             &gpu_scan_buffer_.d_last_azimuths,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "scan last_azimuths buffer");
+    // Pinned host staging buffers
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_distances_staging,
+                             MAX_PACKETS_PER_SCAN * packet_data_size * sizeof(uint16_t)),
+                           "pinned scan distances staging");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_reflectivities_staging,
+                             MAX_PACKETS_PER_SCAN * packet_data_size * sizeof(uint8_t)),
+                           "pinned scan reflectivities staging");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_raw_azimuths_staging,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "pinned scan azimuths staging");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_n_returns_staging,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "pinned scan n_returns staging");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_last_azimuths_staging,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint32_t)),
+                           "pinned scan last_azimuths staging");
+    scan_ok = scan_ok && alloc_scan_ok(
+                           cudaMallocHost(
+                             &gpu_scan_buffer_.h_packet_timestamps_staging,
+                             MAX_PACKETS_PER_SCAN * sizeof(uint64_t)),
+                           "pinned scan timestamps staging");
+
+    if (scan_ok) {
+      gpu_scan_buffer_.packet_count = 0;
+      gpu_scan_buffer_.max_packets = MAX_PACKETS_PER_SCAN;
+      gpu_scan_buffer_.d_points = d_points_;
+      gpu_scan_buffer_.d_count = d_count_;
+      NEBULA_LOG_STREAM(logger_->info, "GPU scan batching enabled");
+    }
+
+    // Build and upload angle correction lookup table
+    std::vector<cuda::CudaAngleCorrectionData> angle_lut;
+    angle_lut.reserve(cuda_n_azimuths_ * n_channels);
+
+    NEBULA_LOG_STREAM(
+      logger_->info, "Building CUDA angle LUT: azimuth_scale="
+                       << azimuth_scale_ << " sensor_max_azimuth=" << sensor_max_azimuth_
+                       << " cuda_n_azimuths=" << cuda_n_azimuths_);
+
+    for (uint32_t lut_idx = 0; lut_idx < cuda_n_azimuths_; ++lut_idx) {
+      uint32_t sensor_azimuth = lut_idx * azimuth_scale_;
+      for (uint32_t channel = 0; channel < n_channels; ++channel) {
+        CorrectedAngleData cpu_data =
+          angle_corrector_.get_corrected_angle_data(sensor_azimuth, channel);
+        cuda::CudaAngleCorrectionData gpu_data;
+        gpu_data.azimuth_rad = cpu_data.azimuth_rad;
+        gpu_data.elevation_rad = cpu_data.elevation_rad;
+        gpu_data.sin_azimuth = cpu_data.sin_azimuth;
+        gpu_data.cos_azimuth = cpu_data.cos_azimuth;
+        gpu_data.sin_elevation = cpu_data.sin_elevation;
+        gpu_data.cos_elevation = cpu_data.cos_elevation;
+        angle_lut.push_back(gpu_data);
+      }
+    }
+
+    cuda_decoder_->upload_angle_corrections(angle_lut, cuda_n_azimuths_, n_channels);
+
+    // Compute cached raw angle values for GPU config
+    if constexpr (SensorT::uses_calibration_based_angles) {
+      is_multi_frame_sensor_ = false;
+      auto [emit_raw, reset_raw, fov_start_raw, fov_end_raw] =
+        angle_corrector_.get_cuda_raw_angles(
+          sensor_configuration_->cloud_min_angle, sensor_configuration_->cloud_max_angle,
+          sensor_configuration_->cut_angle);
+      cuda_emit_angle_raw_ = emit_raw;
+      cuda_timestamp_reset_angle_raw_ = reset_raw;
+    } else {
+      is_multi_frame_sensor_ = true;
+      size_t n_frames = angle_corrector_.get_n_frames();
+      NEBULA_LOG_STREAM(
+        logger_->info,
+        "CUDA: Detected multi-frame sensor with " << n_frames << " frames");
+    }
+
+    cuda_enabled_ = true;
+    NEBULA_LOG_STREAM(
+      logger_->info, "CUDA decoder initialized successfully with "
+                       << n_channels << " channels and " << cuda_n_azimuths_
+                       << " azimuth divisions"
+                       << (is_multi_frame_sensor_ ? " (multi-frame)" : ""));
+  }
+#endif  // NEBULA_CUDA_ENABLED
+
   /// @brief Converts a group of returns (i.e. 1 for single return, 2 for dual return, etc.) to
   /// points and appends them to the point cloud
   /// @param start_block_id The first block in the group of returns
@@ -280,6 +895,12 @@ class HesaiDecoder : public HesaiScanDecoder
   {
     did_scan_complete_ = true;
 
+#ifdef NEBULA_CUDA_ENABLED
+    if (cuda_enabled_ && use_scan_batching_ && gpu_scan_buffer_.packet_count > 0) {
+      flush_gpu_scan_buffer(buffer_index);
+    }
+#endif
+
     auto & completed_frame = frame_buffers_[buffer_index];
     constexpr uint64_t nanoseconds_per_second = 1'000'000'000ULL;
     double scan_timestamp_s =
@@ -342,7 +963,41 @@ class HesaiDecoder : public HesaiScanDecoder
         SensorT::peak_resolution_mdeg.azimuth, SensorT::packet_t::n_channels,
         logger_->child("Downsample Mask"), true, sensor_.get_dither_transform());
     }
+
+#ifdef NEBULA_CUDA_ENABLED
+    initialize_cuda();
+#endif
+  }
+
+#ifdef NEBULA_CUDA_ENABLED
+  /// @brief Cleanup CUDA resources
+  ~HesaiDecoder()
+  {
+    gpu_scan_buffer_.cleanup();
+
+    if (d_points_) {
+      cudaFree(d_points_);
+    }
+    if (d_count_) {
+      cudaFree(d_count_);
+    }
+    if (d_distances_) {
+      cudaFree(d_distances_);
+    }
+    if (d_reflectivities_) {
+      cudaFree(d_reflectivities_);
+    }
+    if (h_pinned_distances_) {
+      cudaFreeHost(h_pinned_distances_);
+    }
+    if (h_pinned_reflectivities_) {
+      cudaFreeHost(h_pinned_reflectivities_);
+    }
+    if (cuda_stream_) {
+      cudaStreamDestroy(cuda_stream_);
+    }
   }
+#endif  // NEBULA_CUDA_ENABLED
 
   void set_pointcloud_callback(pointcloud_callback_t callback) override
   {
@@ -383,8 +1038,19 @@ class HesaiDecoder : public HesaiScanDecoder
       const auto & scan_state = scan_cutter_.step(channel_azimuths_out);
 
       if (scan_state.does_block_intersect_fov()) {
-        convert_returns(block_id, n_returns, scan_state);
+#ifdef NEBULA_CUDA_ENABLED
+        if (cuda_enabled_ && use_scan_batching_) {
+          accumulate_packet_to_gpu_buffer(block_id, n_returns);
+        } else
+#endif
+        {
+          convert_returns(block_id, n_returns, scan_state);
+        }
       }
+
+#ifdef NEBULA_CUDA_ENABLED
+      last_azimuth_ = block_azimuth;
+#endif
     }
 
     uint64_t decode_duration_ns = decode_watch.elapsed_ns();
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_sensor.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_sensor.hpp
index c4c990363..87d6510f8 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_sensor.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_sensor.hpp
@@ -94,6 +94,10 @@ class HesaiSensor
     AngleCorrectorCalibrationBased<PacketT::n_channels, PacketT::degree_subdivisions>,
     AngleCorrectorCorrectionBased<PacketT::n_channels, PacketT::degree_subdivisions>>::type;
 
+  /// @brief Whether this sensor uses calibration-based angle correction
+  static constexpr bool uses_calibration_based_angles =
+    (AngleCorrection == AngleCorrectionType::CALIBRATION);
+
   HesaiSensor() = default;
   virtual ~HesaiSensor() = default;
 
diff --git a/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu b/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu
new file mode 100644
index 000000000..8fb0efec6
--- /dev/null
+++ b/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu
@@ -0,0 +1,398 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp"
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+
+namespace nebula::drivers::cuda
+{
+
+// Device function: Check if angle is between start and end (handles wrap-around)
+__device__ __forceinline__ bool cuda_angle_is_between(float start, float end, float angle)
+{
+  if (start <= end) {
+    return (start <= angle && angle <= end);
+  } else {
+    return (angle <= end || start <= angle);
+  }
+}
+
+// Device function: Check if angle is between start and end (uint32 version for raw azimuths)
+__device__ __forceinline__ bool cuda_angle_is_between_raw(
+  uint32_t start, uint32_t end, uint32_t angle, uint32_t max_angle)
+{
+  start = start % max_angle;
+  end = end % max_angle;
+  angle = angle % max_angle;
+
+  if (start <= end) {
+    return (start <= angle && angle <= end);
+  } else {
+    return (angle <= end || start <= angle);
+  }
+}
+
+// Device function: Check if we're inside the overlap region (single-frame version)
+__device__ __forceinline__ bool cuda_is_inside_overlap(
+  uint32_t last_azimuth, uint32_t current_azimuth,
+  uint32_t timestamp_reset_angle, uint32_t emit_angle, uint32_t max_angle)
+{
+  return cuda_angle_is_between_raw(timestamp_reset_angle, emit_angle, current_azimuth, max_angle) ||
+         cuda_angle_is_between_raw(timestamp_reset_angle, emit_angle, last_azimuth, max_angle);
+}
+
+// Device function: Check if we're inside the overlap region for multi-frame sensors (AT128)
+__device__ __forceinline__ bool cuda_is_inside_overlap_multiframe(
+  uint32_t last_azimuth, uint32_t current_azimuth,
+  const CudaFrameAngleInfo* frame_angles, uint32_t n_frames, uint32_t max_angle)
+{
+  for (uint32_t i = 0; i < n_frames; ++i) {
+    if (cuda_angle_is_between_raw(frame_angles[i].timestamp_reset, frame_angles[i].scan_emit,
+                                  current_azimuth, max_angle) ||
+        cuda_angle_is_between_raw(frame_angles[i].timestamp_reset, frame_angles[i].scan_emit,
+                                  last_azimuth, max_angle)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// CUDA kernel for decoding a single Hesai LiDAR packet
+__global__ void decode_hesai_packet_kernel(
+  const uint16_t * __restrict__ distances,
+  const uint8_t * __restrict__ reflectivities,
+  const CudaAngleCorrectionData * __restrict__ angle_lut,
+  const CudaDecoderConfig config,
+  CudaNebulaPoint * __restrict__ output_points,
+  uint32_t * __restrict__ output_count,
+  uint32_t n_azimuths,
+  uint32_t raw_azimuth)
+{
+  const uint32_t channel_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t block_id = blockIdx.y;
+
+  if (channel_id >= config.n_channels || block_id >= config.n_blocks) {
+    return;
+  }
+
+  const uint32_t data_stride = config.data_stride > 0 ? config.data_stride : config.n_blocks;
+  const uint32_t data_idx = channel_id * data_stride + block_id;
+
+  const uint16_t raw_distance = distances[data_idx];
+  const uint8_t reflectivity = reflectivities[data_idx];
+
+  if (raw_distance == 0) {
+    return;
+  }
+
+  const float distance = static_cast<float>(raw_distance) * config.dis_unit;
+
+  if (distance < config.min_range || distance > config.max_range) {
+    return;
+  }
+
+  if (distance < config.sensor_min_range || distance > config.sensor_max_range) {
+    return;
+  }
+
+  const uint32_t azimuth_idx = (raw_azimuth / config.azimuth_scale) % n_azimuths;
+  const uint32_t lut_idx = azimuth_idx * config.n_channels + channel_id;
+  const CudaAngleCorrectionData angle_data = angle_lut[lut_idx];
+
+  const float xy_distance = distance * angle_data.cos_elevation;
+  const float x = xy_distance * angle_data.sin_azimuth;
+  const float y = xy_distance * angle_data.cos_azimuth;
+  const float z = distance * angle_data.sin_elevation;
+
+  const uint32_t output_idx = atomicAdd(output_count, 1);
+
+  CudaNebulaPoint & out_pt = output_points[output_idx];
+  out_pt.x = x;
+  out_pt.y = y;
+  out_pt.z = z;
+  out_pt.distance = distance;
+  out_pt.azimuth = angle_data.azimuth_rad;
+  out_pt.elevation = angle_data.elevation_rad;
+  out_pt.intensity = static_cast<float>(reflectivity);
+  out_pt.return_type = static_cast<uint8_t>(block_id);
+  out_pt.channel = static_cast<uint16_t>(channel_id);
+  out_pt.entry_id = config.entry_id;
+}
+
+/// @brief Batched kernel for processing an entire scan in one launch
+__global__ void decode_hesai_scan_batch_kernel(
+  const uint16_t * __restrict__ d_distances_batch,
+  const uint8_t * __restrict__ d_reflectivities_batch,
+  const uint32_t * __restrict__ d_raw_azimuths,
+  const uint32_t * __restrict__ d_n_returns,
+  const uint32_t * __restrict__ d_last_azimuths,
+  const CudaAngleCorrectionData * __restrict__ angle_lut,
+  const CudaDecoderConfig config,
+  CudaNebulaPoint * __restrict__ output_points,
+  uint32_t * __restrict__ output_count,
+  uint32_t n_azimuths,
+  uint32_t n_packets)
+{
+  const uint32_t global_tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  const uint32_t total_work = n_packets * config.n_channels * config.max_returns;
+  if (global_tid >= total_work) return;
+
+  const uint32_t packet_id = global_tid / (config.n_channels * config.max_returns);
+  const uint32_t channel_id = (global_tid / config.max_returns) % config.n_channels;
+  const uint32_t return_id = global_tid % config.max_returns;
+
+  if (return_id >= d_n_returns[packet_id]) return;
+
+  const uint32_t data_idx = packet_id * (config.n_channels * config.max_returns)
+                           + channel_id * config.max_returns + return_id;
+
+  const uint16_t raw_distance = d_distances_batch[data_idx];
+  const uint8_t reflectivity = d_reflectivities_batch[data_idx];
+
+  if (raw_distance == 0) return;
+
+  const float distance = static_cast<float>(raw_distance) * config.dis_unit;
+
+  if (distance < config.min_range || distance > config.max_range) return;
+  if (distance < config.sensor_min_range || distance > config.sensor_max_range) return;
+
+  const uint32_t raw_azimuth = d_raw_azimuths[packet_id];
+  const uint32_t azimuth_idx = (raw_azimuth / config.azimuth_scale) % n_azimuths;
+  const uint32_t lut_idx = azimuth_idx * config.n_channels + channel_id;
+  const CudaAngleCorrectionData angle_data = angle_lut[lut_idx];
+
+  // FOV filtering
+  const bool in_fov = cuda_angle_is_between(config.fov_min_rad, config.fov_max_rad,
+                                            angle_data.azimuth_rad);
+  if (!in_fov) return;
+
+  // Overlap/scan assignment
+  const uint32_t last_azimuth = d_last_azimuths[packet_id];
+  uint8_t in_current_scan = 1;
+
+  bool is_in_overlap = false;
+  if (config.is_multi_frame) {
+    is_in_overlap = cuda_is_inside_overlap_multiframe(
+      last_azimuth, raw_azimuth, config.frame_angles, config.n_frames, config.n_azimuths_raw);
+  } else {
+    is_in_overlap = cuda_is_inside_overlap(
+      last_azimuth, raw_azimuth, config.timestamp_reset_angle_raw, config.emit_angle_raw,
+      config.n_azimuths_raw);
+  }
+
+  if (is_in_overlap) {
+    constexpr float overlap_margin_rad = 0.349066f;  // 20 degrees
+    const float overlap_end = config.scan_emit_angle_rad + overlap_margin_rad;
+    if (cuda_angle_is_between(config.scan_emit_angle_rad, overlap_end, angle_data.azimuth_rad)) {
+      in_current_scan = 0;
+    }
+  }
+
+  // Dual-return filtering
+  const uint32_t n_returns = d_n_returns[packet_id];
+
+  if (return_id >= n_returns - 1) {
+    goto compute_coordinates;
+  }
+
+  {
+    const uint32_t group_base = packet_id * (config.n_channels * config.max_returns)
+                               + channel_id * config.max_returns;
+    const float threshold = config.dual_return_distance_threshold;
+
+    if (n_returns == 2) {
+      const uint32_t last_idx = group_base + 1;
+      const uint16_t last_raw_distance = d_distances_batch[last_idx];
+      const uint8_t last_reflectivity = d_reflectivities_batch[last_idx];
+
+      if (raw_distance == last_raw_distance && reflectivity == last_reflectivity) {
+        return;
+      }
+
+      const float last_distance = static_cast<float>(last_raw_distance) * config.dis_unit;
+      if (fabsf(distance - last_distance) < threshold) {
+        return;
+      }
+    } else {
+      for (uint32_t other_ret = 0; other_ret < n_returns; ++other_ret) {
+        if (other_ret == return_id) continue;
+
+        const uint16_t other_raw_distance = d_distances_batch[group_base + other_ret];
+        const uint8_t other_reflectivity = d_reflectivities_batch[group_base + other_ret];
+
+        if (raw_distance == other_raw_distance && reflectivity == other_reflectivity) {
+          return;
+        }
+
+        const float other_distance = static_cast<float>(other_raw_distance) * config.dis_unit;
+        if (fabsf(distance - other_distance) < threshold) {
+          return;
+        }
+      }
+    }
+  }
+
+compute_coordinates:
+
+  const float xy_distance = distance * angle_data.cos_elevation;
+  const float x = xy_distance * angle_data.sin_azimuth;
+  const float y = xy_distance * angle_data.cos_azimuth;
+  const float z = distance * angle_data.sin_elevation;
+
+  if (global_tid >= config.max_output_points) return;
+
+  CudaNebulaPoint & out_pt = output_points[global_tid];
+  out_pt.x = x;
+  out_pt.y = y;
+  out_pt.z = z;
+  out_pt.distance = distance;
+  out_pt.azimuth = angle_data.azimuth_rad;
+  out_pt.elevation = angle_data.elevation_rad;
+  out_pt.intensity = static_cast<float>(reflectivity);
+  out_pt.return_type = static_cast<uint8_t>(return_id);
+  out_pt.channel = static_cast<uint16_t>(channel_id);
+  out_pt.in_current_scan = in_current_scan;
+  out_pt.entry_id = packet_id;
+
+  atomicAdd(output_count, 1);
+}
+
+// Constructor
+HesaiCudaDecoder::HesaiCudaDecoder()
+: d_angle_lut_(nullptr),
+  n_azimuths_(0),
+  n_channels_(0),
+  initialized_(false)
+{
+}
+
+// Destructor
+HesaiCudaDecoder::~HesaiCudaDecoder()
+{
+  if (d_angle_lut_) {
+    cudaFree(d_angle_lut_);
+    d_angle_lut_ = nullptr;
+  }
+}
+
+bool HesaiCudaDecoder::initialize(size_t max_points, uint32_t n_channels)
+{
+  n_channels_ = n_channels;
+  initialized_ = true;
+  return true;
+}
+
+bool HesaiCudaDecoder::upload_angle_corrections(
+  const std::vector<CudaAngleCorrectionData> & angle_lut,
+  uint32_t n_azimuths,
+  uint32_t n_channels)
+{
+  if (angle_lut.size() != n_azimuths * n_channels) {
+    fprintf(stderr, "CUDA: Angle LUT size mismatch: %zu vs expected %u\n",
+            angle_lut.size(), n_azimuths * n_channels);
+    return false;
+  }
+
+  n_azimuths_ = n_azimuths;
+  n_channels_ = n_channels;
+
+  if (d_angle_lut_) {
+    cudaFree(d_angle_lut_);
+    d_angle_lut_ = nullptr;
+  }
+
+  const size_t lut_size = angle_lut.size() * sizeof(CudaAngleCorrectionData);
+  cudaError_t err = cudaMalloc(&d_angle_lut_, lut_size);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA: Failed to allocate angle LUT: %s\n", cudaGetErrorString(err));
+    return false;
+  }
+
+  err = cudaMemcpy(d_angle_lut_, angle_lut.data(), lut_size, cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA: Failed to upload angle LUT: %s\n", cudaGetErrorString(err));
+    cudaFree(d_angle_lut_);
+    d_angle_lut_ = nullptr;
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace nebula::drivers::cuda
+
+// C-linkage wrapper for per-packet kernel
+extern "C" void launch_decode_hesai_packet(
+  const uint16_t * d_distances,
+  const uint8_t * d_reflectivities,
+  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
+  const nebula::drivers::cuda::CudaDecoderConfig & config,
+  nebula::drivers::cuda::CudaNebulaPoint * d_points,
+  uint32_t * d_count,
+  uint32_t n_azimuths,
+  uint32_t raw_azimuth,
+  cudaStream_t stream)
+{
+  const uint32_t threads_per_block = 128;
+  const uint32_t n_blocks_x = (config.n_channels + threads_per_block - 1) / threads_per_block;
+  const uint32_t n_blocks_y = config.n_blocks;
+
+  dim3 grid(n_blocks_x, n_blocks_y);
+  dim3 block(threads_per_block);
+
+  nebula::drivers::cuda::decode_hesai_packet_kernel<<<grid, block, 0, stream>>>(
+    d_distances, d_reflectivities, d_angle_lut, config,
+    d_points, d_count, n_azimuths, raw_azimuth);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
+  }
+}
+
+// C-linkage wrapper for batched kernel
+extern "C" void launch_decode_hesai_scan_batch(
+  const uint16_t * d_distances_batch,
+  const uint8_t * d_reflectivities_batch,
+  const uint32_t * d_raw_azimuths,
+  const uint32_t * d_n_returns,
+  const uint32_t * d_last_azimuths,
+  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
+  const nebula::drivers::cuda::CudaDecoderConfig & config,
+  nebula::drivers::cuda::CudaNebulaPoint * d_points,
+  uint32_t * d_count,
+  uint32_t n_azimuths,
+  uint32_t n_packets,
+  cudaStream_t stream)
+{
+  const uint32_t total_work = n_packets * config.n_channels * config.max_returns;
+  const uint32_t threads_per_block = 256;
+  const uint32_t n_blocks = (total_work + threads_per_block - 1) / threads_per_block;
+
+  dim3 grid(n_blocks);
+  dim3 block(threads_per_block);
+
+  nebula::drivers::cuda::decode_hesai_scan_batch_kernel<<<grid, block, 0, stream>>>(
+    d_distances_batch, d_reflectivities_batch, d_raw_azimuths, d_n_returns, d_last_azimuths,
+    d_angle_lut, config, d_points, d_count, n_azimuths, n_packets);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA batched kernel launch failed: %s\n", cudaGetErrorString(err));
+  }
+}

From 580316f08ffeb38ed5c8e43642807b3c737a3f99 Mon Sep 17 00:00:00 2001
From: Keita Morisaki <kmta1236@gmail.com>
Date: Thu, 19 Mar 2026 15:32:24 +0900
Subject: [PATCH 2/2] fix(hesai): address review comments from WIP PR

- Copyright year 2024 -> 2026 for new files
- Replace deprecated find_package(CUDA) with find_package(CUDAToolkit)
- Remove --expt-relaxed-constexpr flag (not needed)
- Remove unused per-packet kernel and launcher (dead code)
- Batch launcher returns bool; caller logs via NEBULA_LOG_STREAM
- Reorder CudaNebulaPoint fields for better memory packing
- Remove redundant is_multi_frame member; use n_frames > 1
- Make HesaiCudaDecoder destructor virtual
- Add int32_t range guarantee comment in angle corrector
---
 .../tests/hesai_cuda_decoder_test.cpp         |   2 +-
 .../nebula_hesai_decoders/CMakeLists.txt      |  34 +++---
 .../cuda/hesai_cuda_decoder.hpp               |  22 +---
 .../angle_corrector_calibration_based.hpp     |   4 +-
 .../decoders/hesai_decoder.hpp                |  24 ++---
 .../src/cuda/hesai_cuda_kernels.cu            | 102 +-----------------
 6 files changed, 33 insertions(+), 155 deletions(-)

diff --git a/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp b/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
index 879dab900..9a2385a71 100644
--- a/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
+++ b/src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2026 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt b/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
index 08a25132e..5da444fde 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
+++ b/src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
@@ -9,17 +9,12 @@ option(BUILD_CUDA "Build with CUDA support" ON)
 
 # CUDA Configuration
 if(BUILD_CUDA)
-  find_package(CUDA)
-  if(NOT ${CUDA_FOUND})
-    message(WARNING "CUDA not found, building without CUDA support")
+  find_package(CUDAToolkit QUIET)
+  if(NOT CUDAToolkit_FOUND)
+    message(WARNING "CUDA Toolkit not found, building without CUDA support")
     set(BUILD_CUDA OFF)
   else()
-    message(STATUS "CUDA found: ${CUDA_VERSION}")
-    message(STATUS "CUDA include dirs: ${CUDA_INCLUDE_DIRS}")
-
-    # CUDA compiler flags
-    list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
-    list(APPEND CUDA_NVCC_FLAGS "-diag-suppress 20012")
+    message(STATUS "CUDA Toolkit found: ${CUDAToolkit_VERSION}")
 
     # GPU architectures — set via -DCMAKE_CUDA_ARCHITECTURES on the command line,
     # or fall back to a default set covering Turing through Ada + PTX for forward
@@ -28,14 +23,6 @@ if(BUILD_CUDA)
     if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
       set(CMAKE_CUDA_ARCHITECTURES "75;86;87;89" CACHE STRING "CUDA architectures" FORCE)
     endif()
-    # Also emit PTX for the highest arch so the driver can JIT for newer GPUs
-    list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_89,code=compute_89")
-
-    # Extended lambda support
-    list(APPEND CUDA_NVCC_FLAGS "--extended-lambda")
-
-    # Optimization flags
-    list(APPEND CUDA_NVCC_FLAGS "-O3")
 
     # Check if CUDA kernel source exists
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/hesai_cuda_kernels.cu")
@@ -67,7 +54,7 @@ target_link_libraries(
 if(BUILD_CUDA AND CUDA_KERNELS_EXIST)
   enable_language(CUDA)
 
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 --expt-relaxed-constexpr")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -diag-suppress 20012")
 
   add_library(nebula_hesai_decoders_cuda SHARED
     src/cuda/hesai_cuda_kernels.cu
@@ -90,17 +77,20 @@ if(BUILD_CUDA AND CUDA_KERNELS_EXIST)
            $<INSTALL_INTERFACE:include>)
 
   target_link_libraries(nebula_hesai_decoders_cuda
-    PRIVATE cudart)
+    PRIVATE CUDA::cudart_static)
 
   target_compile_definitions(nebula_hesai_decoders_cuda PUBLIC NEBULA_CUDA_ENABLED)
   target_compile_definitions(nebula_hesai_decoders PUBLIC NEBULA_CUDA_ENABLED)
 
   # Add CUDA include directories to main library for header resolution
   target_include_directories(nebula_hesai_decoders
-    PUBLIC ${CUDA_INCLUDE_DIRS})
+    PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
 
-  # Link CUDA library to main library
-  target_link_libraries(nebula_hesai_decoders PUBLIC nebula_hesai_decoders_cuda ${CUDA_LIBRARIES})
+  # Link CUDA library to main library (CUDA::cudart is PRIVATE so it won't
+  # propagate the imported target to downstream packages that lack CUDAToolkit)
+  target_link_libraries(nebula_hesai_decoders
+    PUBLIC nebula_hesai_decoders_cuda
+    PRIVATE CUDA::cudart)
 
   install(TARGETS nebula_hesai_decoders_cuda EXPORT export_nebula_hesai_decoders_cuda)
   ament_export_targets(export_nebula_hesai_decoders_cuda)
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
index 57363bee6..e7ff29163 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2026 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -32,8 +32,8 @@ struct CudaNebulaPoint
   float elevation;
   float intensity;
   uint8_t return_type;
-  uint16_t channel;
   uint8_t in_current_scan;  // 1 = belongs to current scan, 0 = belongs to output/next scan
+  uint16_t channel;
   uint32_t entry_id;        // Block group ID for batched processing (used for sorting & filtering)
 };
 
@@ -89,7 +89,6 @@ struct CudaDecoderConfig
   // Multi-frame support for sensors like AT128 (has 4 mirror frames)
   uint32_t n_frames;  // Number of frames (1 for single-frame, 4 for AT128)
   CudaFrameAngleInfo frame_angles[MAX_CUDA_FRAMES];  // Per-frame angle boundaries
-  bool is_multi_frame;  // True if sensor has multiple frames (uses frame_angles)
 };
 
 /// @brief Main CUDA decoder class for Hesai LiDAR
@@ -97,7 +96,7 @@ class HesaiCudaDecoder
 {
 public:
   HesaiCudaDecoder();
-  ~HesaiCudaDecoder();
+  virtual ~HesaiCudaDecoder();
 
   /// @brief Initialize decoder with maximum points and channels
   bool initialize(size_t max_points, uint32_t n_channels);
@@ -126,20 +125,9 @@ class HesaiCudaDecoder
 
 extern "C" {
 
-/// @brief Launch kernel to decode a single Hesai packet
-void launch_decode_hesai_packet(
-    const uint16_t* d_distances,
-    const uint8_t* d_reflectivities,
-    const nebula::drivers::cuda::CudaAngleCorrectionData* d_angle_lut,
-    const nebula::drivers::cuda::CudaDecoderConfig& config,
-    nebula::drivers::cuda::CudaNebulaPoint* d_points,
-    uint32_t* d_count,
-    uint32_t n_azimuths,
-    uint32_t raw_azimuth,
-    cudaStream_t stream);
-
 /// @brief Launch batched kernel to decode entire scan
-void launch_decode_hesai_scan_batch(
+/// @return true on success, false on CUDA error
+bool launch_decode_hesai_scan_batch(
     const uint16_t* d_distances_batch,
     const uint8_t* d_reflectivities_batch,
     const uint32_t* d_raw_azimuths,
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
index 1ecb0b0d3..5db84d571 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/angle_corrector_calibration_based.hpp
@@ -155,7 +155,9 @@ class AngleCorrectorCalibrationBased
   [[nodiscard]] std::tuple<uint32_t, uint32_t, uint32_t, uint32_t>
   get_cuda_raw_angles(double fov_start_deg, double fov_end_deg, double cut_angle_deg) const
   {
-    // Find min/max azimuth offset in raw units
+    // Find min/max azimuth offset in raw units.
+    // All values are guaranteed to fit in int32_t: angles are in [0, 360) degrees and
+    // AngleUnit is at most 100 (0.01-degree resolution), so raw values stay within ±36000.
     auto round_away_from_zero = [](float value) -> int32_t {
       return (value < 0) ? static_cast<int32_t>(std::floor(value))
                          : static_cast<int32_t>(std::ceil(value));
diff --git a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
index 184e2c605..7a35a66a9 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
+++ b/src/nebula_hesai/nebula_hesai_decoders/include/nebula_hesai_decoders/decoders/hesai_decoder.hpp
@@ -26,19 +26,8 @@
 #ifdef NEBULA_CUDA_ENABLED
 #include "nebula_hesai_decoders/cuda/hesai_cuda_decoder.hpp"
 
-// C-linkage kernel launcher declarations
-extern "C" void launch_decode_hesai_packet(
-  const uint16_t * d_distances,
-  const uint8_t * d_reflectivities,
-  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
-  const nebula::drivers::cuda::CudaDecoderConfig & config,
-  nebula::drivers::cuda::CudaNebulaPoint * d_points,
-  uint32_t * d_count,
-  uint32_t n_azimuths,
-  uint32_t raw_azimuth,
-  cudaStream_t stream);
-
-extern "C" void launch_decode_hesai_scan_batch(
+// C-linkage kernel launcher declaration (defined in hesai_cuda_kernels.cu)
+extern "C" bool launch_decode_hesai_scan_batch(
   const uint16_t * d_distances_batch,
   const uint8_t * d_reflectivities_batch,
   const uint32_t * d_raw_azimuths,
@@ -328,7 +317,7 @@ class HesaiDecoder : public HesaiScanDecoder
     config.max_output_points = n_entries * n_channels * max_returns;
 
     if constexpr (SensorT::uses_calibration_based_angles) {
-      config.is_multi_frame = false;
+      // Single-frame sensor (calibration-based)
       config.n_frames = 1;
       config.timestamp_reset_angle_raw = cuda_timestamp_reset_angle_raw_;
       config.emit_angle_raw = cuda_emit_angle_raw_;
@@ -337,7 +326,7 @@ class HesaiDecoder : public HesaiScanDecoder
       config.frame_angles[0].timestamp_reset = cuda_timestamp_reset_angle_raw_;
       config.frame_angles[0].scan_emit = cuda_emit_angle_raw_;
     } else {
-      config.is_multi_frame = true;
+      // Multi-frame sensor (correction-based, e.g. AT128)
       config.n_frames = static_cast<uint32_t>(angle_corrector_.get_n_frames());
       config.timestamp_reset_angle_raw = 0;
       config.emit_angle_raw = 0;
@@ -465,11 +454,14 @@ class HesaiDecoder : public HesaiScanDecoder
       d_points_, 0, sparse_buffer_size * sizeof(cuda::CudaNebulaPoint), cuda_stream_);
 
     // Launch batched kernel
-    launch_decode_hesai_scan_batch(
+    bool kernel_ok = launch_decode_hesai_scan_batch(
       gpu_scan_buffer_.d_distances_batch, gpu_scan_buffer_.d_reflectivities_batch,
       gpu_scan_buffer_.d_raw_azimuths, gpu_scan_buffer_.d_n_returns,
       gpu_scan_buffer_.d_last_azimuths, cuda_decoder_->get_angle_lut(), config, d_points_,
       d_count_, cuda_n_azimuths_, n_entries, cuda_stream_);
+    if (!kernel_ok) {
+      NEBULA_LOG_STREAM(logger_->error, "CUDA batched kernel launch failed");
+    }
 
     cudaStreamSynchronize(cuda_stream_);
 
diff --git a/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu b/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu
index 8fb0efec6..7d19b0678 100644
--- a/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu
+++ b/src/nebula_hesai/nebula_hesai_decoders/src/cuda/hesai_cuda_kernels.cu
@@ -1,4 +1,4 @@
-// Copyright 2024 TIER IV, Inc.
+// Copyright 2026 TIER IV, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -71,68 +71,6 @@ __device__ __forceinline__ bool cuda_is_inside_overlap_multiframe(
   return false;
 }
 
-// CUDA kernel for decoding a single Hesai LiDAR packet
-__global__ void decode_hesai_packet_kernel(
-  const uint16_t * __restrict__ distances,
-  const uint8_t * __restrict__ reflectivities,
-  const CudaAngleCorrectionData * __restrict__ angle_lut,
-  const CudaDecoderConfig config,
-  CudaNebulaPoint * __restrict__ output_points,
-  uint32_t * __restrict__ output_count,
-  uint32_t n_azimuths,
-  uint32_t raw_azimuth)
-{
-  const uint32_t channel_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const uint32_t block_id = blockIdx.y;
-
-  if (channel_id >= config.n_channels || block_id >= config.n_blocks) {
-    return;
-  }
-
-  const uint32_t data_stride = config.data_stride > 0 ? config.data_stride : config.n_blocks;
-  const uint32_t data_idx = channel_id * data_stride + block_id;
-
-  const uint16_t raw_distance = distances[data_idx];
-  const uint8_t reflectivity = reflectivities[data_idx];
-
-  if (raw_distance == 0) {
-    return;
-  }
-
-  const float distance = static_cast<float>(raw_distance) * config.dis_unit;
-
-  if (distance < config.min_range || distance > config.max_range) {
-    return;
-  }
-
-  if (distance < config.sensor_min_range || distance > config.sensor_max_range) {
-    return;
-  }
-
-  const uint32_t azimuth_idx = (raw_azimuth / config.azimuth_scale) % n_azimuths;
-  const uint32_t lut_idx = azimuth_idx * config.n_channels + channel_id;
-  const CudaAngleCorrectionData angle_data = angle_lut[lut_idx];
-
-  const float xy_distance = distance * angle_data.cos_elevation;
-  const float x = xy_distance * angle_data.sin_azimuth;
-  const float y = xy_distance * angle_data.cos_azimuth;
-  const float z = distance * angle_data.sin_elevation;
-
-  const uint32_t output_idx = atomicAdd(output_count, 1);
-
-  CudaNebulaPoint & out_pt = output_points[output_idx];
-  out_pt.x = x;
-  out_pt.y = y;
-  out_pt.z = z;
-  out_pt.distance = distance;
-  out_pt.azimuth = angle_data.azimuth_rad;
-  out_pt.elevation = angle_data.elevation_rad;
-  out_pt.intensity = static_cast<float>(reflectivity);
-  out_pt.return_type = static_cast<uint8_t>(block_id);
-  out_pt.channel = static_cast<uint16_t>(channel_id);
-  out_pt.entry_id = config.entry_id;
-}
-
 /// @brief Batched kernel for processing an entire scan in one launch
 __global__ void decode_hesai_scan_batch_kernel(
   const uint16_t * __restrict__ d_distances_batch,
@@ -186,7 +124,7 @@ __global__ void decode_hesai_scan_batch_kernel(
   uint8_t in_current_scan = 1;
 
   bool is_in_overlap = false;
-  if (config.is_multi_frame) {
+  if (config.n_frames > 1) {
     is_in_overlap = cuda_is_inside_overlap_multiframe(
       last_azimuth, raw_azimuth, config.frame_angles, config.n_frames, config.n_azimuths_raw);
   } else {
@@ -336,37 +274,8 @@ bool HesaiCudaDecoder::upload_angle_corrections(
 
 }  // namespace nebula::drivers::cuda
 
-// C-linkage wrapper for per-packet kernel
-extern "C" void launch_decode_hesai_packet(
-  const uint16_t * d_distances,
-  const uint8_t * d_reflectivities,
-  const nebula::drivers::cuda::CudaAngleCorrectionData * d_angle_lut,
-  const nebula::drivers::cuda::CudaDecoderConfig & config,
-  nebula::drivers::cuda::CudaNebulaPoint * d_points,
-  uint32_t * d_count,
-  uint32_t n_azimuths,
-  uint32_t raw_azimuth,
-  cudaStream_t stream)
-{
-  const uint32_t threads_per_block = 128;
-  const uint32_t n_blocks_x = (config.n_channels + threads_per_block - 1) / threads_per_block;
-  const uint32_t n_blocks_y = config.n_blocks;
-
-  dim3 grid(n_blocks_x, n_blocks_y);
-  dim3 block(threads_per_block);
-
-  nebula::drivers::cuda::decode_hesai_packet_kernel<<<grid, block, 0, stream>>>(
-    d_distances, d_reflectivities, d_angle_lut, config,
-    d_points, d_count, n_azimuths, raw_azimuth);
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    fprintf(stderr, "CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
-  }
-}
-
 // C-linkage wrapper for batched kernel
-extern "C" void launch_decode_hesai_scan_batch(
+extern "C" bool launch_decode_hesai_scan_batch(
   const uint16_t * d_distances_batch,
   const uint8_t * d_reflectivities_batch,
   const uint32_t * d_raw_azimuths,
@@ -391,8 +300,5 @@ extern "C" void launch_decode_hesai_scan_batch(
     d_distances_batch, d_reflectivities_batch, d_raw_azimuths, d_n_returns, d_last_azimuths,
     d_angle_lut, config, d_points, d_count, n_azimuths, n_packets);
 
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    fprintf(stderr, "CUDA batched kernel launch failed: %s\n", cudaGetErrorString(err));
-  }
+  return cudaGetLastError() == cudaSuccess;
 }