Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/nebula_hesai/nebula_hesai/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,19 @@ if(BUILD_TESTING)
nebula_hesai_decoders::nebula_hesai_decoders
diagnostic_updater::diagnostic_updater
pcl_io)

# CUDA decoder equivalence test (only when CUDA support is available)
ament_add_gtest(hesai_cuda_decoder_test_main
tests/hesai_cuda_decoder_test.cpp)
target_include_directories(hesai_cuda_decoder_test_main PRIVATE tests)
target_link_libraries(
hesai_cuda_decoder_test_main
hesai_ros_decoder_test
nebula_core_common::nebula_core_common
nebula_hesai_common::nebula_hesai_common
nebula_hesai_decoders::nebula_hesai_decoders
diagnostic_updater::diagnostic_updater
pcl_io)
endif()

if(BUILD_EXAMPLES)
Expand Down
278 changes: 278 additions & 0 deletions src/nebula_hesai/nebula_hesai/tests/hesai_cuda_decoder_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
// Copyright 2026 TIER IV, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/// @file hesai_cuda_decoder_test.cpp
/// @brief GPU-vs-CPU equivalence tests for the Hesai CUDA decoder (OT128 / Pandar128E4X).
///
/// Decodes the same rosbag with the CPU path (NEBULA_USE_CUDA unset) and the GPU path
/// (NEBULA_USE_CUDA=1), then compares the resulting point clouds with tolerances.
/// The GPU path differs at scan boundaries due to different overlap/FOV detection logic
/// in the GPU kernel vs the CPU's ScanCutter.
///
/// Tolerances were derived from a single OT128 rosbag (ot128/1730271167765338806):
/// - Scan 0: CPU=70275, GPU=72125 (diff=1850)
/// - Scan 1: CPU=72240, GPU=72185 (diff=55)
/// If additional rosbags show larger boundary diffs, tolerances may need adjustment.

#include "hesai_ros_decoder_test.hpp"

#include <rclcpp/rclcpp.hpp>

#include <gtest/gtest.h>

#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <memory>
#include <string>
#include <vector>

#ifdef NEBULA_CUDA_ENABLED

namespace nebula::test
{

// OT128 config — matches TEST_CONFIGS[8] in hesai_ros_decoder_test_main.cpp
static const nebula::ros::HesaiRosDecoderTestParams OT128_CONFIG = {
"Pandar128E4X", "LastStrongest", "Pandar128E4X.csv", "ot128/1730271167765338806",
"hesai", 0, 0.0, 0., 360., 0.3f, 300.f};

// Maximum allowed difference in point count between GPU and CPU per scan.
// The GPU kernel assigns scan-boundary points differently than the CPU's ScanCutter,
// causing up to ~2000 points to shift between adjacent scans.
// Observed max diff: 1850 (scan 0 of test rosbag). Set to 2000 for ~8% headroom.
static constexpr int kMaxPointCountDiff = 2000;
// Coordinate tolerance (metres) for nearest-neighbour matching.
// GPU uses pre-computed LUT with 0.01-degree resolution; small rounding differences expected.
static constexpr float kXyzTolerance = 5e-3f;
// Fraction of GPU points that must have a CPU match within tolerance.
// ~2-3% of points are at scan boundaries and may not have an exact match.
static constexpr double kMinMatchRatio = 0.97;

/// Decoded scan: message timestamp + point cloud
struct DecodedScan
{
uint64_t msg_timestamp;
nebula::drivers::NebulaPointCloudPtr cloud;
};

/// Decode a rosbag with the given CUDA env setting.
/// When @p use_cuda is true, sets NEBULA_USE_CUDA=1; otherwise unsets it.
/// Returns one DecodedScan per completed scan.
static std::vector<DecodedScan> decode_bag(
const nebula::ros::HesaiRosDecoderTestParams & params, bool use_cuda)
{
// Toggle GPU/CPU path via environment variable
if (use_cuda) {
setenv("NEBULA_USE_CUDA", "1", 1);
} else {
unsetenv("NEBULA_USE_CUDA");
}

rclcpp::NodeOptions options;
auto driver = std::make_shared<nebula::ros::HesaiRosDecoderTest>(
options, "cuda_test_node", params);
EXPECT_EQ(driver->get_status(), nebula::Status::OK);

std::vector<DecodedScan> scans;
auto cb = [&](uint64_t msg_ts, uint64_t /*scan_ts*/,
nebula::drivers::NebulaPointCloudPtr cloud) {
if (cloud && !cloud->empty()) {
// Deep-copy: the decoder reuses its internal buffer after the callback returns
auto copy = std::make_shared<nebula::drivers::NebulaPointCloud>(*cloud);
scans.push_back({msg_ts, copy});
}
};
driver->read_bag(cb);
return scans;
}

/// Squared Euclidean distance between two points
static float sq_dist(
const nebula::drivers::NebulaPoint & a, const nebula::drivers::NebulaPoint & b)
{
float dx = a.x - b.x;
float dy = a.y - b.y;
float dz = a.z - b.z;
return dx * dx + dy * dy + dz * dz;
}

// ---------------------------------------------------------------------------
// Test 1: GPU vs CPU equivalence
// ---------------------------------------------------------------------------
TEST(HesaiCudaDecoderTest, OT128_GpuVsCpuEquivalence)
{
auto cpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/false);
auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);

// Both paths must produce the same number of scans
ASSERT_GT(cpu_scans.size(), 0u);
ASSERT_EQ(cpu_scans.size(), gpu_scans.size())
<< "CPU produced " << cpu_scans.size() << " scans, GPU produced " << gpu_scans.size();

const float tol_sq = kXyzTolerance * kXyzTolerance;

for (size_t i = 0; i < cpu_scans.size(); ++i) {
const auto & cpu_cloud = cpu_scans[i].cloud;
const auto & gpu_cloud = gpu_scans[i].cloud;

// Point counts within tolerance
int diff = static_cast<int>(cpu_cloud->size()) - static_cast<int>(gpu_cloud->size());
EXPECT_LE(std::abs(diff), kMaxPointCountDiff)
<< "Scan " << i << ": CPU=" << cpu_cloud->size() << " GPU=" << gpu_cloud->size();

// For each GPU point, find nearest CPU point (brute-force — clouds are small enough)
size_t matched = 0;
for (const auto & gp : gpu_cloud->points) {
float best = std::numeric_limits<float>::max();
for (const auto & cp : cpu_cloud->points) {
float d = sq_dist(gp, cp);
if (d < best) best = d;
if (d < tol_sq) break; // early exit — found a match
}
if (best < tol_sq) ++matched;
}

double match_ratio =
gpu_cloud->empty() ? 1.0 : static_cast<double>(matched) / gpu_cloud->size();
EXPECT_GE(match_ratio, kMinMatchRatio)
<< "Scan " << i << ": only " << matched << "/" << gpu_cloud->size()
<< " GPU points matched a CPU point within " << kXyzTolerance << " m";
}
}

// ---------------------------------------------------------------------------
// Test 2: GPU output is non-empty
// ---------------------------------------------------------------------------
TEST(HesaiCudaDecoderTest, OT128_GpuOutputNonEmpty)
{
auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
ASSERT_GT(gpu_scans.size(), 0u) << "GPU path produced zero scans";
for (size_t i = 0; i < gpu_scans.size(); ++i) {
EXPECT_GT(gpu_scans[i].cloud->size(), 0u) << "Scan " << i << " is empty";
}
}

// ---------------------------------------------------------------------------
// Test 3: Basic field validity of GPU-decoded points
// ---------------------------------------------------------------------------
TEST(HesaiCudaDecoderTest, OT128_GpuFieldValidity)
{
auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
ASSERT_GT(gpu_scans.size(), 0u);

for (size_t i = 0; i < gpu_scans.size(); ++i) {
for (const auto & pt : gpu_scans[i].cloud->points) {
// Distance must be positive (all valid points)
float dist = std::sqrt(pt.x * pt.x + pt.y * pt.y + pt.z * pt.z);
EXPECT_GT(dist, 0.f) << "Scan " << i << ": zero-distance point found";

// Channel must be in [0, 127] for OT128
EXPECT_LE(pt.channel, 127u) << "Scan " << i << ": invalid channel " << pt.channel;
}
}
}

// ---------------------------------------------------------------------------
// Test 4 (edge case): First and last scans have reasonable point counts
// ---------------------------------------------------------------------------
TEST(HesaiCudaDecoderTest, OT128_BoundaryScanPointCounts)
{
auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);
ASSERT_GE(gpu_scans.size(), 2u) << "Need at least 2 scans for boundary test";

// First scan may be partial, but should still have a reasonable number of points.
// A full OT128 scan typically has ~100k+ points; even a partial scan should exceed 1000.
EXPECT_GT(gpu_scans.front().cloud->size(), 1000u)
<< "First scan has suspiciously few points: " << gpu_scans.front().cloud->size();

// Last scan should also be non-trivial
EXPECT_GT(gpu_scans.back().cloud->size(), 1000u)
<< "Last scan has suspiciously few points: " << gpu_scans.back().cloud->size();
}

// ---------------------------------------------------------------------------
// Test 5 (edge case): GPU intensity matches CPU exactly
// ---------------------------------------------------------------------------
TEST(HesaiCudaDecoderTest, OT128_IntensityExactMatch)
{
auto cpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/false);
auto gpu_scans = decode_bag(OT128_CONFIG, /*use_cuda=*/true);

ASSERT_EQ(cpu_scans.size(), gpu_scans.size());

for (size_t i = 0; i < cpu_scans.size(); ++i) {
const auto & cpu_pts = cpu_scans[i].cloud->points;
const auto & gpu_pts = gpu_scans[i].cloud->points;

// For each GPU point, find the nearest CPU match and verify intensity is identical.
// Only compare points that have a close spatial match (same physical point).
size_t checked = 0;
size_t intensity_mismatch = 0;

// Use a tighter tolerance for intensity matching to ensure we're comparing the same point
const float tight_tol_sq = 1e-6f;

for (const auto & gp : gpu_pts) {
float best_dist = std::numeric_limits<float>::max();
size_t best_idx = 0;
for (size_t j = 0; j < cpu_pts.size(); ++j) {
float d = sq_dist(gp, cpu_pts[j]);
if (d < best_dist) {
best_dist = d;
best_idx = j;
}
if (d < tight_tol_sq) break;
}
if (best_dist < tight_tol_sq) {
++checked;
if (gp.intensity != cpu_pts[best_idx].intensity) {
++intensity_mismatch;
}
}
}

// The vast majority of matched points should have identical intensity.
// A small fraction (<1%) may differ at scan boundaries where the GPU kernel
// selects a different return than the CPU's dual-return filtering.
EXPECT_GT(checked, 0u) << "Scan " << i << ": no tight matches found";
double mismatch_ratio =
checked > 0 ? static_cast<double>(intensity_mismatch) / checked : 0.0;
EXPECT_LT(mismatch_ratio, 0.01)
<< "Scan " << i << ": " << intensity_mismatch << "/" << checked
<< " matched points have different intensity (" << (mismatch_ratio * 100) << "%)";
}
}

} // namespace nebula::test

#else // !NEBULA_CUDA_ENABLED

TEST(HesaiCudaDecoderTest, SkippedNoCuda)
{
GTEST_SKIP() << "CUDA not enabled at compile time";
}

#endif // NEBULA_CUDA_ENABLED

int main(int argc, char * argv[])
{
rclcpp::init(argc, argv);
::testing::InitGoogleTest(&argc, argv);
int result = RUN_ALL_TESTS();
// Restore env
unsetenv("NEBULA_USE_CUDA");
rclcpp::shutdown();
return result;
}
80 changes: 79 additions & 1 deletion src/nebula_hesai/nebula_hesai_decoders/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,39 @@ project(nebula_hesai_decoders)
find_package(autoware_cmake REQUIRED)
autoware_package()

# Hesai Decoder
# Option to enable CUDA support
option(BUILD_CUDA "Build with CUDA support" ON)

# CUDA Configuration
if(BUILD_CUDA)
find_package(CUDAToolkit QUIET)
if(NOT CUDAToolkit_FOUND)
message(WARNING "CUDA Toolkit not found, building without CUDA support")
set(BUILD_CUDA OFF)
else()
message(STATUS "CUDA Toolkit found: ${CUDAToolkit_VERSION}")

# GPU architectures — set via -DCMAKE_CUDA_ARCHITECTURES on the command line,
# or fall back to a default set covering Turing through Ada + PTX for forward
# compatibility (e.g. Blackwell GPUs JIT from compute_89 PTX).
# Example: -DCMAKE_CUDA_ARCHITECTURES=89 (single-arch for fast builds)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
set(CMAKE_CUDA_ARCHITECTURES "75;86;87;89" CACHE STRING "CUDA architectures" FORCE)
endif()

# Check if CUDA kernel source exists
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/hesai_cuda_kernels.cu")
set(CUDA_KERNELS_EXIST TRUE)
message(STATUS "CUDA kernel source found, will build CUDA decoder")
else()
set(CUDA_KERNELS_EXIST FALSE)
message(WARNING "CUDA kernel source (src/cuda/hesai_cuda_kernels.cu) not found")
message(WARNING "Building without CUDA decoder support")
endif()
endif()
endif()

# Hesai Decoder library (CPU)
add_library(nebula_hesai_decoders SHARED src/hesai_driver.cpp)

target_include_directories(
Expand All @@ -18,6 +50,52 @@ target_link_libraries(
nebula_hesai_common::nebula_hesai_common
nebula_core_decoders::nebula_core_decoders rclcpp::rclcpp)

# CUDA Decoder library
if(BUILD_CUDA AND CUDA_KERNELS_EXIST)
enable_language(CUDA)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -diag-suppress 20012")

add_library(nebula_hesai_decoders_cuda SHARED
src/cuda/hesai_cuda_kernels.cu
)

set_target_properties(nebula_hesai_decoders_cuda PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
POSITION_INDEPENDENT_CODE ON
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
)

# Clear any inherited compile options that might be incompatible with CUDA
set_property(TARGET nebula_hesai_decoders_cuda PROPERTY COMPILE_OPTIONS "")

target_include_directories(
nebula_hesai_decoders_cuda
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)

target_link_libraries(nebula_hesai_decoders_cuda
PRIVATE CUDA::cudart_static)

target_compile_definitions(nebula_hesai_decoders_cuda PUBLIC NEBULA_CUDA_ENABLED)
target_compile_definitions(nebula_hesai_decoders PUBLIC NEBULA_CUDA_ENABLED)

# Add CUDA include directories to main library for header resolution
target_include_directories(nebula_hesai_decoders
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})

# Link CUDA library to main library (CUDA::cudart is PRIVATE so it won't
# propagate the imported target to downstream packages that lack CUDAToolkit)
target_link_libraries(nebula_hesai_decoders
PUBLIC nebula_hesai_decoders_cuda
PRIVATE CUDA::cudart)

install(TARGETS nebula_hesai_decoders_cuda EXPORT export_nebula_hesai_decoders_cuda)
ament_export_targets(export_nebula_hesai_decoders_cuda)
endif()

install(TARGETS nebula_hesai_decoders EXPORT export_nebula_hesai_decoders)
install(DIRECTORY include/${PROJECT_NAME}/ DESTINATION include/${PROJECT_NAME})

Expand Down
Loading
Loading