diff --git a/.apollo.bazelrc b/.apollo.bazelrc
old mode 100644
new mode 100755
index 3792c50..2f28c99
--- a/.apollo.bazelrc
+++ b/.apollo.bazelrc
@@ -1,4 +1,24 @@
-build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
-build --action_env PYTHON_LIB_PATH="/usr/lib/python3/dist-packages"
-build --python_path="/usr/bin/python3"
-# build --config=tensorrt
+startup --output_user_root="/apollo/.cache/bazel"
+common --distdir="/apollo/.cache/distdir"
+
+build --action_env PYTHON_BIN_PATH="/usr/bin/python3.6"
+build --action_env PYTHON_LIB_PATH="/usr/local/lib/python3.6/dist-packages"
+build --python_path="/usr/bin/python3.6"
+build:gpu --config=cuda
+build:gpu --config=tensorrt
+
+build --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.2"
+build --action_env TF_CUDA_COMPUTE_CAPABILITIES="3.7,5.2,6.0,6.1,7.0,7.2,7.5"
+build --action_env GCC_HOST_COMPILER_PATH="/usr/bin/x86_64-linux-gnu-gcc-7"
+
+# This config refers to building with CUDA available.
+build:using_cuda --define=using_cuda=true
+build:using_cuda --action_env TF_NEED_CUDA=1
+build:using_cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+
+# This config refers to building CUDA with nvcc.
+build:cuda --config=using_cuda
+build:cuda --define=using_cuda_nvcc=true
+
+build:tensorrt --action_env TF_NEED_TENSORRT=1
+build:teleop --define WITH_TELEOP=true
diff --git a/.cache/distdir/README.md b/.cache/distdir/README.md
deleted file mode 100644
index 45013f3..0000000
--- a/.cache/distdir/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# Directory used as bazel `--distdir` destination
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 459ae80..f6c183c 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -21,4 +21,4 @@ jobs:
- name: Pull and Create Container
run: bash docker/scripts/cyber_start.sh
- name: Build
- run: docker exec -u $USER apollo_cyber_$USER bash -c "./apollo.sh build"
+ run: docker exec -u $USER apollo_cyber_$USER bash -c "./apollo.sh build_gpu"
diff --git a/README.md b/README.md
index 45d24b3..d33a0f7 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ build all modules
```bash
-bash apollo.sh build
+bash apollo.sh build_gpu
```
diff --git a/docker/scripts/cyber_start.sh b/docker/scripts/cyber_start.sh
index 9b41d12..a416b61 100755
--- a/docker/scripts/cyber_start.sh
+++ b/docker/scripts/cyber_start.sh
@@ -20,13 +20,15 @@ source "${APOLLO_ROOT_DIR}/scripts/apollo.bashrc"
# CACHE_ROOT_DIR="${APOLLO_ROOT_DIR}/.cache"
-VERSION_X86_64="diamond-auto-x86_64-18.04-20201125_1739"
+VERSION_X86_64="diamond-auto-x86_64-18.04-20210119_1643"
VERSION_AARCH64="diamond-auto-aarch64-18.04-20200925_1309"
VERSION_LOCAL_CYBER="local_cyber_dev"
CYBER_CONTAINER="apollo_cyber_${USER}"
CYBER_INSIDE="in_cyber_docker"
DOCKER_REPO="hotzoneauto2020/diamond-auto"
+#DOCKER_REPO="sunzengpeng/ubuntu_docker"
+#DOCKER_REPO="apolloauto/apollo"
DOCKER_RUN_CMD="docker run"
DOCKER_PULL_CMD="docker pull"
diff --git a/modules/control/BUILD b/modules/control/BUILD
index 15092ab..0a7ecf5 100644
--- a/modules/control/BUILD
+++ b/modules/control/BUILD
@@ -30,6 +30,7 @@ cc_library(
"//modules/drivers/proto:wheelangle_cc_proto",
"//modules/drivers/proto:parking_cc_proto",
"//modules/common/adapters:adapter_gflags",
+ "//modules/perception/proto:obst_box_cc_proto",
"@com_github_gflags_gflags//:gflags",
],
)
diff --git a/modules/drivers/robosense/BUILD b/modules/drivers/robosense/BUILD
index ebe9f64..33f69e8 100755
--- a/modules/drivers/robosense/BUILD
+++ b/modules/drivers/robosense/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = ["//visibility:public"])
cc_binary(
name = "librobosense_driver_component.so",
linkopts = ["-shared"],
+ linkshared = True,
linkstatic = False,
deps = [":driver"],
)
diff --git a/modules/drivers/velodyne/driver/BUILD b/modules/drivers/velodyne/driver/BUILD
index a925855..7c7ca00 100644
--- a/modules/drivers/velodyne/driver/BUILD
+++ b/modules/drivers/velodyne/driver/BUILD
@@ -6,6 +6,7 @@ package(default_visibility = ["//visibility:public"])
cc_binary(
name = "libvelodyne_driver_component.so",
linkopts = ["-shared"],
+ linkshared = True,
linkstatic = False,
deps = [":velodyne_driver_component_lib"],
)
diff --git a/modules/perception/conf/lidar_poindcloud_conf.pb.txt b/modules/perception/conf/lidar_poindcloud_conf.pb.txt
new file mode 100644
index 0000000..51c6d5f
--- /dev/null
+++ b/modules/perception/conf/lidar_poindcloud_conf.pb.txt
@@ -0,0 +1,4 @@
+obst_output_channel: "/diamond/perception/Obstacles"
+save_pcd_file: false
+save_pcd_path_front: "/apollo/modules/perception/data/lidar_front.pcd"
+save_pcd_path_rear: "/apollo/modules/perception/data/lidar_rear.pcd"
\ No newline at end of file
diff --git a/modules/perception/conf/tracker_config.pb.txt b/modules/perception/conf/tracker_config.pb.txt
new file mode 100644
index 0000000..bf98a70
--- /dev/null
+++ b/modules/perception/conf/tracker_config.pb.txt
@@ -0,0 +1,24 @@
+name: "HmObjectTracker"
+version: "1.1.0"
+matcher_method: HUNGARIAN_MATCHER
+filter_method: KALMAN_FILTER
+track_cached_history_size_maximum: 5
+track_consecutive_invisible_maximum: 1
+track_visible_ratio_minimum: 0.6
+collect_age_minimum: 0
+collect_consecutive_invisible_maximum: 0
+acceleration_noise_maximum: 5
+speed_noise_maximum: 0.4
+match_distance_maximum: 4.0
+location_distance_weight: 0.6
+direction_distance_weight: 0.2
+bbox_size_distance_weight: 0.1
+point_num_distance_weight: 0.1
+histogram_distance_weight: 0.5
+histogram_bin_size: 10
+use_adaptive: true
+measurement_noise: 0.4
+initial_velocity_noise: 5.0
+xy_propagation_noise: 10.0
+z_propagation_noise: 10.0
+breakdown_threshold_maximum: 10.0
\ No newline at end of file
diff --git a/modules/perception/dag/lidar_point_pillars.dag b/modules/perception/dag/lidar_point_pillars.dag
new file mode 100755
index 0000000..673ced5
--- /dev/null
+++ b/modules/perception/dag/lidar_point_pillars.dag
@@ -0,0 +1,17 @@
+# Define all coms in DAG streaming.
+module_config {
+ module_library : "/apollo/bazel-bin/modules/perception/lidar_point_pillars/point_pillars_detection.so"
+ components {
+ class_name : "PointPillarsDetection"
+ config {
+ name : "point_detect"
+ config_file_path: "/apollo/modules/perception/conf/lidar_poindcloud_conf.pb.txt"
+ readers {
+ channel: "/diamond/sensor/lidar/front/PointCloud2"
+ }
+ readers {
+ channel: "/diamond/sensor/lidar/rear/PointCloud2"
+ }
+ }
+ }
+}
diff --git a/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.dag b/modules/perception/dag/lidar_pointcloudcluster.dag
similarity index 76%
rename from modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.dag
rename to modules/perception/dag/lidar_pointcloudcluster.dag
index adf7ee9..8289558 100644
--- a/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.dag
+++ b/modules/perception/dag/lidar_pointcloudcluster.dag
@@ -2,9 +2,10 @@
module_config {
module_library : "/apollo/bazel-bin/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.so"
components {
- class_name : "lidar_pointcloudcluster"
+ class_name : "Lidar_pointcloudcluster"
config {
name : "lidar_front"
+ config_file_path: "/apollo/modules/perception/conf/lidar_poindcloud_conf.pb.txt"
readers {
channel: "/diamond/sensor/lidar/front/PointCloud2"
}
diff --git a/modules/perception/data/obst_record.00000 b/modules/perception/data/obst_record.00000
new file mode 100644
index 0000000..ac6d93e
Binary files /dev/null and b/modules/perception/data/obst_record.00000 differ
diff --git a/modules/perception/data/Screenshot from 2020-09-30 10-22-45.png b/modules/perception/data/pointcloudcluster_screenshot1.png
old mode 100644
new mode 100755
similarity index 100%
rename from modules/perception/data/Screenshot from 2020-09-30 10-22-45.png
rename to modules/perception/data/pointcloudcluster_screenshot1.png
diff --git a/modules/perception/data/Screenshot from 2020-09-30 10-22-57.png b/modules/perception/data/pointcloudcluster_screenshot2.png
old mode 100644
new mode 100755
similarity index 100%
rename from modules/perception/data/Screenshot from 2020-09-30 10-22-57.png
rename to modules/perception/data/pointcloudcluster_screenshot2.png
diff --git a/modules/perception/data/Screenshot from 2020-09-30 10-23-18.png b/modules/perception/data/pointcloudcluster_screenshot3.png
old mode 100644
new mode 100755
similarity index 100%
rename from modules/perception/data/Screenshot from 2020-09-30 10-23-18.png
rename to modules/perception/data/pointcloudcluster_screenshot3.png
diff --git a/modules/perception/launch/lidar_point_pillars.launch b/modules/perception/launch/lidar_point_pillars.launch
new file mode 100755
index 0000000..92969ee
--- /dev/null
+++ b/modules/perception/launch/lidar_point_pillars.launch
@@ -0,0 +1,7 @@
+
+
+ common
+ /apollo/modules/perception/dag/lidar_point_pillars.dag
+ lidar_point_pillars
+
+
diff --git a/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.launch b/modules/perception/launch/lidar_pointcloudcluster.launch
similarity index 55%
rename from modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.launch
rename to modules/perception/launch/lidar_pointcloudcluster.launch
index af3ee4c..6731192 100644
--- a/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.launch
+++ b/modules/perception/launch/lidar_pointcloudcluster.launch
@@ -1,7 +1,7 @@
common
- /apollo/modules/perception/lidar_pointcloudcluster/lidar_pointcloudcluster.dag
+ /apollo/modules/perception/dag/lidar_pointcloudcluster.dag
lidar_pointcloudcluster
diff --git a/modules/perception/lidar_point_pillars/BUILD b/modules/perception/lidar_point_pillars/BUILD
new file mode 100755
index 0000000..280ab22
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/BUILD
@@ -0,0 +1,139 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_binary")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("//tools:cpplint.bzl", "cpplint")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_binary(
+ name = "point_pillars_detection.so",
+ linkopts = ["-shared"],
+ linkshared = True,
+ linkstatic = False,
+ deps = [":point_pillars_detection"],
+)
+
+cc_library(
+ name = "point_pillars_detection",
+ srcs = [
+ "point_pillars_detection.cc",
+ ],
+ hdrs = [
+ "point_pillars_detection.h",
+ ],
+ deps = [
+ ":point_pillars",
+ "//cyber",
+ "//modules/drivers/proto:pointcloud_cc_proto",
+ "//modules/perception/proto:obst_box_cc_proto",
+ "//modules/perception/proto:lidar_pointcloud_conf_cc_proto",
+ "//modules/perception/lidar_pointcloud_tracking:hm_tracker",
+ "//modules/perception/lidar_pointcloud_tracking/common:object",
+ "//modules/perception/lidar_pointcloud_tracking:min_box",
+ "//modules/common/time",
+ "@eigen",
+ "@pcl",
+ "@local_config_vtk//:vtk",
+ ],
+ alwayslink = True,
+)
+
+cc_library(
+ name = "params",
+ hdrs = ["params.h"],
+)
+
+cc_library(
+ name = "point_pillars",
+ srcs = [
+ "point_pillars.cc",
+ ],
+ hdrs = [
+ "point_pillars.h",
+ ],
+ deps = [
+ ":anchor_mask_cuda",
+ ":common",
+ ":nms_cuda",
+ ":params",
+ ":postprocess_cuda",
+ ":preprocess_points",
+ ":scatter_cuda",
+ "//cyber/common",
+ "//third_party:libtorch",
+ "@local_config_cuda//cuda:cudart",
+ "@local_config_tensorrt//:tensorrt",
+ ],
+ alwayslink = True,
+)
+
+cc_library(
+ name = "preprocess_points",
+ srcs = [
+ "preprocess_points.cc",
+ ],
+ hdrs = [
+ "preprocess_points.h",
+ ],
+ deps = [
+ ":common",
+ ":preprocess_points_cuda",
+ ],
+)
+
+cc_library(
+ name = "common",
+ hdrs = ["common.h"],
+)
+
+cuda_library(
+ name = "anchor_mask_cuda",
+ srcs = ["anchor_mask_cuda.cu"],
+ hdrs = ["anchor_mask_cuda.h"],
+ deps = [
+ ":common",
+ "@local_config_cuda//cuda:cudart",
+ ],
+)
+
+cuda_library(
+ name = "nms_cuda",
+ srcs = ["nms_cuda.cu"],
+ hdrs = ["nms_cuda.h"],
+ deps = [
+ ":common",
+ "@local_config_cuda//cuda:cudart",
+ ],
+)
+
+cuda_library(
+ name = "postprocess_cuda",
+ srcs = ["postprocess_cuda.cu"],
+ hdrs = ["postprocess_cuda.h"],
+ deps = [
+ ":common",
+ ":nms_cuda",
+ "@local_config_cuda//cuda:cudart",
+ ],
+)
+
+cuda_library(
+ name = "preprocess_points_cuda",
+ srcs = ["preprocess_points_cuda.cu"],
+ hdrs = ["preprocess_points_cuda.h"],
+ deps = [
+ ":common",
+ "@local_config_cuda//cuda:cudart",
+ ],
+)
+
+cuda_library(
+ name = "scatter_cuda",
+ srcs = ["scatter_cuda.cu"],
+ hdrs = ["scatter_cuda.h"],
+ deps = [
+ ":common",
+ "@local_config_cuda//cuda:cudart",
+ ],
+)
+
+cpplint()
diff --git a/modules/perception/lidar_point_pillars/anchor_mask_cuda.cu b/modules/perception/lidar_point_pillars/anchor_mask_cuda.cu
new file mode 100755
index 0000000..116e60e
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/anchor_mask_cuda.cu
@@ -0,0 +1,184 @@
+#include
+
+// headers in local files
+#include "modules/perception/lidar_point_pillars/anchor_mask_cuda.h"
+#include "modules/perception/lidar_point_pillars/common.h"
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+// modified prefix sum code from
+__global__ void scan_x(int* g_odata, int* g_idata, int n) {
+ extern __shared__ int temp[]; // allocated on invocation
+ int thid = threadIdx.x;
+ int bid = blockIdx.x;
+ int bdim = blockDim.x;
+ int offset = 1;
+ temp[2 * thid] =
+ g_idata[bid * bdim * 2 + 2 * thid]; // load input into shared memory
+ temp[2 * thid + 1] = g_idata[bid * bdim * 2 + 2 * thid + 1];
+ for (int d = n >> 1; d > 0; d >>= 1) { // build sum in place up the tree
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ temp[bi] += temp[ai];
+ }
+ offset *= 2;
+ }
+ if (thid == 0) {
+ temp[n - 1] = 0;
+ } // clear the last element
+ for (int d = 1; d < n; d *= 2) { // traverse down tree & build scan
+ offset >>= 1;
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ int t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+ }
+ __syncthreads();
+ g_odata[bid * bdim * 2 + 2 * thid] =
+ temp[2 * thid + 1]; // write results to device memory
+ int second_ind = 2 * thid + 2;
+ if (second_ind == bdim * 2) {
+ g_odata[bid * bdim * 2 + 2 * thid + 1] =
+ temp[2 * thid + 1] + g_idata[bid * bdim * 2 + 2 * thid + 1];
+ } else {
+ g_odata[bid * bdim * 2 + 2 * thid + 1] = temp[2 * thid + 2];
+ }
+}
+
+// modified prefix sum code from
+// https://www.mimuw.edu.pl/~ps209291/kgkp/slides/scan.pdf
+__global__ void scan_y(int* g_odata, int* g_idata, int n) {
+ extern __shared__ int temp[]; // allocated on invocation
+ int thid = threadIdx.x;
+ int bid = blockIdx.x;
+ int bdim = blockDim.x;
+ int gdim = gridDim.x;
+ int offset = 1;
+ temp[2 * thid] =
+ g_idata[bid + 2 * thid * gdim]; // load input into shared memory
+ temp[2 * thid + 1] = g_idata[bid + 2 * thid * gdim + gdim];
+ for (int d = n >> 1; d > 0; d >>= 1) { // build sum in place up the tree
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ temp[bi] += temp[ai];
+ }
+ offset *= 2;
+ }
+ if (thid == 0) {
+ temp[n - 1] = 0;
+ } // clear the last element
+ for (int d = 1; d < n; d *= 2) { // traverse down tree & build scan
+ offset >>= 1;
+ __syncthreads();
+ if (thid < d) {
+ int ai = offset * (2 * thid + 1) - 1;
+ int bi = offset * (2 * thid + 2) - 1;
+ int t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+ }
+ __syncthreads();
+ g_odata[bid + 2 * thid * gdim] =
+ temp[2 * thid + 1]; // write results to device memory
+ int second_ind = 2 * thid + 2;
+ if (second_ind == bdim * 2) {
+ g_odata[bid + 2 * thid * gdim + gdim] =
+ temp[2 * thid + 1] + g_idata[bid + 2 * thid * gdim + gdim];
+ } else {
+ g_odata[bid + 2 * thid * gdim + gdim] = temp[2 * thid + 2];
+ }
+}
+
+__global__ void make_anchor_mask_kernel(
+ const float* dev_box_anchors_min_x, const float* dev_box_anchors_min_y,
+ const float* dev_box_anchors_max_x, const float* dev_box_anchors_max_y,
+ int* dev_sparse_pillar_map, int* dev_anchor_mask, const float min_x_range,
+ const float min_y_range, const float pillar_x_size,
+ const float pillar_y_size, const int grid_x_size, const int grid_y_size,
+ const int num_inds_for_scan) {
+ int tid = threadIdx.x + blockIdx.x * blockDim.x;
+ int anchor_coor[NUM_2D_BOX_CORNERS_MACRO] = {0};
+ const int grid_x_size_1 = grid_x_size - 1; // grid_x_size - 1
+ const int grid_y_size_1 = grid_y_size - 1; // grid_y_size - 1
+
+ anchor_coor[0] =
+ floor((dev_box_anchors_min_x[tid] - min_x_range) / pillar_x_size);
+ anchor_coor[1] =
+ floor((dev_box_anchors_min_y[tid] - min_y_range) / pillar_y_size);
+ anchor_coor[2] =
+ floor((dev_box_anchors_max_x[tid] - min_x_range) / pillar_x_size);
+ anchor_coor[3] =
+ floor((dev_box_anchors_max_y[tid] - min_y_range) / pillar_y_size);
+ anchor_coor[0] = max(anchor_coor[0], 0);
+ anchor_coor[1] = max(anchor_coor[1], 0);
+ anchor_coor[2] = min(anchor_coor[2], grid_x_size_1);
+ anchor_coor[3] = min(anchor_coor[3], grid_y_size_1);
+
+ int right_top = dev_sparse_pillar_map[anchor_coor[3] * num_inds_for_scan +
+ anchor_coor[2]];
+ int left_bottom = dev_sparse_pillar_map[anchor_coor[1] * num_inds_for_scan +
+ anchor_coor[0]];
+ int left_top = dev_sparse_pillar_map[anchor_coor[3] * num_inds_for_scan +
+ anchor_coor[0]];
+ int right_bottom = dev_sparse_pillar_map[anchor_coor[1] * num_inds_for_scan +
+ anchor_coor[2]];
+
+ int area = right_top - left_top - right_bottom + left_bottom;
+ if (area > 1) {
+ dev_anchor_mask[tid] = 1;
+ } else {
+ dev_anchor_mask[tid] = 0;
+ }
+}
+
+AnchorMaskCuda::AnchorMaskCuda(
+ const int num_threads, const int num_inds_for_scan, const int num_anchor,
+ const float min_x_range, const float min_y_range, const float pillar_x_size,
+ const float pillar_y_size, const int grid_x_size, const int grid_y_size)
+ : num_threads_(num_threads),
+ num_inds_for_scan_(num_inds_for_scan),
+ num_anchor_(num_anchor),
+ min_x_range_(min_x_range),
+ min_y_range_(min_y_range),
+ pillar_x_size_(pillar_x_size),
+ pillar_y_size_(pillar_y_size),
+ grid_x_size_(grid_x_size),
+ grid_y_size_(grid_y_size) {}
+
+void AnchorMaskCuda::DoAnchorMaskCuda(
+ int* dev_sparse_pillar_map, int* dev_cumsum_along_x,
+ int* dev_cumsum_along_y, const float* dev_box_anchors_min_x,
+ const float* dev_box_anchors_min_y, const float* dev_box_anchors_max_x,
+ const float* dev_box_anchors_max_y, int* dev_anchor_mask) {
+ scan_x<<>>(
+ dev_cumsum_along_x, dev_sparse_pillar_map, num_inds_for_scan_);
+ scan_y<<>>(
+ dev_cumsum_along_y, dev_cumsum_along_x, num_inds_for_scan_);
+ GPU_CHECK(cudaMemcpy(dev_sparse_pillar_map, dev_cumsum_along_y,
+ num_inds_for_scan_ * num_inds_for_scan_ * sizeof(int),
+ cudaMemcpyDeviceToDevice));
+
+ int num_blocks = DIVUP(num_anchor_, num_threads_);
+ make_anchor_mask_kernel<<>>(
+ dev_box_anchors_min_x, dev_box_anchors_min_y, dev_box_anchors_max_x,
+ dev_box_anchors_max_y, dev_sparse_pillar_map, dev_anchor_mask,
+ min_x_range_, min_y_range_, pillar_x_size_, pillar_y_size_, grid_x_size_,
+ grid_y_size_, num_inds_for_scan_);
+}
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/anchor_mask_cuda.h b/modules/perception/lidar_point_pillars/anchor_mask_cuda.h
new file mode 100755
index 0000000..b90108f
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/anchor_mask_cuda.h
@@ -0,0 +1,72 @@
+#pragma once
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+class AnchorMaskCuda {
+ private:
+ const int num_threads_;
+ const int num_inds_for_scan_;
+ const int num_anchor_;
+ const float min_x_range_;
+ const float min_y_range_;
+ const float pillar_x_size_;
+ const float pillar_y_size_;
+ const int grid_x_size_;
+ const int grid_y_size_;
+
+ public:
+ /**
+ * @brief Constructor
+ * @param[in] num_threads Number of threads per block
+ * @param[in] num_inds_for_scan Number of indexes for scan(cumsum)
+ * @param[in] num_anchor Number of anchors in total
+ * @param[in] min_x_range Minimum x value for point cloud
+ * @param[in] min_y_range Minimum y value for point cloud
+ * @param[in] pillar_x_size Size of x-dimension for a pillar
+ * @param[in] pillar_y_size Size of y-dimension for a pillar
+ * @param[in] grid_x_size Number of pillars in x-coordinate
+ * @param[in] grid_y_size Number of pillars in y-coordinate
+ * @details Captital variables never change after the compile
+ */
+ AnchorMaskCuda(const int num_threads, const int num_inds_for_scan,
+ const int num_anchor, const float min_x_range,
+ const float min_y_range, const float pillar_x_size,
+ const float pillar_y_size, const int grid_x_size,
+ const int grid_y_size);
+
+ /**
+ * @brief call cuda code for making anchor mask
+ * @param[in] dev_sparse_pillar_map
+ * Grid map representation for pillar occupancy
+ * @param[in] dev_cumsum_along_x
+ * Array for storing cumsum-ed dev_sparse_pillar_map values
+ * @param[in] dev_cumsum_along_y
+ * Array for storing cumsum-ed dev_cumsum_along_y values
+ * @param[in] dev_box_anchors_min_x
+ * Array for storing min x value for each anchor
+ * @param[in] dev_box_anchors_min_y
+ * Array for storing min y value for each anchor
+ * @param[in] dev_box_anchors_max_x
+ * Array for storing max x value for each anchor
+ * @param[in] dev_box_anchors_max_y
+ * Array for storing max y value for each anchor
+ * @param[in] dev_box_anchors_max_y
+ * Array for storing max y value for each anchor
+ * @param[out] dev_anchor_mask Anchor mask for filtering the network output
+ * @details dev_* means device memory. Make a mask for filtering pillar
+ * occupancy area
+ */
+ void DoAnchorMaskCuda(int* dev_sparse_pillar_map, int* dev_cumsum_along_x,
+ int* dev_cumsum_along_y,
+ const float* dev_box_anchors_min_x,
+ const float* dev_box_anchors_min_y,
+ const float* dev_box_anchors_max_x,
+ const float* dev_box_anchors_max_y,
+ int* dev_anchor_mask);
+};
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/common.h b/modules/perception/lidar_point_pillars/common.h
new file mode 100755
index 0000000..f452f07
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/common.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// headers in STL
+#include
+
+// headers in CUDA
+#include
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+// using MACRO to allocate memory inside CUDA kernel
+#define NUM_3D_BOX_CORNERS_MACRO 8
+#define NUM_2D_BOX_CORNERS_MACRO 4
+#define NUM_THREADS_MACRO 64 // need to be changed when num_threads_ is changed
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define GPU_CHECK(ans) \
+ { GPUAssert((ans), __FILE__, __LINE__); }
+inline void GPUAssert(cudaError_t code, const char* file, int line,
+ bool abort = true) {
+ if (code != cudaSuccess) {
+ fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+ line);
+ if (abort) exit(code);
+ }
+}
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/nms_cuda.cu b/modules/perception/lidar_point_pillars/nms_cuda.cu
new file mode 100755
index 0000000..369f7b8
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/nms_cuda.cu
@@ -0,0 +1,118 @@
+#include
+
+// headers in local files
+#include "modules/perception/lidar_point_pillars/nms_cuda.h"
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+__device__ inline float devIoU(float const *const a, float const *const b) {
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+ return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+ const float *dev_boxes, uint64_t *dev_mask,
+ const int num_box_corners) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ const int block_threads = blockDim.x;
+
+ const int row_size = min(n_boxes - row_start * block_threads, block_threads);
+ const int col_size = min(n_boxes - col_start * block_threads, block_threads);
+
+ __shared__ float block_boxes[NUM_THREADS_MACRO * NUM_2D_BOX_CORNERS_MACRO];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * num_box_corners + 0] =
+ dev_boxes[(block_threads * col_start + threadIdx.x) * num_box_corners +
+ 0];
+ block_boxes[threadIdx.x * num_box_corners + 1] =
+ dev_boxes[(block_threads * col_start + threadIdx.x) * num_box_corners +
+ 1];
+ block_boxes[threadIdx.x * num_box_corners + 2] =
+ dev_boxes[(block_threads * col_start + threadIdx.x) * num_box_corners +
+ 2];
+ block_boxes[threadIdx.x * num_box_corners + 3] =
+ dev_boxes[(block_threads * col_start + threadIdx.x) * num_box_corners +
+ 3];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = block_threads * row_start + threadIdx.x;
+ const float cur_box[NUM_2D_BOX_CORNERS_MACRO] = {
+ dev_boxes[cur_box_idx * num_box_corners + 0],
+ dev_boxes[cur_box_idx * num_box_corners + 1],
+ dev_boxes[cur_box_idx * num_box_corners + 2],
+ dev_boxes[cur_box_idx * num_box_corners + 3]};
+ uint64_t t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (int i = start; i < col_size; ++i) {
+ if (devIoU(cur_box, block_boxes + i * num_box_corners) >
+ nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = DIVUP(n_boxes, block_threads);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+}
+
+NmsCuda::NmsCuda(const int num_threads, const int num_box_corners,
+ const float nms_overlap_threshold)
+ : num_threads_(num_threads),
+ num_box_corners_(num_box_corners),
+ nms_overlap_threshold_(nms_overlap_threshold) {}
+
+void NmsCuda::DoNmsCuda(const int host_filter_count,
+ float *dev_sorted_box_for_nms, int *out_keep_inds,
+ int *out_num_to_keep) {
+ const int col_blocks = DIVUP(host_filter_count, num_threads_);
+ dim3 blocks(DIVUP(host_filter_count, num_threads_),
+ DIVUP(host_filter_count, num_threads_));
+ dim3 threads(num_threads_);
+
+ uint64_t *dev_mask = NULL;
+ GPU_CHECK(
+ cudaMalloc(&dev_mask, host_filter_count * col_blocks * sizeof(uint64_t)));
+
+ nms_kernel<<>>(host_filter_count, nms_overlap_threshold_,
+ dev_sorted_box_for_nms, dev_mask,
+ num_box_corners_);
+
+ // postprocess for nms output
+ std::vector host_mask(host_filter_count * col_blocks);
+ GPU_CHECK(cudaMemcpy(&host_mask[0], dev_mask,
+ sizeof(uint64_t) * host_filter_count * col_blocks,
+ cudaMemcpyDeviceToHost));
+ std::vector remv(col_blocks);
+ memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+ for (int i = 0; i < host_filter_count; ++i) {
+ int nblock = i / num_threads_;
+ int inblock = i % num_threads_;
+
+ if (!(remv[nblock] & (1ULL << inblock))) {
+ out_keep_inds[(*out_num_to_keep)++] = i;
+ uint64_t *p = &host_mask[0] + i * col_blocks;
+ for (int j = nblock; j < col_blocks; ++j) {
+ remv[j] |= p[j];
+ }
+ }
+ }
+ GPU_CHECK(cudaFree(dev_mask));
+}
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/nms_cuda.h b/modules/perception/lidar_point_pillars/nms_cuda.h
new file mode 100755
index 0000000..a228aef
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/nms_cuda.h
@@ -0,0 +1,46 @@
+#pragma once
+
+// heders in STL
+#include
+#include
+
+// headers in local files
+#include "modules/perception/lidar_point_pillars/common.h"
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+class NmsCuda {
+ private:
+ const int num_threads_;
+ const int num_box_corners_;
+ const float nms_overlap_threshold_;
+
+ public:
+ /**
+ * @brief Constructor
+ * @param[in] num_threads Number of threads when launching cuda kernel
+ * @param[in] num_box_corners Number of corners for 2D box
+ * @param[in] nms_overlap_threshold IOU threshold for NMS
+ * @details Captital variables never change after the compile, Non-captital
+ * variables could be chaned through rosparam
+ */
+ NmsCuda(const int num_threads, const int num_box_corners,
+ const float nms_overlap_threshold);
+
+ /**
+ * @brief GPU Non-Maximum Suppresion for network output
+ * @param[in] host_filter_count Number of filtered output
+ * @param[in] dev_sorted_box_for_nms Bounding box output sorted by score
+ * @param[out] out_keep_inds Indexes of selected bounding box
+ * @param[out] out_num_to_keep Number of kept bounding boxes
+ * @details NMS in GPU and postprocessing for selecting box in CPU
+ */
+ void DoNmsCuda(const int host_filter_count, float* dev_sorted_box_for_nms,
+ int* out_keep_inds, int* out_num_to_keep);
+};
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/params.h b/modules/perception/lidar_point_pillars/params.h
new file mode 100755
index 0000000..6b16847
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/params.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+class Params {
+ public:
+ static constexpr float kPillarXSize = 0.25f;
+ static constexpr float kPillarYSize = 0.25f;
+ static constexpr float kPillarZSize = 8.0f;
+ static constexpr float kMinXRange = -100.0f;
+ static constexpr float kMinYRange = -70.0f;
+ static constexpr float kMinZRange = -5.0f;
+ static constexpr float kMaxXRange = 100.0f;
+ static constexpr float kMaxYRange = 70.0f;
+ static constexpr float kMaxZRange = 3.0f;
+ static constexpr int kNumClass = 9;
+ static constexpr int kMaxNumPillars = 30000;
+ static constexpr int kMaxNumPointsPerPillar = 60;
+ static constexpr int kNumPointFeature = 5; // x, y, z, i, delta of time
+ static constexpr int kNumAnchor = 200 * 140 * 8 + 220 * 140 * 8;
+ static constexpr int kNumOutputBoxFeature = 7;
+ static constexpr int kBatchSize = 1;
+ static constexpr int kNumIndsForScan = 1024;
+ static constexpr int kNumThreads = 64;
+ static constexpr int kNumBoxCorners = 4;
+
+ static std::vector AnchorStrides() { return std::vector{4, 2}; }
+
+ static std::vector NumAnchorSets() { return std::vector{8, 8}; }
+
+ static std::vector> AnchorDxSizes() {
+ return std::vector>{
+ std::vector{2.94046906f, 1.95017717f, 2.73050468f, 2.4560939f},
+ std::vector{2.49008838f, 0.60058911f, 0.76279481f, 0.66344886f,
+ 0.39694519f}};
+ }
+
+ static std::vector> AnchorDySizes() {
+ return std::vector>{
+ std::vector{11.1885991, 4.60718145f, 6.38352896f, 6.73778078f},
+ std::vector{0.48578221f, 1.68452161f, 2.09973778f, 0.7256437f,
+ 0.40359262f}};
+ }
+
+ static std::vector> AnchorDzSizes() {
+ return std::vector>{
+ std::vector{3.47030982f, 1.72270761f, 3.13312415f, 2.73004906f},
+ std::vector{0.98297065f, 1.27192197f, 1.44403034f, 1.75748069f,
+ 1.06232151f}};
+ }
+
+ static std::vector> AnchorZCoors() {
+ return std::vector>{
+ std::vector{-0.0715754f, -0.93897414f, -0.08168083f,
+ -0.37937912f},
+ std::vector{-1.27247968f, -1.03743013f, -0.99194854f,
+ -0.73911038f, -1.27868911f}};
+ }
+
+ static std::vector> NumAnchorRo() {
+ return std::vector>{std::vector{2, 2, 2, 2},
+ std::vector{2, 2, 2, 1, 1}};
+ }
+
+ static std::vector> AnchorRo() {
+ return std::vector>{
+ std::vector{0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2},
+ std::vector{0, M_PI / 2, 0, M_PI / 2, 0, M_PI / 2, 0, 0}};
+ }
+
+ private:
+ Params() = default;
+ ~Params() = default;
+}; // class Params
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/point_pillars.cc b/modules/perception/lidar_point_pillars/point_pillars.cc
new file mode 100755
index 0000000..6d75654
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/point_pillars.cc
@@ -0,0 +1,656 @@
+// headers in STL
+#include
+#include
+
+#include "cyber/common/log.h"
+
+// headers in local files
+#include "modules/perception/lidar_point_pillars/point_pillars.h"
+using namespace std;
+
+namespace apollo {
+namespace perception {
+namespace lidar {
+
+const float PointPillars::kPillarXSize = Params::kPillarXSize;
+const float PointPillars::kPillarYSize = Params::kPillarYSize;
+const float PointPillars::kPillarZSize = Params::kPillarZSize;
+const float PointPillars::kMinXRange = Params::kMinXRange;
+const float PointPillars::kMinYRange = Params::kMinYRange;
+const float PointPillars::kMinZRange = Params::kMinZRange;
+const float PointPillars::kMaxXRange = Params::kMaxXRange;
+const float PointPillars::kMaxYRange = Params::kMaxYRange;
+const float PointPillars::kMaxZRange = Params::kMaxZRange;
+const int PointPillars::kNumClass = Params::kNumClass;
+const int PointPillars::kMaxNumPillars = Params::kMaxNumPillars;
+const int PointPillars::kMaxNumPointsPerPillar = Params::kMaxNumPointsPerPillar;
+const int PointPillars::kNumPointFeature = Params::kNumPointFeature;
+const int PointPillars::kGridXSize =
+ static_cast((kMaxXRange - kMinXRange) / kPillarXSize);
+const int PointPillars::kGridYSize =
+ static_cast((kMaxYRange - kMinYRange) / kPillarYSize);
+const int PointPillars::kGridZSize =
+ static_cast((kMaxZRange - kMinZRange) / kPillarZSize);
+const int PointPillars::kRpnInputSize = 64 * kGridXSize * kGridYSize;
+const int PointPillars::kNumAnchor = Params::kNumAnchor;
+const int PointPillars::kNumOutputBoxFeature = Params::kNumOutputBoxFeature;
+const int PointPillars::kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
+const int PointPillars::kRpnClsOutputSize = kNumAnchor * kNumClass;
+const int PointPillars::kRpnDirOutputSize = kNumAnchor * 2;
+const int PointPillars::kBatchSize = Params::kBatchSize;
+const int PointPillars::kNumIndsForScan = Params::kNumIndsForScan;
+const int PointPillars::kNumThreads = Params::kNumThreads;
+// if you change kNumThreads, need to modify NUM_THREADS_MACRO in
+// common.h
+const int PointPillars::kNumBoxCorners = Params::kNumBoxCorners;
+const std::vector PointPillars::kAnchorStrides = Params::AnchorStrides();
+const std::vector PointPillars::kAnchorRanges{
+ 0,
+ kGridXSize,
+ 0,
+ kGridYSize,
+ static_cast(kGridXSize * 0.25),
+ static_cast(kGridXSize * 0.75),
+ static_cast(kGridYSize * 0.25),
+ static_cast(kGridYSize * 0.75)};
+const std::vector PointPillars::kNumAnchorSets = Params::NumAnchorSets();
+const std::vector> PointPillars::kAnchorDxSizes =
+ Params::AnchorDxSizes();
+const std::vector> PointPillars::kAnchorDySizes =
+ Params::AnchorDySizes();
+const std::vector> PointPillars::kAnchorDzSizes =
+ Params::AnchorDzSizes();
+const std::vector> PointPillars::kAnchorZCoors =
+ Params::AnchorZCoors();
+const std::vector> PointPillars::kNumAnchorRo =
+ Params::NumAnchorRo();
+const std::vector> PointPillars::kAnchorRo =
+ Params::AnchorRo();
+
+PointPillars::PointPillars(const bool reproduce_result_mode,
+ const float score_threshold,
+ const float nms_overlap_threshold,
+ const std::string pfe_torch_file,
+ const std::string rpn_onnx_file)
+ : reproduce_result_mode_(reproduce_result_mode),
+ score_threshold_(score_threshold),
+ nms_overlap_threshold_(nms_overlap_threshold),
+ pfe_torch_file_(pfe_torch_file),
+ rpn_onnx_file_(rpn_onnx_file) {
+ if (reproduce_result_mode_) {
+ preprocess_points_ptr_.reset(new PreprocessPoints(
+ kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature, kGridXSize,
+ kGridYSize, kGridZSize, kPillarXSize, kPillarYSize, kPillarZSize,
+ kMinXRange, kMinYRange, kMinZRange, kNumIndsForScan));
+ } else {
+ preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
+ kNumThreads, kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature,
+ kNumIndsForScan, kGridXSize, kGridYSize, kGridZSize, kPillarXSize,
+ kPillarYSize, kPillarZSize, kMinXRange, kMinYRange, kMinZRange));
+ }
+
+ anchor_mask_cuda_ptr_.reset(new AnchorMaskCuda(
+ kNumThreads, kNumIndsForScan, kNumAnchor, kMinXRange, kMinYRange,
+ kPillarXSize, kPillarYSize, kGridXSize, kGridYSize));
+
+ scatter_cuda_ptr_.reset(
+ new ScatterCuda(kNumThreads, kMaxNumPillars, kGridXSize, kGridYSize));
+
+ const float float_min = std::numeric_limits::lowest();
+ const float float_max = std::numeric_limits::max();
+ postprocess_cuda_ptr_.reset(
+ new PostprocessCuda(float_min, float_max, kNumAnchor, kNumClass,
+ score_threshold_, kNumThreads, nms_overlap_threshold_,
+ kNumBoxCorners, kNumOutputBoxFeature));
+
+ DeviceMemoryMalloc();
+ InitTorch();
+ InitTRT();
+ InitAnchors();
+}
+
+PointPillars::~PointPillars() {
+ delete[] anchors_px_;
+ delete[] anchors_py_;
+ delete[] anchors_pz_;
+ delete[] anchors_dx_;
+ delete[] anchors_dy_;
+ delete[] anchors_dz_;
+ delete[] anchors_ro_;
+ delete[] box_anchors_min_x_;
+ delete[] box_anchors_min_y_;
+ delete[] box_anchors_max_x_;
+ delete[] box_anchors_max_y_;
+
+ GPU_CHECK(cudaFree(dev_x_coors_));
+ GPU_CHECK(cudaFree(dev_y_coors_));
+ GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
+ GPU_CHECK(cudaFree(dev_sparse_pillar_map_));
+ GPU_CHECK(cudaFree(dev_pillar_point_feature_));
+ GPU_CHECK(cudaFree(dev_pillar_coors_));
+
+ GPU_CHECK(cudaFree(dev_cumsum_along_x_));
+ GPU_CHECK(cudaFree(dev_cumsum_along_y_));
+
+ GPU_CHECK(cudaFree(dev_box_anchors_min_x_));
+ GPU_CHECK(cudaFree(dev_box_anchors_min_y_));
+ GPU_CHECK(cudaFree(dev_box_anchors_max_x_));
+ GPU_CHECK(cudaFree(dev_box_anchors_max_y_));
+ GPU_CHECK(cudaFree(dev_anchor_mask_));
+
+ GPU_CHECK(cudaFree(pfe_buffers_[0]));
+ GPU_CHECK(cudaFree(pfe_buffers_[1]));
+ GPU_CHECK(cudaFree(pfe_buffers_[2]));
+
+ GPU_CHECK(cudaFree(rpn_buffers_[0]));
+ GPU_CHECK(cudaFree(rpn_buffers_[1]));
+ GPU_CHECK(cudaFree(rpn_buffers_[2]));
+ GPU_CHECK(cudaFree(rpn_buffers_[3]));
+
+ GPU_CHECK(cudaFree(dev_scattered_feature_));
+
+ GPU_CHECK(cudaFree(dev_anchors_px_));
+ GPU_CHECK(cudaFree(dev_anchors_py_));
+ GPU_CHECK(cudaFree(dev_anchors_pz_));
+ GPU_CHECK(cudaFree(dev_anchors_dx_));
+ GPU_CHECK(cudaFree(dev_anchors_dy_));
+ GPU_CHECK(cudaFree(dev_anchors_dz_));
+ GPU_CHECK(cudaFree(dev_anchors_ro_));
+ GPU_CHECK(cudaFree(dev_filtered_box_));
+ GPU_CHECK(cudaFree(dev_filtered_score_));
+ GPU_CHECK(cudaFree(dev_filtered_label_));
+ GPU_CHECK(cudaFree(dev_filtered_dir_));
+ GPU_CHECK(cudaFree(dev_box_for_nms_));
+ GPU_CHECK(cudaFree(dev_filter_count_));
+
+ rpn_context_->destroy();
+ rpn_runtime_->destroy();
+ rpn_engine_->destroy();
+}
+
+void PointPillars::DeviceMemoryMalloc() {
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_x_coors_),
+ kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_y_coors_),
+ kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_num_points_per_pillar_),
+ kMaxNumPillars * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_sparse_pillar_map_),
+ kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_pillar_point_feature_),
+ kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_pillar_coors_),
+ kMaxNumPillars * 4 * sizeof(float)));
+
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_cumsum_along_x_),
+ kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_cumsum_along_y_),
+ kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+
+ // for make anchor mask kernel
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_box_anchors_min_x_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_box_anchors_min_y_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_box_anchors_max_x_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_box_anchors_max_y_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchor_mask_),
+ kNumAnchor * sizeof(int)));
+
+ // for trt inference
+ // create GPU buffers and a stream
+ GPU_CHECK(
+ cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float)));
+ GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * sizeof(float)));
+ GPU_CHECK(cudaMalloc(&pfe_buffers_[2], kMaxNumPillars * 4 * sizeof(float)));
+
+ GPU_CHECK(cudaMalloc(&rpn_buffers_[0], kRpnInputSize * sizeof(float)));
+ GPU_CHECK(cudaMalloc(&rpn_buffers_[1], kRpnBoxOutputSize * sizeof(float)));
+ GPU_CHECK(cudaMalloc(&rpn_buffers_[2], kRpnClsOutputSize * sizeof(float)));
+ GPU_CHECK(cudaMalloc(&rpn_buffers_[3], kRpnDirOutputSize * sizeof(float)));
+
+ // for scatter kernel
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_scattered_feature_),
+ kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
+
+ // for filter
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_px_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_py_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_pz_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_dx_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_dy_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_dz_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_anchors_ro_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_filtered_box_),
+ kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_filtered_score_),
+ kNumAnchor * sizeof(float)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_filtered_label_),
+ kNumAnchor * sizeof(int)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_filtered_dir_),
+ kNumAnchor * sizeof(int)));
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_box_for_nms_),
+ kNumAnchor * kNumBoxCorners * sizeof(float)));
+ GPU_CHECK(
+ cudaMalloc(reinterpret_cast(&dev_filter_count_), sizeof(int)));
+}
+
+void PointPillars::InitAnchors() {
+ // allocate memory for anchors
+ anchors_px_ = new float[kNumAnchor]();
+ anchors_py_ = new float[kNumAnchor]();
+ anchors_pz_ = new float[kNumAnchor]();
+ anchors_dx_ = new float[kNumAnchor]();
+ anchors_dy_ = new float[kNumAnchor]();
+ anchors_dz_ = new float[kNumAnchor]();
+ anchors_ro_ = new float[kNumAnchor]();
+ box_anchors_min_x_ = new float[kNumAnchor]();
+ box_anchors_min_y_ = new float[kNumAnchor]();
+ box_anchors_max_x_ = new float[kNumAnchor]();
+ box_anchors_max_y_ = new float[kNumAnchor]();
+ // deallocate these memories in destructor
+
+ GenerateAnchors(anchors_px_, anchors_py_, anchors_pz_, anchors_dx_,
+ anchors_dy_, anchors_dz_, anchors_ro_);
+
+ ConvertAnchors2BoxAnchors(anchors_px_, anchors_py_, box_anchors_min_x_,
+ box_anchors_min_y_, box_anchors_max_x_,
+ box_anchors_max_y_);
+
+ PutAnchorsInDeviceMemory();
+}
+
+void PointPillars::GenerateAnchors(float* anchors_px_, float* anchors_py_,
+ float* anchors_pz_, float* anchors_dx_,
+ float* anchors_dy_, float* anchors_dz_,
+ float* anchors_ro_) {
+ for (int i = 0; i < kNumAnchor; ++i) {
+ anchors_px_[i] = 0;
+ anchors_py_[i] = 0;
+ anchors_pz_[i] = 0;
+ anchors_dx_[i] = 0;
+ anchors_dy_[i] = 0;
+ anchors_dz_[i] = 0;
+ anchors_ro_[i] = 0;
+ box_anchors_min_x_[i] = 0;
+ box_anchors_min_y_[i] = 0;
+ box_anchors_max_x_[i] = 0;
+ box_anchors_max_y_[i] = 0;
+ }
+
+ int ind = 0;
+ for (size_t head = 0; head < kNumAnchorSets.size(); ++head) {
+ float x_stride = kPillarXSize * kAnchorStrides[head];
+ float y_stride = kPillarYSize * kAnchorStrides[head];
+ int x_ind_start = kAnchorRanges[head * 4 + 0] / kAnchorStrides[head];
+ int x_ind_end = kAnchorRanges[head * 4 + 1] / kAnchorStrides[head];
+ int y_ind_start = kAnchorRanges[head * 4 + 2] / kAnchorStrides[head];
+ int y_ind_end = kAnchorRanges[head * 4 + 3] / kAnchorStrides[head];
+ // coors of first anchor's center
+ float x_offset = kMinXRange + x_stride / 2.0;
+ float y_offset = kMinYRange + y_stride / 2.0;
+
+ std::vector anchor_x_count, anchor_y_count;
+ for (int i = x_ind_start; i < x_ind_end; ++i) {
+ float anchor_coor_x = static_cast(i) * x_stride + x_offset;
+ anchor_x_count.push_back(anchor_coor_x);
+ }
+ for (int i = y_ind_start; i < y_ind_end; ++i) {
+ float anchor_coor_y = static_cast(i) * y_stride + y_offset;
+ anchor_y_count.push_back(anchor_coor_y);
+ }
+
+ for (int y = 0; y < y_ind_end - y_ind_start; ++y) {
+ for (int x = 0; x < x_ind_end - x_ind_start; ++x) {
+ int ro_count = 0;
+ for (size_t c = 0; c < kNumAnchorRo[head].size(); ++c) {
+ for (int i = 0; i < kNumAnchorRo[head][c]; ++i) {
+ anchors_px_[ind] = anchor_x_count[x];
+ anchors_py_[ind] = anchor_y_count[y];
+ anchors_ro_[ind] = kAnchorRo[head][ro_count];
+ anchors_pz_[ind] = kAnchorZCoors[head][c];
+ anchors_dx_[ind] = kAnchorDxSizes[head][c];
+ anchors_dy_[ind] = kAnchorDySizes[head][c];
+ anchors_dz_[ind] = kAnchorDzSizes[head][c];
+ ro_count++;
+ ind++;
+ }
+ }
+ }
+ }
+ }
+}
+
+void PointPillars::PutAnchorsInDeviceMemory() {
+ GPU_CHECK(cudaMemcpy(dev_box_anchors_min_x_, box_anchors_min_x_,
+ kNumAnchor * sizeof(float), cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_box_anchors_min_y_, box_anchors_min_y_,
+ kNumAnchor * sizeof(float), cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_box_anchors_max_x_, box_anchors_max_x_,
+ kNumAnchor * sizeof(float), cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_box_anchors_max_y_, box_anchors_max_y_,
+ kNumAnchor * sizeof(float), cudaMemcpyHostToDevice));
+
+ GPU_CHECK(cudaMemcpy(dev_anchors_px_, anchors_px_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_py_, anchors_py_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_pz_, anchors_pz_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_dx_, anchors_dx_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_dy_, anchors_dy_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_dz_, anchors_dz_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_anchors_ro_, anchors_ro_, kNumAnchor * sizeof(float),
+ cudaMemcpyHostToDevice));
+}
+
+void PointPillars::ConvertAnchors2BoxAnchors(float* anchors_px,
+ float* anchors_py,
+ float* box_anchors_min_x_,
+ float* box_anchors_min_y_,
+ float* box_anchors_max_x_,
+ float* box_anchors_max_y_) {
+ // flipping box's dimension
+ float flipped_anchors_dx[kNumAnchor] = {};
+ float flipped_anchors_dy[kNumAnchor] = {};
+ int ind = 0;
+ for (size_t head = 0; head < kNumAnchorSets.size(); ++head) {
+ int num_x_inds =
+ (kAnchorRanges[head * 4 + 1] - kAnchorRanges[head * 4 + 0]) /
+ kAnchorStrides[head];
+ int num_y_inds =
+ (kAnchorRanges[head * 4 + 3] - kAnchorRanges[head * 4 + 2]) /
+ kAnchorStrides[head];
+ int base_ind = ind;
+ int ro_count = 0;
+ for (int x = 0; x < num_x_inds; ++x) {
+ for (int y = 0; y < num_y_inds; ++y) {
+ ro_count = 0;
+ for (size_t c = 0; c < kNumAnchorRo[head].size(); ++c) {
+ for (int i = 0; i < kNumAnchorRo[head][c]; ++i) {
+ if (kAnchorRo[head][ro_count] <= 0.78) {
+ flipped_anchors_dx[base_ind] = kAnchorDxSizes[head][c];
+ flipped_anchors_dy[base_ind] = kAnchorDySizes[head][c];
+ } else {
+ flipped_anchors_dx[base_ind] = kAnchorDySizes[head][c];
+ flipped_anchors_dy[base_ind] = kAnchorDxSizes[head][c];
+ }
+ ro_count++;
+ base_ind++;
+ }
+ }
+ }
+ }
+ for (int x = 0; x < num_x_inds; ++x) {
+ for (int y = 0; y < num_y_inds; ++y) {
+ for (size_t i = 0; i < kAnchorRo[head].size(); ++i) {
+ box_anchors_min_x_[ind] =
+ anchors_px[ind] - flipped_anchors_dx[ind] / 2.0f;
+ box_anchors_min_y_[ind] =
+ anchors_py[ind] - flipped_anchors_dy[ind] / 2.0f;
+ box_anchors_max_x_[ind] =
+ anchors_px[ind] + flipped_anchors_dx[ind] / 2.0f;
+ box_anchors_max_y_[ind] =
+ anchors_py[ind] + flipped_anchors_dy[ind] / 2.0f;
+ ind++;
+ }
+ }
+ }
+ }
+}
+
+void PointPillars::InitTorch() {
+ if (gpu_id_ >= 0) {
+ device_type_ = torch::kCUDA;
+ device_id_ = gpu_id_;
+ } else {
+ device_type_ = torch::kCPU;
+ }
+
+ // Init torch net
+ torch::Device device(device_type_, device_id_);
+ pfe_net_ = torch::jit::load(pfe_torch_file_, device);
+}
+
+void PointPillars::InitTRT() {
+ // create a TensorRT model from the onnx model and serialize it to a stream
+ nvinfer1::IHostMemory* rpn_trt_model_stream{nullptr};
+ OnnxToTRTModel(rpn_onnx_file_, &rpn_trt_model_stream);
+ if (rpn_trt_model_stream == nullptr) {
+ std::cerr << "Failed to load ONNX file " << std::endl;
+ }
+
+ // deserialize the engine
+ rpn_runtime_ = nvinfer1::createInferRuntime(g_logger_);
+ if (rpn_runtime_ == nullptr) {
+ std::cerr << "Failed to create TensorRT Runtime object." << std::endl;
+ }
+ rpn_engine_ = rpn_runtime_->deserializeCudaEngine(
+ rpn_trt_model_stream->data(), rpn_trt_model_stream->size(), nullptr);
+ if (rpn_engine_ == nullptr) {
+ std::cerr << "Failed to create TensorRT Engine." << std::endl;
+ }
+ rpn_trt_model_stream->destroy();
+ rpn_context_ = rpn_engine_->createExecutionContext();
+ if (rpn_context_ == nullptr) {
+ std::cerr << "Failed to create TensorRT Execution Context." << std::endl;
+ }
+}
+
+void PointPillars::OnnxToTRTModel(
+ const std::string& model_file, // name of the onnx model
+ nvinfer1::IHostMemory**
+ trt_model_stream) { // output buffer for the TensorRT model
+ int verbosity = static_cast(nvinfer1::ILogger::Severity::kWARNING);
+
+ // create the builder
+ const auto explicit_batch =
+ static_cast(kBatchSize) << static_cast(
+ nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+ nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
+ nvinfer1::INetworkDefinition* network =
+ builder->createNetworkV2(explicit_batch);
+
+ auto parser = nvonnxparser::createParser(*network, g_logger_);
+ if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
+ std::string msg("failed to parse onnx file");
+ g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+ exit(EXIT_FAILURE);
+ }
+
+ // Build the engine
+ builder->setMaxBatchSize(kBatchSize);
+ nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
+ config->setMaxWorkspaceSize(1 << 20);
+ nvinfer1::ICudaEngine* engine =
+ builder->buildEngineWithConfig(*network, *config);
+
+ parser->destroy();
+
+ // serialize the engine, then close everything down
+ *trt_model_stream = engine->serialize();
+ engine->destroy();
+ network->destroy();
+ config->destroy();
+ builder->destroy();
+}
+
+void PointPillars::PreprocessCPU(const float* in_points_array,
+ const int in_num_points) {
+ int x_coors[kMaxNumPillars] = {};
+ int y_coors[kMaxNumPillars] = {};
+ float num_points_per_pillar[kMaxNumPillars] = {};
+
+ float* pillar_point_feature =
+ new float[kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature];
+ float* pillar_coors = new float[kMaxNumPillars * 4];
+ float* sparse_pillar_map = new float[kNumIndsForScan * kNumIndsForScan];
+
+ preprocess_points_ptr_->Preprocess(in_points_array, in_num_points, x_coors,
+ y_coors, num_points_per_pillar,
+ pillar_point_feature, pillar_coors,
+ sparse_pillar_map, host_pillar_count_);
+
+ GPU_CHECK(cudaMemset(dev_x_coors_, 0, kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMemset(dev_y_coors_, 0, kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMemset(dev_pillar_point_feature_, 0,
+ kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float)));
+ GPU_CHECK(
+ cudaMemset(dev_pillar_coors_, 0, kMaxNumPillars * 4 * sizeof(float)));
+ GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0,
+ kMaxNumPillars * sizeof(float)));
+ GPU_CHECK(cudaMemset(dev_sparse_pillar_map_, 0,
+ kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+
+ GPU_CHECK(cudaMemcpy(dev_x_coors_, x_coors, kMaxNumPillars * sizeof(int),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_y_coors_, y_coors, kMaxNumPillars * sizeof(int),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_pillar_point_feature_, pillar_point_feature,
+ kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_pillar_coors_, pillar_coors,
+ kMaxNumPillars * 4 * sizeof(float),
+ cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_num_points_per_pillar_, num_points_per_pillar,
+ kMaxNumPillars * sizeof(float), cudaMemcpyHostToDevice));
+ GPU_CHECK(cudaMemcpy(dev_sparse_pillar_map_, sparse_pillar_map,
+ kNumIndsForScan * kNumIndsForScan * sizeof(float),
+ cudaMemcpyHostToDevice));
+
+ delete[] pillar_point_feature;
+ delete[] pillar_coors;
+ delete[] sparse_pillar_map;
+}
+
+void PointPillars::PreprocessGPU(const float* in_points_array,
+ const int in_num_points) {
+ float* dev_points;
+ GPU_CHECK(cudaMalloc(reinterpret_cast(&dev_points),
+ in_num_points * kNumPointFeature * sizeof(float)));
+ GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
+ in_num_points * kNumPointFeature * sizeof(float),
+ cudaMemcpyHostToDevice));
+
+ GPU_CHECK(cudaMemset(dev_x_coors_, 0, kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMemset(dev_y_coors_, 0, kMaxNumPillars * sizeof(int)));
+ GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0,
+ kMaxNumPillars * sizeof(float)));
+ GPU_CHECK(cudaMemset(dev_pillar_point_feature_, 0,
+ kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float)));
+ GPU_CHECK(
+ cudaMemset(dev_pillar_coors_, 0, kMaxNumPillars * 4 * sizeof(float)));
+ GPU_CHECK(cudaMemset(dev_sparse_pillar_map_, 0,
+ kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+ host_pillar_count_[0] = 0;
+
+ GPU_CHECK(cudaMemset(dev_anchor_mask_, 0, kNumAnchor * sizeof(int)));
+
+ preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
+ dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
+ dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
+ dev_sparse_pillar_map_, host_pillar_count_);
+
+ GPU_CHECK(cudaFree(dev_points));
+}
+
+void PointPillars::Preprocess(const float* in_points_array,
+ const int in_num_points) {
+ if (reproduce_result_mode_) {
+ PreprocessCPU(in_points_array, in_num_points);
+ } else {
+ PreprocessGPU(in_points_array, in_num_points);
+ }
+}
+
+void PointPillars::DoInference(const float* in_points_array,
+ const int in_num_points,
+ std::vector* out_detections,
+ std::vector* out_labels) {
+ if (device_id_ < 0) {
+ AERROR << "Torch is not using GPU!";
+ return;
+ }
+
+ Preprocess(in_points_array, in_num_points);
+ anchor_mask_cuda_ptr_->DoAnchorMaskCuda(
+ dev_sparse_pillar_map_, dev_cumsum_along_x_, dev_cumsum_along_y_,
+ dev_box_anchors_min_x_, dev_box_anchors_min_y_, dev_box_anchors_max_x_,
+ dev_box_anchors_max_y_, dev_anchor_mask_);
+
+ cudaStream_t stream;
+ GPU_CHECK(cudaStreamCreate(&stream));
+ GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pillar_point_feature_,
+ kMaxNumPillars * kMaxNumPointsPerPillar *
+ kNumPointFeature * sizeof(float),
+ cudaMemcpyDeviceToDevice, stream));
+ GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[1], dev_num_points_per_pillar_,
+ kMaxNumPillars * sizeof(float),
+ cudaMemcpyDeviceToDevice, stream));
+ GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[2], dev_pillar_coors_,
+ kMaxNumPillars * 4 * sizeof(float),
+ cudaMemcpyDeviceToDevice, stream));
+ torch::Tensor tensor_pillar_point_feature = torch::from_blob(pfe_buffers_[0],
+ {kMaxNumPillars, kMaxNumPointsPerPillar, kNumPointFeature},
+ torch::kCUDA);
+ torch::Tensor tensor_num_points_per_pillar = torch::from_blob(
+ pfe_buffers_[1], {kMaxNumPillars}, torch::kCUDA);
+ torch::Tensor tensor_pillar_coors = torch::from_blob(pfe_buffers_[2],
+ {kMaxNumPillars, 4}, torch::kCUDA);
+ torch::Device device(device_type_, device_id_);
+ tensor_pillar_point_feature.to(device);
+ tensor_num_points_per_pillar.to(device);
+ tensor_pillar_coors.to(device);
+
+//***********************************************************************
+//bug:operation failed in the TorchScript interpreter
+ auto pfe_output = pfe_net_.forward({tensor_pillar_point_feature,
+ tensor_num_points_per_pillar,
+ tensor_pillar_coors}).toTensor();
+
+ GPU_CHECK(
+ cudaMemset(dev_scattered_feature_, 0, kRpnInputSize * sizeof(float)));
+ scatter_cuda_ptr_->DoScatterCuda(
+ host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
+ pfe_output.data_ptr(), dev_scattered_feature_);
+ GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
+ kBatchSize * kRpnInputSize * sizeof(float),
+ cudaMemcpyDeviceToDevice, stream));
+ rpn_context_->enqueueV2(rpn_buffers_, stream, nullptr);
+ GPU_CHECK(cudaMemset(dev_filter_count_, 0, sizeof(int)));
+//bug:radix_sort: failed on 2nd step: cudaErrorInvalidValue: invalid argument
+ postprocess_cuda_ptr_->DoPostprocessCuda(
+ reinterpret_cast(rpn_buffers_[1]),
+ reinterpret_cast(rpn_buffers_[2]),
+ reinterpret_cast(rpn_buffers_[3]), dev_anchor_mask_,
+ dev_anchors_px_, dev_anchors_py_, dev_anchors_pz_, dev_anchors_dx_,
+ dev_anchors_dy_, dev_anchors_dz_, dev_anchors_ro_, dev_filtered_box_,
+ dev_filtered_score_, dev_filtered_label_, dev_filtered_dir_,
+ dev_box_for_nms_, dev_filter_count_, out_detections, out_labels);
+//***************************************************************************
+
+ // release the stream and the buffers
+ cudaStreamDestroy(stream);
+}
+
+} // namespace lidar
+} // namespace perception
+} // namespace apollo
diff --git a/modules/perception/lidar_point_pillars/point_pillars.h b/modules/perception/lidar_point_pillars/point_pillars.h
new file mode 100755
index 0000000..728e4f9
--- /dev/null
+++ b/modules/perception/lidar_point_pillars/point_pillars.h
@@ -0,0 +1,321 @@
+#pragma once
+
+// headers in STL
+#include
+#include
+#include
+#include
+#include