From e4db6b22e83edd9a152403e22d69228bbe16cf20 Mon Sep 17 00:00:00 2001 From: Danil An Date: Thu, 26 Feb 2026 20:03:19 +0400 Subject: [PATCH 1/4] build fix (with UTL_PROFILER_DISABLE=OFF) --- .../lidar_odometry_utils_optimizers.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp index d13e97ae..c273ac3f 100644 --- a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp +++ b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp @@ -2335,6 +2335,8 @@ bool process_worker_step_1( std::atomic& loProgress, double& ts_failure) { + UTL_PROFILER_SCOPE("process_worker_step_1"); + Eigen::Vector3d mean_shift(0.0, 0.0, 0.0); if (i > 1 && params.use_motion_from_previous_step) { @@ -2427,6 +2429,8 @@ bool process_worker_step_2( double& ts_failure, std::vector& intermediate_points) { + UTL_PROFILER_SCOPE("process_worker_step_2"); + bool add_pitch_roll_constraint = false; spdlog::stopwatch stopwatch_worker; @@ -2583,8 +2587,7 @@ bool process_worker_step_lidar_odometry_core( { spdlog::stopwatch stopwatch_realtime; - UTL_PROFILER_END(before_iter); - UTL_PROFILER_BEGIN(iter_loop, "iteration_loop"); + UTL_PROFILER_SCOPE("process_worker_step_lidar_odometry_core"); for (int iter = 0; iter < params.nr_iter; iter++) { @@ -2646,7 +2649,6 @@ bool process_worker_step_lidar_odometry_core( break; } } - UTL_PROFILER_END(iter_loop); return true; } @@ -2773,6 +2775,7 @@ bool compute_step_2( for (int i = 0; i < worker_data.size(); i++) { UTL_PROFILER_BEGIN(before_iter, "before_iterations"); + std::vector intermediate_points; // = worker_data[i].load_points(worker_data[i].intermediate_points_cache_file_name); load_vector_data(worker_data[i].intermediate_points_cache_file_name.string(), intermediate_points); @@ -2830,6 +2833,8 @@ bool compute_step_2( double delta = 100000.0; double lm_factor = 1.0; + UTL_PROFILER_END(before_iter); + process_worker_step_lidar_odometry_core( worker_data[i], i > 0 ? worker_data[i - 1] : worker_data[0], From 58841b19ad7c42deb077224b2fc4bfdb734c5c78 Mon Sep 17 00:00:00 2001 From: Danil An Date: Thu, 26 Feb 2026 20:34:53 +0400 Subject: [PATCH 2/4] Fix: replace tbb::combinable with fixed-size chunk accumulators for deterministic multithreaded Hessian accumulation --- .../lidar_odometry_utils_optimizers.cpp | 101 +++++++++--------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp index c273ac3f..a2bfa881 100644 --- a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp +++ b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp @@ -4,8 +4,8 @@ #include #include #include -#include #include +#include #include @@ -1826,58 +1826,57 @@ void optimize_lidar_odometry( UTL_PROFILER_BEGIN(hessian_compute, "hessian_compute"); if (multithread) { - using MatrixPair = std::pair; - tbb::combinable thread_local_matrices( - [&intermediate_trajectory]() - { - return std::make_pair( - Eigen::MatrixXd::Zero(intermediate_trajectory.size() * 6, intermediate_trajectory.size() * 6), - Eigen::MatrixXd::Zero(intermediate_trajectory.size() * 6, 1)); - }); - tbb::combinable thread_local_stats; - - tbb::parallel_for( - tbb::blocked_range(0, intermediate_points.size()), - [&](const tbb::blocked_range& range) - { - auto& [local_AtPA, local_AtPB] = thread_local_matrices.local(); - auto& local_stats = thread_local_stats.local(); - for (size_t i = range.begin(); i != range.end(); ++i) - { - compute_hessian( - intermediate_points[i], - intermediate_trajectory, - tait_bryan_poses, - buckets_indoor, - bucket_size_indoor, - buckets_outdoor, - bucket_size_outdoor, - max_distance, - ablation_study_use_hierarchical_rgd, - ablation_study_use_view_point_and_normal_vectors, - ablation_study_use_planarity, - ablation_study_use_norm, - local_AtPA, - local_AtPB, - local_stats, - ablation_study_use_threshold_outer_rgd, - convergence_result, - convergence_delta_threshold_outer_rgd); - } - }); + const size_t num_points = intermediate_points.size(); + const size_t hw_threads = std::thread::hardware_concurrency(); + const size_t num_chunks = hw_threads > 0 ? hw_threads : 4; // fallback to 4 if hw_concurrency unknown + const size_t chunk_size = (num_points + num_chunks - 1) / num_chunks; // ceiling division - thread_local_matrices.combine_each( - [&AtPAndt, &AtPBndt](const MatrixPair& local) - { - AtPAndt += local.first; - AtPBndt += local.second; - }); - thread_local_stats.combine_each( - [&lookup_stats](const LookupStats& local) + const size_t mat_dim = intermediate_trajectory.size() * 6; + std::vector> chunk_matrices(num_chunks); + std::vector chunk_stats(num_chunks); + for (size_t c = 0; c < num_chunks; ++c) + { + chunk_matrices[c].first = Eigen::MatrixXd::Zero(mat_dim, mat_dim); + chunk_matrices[c].second = Eigen::MatrixXd::Zero(mat_dim, 1); + } + + tbb::parallel_for(size_t(0), num_chunks, [&](size_t chunk) + { + const size_t start = chunk * chunk_size; + const size_t end = std::min(start + chunk_size, num_points); + auto& [local_AtPA, local_AtPB] = chunk_matrices[chunk]; + auto& local_stats = chunk_stats[chunk]; + for (size_t i = start; i < end; ++i) { - lookup_stats.indoor_lookups += local.indoor_lookups; - lookup_stats.outdoor_lookups += local.outdoor_lookups; - }); + compute_hessian( + intermediate_points[i], + intermediate_trajectory, + tait_bryan_poses, + buckets_indoor, + bucket_size_indoor, + buckets_outdoor, + bucket_size_outdoor, + max_distance, + ablation_study_use_hierarchical_rgd, + ablation_study_use_view_point_and_normal_vectors, + ablation_study_use_planarity, + ablation_study_use_norm, + local_AtPA, + local_AtPB, + local_stats, + ablation_study_use_threshold_outer_rgd, + convergence_result, + convergence_delta_threshold_outer_rgd); + } + }); + + for (size_t c = 0; c < num_chunks; ++c) + { + AtPAndt += chunk_matrices[c].first; + AtPBndt += chunk_matrices[c].second; + lookup_stats.indoor_lookups += chunk_stats[c].indoor_lookups; + lookup_stats.outdoor_lookups += chunk_stats[c].outdoor_lookups; + } } else { From e4c027092f95e9a3360b055d6c450feb089e2895 Mon Sep 17 00:00:00 2001 From: Danil An Date: Thu, 26 Feb 2026 20:41:50 +0400 Subject: [PATCH 3/4] cland formatting --- .../lidar_odometry_utils_optimizers.cpp | 61 ++++++++++--------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp index a2bfa881..501d158f 100644 --- a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp +++ b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp @@ -1840,35 +1840,38 @@ void optimize_lidar_odometry( chunk_matrices[c].second = Eigen::MatrixXd::Zero(mat_dim, 1); } - tbb::parallel_for(size_t(0), num_chunks, [&](size_t chunk) - { - const size_t start = chunk * chunk_size; - const size_t end = std::min(start + chunk_size, num_points); - auto& [local_AtPA, local_AtPB] = chunk_matrices[chunk]; - auto& local_stats = chunk_stats[chunk]; - for (size_t i = start; i < end; ++i) + tbb::parallel_for( + size_t(0), + num_chunks, + [&](size_t chunk) { - compute_hessian( - intermediate_points[i], - intermediate_trajectory, - tait_bryan_poses, - buckets_indoor, - bucket_size_indoor, - buckets_outdoor, - bucket_size_outdoor, - max_distance, - ablation_study_use_hierarchical_rgd, - ablation_study_use_view_point_and_normal_vectors, - ablation_study_use_planarity, - ablation_study_use_norm, - local_AtPA, - local_AtPB, - local_stats, - ablation_study_use_threshold_outer_rgd, - convergence_result, - convergence_delta_threshold_outer_rgd); - } - }); + const size_t start = chunk * chunk_size; + const size_t end = std::min(start + chunk_size, num_points); + auto& [local_AtPA, local_AtPB] = chunk_matrices[chunk]; + auto& local_stats = chunk_stats[chunk]; + for (size_t i = start; i < end; ++i) + { + compute_hessian( + intermediate_points[i], + intermediate_trajectory, + tait_bryan_poses, + buckets_indoor, + bucket_size_indoor, + buckets_outdoor, + bucket_size_outdoor, + max_distance, + ablation_study_use_hierarchical_rgd, + ablation_study_use_view_point_and_normal_vectors, + ablation_study_use_planarity, + ablation_study_use_norm, + local_AtPA, + local_AtPB, + local_stats, + ablation_study_use_threshold_outer_rgd, + convergence_result, + convergence_delta_threshold_outer_rgd); + } + }); for (size_t c = 0; c < num_chunks; ++c) { @@ -2774,7 +2777,7 @@ bool compute_step_2( for (int i = 0; i < worker_data.size(); i++) { UTL_PROFILER_BEGIN(before_iter, "before_iterations"); - + std::vector intermediate_points; // = worker_data[i].load_points(worker_data[i].intermediate_points_cache_file_name); load_vector_data(worker_data[i].intermediate_points_cache_file_name.string(), intermediate_points); From 5932033553d6fdeae4726b3e6a9ca85618b88387 Mon Sep 17 00:00:00 2001 From: Danil An Date: Fri, 27 Feb 2026 11:07:14 +0400 Subject: [PATCH 4/4] Fix: deterministic Hessian accumulation via per-point storage and in-order sequential summation, making ST and MT paths similar and deterministic --- .../lidar_odometry_utils_optimizers.cpp | 168 +++++++----------- 1 file changed, 63 insertions(+), 105 deletions(-) diff --git a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp index 501d158f..fef3b255 100644 --- a/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp +++ b/apps/lidar_odometry_step_1/lidar_odometry_utils_optimizers.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -1452,12 +1451,11 @@ static void add_indoor_hessian_contribution( const Eigen::Vector3d& point_source, const TaitBryanPoseWithTrig& pose_trig, const Eigen::Vector3d& viewport, - const int matrix_offset, const bool ablation_study_use_view_point_and_normal_vectors, const bool ablation_study_use_planarity, const bool ablation_study_use_norm, - Eigen::MatrixXd& AtPAndt, - Eigen::MatrixXd& AtPBndt) + Eigen::Matrix& out_AtPA, + Eigen::Matrix& out_AtPB) { // Use precomputed cov_inverse from update_rgd const Eigen::Matrix3d& infm = indoor_bucket.cov_inverse; @@ -1551,8 +1549,8 @@ static void add_indoor_hessian_contribution( AtPB *= w; } - AtPAndt.block<6, 6>(matrix_offset, matrix_offset) += AtPA; - AtPBndt.block<6, 1>(matrix_offset, 0) -= AtPB; + out_AtPA += AtPA; + out_AtPB += AtPB; } // Helper to process outdoor bucket contribution @@ -1561,11 +1559,10 @@ static void add_outdoor_hessian_contribution( const Eigen::Vector3d& point_source, const TaitBryanPoseWithTrig& pose_trig, const Eigen::Vector3d& viewport, - const int matrix_offset, const bool ablation_study_use_view_point_and_normal_vectors, const bool ablation_study_use_planarity, - Eigen::MatrixXd& AtPAndt, - Eigen::MatrixXd& AtPBndt) + Eigen::Matrix& out_AtPA, + Eigen::Matrix& out_AtPB) { // Use precomputed cov_inverse from update_rgd const Eigen::Matrix3d& infm = outdoor_bucket.cov_inverse; @@ -1652,12 +1649,13 @@ static void add_outdoor_hessian_contribution( AtPB *= planarity; } - AtPAndt.block<6, 6>(matrix_offset, matrix_offset) += AtPA; - AtPBndt.block<6, 1>(matrix_offset, 0) -= AtPB; + out_AtPA += AtPA; + out_AtPB += AtPB; } // Unified hessian function that handles indoor and optionally outdoor contributions static void compute_hessian( + size_t point_index, const Point3Di& point, const std::vector& trajectory, const std::vector& tait_bryan_poses, @@ -1670,9 +1668,9 @@ static void compute_hessian( const bool ablation_study_use_view_point_and_normal_vectors, const bool ablation_study_use_planarity, const bool ablation_study_use_norm, - Eigen::MatrixXd& AtPAndt, - Eigen::MatrixXd& AtPBndt, - LookupStats& stats, + std::vector>& point_AtPA, + std::vector>& point_AtPB, + std::vector& per_point_stats, const bool& ablation_study_use_threshold_outer_rgd, const double& convergence_result, const double& convergence_delta_threshold_outer_rgd) @@ -1689,29 +1687,25 @@ static void compute_hessian( const Eigen::Vector3d point_global = trajectory[point.index_pose] * point.point; const Eigen::Vector3d& point_source = point.point; const TaitBryanPoseWithTrig& pose_trig = tait_bryan_poses[point.index_pose]; - const int matrix_offset = point.index_pose * 6; const Eigen::Vector3d viewport = trajectory[point.index_pose].translation(); // Indoor contribution (independent) const auto indoor_key = get_rgd_index_3d(point_global, bucket_size_indoor); const auto indoor_it = buckets_indoor.find(indoor_key); - ++stats.indoor_lookups; + ++per_point_stats[point_index].indoor_lookups; - const NDT::Bucket* indoor_bucket = nullptr; if (indoor_it != buckets_indoor.end()) { - indoor_bucket = &indoor_it->second; add_indoor_hessian_contribution( - *indoor_bucket, + indoor_it->second, point_source, pose_trig, viewport, - matrix_offset, ablation_study_use_view_point_and_normal_vectors, ablation_study_use_planarity, ablation_study_use_norm, - AtPAndt, - AtPBndt); + point_AtPA[point_index], + point_AtPB[point_index]); } // Outdoor contribution (independent, only when hierarchical mode and range >= 5.0) @@ -1736,7 +1730,7 @@ static void compute_hessian( { const auto outdoor_key = get_rgd_index_3d(point_global, bucket_size_outdoor); const auto outdoor_it = buckets_outdoor.find(outdoor_key); - ++stats.outdoor_lookups; + ++per_point_stats[point_index].outdoor_lookups; if (outdoor_it != buckets_outdoor.end()) { @@ -1745,11 +1739,10 @@ static void compute_hessian( point_source, pose_trig, viewport, - matrix_offset, ablation_study_use_view_point_and_normal_vectors, ablation_study_use_planarity, - AtPAndt, - AtPBndt); + point_AtPA[point_index], + point_AtPB[point_index]); } } } @@ -1824,87 +1817,52 @@ void optimize_lidar_odometry( UTL_PROFILER_END(pre_hessian); UTL_PROFILER_BEGIN(hessian_compute, "hessian_compute"); - if (multithread) - { - const size_t num_points = intermediate_points.size(); - const size_t hw_threads = std::thread::hardware_concurrency(); - const size_t num_chunks = hw_threads > 0 ? hw_threads : 4; // fallback to 4 if hw_concurrency unknown - const size_t chunk_size = (num_points + num_chunks - 1) / num_chunks; // ceiling division - - const size_t mat_dim = intermediate_trajectory.size() * 6; - std::vector> chunk_matrices(num_chunks); - std::vector chunk_stats(num_chunks); - for (size_t c = 0; c < num_chunks; ++c) - { - chunk_matrices[c].first = Eigen::MatrixXd::Zero(mat_dim, mat_dim); - chunk_matrices[c].second = Eigen::MatrixXd::Zero(mat_dim, 1); - } - - tbb::parallel_for( - size_t(0), - num_chunks, - [&](size_t chunk) - { - const size_t start = chunk * chunk_size; - const size_t end = std::min(start + chunk_size, num_points); - auto& [local_AtPA, local_AtPB] = chunk_matrices[chunk]; - auto& local_stats = chunk_stats[chunk]; - for (size_t i = start; i < end; ++i) - { - compute_hessian( - intermediate_points[i], - intermediate_trajectory, - tait_bryan_poses, - buckets_indoor, - bucket_size_indoor, - buckets_outdoor, - bucket_size_outdoor, - max_distance, - ablation_study_use_hierarchical_rgd, - ablation_study_use_view_point_and_normal_vectors, - ablation_study_use_planarity, - ablation_study_use_norm, - local_AtPA, - local_AtPB, - local_stats, - ablation_study_use_threshold_outer_rgd, - convergence_result, - convergence_delta_threshold_outer_rgd); - } - }); + const size_t num_points = intermediate_points.size(); + using Mat6x6 = Eigen::Matrix; + using Vec6x1 = Eigen::Matrix; + std::vector point_AtPA(num_points, Mat6x6::Zero()); + std::vector point_AtPB(num_points, Vec6x1::Zero()); + std::vector per_point_stats(num_points); + + auto compute_point = [&](size_t i) + { + compute_hessian( + i, + intermediate_points[i], + intermediate_trajectory, + tait_bryan_poses, + buckets_indoor, + bucket_size_indoor, + buckets_outdoor, + bucket_size_outdoor, + max_distance, + ablation_study_use_hierarchical_rgd, + ablation_study_use_view_point_and_normal_vectors, + ablation_study_use_planarity, + ablation_study_use_norm, + point_AtPA, + point_AtPB, + per_point_stats, + ablation_study_use_threshold_outer_rgd, + convergence_result, + convergence_delta_threshold_outer_rgd); + }; - for (size_t c = 0; c < num_chunks; ++c) - { - AtPAndt += chunk_matrices[c].first; - AtPBndt += chunk_matrices[c].second; - lookup_stats.indoor_lookups += chunk_stats[c].indoor_lookups; - lookup_stats.outdoor_lookups += chunk_stats[c].outdoor_lookups; - } - } + if (multithread) + tbb::parallel_for(size_t(0), num_points, compute_point); else - { - for (const auto& pt : intermediate_points) - { - compute_hessian( - pt, - intermediate_trajectory, - tait_bryan_poses, - buckets_indoor, - bucket_size_indoor, - buckets_outdoor, - bucket_size_outdoor, - max_distance, - ablation_study_use_hierarchical_rgd, - ablation_study_use_view_point_and_normal_vectors, - ablation_study_use_planarity, - ablation_study_use_norm, - AtPAndt, - AtPBndt, - lookup_stats, - ablation_study_use_threshold_outer_rgd, - convergence_result, - convergence_delta_threshold_outer_rgd); - } + for (size_t i = 0; i < num_points; ++i) + compute_point(i); + + // Sequential sum in point order -- guarantees identical accumulation order + // for both ST and MT paths, avoiding FP non-associativity nondeterminism. + for (size_t i = 0; i < num_points; ++i) + { + const int c = intermediate_points[i].index_pose * 6; + AtPAndt.block<6, 6>(c, c) += point_AtPA[i]; + AtPBndt.block<6, 1>(c, 0) -= point_AtPB[i]; + lookup_stats.indoor_lookups += per_point_stats[i].indoor_lookups; + lookup_stats.outdoor_lookups += per_point_stats[i].outdoor_lookups; } UTL_PROFILER_END(hessian_compute);