From 3709f4434f3a19d63f5338e060cf63fb5f55e58d Mon Sep 17 00:00:00 2001 From: pnikutta Date: Mon, 21 Jul 2025 16:02:58 +0200 Subject: [PATCH 1/4] fix invalid pointer when running with two points --- cpp/dbscan.cpp | 23 +++++++++++++++++------ python/dbscan_test.py | 12 ++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/cpp/dbscan.cpp b/cpp/dbscan.cpp index 0f43514..1d6066e 100644 --- a/cpp/dbscan.cpp +++ b/cpp/dbscan.cpp @@ -52,8 +52,12 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto // FIRST PASS OVER THE POINTS for (auto const& pt : points) { - auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + + bin_x = bin_x < num_bins_x ? bin_x : num_bins_x - 1; + bin_y = bin_y < num_bins_y ? bin_y : num_bins_y - 1; + auto const index{bin_y * num_bins_x + bin_x}; counts_[index] += 1; } @@ -68,9 +72,13 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto std::vector new_point_to_point_index_map(std::size(points)); std::uint32_t i{0}; for (auto const& pt : points) { - auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + + bin_x = bin_x < num_bins_x ? bin_x : num_bins_x - 1; + bin_y = bin_y < num_bins_y ? bin_y : num_bins_y - 1; auto const index{bin_y * num_bins_x + bin_x}; + auto const new_pt_index{scratch[index]}; scratch[index] += 1; new_points[new_pt_index] = pt; @@ -86,8 +94,11 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto #pragma omp parallel for for (auto i = 0UL; i < std::size(new_points); ++i) { auto const pt{new_points[i]}; - auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + + bin_x = bin_x < static_cast(num_bins_x) ? bin_x : num_bins_x - 1; + bin_y = bin_y < static_cast(num_bins_y) ? bin_y : num_bins_y - 1; std::vector local_neighbors; diff --git a/python/dbscan_test.py b/python/dbscan_test.py index 52d49d3..6d2dceb 100644 --- a/python/dbscan_test.py +++ b/python/dbscan_test.py @@ -61,5 +61,17 @@ def test_blobs(): assert all(np.count_nonzero(y_pred == label) == 500 for label in range(n_blobs)) +def test_two_points(): + """Test using two points.""" + X = np.array([[0, 0], [1, 1]]) + + dbscan = py_dbscan.DBSCAN(0.5, 2) + y_pred = dbscan.fit_predict(X) + + assert y_pred.shape[0] == 2 + np.testing.assert_array_equal(y_pred, np.array([-1, -1])) + + + if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rP"])) From 75b6d49cc91d2ac1e80ba5b031d54cdca779e0ad Mon Sep 17 00:00:00 2001 From: pnikutta Date: Thu, 24 Jul 2025 16:30:15 +0200 Subject: [PATCH 2/4] fix: change num_bins calculation instead of bins index directly --- cpp/dbscan.cpp | 25 ++++++++----------------- python/dbscan_test.py | 4 ++-- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/cpp/dbscan.cpp b/cpp/dbscan.cpp index 1d6066e..598a171 100644 --- a/cpp/dbscan.cpp +++ b/cpp/dbscan.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace dbscan { @@ -44,8 +45,9 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto // derive num_bins out of it float const range_x{max[0] - min[0]}; float const range_y{max[1] - min[1]}; - auto const num_bins_x{static_cast(std::ceil(range_x / eps_))}; - auto const num_bins_y{static_cast(std::ceil(range_y / eps_))}; + // add 1e-7 to handle the case where range is exactly divisible by eps_ + auto const num_bins_x{static_cast(std::ceil((range_x + 1e-7f) / eps_))}; + auto const num_bins_y{static_cast(std::ceil((range_y + 1e-7f) / eps_))}; // count number of points in every bin counts_.assign(num_bins_x * num_bins_y, 0); @@ -54,10 +56,6 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto for (auto const& pt : points) { auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; - - bin_x = bin_x < num_bins_x ? bin_x : num_bins_x - 1; - bin_y = bin_y < num_bins_y ? bin_y : num_bins_y - 1; - auto const index{bin_y * num_bins_x + bin_x}; counts_[index] += 1; } @@ -72,13 +70,9 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto std::vector new_point_to_point_index_map(std::size(points)); std::uint32_t i{0}; for (auto const& pt : points) { - auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; - - bin_x = bin_x < num_bins_x ? bin_x : num_bins_x - 1; - bin_y = bin_y < num_bins_y ? bin_y : num_bins_y - 1; + auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; auto const index{bin_y * num_bins_x + bin_x}; - auto const new_pt_index{scratch[index]}; scratch[index] += 1; new_points[new_pt_index] = pt; @@ -94,11 +88,8 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto #pragma omp parallel for for (auto i = 0UL; i < std::size(new_points); ++i) { auto const pt{new_points[i]}; - auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; - - bin_x = bin_x < static_cast(num_bins_x) ? bin_x : num_bins_x - 1; - bin_y = bin_y < static_cast(num_bins_y) ? bin_y : num_bins_y - 1; + auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; std::vector local_neighbors; diff --git a/python/dbscan_test.py b/python/dbscan_test.py index 6d2dceb..bc7963f 100644 --- a/python/dbscan_test.py +++ b/python/dbscan_test.py @@ -61,8 +61,8 @@ def test_blobs(): assert all(np.count_nonzero(y_pred == label) == 500 for label in range(n_blobs)) -def test_two_points(): - """Test using two points.""" +def test_points_on_border(): + """Test using two points on the border with a eps which divides point / min_point without remainder.""" X = np.array([[0, 0], [1, 1]]) dbscan = py_dbscan.DBSCAN(0.5, 2) From 5adc01bc342375008d65547aabff7f9337862df6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Jul 2025 14:32:25 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- python/dbscan_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/dbscan_test.py b/python/dbscan_test.py index bc7963f..a1d4deb 100644 --- a/python/dbscan_test.py +++ b/python/dbscan_test.py @@ -72,6 +72,5 @@ def test_points_on_border(): np.testing.assert_array_equal(y_pred, np.array([-1, -1])) - if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rP"])) From 0768089e2a9c6cdc2db107470f57e3ea4043a144 Mon Sep 17 00:00:00 2001 From: Alexander Hans Date: Thu, 24 Jul 2025 23:15:15 +0200 Subject: [PATCH 4/4] cleanup --- cpp/dbscan.cpp | 5 ++--- python/dbscan_test.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/dbscan.cpp b/cpp/dbscan.cpp index 598a171..6011473 100644 --- a/cpp/dbscan.cpp +++ b/cpp/dbscan.cpp @@ -4,7 +4,6 @@ #include #include #include -#include namespace dbscan { @@ -54,8 +53,8 @@ auto Dbscan::fit_predict(std::vector const& points) -> std::vecto // FIRST PASS OVER THE POINTS for (auto const& pt : points) { - auto bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; - auto bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; + auto const bin_x{static_cast(std::floor((pt[0] - min[0]) / eps_))}; + auto const bin_y{static_cast(std::floor((pt[1] - min[1]) / eps_))}; auto const index{bin_y * num_bins_x + bin_x}; counts_[index] += 1; } diff --git a/python/dbscan_test.py b/python/dbscan_test.py index a1d4deb..634792b 100644 --- a/python/dbscan_test.py +++ b/python/dbscan_test.py @@ -62,14 +62,14 @@ def test_blobs(): def test_points_on_border(): - """Test using two points on the border with a eps which divides point / min_point without remainder.""" + """Test using two points on the border with an eps which divides (point - min_point) without remainder.""" X = np.array([[0, 0], [1, 1]]) dbscan = py_dbscan.DBSCAN(0.5, 2) y_pred = dbscan.fit_predict(X) assert y_pred.shape[0] == 2 - np.testing.assert_array_equal(y_pred, np.array([-1, -1])) + np.testing.assert_equal(y_pred, np.array([-1, -1])) if __name__ == "__main__":