From 1b91666377a34e49548097caf9f482a45a5b4e93 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Tue, 27 Jan 2026 13:23:09 +0530 Subject: [PATCH 1/2] BugFix: SIGABRT in quantiles_sketch::deserialize(): dereferencing empty std::optional (libc++ verbose_abort) --- kll/include/kll_sketch_impl.hpp | 37 +++++++++++++-------- quantiles/include/quantiles_sketch_impl.hpp | 37 +++++++++++++-------- req/include/req_sketch_impl.hpp | 37 +++++++++++++-------- sampling/include/ebpps_sample_impl.hpp | 25 +++++++++----- 4 files changed, 86 insertions(+), 50 deletions(-) diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index fde0a314..44fe6a15 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include "conditional_forward.hpp" #include "count_zeros.hpp" @@ -481,18 +482,22 @@ kll_sketch kll_sketch::deserialize(std::istream& is, const Ser read(is, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; @@ -565,18 +570,22 @@ kll_sketch kll_sketch::deserialize(const void* bytes, size_t s ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels); } levels[num_levels] = capacity; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; if (!is_single_item) { - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } A alloc(allocator); auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); }; diff --git a/quantiles/include/quantiles_sketch_impl.hpp b/quantiles/include/quantiles_sketch_impl.hpp index 50c82c18..2dacf21e 100644 --- a/quantiles/include/quantiles_sketch_impl.hpp +++ b/quantiles/include/quantiles_sketch_impl.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "count_zeros.hpp" #include "conditional_forward.hpp" @@ -393,18 +394,22 @@ auto quantiles_sketch::deserialize(std::istream &is, const SerDe& serde const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - serde.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - serde.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + serde.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { read(is); // no longer used @@ -507,18 +512,22 @@ auto quantiles_sketch::deserialize(const void* bytes, size_t size, cons const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0); const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0; - optional tmp; // space to deserialize min and max optional min_item; optional max_item; - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += serde.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += serde.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); if (serial_version == 1) { uint64_t unused_long; diff --git a/req/include/req_sketch_impl.hpp b/req/include/req_sketch_impl.hpp index a28e74e2..3c1c2fc1 100755 --- a/req/include/req_sketch_impl.hpp +++ b/req/include/req_sketch_impl.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace datasketches { @@ -461,7 +462,6 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -472,14 +472,19 @@ req_sketch req_sketch::deserialize(std::istream& is, const Ser uint64_t n = 1; if (num_levels > 1) { n = read(is); - sd.deserialize(is, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - sd.deserialize(is, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + sd.deserialize(is, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { @@ -537,7 +542,6 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK); if (is_empty) return req_sketch(k, hra, comparator, allocator); - optional tmp; // space to deserialize min and max optional min_item; optional max_item; @@ -549,14 +553,19 @@ req_sketch req_sketch::deserialize(const void* bytes, size_t s if (num_levels > 1) { ensure_minimum_memory(end_ptr - ptr, sizeof(n)); ptr += copy_from_mem(ptr, n); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize min and max. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - min_item.emplace(*tmp); - (*tmp).~T(); - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + min_item.emplace(std::move(*tmp)); + tmp->~T(); + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde call did not throw, repackage and cleanup - max_item.emplace(*tmp); - (*tmp).~T(); + max_item.emplace(std::move(*tmp)); + tmp->~T(); } if (raw_items) { diff --git a/sampling/include/ebpps_sample_impl.hpp b/sampling/include/ebpps_sample_impl.hpp index 88a86ae0..c48b32aa 100644 --- a/sampling/include/ebpps_sample_impl.hpp +++ b/sampling/include/ebpps_sample_impl.hpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace datasketches { @@ -365,11 +366,15 @@ std::pair, size_t> ebpps_sample::deserialize(const uint optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - ptr += sd.deserialize(ptr, end_ptr - ptr, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + ptr += sd.deserialize(ptr, end_ptr - ptr, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } return std::pair, size_t>( @@ -400,11 +405,15 @@ ebpps_sample ebpps_sample::deserialize(std::istream& is, const SerDe optional partial_item; if (has_partial) { - optional tmp; // space to deserialize - sd.deserialize(is, &*tmp, 1); + // Space to deserialize. + // serde::deserialize expects allocated but not initialized storage. + typename std::aligned_storage::type tmp_storage; + T* tmp = reinterpret_cast(&tmp_storage); + + sd.deserialize(is, tmp, 1); // serde did not throw so place item and clean up - partial_item.emplace(*tmp); - (*tmp).~T(); + partial_item.emplace(std::move(*tmp)); + tmp->~T(); } if (!is.good()) throw std::runtime_error("error reading from std::istream"); From ba2aa6909aa5269af7fc3aa67ad9ced4f75a2938 Mon Sep 17 00:00:00 2001 From: Mahesh G Pai Date: Tue, 27 Jan 2026 18:54:09 +0530 Subject: [PATCH 2/2] Added testcases --- .github/workflows/hardening.yml | 59 +++++++ common/test/CMakeLists.txt | 69 +++++++- common/test/deserialize_hardening_test.cpp | 188 +++++++++++++++++++++ 3 files changed, 310 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/hardening.yml create mode 100644 common/test/deserialize_hardening_test.cpp diff --git a/.github/workflows/hardening.yml b/.github/workflows/hardening.yml new file mode 100644 index 00000000..e264ebd9 --- /dev/null +++ b/.github/workflows/hardening.yml @@ -0,0 +1,59 @@ +name: libc++ Hardening Tests + +on: + push: + branches: + - master + pull_request: + branches: + - master + workflow_dispatch: + +env: + BUILD_TYPE: Debug + +jobs: + hardening-test: + name: C++17 with libc++ Hardening Mode + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + persist-credentials: false + + - name: Install LLVM and libc++ + run: | + # Install LLVM/Clang with libc++ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + sudo add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" + sudo apt-get update + sudo apt-get install -y clang-18 libc++-18-dev libc++abi-18-dev + + - name: Configure with C++17 and libc++ hardening + env: + CC: clang-18 + CXX: clang++-18 + run: | + cmake -B build -S . \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_TESTS=ON \ + -DCMAKE_CXX_FLAGS="-stdlib=libc++ -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG" \ + -DCMAKE_EXE_LINKER_FLAGS="-stdlib=libc++ -lc++abi" + + - name: Build hardening tests + run: cmake --build build --target hardening_test --config ${{ env.BUILD_TYPE }} + + - name: Run hardening tests + run: | + cd build + ./common/test/hardening_test "[deserialize_hardening]" + + - name: Report results + if: always() + run: | + echo "✅ Tests passed with libc++ hardening enabled!" + echo "This verifies the fix for issue #477 prevents SIGABRT." diff --git a/common/test/CMakeLists.txt b/common/test/CMakeLists.txt index 7593bd0b..d190b628 100644 --- a/common/test/CMakeLists.txt +++ b/common/test/CMakeLists.txt @@ -75,12 +75,28 @@ target_sources(common_test # now the integration test part add_executable(integration_test) -target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple common_test_lib) - -set_target_properties(integration_test PROPERTIES - CXX_STANDARD 11 - CXX_STANDARD_REQUIRED YES -) +target_link_libraries(integration_test count cpc density fi hll kll req sampling theta tuple quantiles common_test_lib) + +# Use C++17 if CMAKE_CXX_STANDARD is set to 17+, otherwise C++11 +# This allows hardening tests to use std::optional with libc++ hardening +if(DEFINED CMAKE_CXX_STANDARD) + if(CMAKE_CXX_STANDARD MATCHES "17|20|23") + set_target_properties(integration_test PROPERTIES + CXX_STANDARD ${CMAKE_CXX_STANDARD} + CXX_STANDARD_REQUIRED YES + ) + else() + set_target_properties(integration_test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES + ) + endif() +else() + set_target_properties(integration_test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES + ) +endif() add_test( NAME integration_test @@ -91,3 +107,44 @@ target_sources(integration_test PRIVATE integration_test.cpp ) + +# Separate hardening test executable (header-only, no pre-compiled libs) +# This ensures the sketch code is compiled with C++17 + hardening +# Always build this target - it will use CMAKE_CXX_STANDARD if set, otherwise C++17 +message(STATUS "CMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}") + +add_executable(hardening_test) +target_link_libraries(hardening_test common common_test_lib) + +# Include directories for header-only sketch implementations +target_include_directories(hardening_test PRIVATE + ${CMAKE_SOURCE_DIR}/quantiles/include + ${CMAKE_SOURCE_DIR}/kll/include + ${CMAKE_SOURCE_DIR}/req/include + ${CMAKE_SOURCE_DIR}/common/include +) + +# Use C++17 minimum for hardening tests +if(CMAKE_CXX_STANDARD AND CMAKE_CXX_STANDARD GREATER_EQUAL 17) + set_target_properties(hardening_test PROPERTIES + CXX_STANDARD ${CMAKE_CXX_STANDARD} + CXX_STANDARD_REQUIRED YES + ) + message(STATUS "hardening_test will use C++${CMAKE_CXX_STANDARD}") +else() + set_target_properties(hardening_test PROPERTIES + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED YES + ) + message(STATUS "hardening_test will use C++17 (default)") +endif() + +add_test( + NAME hardening_test + COMMAND hardening_test "[deserialize_hardening]" +) + +target_sources(hardening_test + PRIVATE + deserialize_hardening_test.cpp +) diff --git a/common/test/deserialize_hardening_test.cpp b/common/test/deserialize_hardening_test.cpp new file mode 100644 index 00000000..64e654b4 --- /dev/null +++ b/common/test/deserialize_hardening_test.cpp @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +// Include all affected sketch types +#include +#include +#include + +namespace datasketches { + +/** + * Test for fix of issue #477: + * BUG: SIGABRT in deserialize(): dereferencing empty std::optional (libc++ verbose_abort) + * + * These tests exercise the actual deserialization code path that contained the bug. + * With buggy code (&*tmp on empty optional) and hardening enabled, these will SIGABRT. + * With fixed code (aligned_storage), these pass normally. + * + * IMPORTANT: These tests actually call deserialize() on multi-item sketches, which + * exercises the buggy code path where min/max are deserialized. + */ + +TEST_CASE("quantiles_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + // Create sketch with multiple items (so min/max are stored in serialization) + quantiles_sketch sketch(128); + for (int i = 0; i < 1000; i++) { + sketch.update(static_cast(i)); + } + + // Serialize + auto bytes = sketch.serialize(); + + // Deserialize - WITH BUGGY CODE AND HARDENING, THIS WILL SIGABRT HERE + // The bug is: sd.deserialize(is, &*tmp, 1) where tmp is empty optional + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + + // Verify deserialization worked correctly + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); + REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5)); +} + +TEST_CASE("quantiles_sketch: deserialize from stream", "[deserialize_hardening]") { + quantiles_sketch sketch(256); + for (int i = 0; i < 2000; i++) { + sketch.update(static_cast(i) * 0.5f); + } + + // Serialize to stream + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream - exercises the buggy code path + auto sketch2 = quantiles_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize multi-item sketch", "[deserialize_hardening]") { + kll_sketch sketch(200); + for (int i = 0; i < 1500; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy &*tmp code path + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("kll_sketch: deserialize from stream", "[deserialize_hardening]") { + kll_sketch sketch(400); + for (int i = 0; i < 3000; i++) { + sketch.update(i); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = kll_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize multi-level sketch", "[deserialize_hardening]") { + // REQ sketch only has the bug when num_levels > 1 + // We need to add enough items to trigger multiple levels + req_sketch sketch(12); + for (int i = 0; i < 10000; i++) { + sketch.update(static_cast(i)); + } + + auto bytes = sketch.serialize(); + + // Deserialize - exercises buggy code path when num_levels > 1 + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("req_sketch: deserialize from stream", "[deserialize_hardening]") { + req_sketch sketch(20); + for (int i = 0; i < 15000; i++) { + sketch.update(static_cast(i) * 0.1); + } + + std::stringstream ss; + sketch.serialize(ss); + + // Deserialize from stream + auto sketch2 = req_sketch::deserialize(ss); + + REQUIRE(sketch2.get_n() == sketch.get_n()); + REQUIRE(sketch2.get_min_item() == sketch.get_min_item()); + REQUIRE(sketch2.get_max_item() == sketch.get_max_item()); +} + +TEST_CASE("multiple sketch types: stress test", "[deserialize_hardening]") { + SECTION("quantiles with various sizes") { + for (int k : {64, 128, 256}) { + quantiles_sketch sketch(k); + for (int i = 0; i < 5000; i++) { + sketch.update(i); + } + auto bytes = sketch.serialize(); + auto sketch2 = quantiles_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("kll with various sizes") { + for (int k : {100, 200, 400}) { + kll_sketch sketch(k); + for (int i = 0; i < 4000; i++) { + sketch.update(static_cast(i) / 10.0); + } + auto bytes = sketch.serialize(); + auto sketch2 = kll_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } + + SECTION("req with various sizes") { + for (int k : {12, 20}) { + req_sketch sketch(k); + for (int i = 0; i < 8000; i++) { + sketch.update(static_cast(i)); + } + auto bytes = sketch.serialize(); + auto sketch2 = req_sketch::deserialize(bytes.data(), bytes.size()); + REQUIRE(sketch2.get_n() == sketch.get_n()); + } + } +} + +} // namespace datasketches