From da95fd28419a982d8c6d802ab153362f724c470f Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:19:38 +0900 Subject: [PATCH 01/11] feat: add utf8cpp --- NOTICE | 3 + common/CMakeLists.txt | 11 + common/include/third_party/utf8cpp/LICENSE | 23 + common/include/third_party/utf8cpp/utf8.h | 46 ++ .../third_party/utf8cpp/utf8/checked.h | 359 +++++++++++++ .../include/third_party/utf8cpp/utf8/core.h | 500 ++++++++++++++++++ .../include/third_party/utf8cpp/utf8/cpp11.h | 70 +++ .../include/third_party/utf8cpp/utf8/cpp17.h | 96 ++++ .../include/third_party/utf8cpp/utf8/cpp20.h | 124 +++++ .../third_party/utf8cpp/utf8/unchecked.h | 286 ++++++++++ 10 files changed, 1518 insertions(+) create mode 100644 common/include/third_party/utf8cpp/LICENSE create mode 100644 common/include/third_party/utf8cpp/utf8.h create mode 100644 common/include/third_party/utf8cpp/utf8/checked.h create mode 100644 common/include/third_party/utf8cpp/utf8/core.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp11.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp17.h create mode 100644 common/include/third_party/utf8cpp/utf8/cpp20.h create mode 100644 common/include/third_party/utf8cpp/utf8/unchecked.h diff --git a/NOTICE b/NOTICE index 11ba6f6c..6a2376d9 100644 --- a/NOTICE +++ b/NOTICE @@ -10,3 +10,6 @@ The Apache Software Foundation (http://www.apache.org/). Prior to moving to ASF, the software for this project was developed at Yahoo Inc. (https://developer.yahoo.com). + +This product includes utf8cpp (https://github.com/nemtrif/utfcpp), +licensed under the Boost Software License, Version 1.0. diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8514433b..2d5c7330 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,3 +51,14 @@ install(FILES include/serde.hpp include/xxhash64.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") + +install(FILES + include/third_party/utf8cpp/utf8.h + include/third_party/utf8cpp/utf8/checked.h + include/third_party/utf8cpp/utf8/core.h + include/third_party/utf8cpp/utf8/cpp11.h + include/third_party/utf8cpp/utf8/cpp17.h + include/third_party/utf8cpp/utf8/cpp20.h + include/third_party/utf8cpp/utf8/unchecked.h + include/third_party/utf8cpp/LICENSE + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp") diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE new file mode 100644 index 00000000..36b7cd93 --- /dev/null +++ b/common/include/third_party/utf8cpp/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h new file mode 100644 index 00000000..b5135309 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8.h @@ -0,0 +1,46 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h new file mode 100644 index 00000000..96ceb4d5 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/checked.h @@ -0,0 +1,359 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + utfchar32_t cp; + public: + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } + utfchar32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + utfchar8_t u8; + public: + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast(c)) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } + utfchar8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + utfchar16_t u16; + public: + invalid_utf16 (utfchar16_t u) : u16(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } + utfchar16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append(cp, result); + } + + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it, octet_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(static_cast(*it)); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + utfchar32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = static_cast(utf8::internal::mask16(*start++)); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + const utfchar32_t trail_surrogate = static_cast(utf8::internal::mask16(*start++)); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + const utfchar32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#include "cpp17.h" +#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later + +#endif //header guard + diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h new file mode 100644 index 00000000..8e128c18 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/core.h @@ -0,0 +1,500 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include +#include +#include + +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept + #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() + // Not worth simulating static_assert: + #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition); +#endif // C++ 11 or later + + +namespace utf8 +{ +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline utfchar8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + + template + inline utfchar16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + inline bool is_lead_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); + } + + inline bool is_trail_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_code_point_valid(utfchar32_t cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + + template + int sequence_length(octet_iterator lead_it) + { + const utfchar8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + inline bool is_overlong_sequence(utfchar32_t cp, int length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, const octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + utfchar32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + utfchar32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + // Make sure the iterator dereferences a large enough type + typedef typename std::iterator_traits::value_type word_type; + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + // Check the edge case: + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + + // Internal implementation of both checked and unchecked append() function + // This function will be invoked by the overloads below, as they will know + // the octet_type. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + // One of the following overloads will be invoked from the API calls + + // A simple (but dangerous) case: the caller appends byte(s) to a char array + inline char* append(utfchar32_t cp, char* result) { + return append(cp, result); + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append(cp, std::back_inserter(str)); + template + std::back_insert_iterator append + (utfchar32_t cp, std::back_insert_iterator result) { + return append, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine octet_type + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h new file mode 100644 index 00000000..691633c8 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp11.h @@ -0,0 +1,70 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" + +namespace utf8 +{ + inline void append16(utfchar32_t cp, std::u16string& s) + { + append16(cp, std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h new file mode 100644 index 00000000..07587300 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp17.h @@ -0,0 +1,96 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp11.h" + +namespace utf8 +{ + inline std::string utf16to8(std::u16string_view s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(std::u32string_view s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(std::string_view s) + { + std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(std::string_view s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(std::string_view s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h new file mode 100644 index 00000000..07b61d0f --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h new file mode 100644 index 00000000..173d0302 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/unchecked.h @@ -0,0 +1,286 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + return internal::append(cp, result); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append(replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append(replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it) + { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { + case 1: + break; + case 2: + ++it; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp = static_cast(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + } + ++it; + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + + template + utfchar32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + template + void advance(octet_iterator& it, distance_type n) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } + } + + template + typename std::iterator_traits::difference_type + distance(octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start == end) + return result; + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + utfchar32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + +#endif // header guard + From 2b48f475ed20ed576342f68f7a7e3afa401bf12c Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:21:02 +0900 Subject: [PATCH 02/11] feat: add aos tuple sketch --- tuple/CMakeLists.txt | 2 + tuple/include/array_of_strings_sketch.hpp | 150 +++++++++ .../include/array_of_strings_sketch_impl.hpp | 284 ++++++++++++++++++ tuple/test/CMakeLists.txt | 5 +- .../aos_sketch_deserialize_from_java_test.cpp | 172 +++++++++++ tuple/test/aos_sketch_serialize_for_java.cpp | 155 ++++++++++ tuple/test/array_of_strings_sketch_test.cpp | 243 +++++++++++++++ ...uple_sketch_deserialize_from_java_test.cpp | 2 +- 8 files changed, 1011 insertions(+), 2 deletions(-) create mode 100644 tuple/include/array_of_strings_sketch.hpp create mode 100644 tuple/include/array_of_strings_sketch_impl.hpp create mode 100644 tuple/test/aos_sketch_deserialize_from_java_test.cpp create mode 100644 tuple/test/aos_sketch_serialize_for_java.cpp create mode 100644 tuple/test/array_of_strings_sketch_test.cpp diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt index 4b0a48c7..54df11ee 100644 --- a/tuple/CMakeLists.txt +++ b/tuple/CMakeLists.txt @@ -54,4 +54,6 @@ install(FILES include/array_tuple_intersection_impl.hpp include/array_tuple_a_not_b.hpp include/array_tuple_a_not_b_impl.hpp + include/array_of_strings_sketch.hpp + include/array_of_strings_sketch_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp new file mode 100644 index 00000000..a3f8ddd7 --- /dev/null +++ b/tuple/include/array_of_strings_sketch.hpp @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_HPP_ + +#include +#include + +#include "array_tuple_sketch.hpp" +#include "xxhash64.h" + +namespace datasketches { + +// default update policy for an array of strings +template> +class default_array_of_strings_update_policy { +public: + using array_of_strings = array; + + explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); + + array_of_strings create() const; + + void update(array_of_strings& array, const array_of_strings& input) const; + + void update(array_of_strings& array, const array_of_strings* input) const; + +private: + Allocator allocator_; +}; + +// serializer/deserializer for an array of strings +// Requirements: all strings must be valid UTF-8 and array size must be <= 127. +template> +struct array_of_strings_serde { + using array_of_strings = array; + + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; + size_t size_of_item(const array_of_strings& item) const; + +private: + static void check_num_nodes(uint8_t num_nodes); + static uint32_t compute_total_bytes(const array_of_strings& item); + static void check_utf8(const std::string& value); +}; + +/** + * Extended class of compact_tuple_sketch for array of strings + * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + */ +template> +class compact_array_of_strings_tuple_sketch: + public compact_tuple_sketch< + array, + typename std::allocator_traits::template rebind_alloc> + > { +public: + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + using Base = compact_tuple_sketch; + using vector_bytes = typename Base::vector_bytes; + + template + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); + + void serialize(std::ostream& os) const; + vector_bytes serialize(unsigned header_size_bytes = 0) const; + + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, + const Allocator& allocator = Allocator()); + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, + const Allocator& allocator = Allocator()); + +private: + explicit compact_array_of_strings_tuple_sketch(Base&& base); +}; + +/** + * Extended class of update_tuple_sketch for array of strings + * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + */ +template> +class update_array_of_strings_tuple_sketch: + public update_tuple_sketch< + array, + array, + default_array_of_strings_update_policy, + typename std::allocator_traits::template rebind_alloc> + > { +public: + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + using policy_type = default_array_of_strings_update_policy; + using Base = update_tuple_sketch< + array_of_strings, + array_of_strings, + policy_type, + summary_allocator + >; + using resize_factor = typename Base::resize_factor; + class builder; + using Base::update; + + void update(const array_of_strings& key, const array_of_strings& value); + compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; + +private: + update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, + uint64_t seed, const policy_type& policy, const summary_allocator& allocator); + + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; + + static uint64_t hash_key(const array_of_strings& key); +}; + +template +class update_array_of_strings_tuple_sketch::builder: + public tuple_base_builder { +public: + builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); + + update_array_of_strings_tuple_sketch build() const; +}; + +} /* namespace datasketches */ + +#include "array_of_strings_sketch_impl.hpp" + +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp new file mode 100644 index 00000000..264f79bf --- /dev/null +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ + +#include + +#include "common_defs.hpp" +#include "third_party/utf8cpp/utf8.h" + +namespace datasketches { + +template +default_array_of_strings_update_policy::default_array_of_strings_update_policy(const Allocator& allocator): + allocator_(allocator) {} + +template +auto default_array_of_strings_update_policy::create() const -> array_of_strings { + return array_of_strings(0, "", allocator_); +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings& input +) const { + const auto length = input.size(); + array = array_of_strings(length, "", allocator_); + for (uint8_t i = 0; i < length; ++i) array[i] = input[i]; +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings* input +) const { + if (input == nullptr) { + array = array_of_strings(0, "", allocator_); + return; + } + const auto length = input->size(); + array = array_of_strings(length, "", allocator_); + for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i]; +} + +template +update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( + uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, + uint64_t seed, const policy_type& policy, const summary_allocator& allocator +): +Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} + +template +void update_array_of_strings_tuple_sketch::update( + const array_of_strings& key, const array_of_strings& value +) { + const uint64_t hash = hash_key(key); + Base::update(hash, value); +} + +template +uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { + XXHash64 hasher(STRING_ARR_HASH_SEED); + const auto size = static_cast(key.size()); + for (size_t i = 0; i < size; ++i) { + const auto& entry = key[static_cast(i)]; + hasher.add(entry.data(), entry.size()); + if (i + 1 < size) hasher.add(",", 1); + } + return hasher.hash(); +} + +template +compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { + return compact_array_of_strings_tuple_sketch(*this, ordered); +} + +// builder + +template +update_array_of_strings_tuple_sketch::builder::builder( + const policy_type& policy, const summary_allocator& allocator +): +tuple_base_builder(policy, allocator) {} + +template +auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { + return update_array_of_strings_tuple_sketch( + this->starting_lg_size(), + this->lg_k_, + this->rf_, + this->p_, + this->starting_theta(), + this->seed_, + this->policy_, + this->allocator_ + ); +} + +template +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + const Sketch& sketch, bool ordered +): Base(sketch, ordered) {} + +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + Base&& base +): Base(std::move(base)) {} + +template +void compact_array_of_strings_tuple_sketch::serialize(std::ostream& os) const { + Base::serialize(os, array_of_strings_serde()); +} + +template +auto compact_array_of_strings_tuple_sketch::serialize(unsigned header_size_bytes) const -> vector_bytes { + return Base::serialize(header_size_bytes, array_of_strings_serde()); +} + +template +auto compact_array_of_strings_tuple_sketch::deserialize( + std::istream& is, uint64_t seed, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(is, seed, array_of_strings_serde(), alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +auto compact_array_of_strings_tuple_sketch::deserialize( + const void* bytes, size_t size, uint64_t seed, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde(), alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +void array_of_strings_serde::serialize( + std::ostream& os, const array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } + } +} + +template +void array_of_strings_serde::deserialize( + std::istream& is, array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + read(is); // total_bytes + const uint8_t num_nodes = read(is); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", Allocator()); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + std::string value(length, '\0'); + is.read(&value[0], length); + check_utf8(value); + array[j] = std::move(value); + } + new (&items[i]) array_of_strings(std::move(array)); + } +} + +template +size_t array_of_strings_serde::serialize( + void* ptr, size_t capacity, const array_of_strings* items, unsigned num +) const { + uint8_t* ptr8 = static_cast(ptr); + size_t bytes_written = 0; + + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + check_memory_size(bytes_written + total_bytes, capacity); + bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); + bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + + bytes_written += copy_to_mem(length, ptr8 + bytes_written); + bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length); + } + } + return bytes_written; +} + +template +size_t array_of_strings_serde::deserialize( + const void* ptr, size_t capacity, array_of_strings* items, unsigned num +) const { + const uint8_t* ptr8 = static_cast(ptr); + size_t bytes_read = 0; + + for (unsigned i = 0; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", Allocator()); + for (uint8_t j = 0; j < num_nodes; ++j) { + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + check_utf8(value); + array[j] = std::move(value); + } + new (&items[i]) array_of_strings(std::move(array)); + } + return bytes_read; +} + +template +size_t array_of_strings_serde::size_of_item(const array_of_strings& item) const { + return compute_total_bytes(item); +} + +template +void array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { + if (num_nodes > 127) { + throw std::runtime_error("array_of_strings size exceeds 127"); + } +} + +template +uint32_t array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { + const auto count = item.size(); + check_num_nodes(static_cast(count)); + size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); + const std::string* data = item.data(); + for (uint32_t j = 0; j < count; ++j) { + total += data[j].size(); + } + if (total > std::numeric_limits::max()) { + throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max"); + } + return static_cast(total); +} + +template +void array_of_strings_serde::check_utf8(const std::string& value) { + if (!utf8::is_valid(value.begin(), value.end())) { + throw std::runtime_error("array_of_strings contains invalid UTF-8"); + } +} + +} /* namespace datasketches */ + +#endif diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..8c561745 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES CXX_STANDARD_REQUIRED YES ) -file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH) +file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH) string(APPEND THETA_TEST_BINARY_PATH "/") target_compile_definitions(tuple_test PRIVATE @@ -44,6 +44,7 @@ target_sources(tuple_test tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp array_of_doubles_sketch_test.cpp + array_of_strings_sketch_test.cpp engagement_test.cpp ) @@ -52,6 +53,7 @@ target_sources(tuple_test PRIVATE aod_sketch_deserialize_from_java_test.cpp tuple_sketch_deserialize_from_java_test.cpp + aos_sketch_deserialize_from_java_test.cpp ) endif() @@ -60,5 +62,6 @@ target_sources(tuple_test PRIVATE aod_sketch_serialize_for_java.cpp tuple_sketch_serialize_for_java.cpp + aos_sketch_serialize_for_java.cpp ) endif() diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp new file mode 100644 index 00000000..15e9d6dd --- /dev/null +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -0,0 +1,172 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + // assume the binary sketches for this test have been generated by datasketches-java code + // in the subdirectory called "java" in the root directory of this project + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; + + TEST_CASE("aos sketch one value", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + + TEST_CASE("aos sketch three values", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + } + + TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + + TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + + TEST_CASE("aos sketch unicode strings", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + } + + TEST_CASE("aos sketch empty strings", "[serde_compat]") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + } +} diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp new file mode 100644 index 00000000..3a154132 --- /dev/null +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using aos_sketch = update_array_of_strings_tuple_sketch<>; +using array_of_strings = aos_sketch::array_of_strings; + +static array_of_strings make_array(std::initializer_list items) { + array_of_strings array(static_cast(items.size()), ""); + size_t i = 0; + for (const auto& item: items) { + array[static_cast(i)] = item; + ++i; + } + return array; +} + +TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { + auto sketch = aos_sketch::builder() + .set_lg_k(12) + .set_resize_factor(resize_factor::X8) + .set_p(0.01f) + .build(); + array_of_strings key(1, ""); + key[0] = "key1"; + array_of_strings value(1, ""); + value[0] = "value1"; + sketch.update(key, value); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(key, value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); + } +} + +TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + make_array({u8"키", u8"열쇠"}), + make_array({u8"밸류", u8"값"}) + ); + sketch.update( + make_array({u8"🔑", u8"🗝️"}), + make_array({u8"📦", u8"🎁"}) + ); + sketch.update( + make_array({u8"ключ1", u8"ключ2"}), + make_array({u8"ценить1", u8"ценить2"}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + make_array({""}), + make_array({"empty_key_value"}) + ); + sketch.update( + make_array({"empty_value_key"}), + make_array({""}) + ); + sketch.update( + make_array({"", ""}), + make_array({"", ""}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); + sketch.compact().serialize(os); +} + +} /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp new file mode 100644 index 00000000..45e554bd --- /dev/null +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using array_of_strings = array; + +TEST_CASE("aos update policy", "[tuple_sketch]") { + default_array_of_strings_update_policy<> policy; + + SECTION("create empty") { + auto values = policy.create(); + REQUIRE(values.size() == 0); + } + + SECTION("replace array") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "alpha"; + input[1] = "beta"; + policy.update(values, input); + REQUIRE(values.size() == 2); + REQUIRE(values[0] == "alpha"); + REQUIRE(values[1] == "beta"); + input[0] = "changed"; + REQUIRE(values[0] == "alpha"); + + array_of_strings input2(1, "", std::allocator()); + input2[0] = "gamma"; + policy.update(values, input2); + REQUIRE(values.size() == 1); + REQUIRE(values[0] == "gamma"); + } + + SECTION("nullptr clears") { + array_of_strings values(2, "", std::allocator()); + values[0] = "one"; + values[1] = "two"; + + policy.update(values, static_cast(nullptr)); + REQUIRE(values.size() == 0); + } + + SECTION("pointer input copies") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "first"; + input[1] = "second"; + policy.update(values, &input); + REQUIRE(values.size() == 2); + REQUIRE(values[1] == "second"); + input[1] = "changed"; + REQUIRE(values[1] == "second"); + } +} + +TEST_CASE("aos sketch update", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto* entry: entries) array[i++] = entry; + return array; + }; + + SECTION("same key replaces summary") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({"alpha", "beta"}), make_array({"first"})); + sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"})); + + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 2); + REQUIRE(it->second[0] == "second"); + REQUIRE(it->second[1] == "third"); + } + + SECTION("distinct keys retain multiple entries") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({"a", "bc"}), make_array({"one"})); + sketch.update(make_array({"ab", "c"}), make_array({"two"})); + + REQUIRE(sketch.get_num_retained() == 2); + + bool saw_one = false; + bool saw_two = false; + for (const auto& entry: sketch) { + REQUIRE(entry.second.size() == 1); + if (entry.second[0] == "one") saw_one = true; + if (entry.second[0] == "two") saw_two = true; + } + REQUIRE(saw_one); + REQUIRE(saw_two); + } + + SECTION("empty key") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(make_array({}), make_array({"value"})); + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 1); + REQUIRE(it->second[0] == "value"); + } +} + +TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto& entry: entries) array[i++] = entry; + return array; + }; + + auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + typedef std::pair entry_type; + std::vector entries; + for (const auto& entry: sketch) entries.push_back(entry); + struct entry_less { + bool operator()(const entry_type& lhs, const entry_type& rhs) const { + return lhs.first < rhs.first; + } + }; + std::sort(entries.begin(), entries.end(), entry_less()); + return entries; + }; + + auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + compact_sketch.serialize(ss); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss); + + auto bytes = compact_sketch.serialize(); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size()); + + const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { + &deserialized_stream, + &deserialized_bytes + }; + for (int list_index = 0; list_index < 2; ++list_index) { + const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index]; + REQUIRE(compact_sketch.is_empty() == deserialized->is_empty()); + REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode()); + REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered()); + REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained()); + REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10)); + REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10)); + REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10)); + REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10)); + + auto original_entries = collect_entries(compact_sketch); + auto round_trip_entries = collect_entries(*deserialized); + REQUIRE(original_entries.size() == round_trip_entries.size()); + for (size_t i = 0; i < original_entries.size(); ++i) { + REQUIRE(original_entries[i].first == round_trip_entries[i].first); + REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size()); + for (size_t j = 0; j < original_entries[i].second.size(); ++j) { + REQUIRE(original_entries[i].second[static_cast(j)] == + round_trip_entries[i].second[static_cast(j)]); + } + } + } + }; + + auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = sketch.compact(true); + auto unordered = sketch.compact(false); + check_round_trip(ordered); + check_round_trip(unordered); + }; + + SECTION("empty sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + exercise_ordering(sketch); + } + + SECTION("single entry sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + sketch.update(make_array({"key"}), make_array({"value"})); + exercise_ordering(sketch); + } + + SECTION("multiple entries exact mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); + for (int i = 0; i < 50; ++i) { + sketch.update( + make_array({std::string("key-") + std::to_string(i)}), + make_array({std::string("value-") + std::to_string(i), "extra"}) + ); + } + REQUIRE_FALSE(sketch.is_estimation_mode()); + exercise_ordering(sketch); + } + + SECTION("multiple entries estimation mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + for (int i = 0; i < 10000; ++i) { + sketch.update( + make_array({std::string("key-") + std::to_string(i)}), + make_array({std::string("value-") + std::to_string(i)}) + ); + } + REQUIRE(sketch.is_estimation_mode()); + exercise_ordering(sketch); + } +} + +} /* namespace datasketches */ diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp index 408223f9..cf589cd0 100644 --- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp @@ -25,7 +25,7 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project -static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; +static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; TEST_CASE("tuple sketch int", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; From 307fe02179bf4a1776897ffff7bbee1913125402 Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:22:16 +0900 Subject: [PATCH 03/11] test: rollback test file path --- tuple/test/tuple_sketch_deserialize_from_java_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp index cf589cd0..408223f9 100644 --- a/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/tuple_sketch_deserialize_from_java_test.cpp @@ -25,7 +25,7 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project -static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; +static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; TEST_CASE("tuple sketch int", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; From a1e24c80e6336bccacc02a75a256643b5958d3be Mon Sep 17 00:00:00 2001 From: proost Date: Thu, 22 Jan 2026 01:23:42 +0900 Subject: [PATCH 04/11] chore: rollback test directory --- tuple/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 8c561745..3d7ccca3 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -23,7 +23,7 @@ set_target_properties(tuple_test PROPERTIES CXX_STANDARD_REQUIRED YES ) -file(TO_CMAKE_PATH "${CMAKE_SOURCE_DIR}" THETA_TEST_BINARY_PATH) +file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" THETA_TEST_BINARY_PATH) string(APPEND THETA_TEST_BINARY_PATH "/") target_compile_definitions(tuple_test PRIVATE From 9381dcd227a5a06f8e18803dd21aa6c877b46b25 Mon Sep 17 00:00:00 2001 From: proost Date: Sun, 25 Jan 2026 22:14:47 +0900 Subject: [PATCH 05/11] fix: empty string handling --- tuple/include/array_of_strings_sketch.hpp | 66 +++- .../include/array_of_strings_sketch_impl.hpp | 101 +++--- .../aos_sketch_deserialize_from_java_test.cpp | 311 ++++++++++++------ tuple/test/aos_sketch_serialize_for_java.cpp | 12 +- tuple/test/array_of_strings_sketch_test.cpp | 16 +- 5 files changed, 334 insertions(+), 172 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index a3f8ddd7..4442fd64 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -49,8 +49,11 @@ class default_array_of_strings_update_policy { // serializer/deserializer for an array of strings // Requirements: all strings must be valid UTF-8 and array size must be <= 127. template> -struct array_of_strings_serde { +struct default_array_of_strings_serde { using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; @@ -59,6 +62,8 @@ struct array_of_strings_serde { size_t size_of_item(const array_of_strings& item) const; private: + Allocator allocator_; + summary_allocator summary_allocator_; static void check_num_nodes(uint8_t num_nodes); static uint32_t compute_total_bytes(const array_of_strings& item); static void check_utf8(const std::string& value); @@ -79,17 +84,41 @@ class compact_array_of_strings_tuple_sketch: using summary_allocator = typename std::allocator_traits::template rebind_alloc; using Base = compact_tuple_sketch; using vector_bytes = typename Base::vector_bytes; - + using Base::serialize; + + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (update or compact) + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ template compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); - void serialize(std::ostream& os) const; - vector_bytes serialize(unsigned header_size_bytes = 0) const; - + /** + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, - const Allocator& allocator = Allocator()); + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + + /** + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, - const Allocator& allocator = Allocator()); + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); private: explicit compact_array_of_strings_tuple_sketch(Base&& base); @@ -97,20 +126,20 @@ class compact_array_of_strings_tuple_sketch: /** * Extended class of update_tuple_sketch for array of strings - * Requirements: all strings must be valid UTF-8 and array size must be <= 127. */ -template> +template class Policy = default_array_of_strings_update_policy, + typename Allocator = std::allocator> class update_array_of_strings_tuple_sketch: public update_tuple_sketch< array, array, - default_array_of_strings_update_policy, + Policy, typename std::allocator_traits::template rebind_alloc> > { public: using array_of_strings = array; using summary_allocator = typename std::allocator_traits::template rebind_alloc; - using policy_type = default_array_of_strings_update_policy; + using policy_type = Policy; using Base = update_tuple_sketch< array_of_strings, array_of_strings, @@ -121,7 +150,18 @@ class update_array_of_strings_tuple_sketch: class builder; using Base::update; + /** + * Updates the sketch with string array for both key and value. + * @param key the given string array key + * @param value the given string array value + */ void update(const array_of_strings& key, const array_of_strings& value); + + /** + * Converts this sketch to a compact sketch (ordered or unordered). + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; private: @@ -134,8 +174,8 @@ class update_array_of_strings_tuple_sketch: static uint64_t hash_key(const array_of_strings& key); }; -template -class update_array_of_strings_tuple_sketch::builder: +template class Policy, typename Allocator> +class update_array_of_strings_tuple_sketch::builder: public tuple_base_builder { public: builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 264f79bf..b95987a0 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -40,9 +40,9 @@ template void default_array_of_strings_update_policy::update( array_of_strings& array, const array_of_strings& input ) const { - const auto length = input.size(); - array = array_of_strings(length, "", allocator_); - for (uint8_t i = 0; i < length; ++i) array[i] = input[i]; + const auto length = static_cast(input.size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = input[i]; } template @@ -53,53 +53,53 @@ void default_array_of_strings_update_policy::update( array = array_of_strings(0, "", allocator_); return; } - const auto length = input->size(); - array = array_of_strings(length, "", allocator_); - for (uint8_t i = 0; i < length; ++i) array[i] = (*input)[i]; + const auto length = static_cast(input->size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } -template -update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( +template class Policy, typename Allocator> +update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const policy_type& policy, const summary_allocator& allocator ): Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} -template -void update_array_of_strings_tuple_sketch::update( +template class Policy, typename Allocator> +void update_array_of_strings_tuple_sketch::update( const array_of_strings& key, const array_of_strings& value ) { const uint64_t hash = hash_key(key); Base::update(hash, value); } -template -uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { +template class Policy, typename Allocator> +uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { XXHash64 hasher(STRING_ARR_HASH_SEED); const auto size = static_cast(key.size()); for (size_t i = 0; i < size; ++i) { - const auto& entry = key[static_cast(i)]; + const auto& entry = key[i]; hasher.add(entry.data(), entry.size()); if (i + 1 < size) hasher.add(",", 1); } return hasher.hash(); } -template -compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { +template class Policy, typename Allocator> +compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { return compact_array_of_strings_tuple_sketch(*this, ordered); } // builder -template -update_array_of_strings_tuple_sketch::builder::builder( +template class Policy, typename Allocator> +update_array_of_strings_tuple_sketch::builder::builder( const policy_type& policy, const summary_allocator& allocator ): tuple_base_builder(policy, allocator) {} -template -auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { +template class Policy, typename Allocator> +auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { return update_array_of_strings_tuple_sketch( this->starting_lg_size(), this->lg_k_, @@ -124,35 +124,32 @@ compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple ): Base(std::move(base)) {} template -void compact_array_of_strings_tuple_sketch::serialize(std::ostream& os) const { - Base::serialize(os, array_of_strings_serde()); -} - -template -auto compact_array_of_strings_tuple_sketch::serialize(unsigned header_size_bytes) const -> vector_bytes { - return Base::serialize(header_size_bytes, array_of_strings_serde()); -} - -template +template auto compact_array_of_strings_tuple_sketch::deserialize( - std::istream& is, uint64_t seed, const Allocator& allocator + std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { summary_allocator alloc(allocator); - auto base = Base::deserialize(is, seed, array_of_strings_serde(), alloc); + auto base = Base::deserialize(is, seed, sd, alloc); return compact_array_of_strings_tuple_sketch(std::move(base)); } template +template auto compact_array_of_strings_tuple_sketch::deserialize( - const void* bytes, size_t size, uint64_t seed, const Allocator& allocator + const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator ) -> compact_array_of_strings_tuple_sketch { summary_allocator alloc(allocator); - auto base = Base::deserialize(bytes, size, seed, array_of_strings_serde(), alloc); + auto base = Base::deserialize(bytes, size, seed, sd, alloc); return compact_array_of_strings_tuple_sketch(std::move(base)); } template -void array_of_strings_serde::serialize( +default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): + allocator_(allocator), + summary_allocator_(allocator) {} + +template +void default_array_of_strings_serde::serialize( std::ostream& os, const array_of_strings* items, unsigned num ) const { for (unsigned i = 0; i < num; ++i) { @@ -171,27 +168,34 @@ void array_of_strings_serde::serialize( } template -void array_of_strings_serde::deserialize( +void default_array_of_strings_serde::deserialize( std::istream& is, array_of_strings* items, unsigned num ) const { for (unsigned i = 0; i < num; ++i) { read(is); // total_bytes + if (!is) throw std::runtime_error("array_of_strings stream read failed"); const uint8_t num_nodes = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", Allocator()); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { const uint32_t length = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); std::string value(length, '\0'); - is.read(&value[0], length); + if (length != 0) { + is.read(value.data(), length); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + } check_utf8(value); array[j] = std::move(value); } - new (&items[i]) array_of_strings(std::move(array)); + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } } template -size_t array_of_strings_serde::serialize( +size_t default_array_of_strings_serde::serialize( void* ptr, size_t capacity, const array_of_strings* items, unsigned num ) const { uint8_t* ptr8 = static_cast(ptr); @@ -216,7 +220,7 @@ size_t array_of_strings_serde::serialize( } template -size_t array_of_strings_serde::deserialize( +size_t default_array_of_strings_serde::deserialize( const void* ptr, size_t capacity, array_of_strings* items, unsigned num ) const { const uint8_t* ptr8 = static_cast(ptr); @@ -231,34 +235,37 @@ size_t array_of_strings_serde::deserialize( uint8_t num_nodes; bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); check_num_nodes(num_nodes); - array_of_strings array(num_nodes, "", Allocator()); + array_of_strings array(num_nodes, "", allocator_); for (uint8_t j = 0; j < num_nodes; ++j) { uint32_t length; bytes_read += copy_from_mem(ptr8 + bytes_read, length); std::string value(length, '\0'); - bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length); + } check_utf8(value); array[j] = std::move(value); } - new (&items[i]) array_of_strings(std::move(array)); + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); } return bytes_read; } template -size_t array_of_strings_serde::size_of_item(const array_of_strings& item) const { +size_t default_array_of_strings_serde::size_of_item(const array_of_strings& item) const { return compute_total_bytes(item); } template -void array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { +void default_array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { if (num_nodes > 127) { throw std::runtime_error("array_of_strings size exceeds 127"); } } template -uint32_t array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { +uint32_t default_array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { const auto count = item.size(); check_num_nodes(static_cast(count)); size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); @@ -273,7 +280,7 @@ uint32_t array_of_strings_serde::compute_total_bytes(const array_of_s } template -void array_of_strings_serde::check_utf8(const std::string& value) { +void default_array_of_strings_serde::check_utf8(const std::string& value) { if (!utf8::is_valid(value.begin(), value.end())) { throw std::runtime_error("array_of_strings contains invalid UTF-8"); } diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp index 15e9d6dd..af37d6c2 100644 --- a/tuple/test/aos_sketch_deserialize_from_java_test.cpp +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -26,21 +26,53 @@ namespace datasketches { // assume the binary sketches for this test have been generated by datasketches-java code // in the subdirectory called "java" in the root directory of this project - static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "java/"; + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + + static std::vector read_binary_file(const std::string& path) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + is.seekg(0, std::ios::end); + const auto size = static_cast(is.tellg()); + is.seekg(0, std::ios::beg); + std::vector bytes(size); + if (size != 0) { + is.read(reinterpret_cast(bytes.data()), size); + } + return bytes; + } TEST_CASE("aos sketch one value", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 1); + const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } } } } @@ -48,125 +80,204 @@ namespace datasketches { TEST_CASE("aos sketch three values", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 3); + const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } } } } TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE(sketch.get_num_retained() == 0); + const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } } TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; for (const unsigned n: n_arr) { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE(sketch.is_empty() == (n == 0)); - REQUIRE(sketch.is_estimation_mode() == (n > 1000)); - REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 1); + const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } } } } TEST_CASE("aos sketch unicode strings", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_unicode_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE_FALSE(sketch.is_estimation_mode()); - REQUIRE(sketch.get_num_retained() == 3); + const auto path = testBinaryInputPath + "aos_unicode_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); - const std::vector> expected_values = { - {"밸류", "값"}, - {"📦", "🎁"}, - {"ценить1", "ценить2"} - }; - std::vector matched(expected_values.size(), false); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); - REQUIRE(entry.second.size() == 2); + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); - bool found = false; - for (size_t i = 0; i < expected_values.size(); ++i) { - if (matched[i]) continue; - const auto& expected = expected_values[i]; - if (entry.second.size() != expected.size()) continue; - bool equal = true; - for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { - equal = false; + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; break; } } - if (equal) { - matched[i] = true; - found = true; - break; - } + REQUIRE(found); } - REQUIRE(found); + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); } - for (bool found: matched) REQUIRE(found); } TEST_CASE("aos sketch empty strings", "[serde_compat]") { - std::ifstream is; - is.exceptions(std::ios::failbit | std::ios::badbit); - is.open(testBinaryInputPath + "aos_empty_strings_java.sk", std::ios::binary); - const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize(is); - REQUIRE_FALSE(sketch.is_empty()); - REQUIRE_FALSE(sketch.is_estimation_mode()); - REQUIRE(sketch.get_num_retained() == 3); - const std::vector> expected_values = { - {"empty_key_value"}, - {""}, - {"", ""} - }; - std::vector matched(expected_values.size(), false); - for (const auto& entry: sketch) { - REQUIRE(entry.first < sketch.get_theta64()); + const auto path = testBinaryInputPath + "aos_empty_strings_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); - bool found = false; - for (size_t i = 0; i < expected_values.size(); ++i) { - if (matched[i]) continue; - const auto& expected = expected_values[i]; - if (entry.second.size() != expected.size()) continue; - bool equal = true; - for (size_t j = 0; j < expected.size(); ++j) { - if (entry.second[j] != expected[j]) { - equal = false; + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; break; } } - if (equal) { - matched[i] = true; - found = true; - break; - } + REQUIRE(found); } - REQUIRE(found); + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); } - for (bool found: matched) REQUIRE(found); } } diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index 3a154132..db506825 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -52,7 +52,7 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -72,7 +72,7 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -90,7 +90,7 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 0); std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { @@ -108,7 +108,7 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } @@ -129,7 +129,7 @@ TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { @@ -149,7 +149,7 @@ TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); - sketch.compact().serialize(os); + sketch.compact().serialize(os, default_array_of_strings_serde<>()); } } /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 45e554bd..3e3673aa 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -65,7 +65,7 @@ TEST_CASE("aos update policy", "[tuple_sketch]") { values[0] = "one"; values[1] = "two"; - policy.update(values, static_cast(nullptr)); + policy.update(values, nullptr); REQUIRE(values.size() == 0); } @@ -162,11 +162,15 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { std::stringstream ss; ss.exceptions(std::ios::failbit | std::ios::badbit); - compact_sketch.serialize(ss); - auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize(ss); - - auto bytes = compact_sketch.serialize(); - auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize(bytes.data(), bytes.size()); + compact_sketch.serialize(ss, default_array_of_strings_serde<>()); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize( + ss, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>()); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { &deserialized_stream, From 46c945d5c9f59a0bfd1e7afe6c8256fdd3e8d4f4 Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 27 Jan 2026 01:11:18 +0900 Subject: [PATCH 06/11] refactor: remove update sketch --- tuple/include/array_of_strings_sketch.hpp | 79 ++++++------------- .../include/array_of_strings_sketch_impl.hpp | 55 +++---------- tuple/test/aos_sketch_serialize_for_java.cpp | 43 ++++------ tuple/test/array_of_strings_sketch_test.cpp | 43 ++++++---- 4 files changed, 79 insertions(+), 141 deletions(-) diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp index 4442fd64..db147723 100644 --- a/tuple/include/array_of_strings_sketch.hpp +++ b/tuple/include/array_of_strings_sketch.hpp @@ -69,6 +69,12 @@ struct default_array_of_strings_serde { static void check_utf8(const std::string& value); }; +/** + * Hashes an array of strings using ArrayOfStrings-compatible hashing. + */ +template> +uint64_t hash_array_of_strings_key(const array& key); + /** * Extended class of compact_tuple_sketch for array of strings * Requirements: all strings must be valid UTF-8 and array size must be <= 127. @@ -125,63 +131,26 @@ class compact_array_of_strings_tuple_sketch: }; /** - * Extended class of update_tuple_sketch for array of strings + * Convenience alias for update_tuple_sketch for array of strings */ -template class Policy = default_array_of_strings_update_policy, - typename Allocator = std::allocator> -class update_array_of_strings_tuple_sketch: - public update_tuple_sketch< - array, - array, - Policy, - typename std::allocator_traits::template rebind_alloc> - > { -public: - using array_of_strings = array; - using summary_allocator = typename std::allocator_traits::template rebind_alloc; - using policy_type = Policy; - using Base = update_tuple_sketch< - array_of_strings, - array_of_strings, - policy_type, - summary_allocator - >; - using resize_factor = typename Base::resize_factor; - class builder; - using Base::update; - - /** - * Updates the sketch with string array for both key and value. - * @param key the given string array key - * @param value the given string array value - */ - void update(const array_of_strings& key, const array_of_strings& value); +template, + typename Policy = default_array_of_strings_update_policy> +using update_array_of_strings_tuple_sketch = update_tuple_sketch< + array, + array, + Policy, + typename std::allocator_traits::template rebind_alloc> +>; - /** - * Converts this sketch to a compact sketch (ordered or unordered). - * @param ordered optional flag to specify if an ordered sketch should be produced - * @return compact array of strings sketch - */ - compact_array_of_strings_tuple_sketch compact(bool ordered = true) const; - -private: - update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, - uint64_t seed, const policy_type& policy, const summary_allocator& allocator); - - // Matches Java Util.PRIME for ArrayOfStrings key hashing. - static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; - - static uint64_t hash_key(const array_of_strings& key); -}; - -template class Policy, typename Allocator> -class update_array_of_strings_tuple_sketch::builder: - public tuple_base_builder { -public: - builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator()); - - update_array_of_strings_tuple_sketch build() const; -}; +/** + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). + * @param sketch input sketch + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ +template, typename Policy = default_array_of_strings_update_policy> +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); } /* namespace datasketches */ diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index b95987a0..01a3daba 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -58,23 +58,10 @@ void default_array_of_strings_update_policy::update( for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; } -template class Policy, typename Allocator> -update_array_of_strings_tuple_sketch::update_array_of_strings_tuple_sketch( - uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, - uint64_t seed, const policy_type& policy, const summary_allocator& allocator -): -Base(lg_cur_size, lg_nom_size, rf, p, theta, seed, policy, allocator) {} - -template class Policy, typename Allocator> -void update_array_of_strings_tuple_sketch::update( - const array_of_strings& key, const array_of_strings& value -) { - const uint64_t hash = hash_key(key); - Base::update(hash, value); -} - -template class Policy, typename Allocator> -uint64_t update_array_of_strings_tuple_sketch::hash_key(const array_of_strings& key) { +template +uint64_t hash_array_of_strings_key(const array& key) { + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; XXHash64 hasher(STRING_ARR_HASH_SEED); const auto size = static_cast(key.size()); for (size_t i = 0; i < size; ++i) { @@ -85,31 +72,11 @@ uint64_t update_array_of_strings_tuple_sketch::hash_key(const return hasher.hash(); } -template class Policy, typename Allocator> -compact_array_of_strings_tuple_sketch update_array_of_strings_tuple_sketch::compact(bool ordered) const { - return compact_array_of_strings_tuple_sketch(*this, ordered); -} - -// builder - -template class Policy, typename Allocator> -update_array_of_strings_tuple_sketch::builder::builder( - const policy_type& policy, const summary_allocator& allocator -): -tuple_base_builder(policy, allocator) {} - -template class Policy, typename Allocator> -auto update_array_of_strings_tuple_sketch::builder::build() const -> update_array_of_strings_tuple_sketch { - return update_array_of_strings_tuple_sketch( - this->starting_lg_size(), - this->lg_k_, - this->rf_, - this->p_, - this->starting_theta(), - this->seed_, - this->policy_, - this->allocator_ - ); +template +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered +) { + return compact_array_of_strings_tuple_sketch(sketch, ordered); } template @@ -183,7 +150,7 @@ void default_array_of_strings_serde::deserialize( if (!is) throw std::runtime_error("array_of_strings stream read failed"); std::string value(length, '\0'); if (length != 0) { - is.read(value.data(), length); + is.read(&value[0], length); if (!is) throw std::runtime_error("array_of_strings stream read failed"); } check_utf8(value); @@ -241,7 +208,7 @@ size_t default_array_of_strings_serde::deserialize( bytes_read += copy_from_mem(ptr8 + bytes_read, length); std::string value(length, '\0'); if (length != 0) { - bytes_read += copy_from_mem(ptr8 + bytes_read, value.data(), length); + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); } check_utf8(value); array[j] = std::move(value); diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp index db506825..c6eb0dfc 100644 --- a/tuple/test/aos_sketch_serialize_for_java.cpp +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -26,7 +26,7 @@ namespace datasketches { using aos_sketch = update_array_of_strings_tuple_sketch<>; -using array_of_strings = aos_sketch::array_of_strings; +using array_of_strings = array; static array_of_strings make_array(std::initializer_list items) { array_of_strings array(static_cast(items.size()), ""); @@ -47,12 +47,12 @@ TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { key[0] = std::to_string(i); array_of_strings value(1, ""); value[0] = "value" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } @@ -67,12 +67,12 @@ TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { value[0] = "a" + std::to_string(i); value[1] = "b" + std::to_string(i); value[2] = "c" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } @@ -86,11 +86,11 @@ TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { key[0] = "key1"; array_of_strings value(1, ""); value[0] = "value1"; - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 0); std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { @@ -103,53 +103,44 @@ TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { key[1] = "subkey" + std::to_string(i % 10); array_of_strings value(1, ""); value[0] = "value" + std::to_string(i); - sketch.update(key, value); + sketch.update(hash_array_of_strings_key(key), value); } REQUIRE(sketch.is_empty() == (n == 0)); REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { auto sketch = aos_sketch::builder().build(); sketch.update( - make_array({u8"키", u8"열쇠"}), + hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})), make_array({u8"밸류", u8"값"}) ); sketch.update( - make_array({u8"🔑", u8"🗝️"}), + hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})), make_array({u8"📦", u8"🎁"}) ); sketch.update( - make_array({u8"ключ1", u8"ключ2"}), + hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})), make_array({u8"ценить1", u8"ценить2"}) ); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { auto sketch = aos_sketch::builder().build(); - sketch.update( - make_array({""}), - make_array({"empty_key_value"}) - ); - sketch.update( - make_array({"empty_value_key"}), - make_array({""}) - ); - sketch.update( - make_array({"", ""}), - make_array({"", ""}) - ); + sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"})); + sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""})); + sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""})); REQUIRE_FALSE(sketch.is_empty()); REQUIRE(sketch.get_num_retained() == 3); std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); - sketch.compact().serialize(os, default_array_of_strings_serde<>()); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); } } /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 3e3673aa..59cc04ca 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -18,7 +18,6 @@ */ #include -#include #include #include #include @@ -94,8 +93,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("same key replaces summary") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"alpha", "beta"}), make_array({"first"})); - sketch.update(make_array({"alpha", "beta"}), make_array({"second", "third"})); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"first"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"second", "third"}) + ); REQUIRE(sketch.get_num_retained() == 1); @@ -109,8 +114,14 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("distinct keys retain multiple entries") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"a", "bc"}), make_array({"one"})); - sketch.update(make_array({"ab", "c"}), make_array({"two"})); + sketch.update( + hash_array_of_strings_key(make_array({"a", "bc"})), + make_array({"one"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"ab", "c"})), + make_array({"two"}) + ); REQUIRE(sketch.get_num_retained() == 2); @@ -128,7 +139,7 @@ TEST_CASE("aos sketch update", "[tuple_sketch]") { SECTION("empty key") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({}), make_array({"value"})); + sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"})); REQUIRE(sketch.get_num_retained() == 1); auto it = sketch.begin(); @@ -201,46 +212,46 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { } }; - auto exercise_ordering = [&](const update_array_of_strings_tuple_sketch<>& sketch) { - auto ordered = sketch.compact(true); - auto unordered = sketch.compact(false); + auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = compact_array_of_strings_sketch(sketch, true); + auto unordered = compact_array_of_strings_sketch(sketch, false); check_round_trip(ordered); check_round_trip(unordered); }; SECTION("empty sketch") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - exercise_ordering(sketch); + run_tests(sketch); } SECTION("single entry sketch") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); - sketch.update(make_array({"key"}), make_array({"value"})); - exercise_ordering(sketch); + sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"})); + run_tests(sketch); } SECTION("multiple entries exact mode") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); for (int i = 0; i < 50; ++i) { sketch.update( - make_array({std::string("key-") + std::to_string(i)}), + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), make_array({std::string("value-") + std::to_string(i), "extra"}) ); } REQUIRE_FALSE(sketch.is_estimation_mode()); - exercise_ordering(sketch); + run_tests(sketch); } SECTION("multiple entries estimation mode") { auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); for (int i = 0; i < 10000; ++i) { sketch.update( - make_array({std::string("key-") + std::to_string(i)}), + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), make_array({std::string("value-") + std::to_string(i)}) ); } REQUIRE(sketch.is_estimation_mode()); - exercise_ordering(sketch); + run_tests(sketch); } } From 342248f294020a362a35c34d9b40ea7224d27438 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:37:41 +0900 Subject: [PATCH 07/11] fix: control array and element life cycle --- tuple/include/array_tuple_sketch.hpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 547b240c..54a000e8 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -34,17 +34,22 @@ class array { public: using value_type = T; using allocator_type = Allocator; + using alloc_traits = std::allocator_traits; - explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()): - allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - std::fill(array_, array_ + size_, value); + explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): + allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } } array(const array& other): allocator_(other.allocator_), size_(other.size_), - array_(allocator_.allocate(size_)) + array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { - std::copy(other.array_, other.array_ + size_, array_); + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -52,9 +57,15 @@ class array { array_(other.array_) { other.array_ = nullptr; + other.size_ = 0; } ~array() { - if (array_ != nullptr) allocator_.deallocate(array_, size_); + if (array_ != nullptr) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + allocator_.deallocate(array_, size_); + } } array& operator=(const array& other) { array copy(other); From 2c712e99ff17d8c6eff3c2c1dc53db5b10e0613d Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:48:12 +0900 Subject: [PATCH 08/11] fix: null ptr to empty array --- tuple/include/array_tuple_sketch.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 54a000e8..416816e0 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -37,7 +37,7 @@ class array { using alloc_traits = std::allocator_traits; explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): - allocator_(allocator), size_(size), array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) { + allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { for (uint8_t i = 0; i < size_; ++i) { alloc_traits::construct(allocator_, array_ + i, value); } @@ -45,7 +45,7 @@ class array { array(const array& other): allocator_(other.allocator_), size_(other.size_), - array_(size_ == 0 ? nullptr : allocator_.allocate(size_)) + array_(allocator_.allocate(size_)) { for (uint8_t i = 0; i < size_; ++i) { alloc_traits::construct(allocator_, array_ + i, other.array_[i]); From d463bfb159cd338fd88094ba600a50cfcc0e34d2 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 00:54:32 +0900 Subject: [PATCH 09/11] test: serde validation cases --- .../include/array_of_strings_sketch_impl.hpp | 3 --- tuple/test/array_of_strings_sketch_test.cpp | 26 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp index 01a3daba..f5fa0652 100644 --- a/tuple/include/array_of_strings_sketch_impl.hpp +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -240,9 +240,6 @@ uint32_t default_array_of_strings_serde::compute_total_bytes(const ar for (uint32_t j = 0; j < count; ++j) { total += data[j].size(); } - if (total > std::numeric_limits::max()) { - throw std::runtime_error("array_of_strings serialized size exceeds uint32_t max"); - } return static_cast(total); } diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp index 59cc04ca..74b225b0 100644 --- a/tuple/test/array_of_strings_sketch_test.cpp +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -255,4 +255,30 @@ TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { } } +TEST_CASE("aos serde validation", "[tuple_sketch]") { + default_array_of_strings_serde<> serde; + + SECTION("invalid utf8 rejected") { + array_of_strings array(1, "", std::allocator()); + const std::string invalid_utf8("\xC3\x28", 2); + array[0] = invalid_utf8; + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("invalid UTF-8") + ); + } + + SECTION("too many nodes rejected") { + array_of_strings array(128, "", std::allocator()); + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("size exceeds 127") + ); + } +} + } /* namespace datasketches */ From 3b3a13de52445e663472f630bd41080810977649 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 03:32:39 +0900 Subject: [PATCH 10/11] perf: avoid allocation if data type is primitive --- tuple/include/array_tuple_sketch.hpp | 38 +++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 416816e0..9baa2010 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -22,6 +22,9 @@ #include #include +#include +#include +#include #include "serde.hpp" #include "tuple_sketch.hpp" @@ -38,18 +41,14 @@ class array { explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::construct(allocator_, array_ + i, value); - } + init_values(value, std::is_trivially_copyable()); } array(const array& other): allocator_(other.allocator_), size_(other.size_), array_(allocator_.allocate(size_)) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::construct(allocator_, array_ + i, other.array_[i]); - } + copy_from(other, std::is_trivially_copyable()); } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -61,9 +60,7 @@ class array { } ~array() { if (array_ != nullptr) { - for (uint8_t i = 0; i < size_; ++i) { - alloc_traits::destroy(allocator_, array_ + i); - } + destroy_values(std::is_trivially_destructible()); allocator_.deallocate(array_, size_); } } @@ -90,6 +87,29 @@ class array { return true; } private: + void init_values(const T& value, std::true_type) { + std::fill(array_, array_ + size_, value); + } + void init_values(const T& value, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } + } + void copy_from(const array& other, std::true_type) { + std::copy(other.array_, other.array_ + size_, array_); + } + void copy_from(const array& other, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } + } + void destroy_values(std::true_type) {} + void destroy_values(std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + } + Allocator allocator_; uint8_t size_; T* array_; From 189d22de11b53ff48bb279d066586abbd5af71b9 Mon Sep 17 00:00:00 2001 From: proost Date: Wed, 28 Jan 2026 03:33:57 +0900 Subject: [PATCH 11/11] chore: remove unused header --- tuple/include/array_tuple_sketch.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 9baa2010..03761ff4 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include "serde.hpp"