diff --git a/NOTICE b/NOTICE index 11ba6f6c..6a2376d9 100644 --- a/NOTICE +++ b/NOTICE @@ -10,3 +10,6 @@ The Apache Software Foundation (http://www.apache.org/). Prior to moving to ASF, the software for this project was developed at Yahoo Inc. (https://developer.yahoo.com). + +This product includes utf8cpp (https://github.com/nemtrif/utfcpp), +licensed under the Boost Software License, Version 1.0. diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 8514433b..2d5c7330 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,3 +51,14 @@ install(FILES include/serde.hpp include/xxhash64.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") + +install(FILES + include/third_party/utf8cpp/utf8.h + include/third_party/utf8cpp/utf8/checked.h + include/third_party/utf8cpp/utf8/core.h + include/third_party/utf8cpp/utf8/cpp11.h + include/third_party/utf8cpp/utf8/cpp17.h + include/third_party/utf8cpp/utf8/cpp20.h + include/third_party/utf8cpp/utf8/unchecked.h + include/third_party/utf8cpp/LICENSE + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches/third_party/utf8cpp") diff --git a/common/include/third_party/utf8cpp/LICENSE b/common/include/third_party/utf8cpp/LICENSE new file mode 100644 index 00000000..36b7cd93 --- /dev/null +++ b/common/include/third_party/utf8cpp/LICENSE @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/common/include/third_party/utf8cpp/utf8.h b/common/include/third_party/utf8cpp/utf8.h new file mode 100644 index 00000000..b5135309 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8.h @@ -0,0 +1,46 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/common/include/third_party/utf8cpp/utf8/checked.h b/common/include/third_party/utf8cpp/utf8/checked.h new file mode 100644 index 00000000..96ceb4d5 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/checked.h @@ -0,0 +1,359 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + utfchar32_t cp; + public: + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } + utfchar32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + utfchar8_t u8; + public: + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast(c)) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } + utfchar8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + utfchar16_t u16; + public: + invalid_utf16 (utfchar16_t u) : u16(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } + utfchar16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append(cp, result); + } + + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it, octet_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(static_cast(*it)); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + utfchar32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = static_cast(utf8::internal::mask16(*start++)); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + const utfchar32_t trail_surrogate = static_cast(utf8::internal::mask16(*start++)); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + const utfchar32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#include "cpp17.h" +#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later + +#endif //header guard + diff --git a/common/include/third_party/utf8cpp/utf8/core.h b/common/include/third_party/utf8cpp/utf8/core.h new file mode 100644 index 00000000..8e128c18 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/core.h @@ -0,0 +1,500 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include +#include +#include + +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept + #define UTF_CPP_STATIC_ASSERT(condition) static_assert(condition, "UTFCPP static assert"); +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() + // Not worth simulating static_assert: + #define UTF_CPP_STATIC_ASSERT(condition) (void)(condition); +#endif // C++ 11 or later + + +namespace utf8 +{ +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline utfchar8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + + template + inline utfchar16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + inline bool is_lead_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(LEAD_SURROGATE_MAX)); + } + + inline bool is_trail_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(TRAIL_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_surrogate(utfchar32_t cp) + { + return (cp >= static_cast(LEAD_SURROGATE_MIN) && cp <= static_cast(TRAIL_SURROGATE_MAX)); + } + + inline bool is_code_point_valid(utfchar32_t cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + + template + int sequence_length(octet_iterator lead_it) + { + const utfchar8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + inline bool is_overlong_sequence(utfchar32_t cp, int length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, const octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = static_cast(utf8::internal::mask8(*it)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = static_cast(code_point + ((*it) & 0x3f)); + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + utfchar32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + utfchar32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + // Make sure the iterator dereferences a large enough type + typedef typename std::iterator_traits::value_type word_type; + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + // Check the edge case: + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(static_cast(second_word))) { + code_point = static_cast(first_word << 10) + static_cast(second_word) + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + + // Internal implementation of both checked and unchecked append() function + // This function will be invoked by the overloads below, as they will know + // the octet_type. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + // One of the following overloads will be invoked from the API calls + + // A simple (but dangerous) case: the caller appends byte(s) to a char array + inline char* append(utfchar32_t cp, char* result) { + return append(cp, result); + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append(cp, std::back_inserter(str)); + template + std::back_insert_iterator append + (utfchar32_t cp, std::back_insert_iterator result) { + return append, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine octet_type + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + UTF_CPP_STATIC_ASSERT(sizeof(word_type) >= sizeof(utfchar16_t)); + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp11.h b/common/include/third_party/utf8cpp/utf8/cpp11.h new file mode 100644 index 00000000..691633c8 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp11.h @@ -0,0 +1,70 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" + +namespace utf8 +{ + inline void append16(utfchar32_t cp, std::u16string& s) + { + append16(cp, std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp17.h b/common/include/third_party/utf8cpp/utf8/cpp17.h new file mode 100644 index 00000000..07587300 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp17.h @@ -0,0 +1,96 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp11.h" + +namespace utf8 +{ + inline std::string utf16to8(std::u16string_view s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(std::u32string_view s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(std::string_view s) + { + std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(std::string_view s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(std::string_view s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/cpp20.h b/common/include/third_party/utf8cpp/utf8/cpp20.h new file mode 100644 index 00000000..07b61d0f --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/common/include/third_party/utf8cpp/utf8/unchecked.h b/common/include/third_party/utf8cpp/utf8/unchecked.h new file mode 100644 index 00000000..173d0302 --- /dev/null +++ b/common/include/third_party/utf8cpp/utf8/unchecked.h @@ -0,0 +1,286 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + return internal::append(cp, result); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append(replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append(replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = static_cast(utf8::internal::mask16(0xfffd)); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it) + { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { + case 1: + break; + case 2: + ++it; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp = static_cast(cp + ((utf8::internal::mask8(*it) << 6) & 0xfff)); + ++it; + cp = static_cast(cp + ((*it) & 0x3f)); + break; + } + ++it; + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + + template + utfchar32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + template + void advance(octet_iterator& it, distance_type n) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } + } + + template + typename std::iterator_traits::difference_type + distance(octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start == end) + return result; + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + utfchar32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + +#endif // header guard + diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt index 4b0a48c7..54df11ee 100644 --- a/tuple/CMakeLists.txt +++ b/tuple/CMakeLists.txt @@ -54,4 +54,6 @@ install(FILES include/array_tuple_intersection_impl.hpp include/array_tuple_a_not_b.hpp include/array_tuple_a_not_b_impl.hpp + include/array_of_strings_sketch.hpp + include/array_of_strings_sketch_impl.hpp DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/tuple/include/array_of_strings_sketch.hpp b/tuple/include/array_of_strings_sketch.hpp new file mode 100644 index 00000000..db147723 --- /dev/null +++ b/tuple/include/array_of_strings_sketch.hpp @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_HPP_ + +#include +#include + +#include "array_tuple_sketch.hpp" +#include "xxhash64.h" + +namespace datasketches { + +// default update policy for an array of strings +template> +class default_array_of_strings_update_policy { +public: + using array_of_strings = array; + + explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator()); + + array_of_strings create() const; + + void update(array_of_strings& array, const array_of_strings& input) const; + + void update(array_of_strings& array, const array_of_strings* input) const; + +private: + Allocator allocator_; +}; + +// serializer/deserializer for an array of strings +// Requirements: all strings must be valid UTF-8 and array size must be <= 127. +template> +struct default_array_of_strings_serde { + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); + + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; + size_t size_of_item(const array_of_strings& item) const; + +private: + Allocator allocator_; + summary_allocator summary_allocator_; + static void check_num_nodes(uint8_t num_nodes); + static uint32_t compute_total_bytes(const array_of_strings& item); + static void check_utf8(const std::string& value); +}; + +/** + * Hashes an array of strings using ArrayOfStrings-compatible hashing. + */ +template> +uint64_t hash_array_of_strings_key(const array& key); + +/** + * Extended class of compact_tuple_sketch for array of strings + * Requirements: all strings must be valid UTF-8 and array size must be <= 127. + */ +template> +class compact_array_of_strings_tuple_sketch: + public compact_tuple_sketch< + array, + typename std::allocator_traits::template rebind_alloc> + > { +public: + using array_of_strings = array; + using summary_allocator = typename std::allocator_traits::template rebind_alloc; + using Base = compact_tuple_sketch; + using vector_bytes = typename Base::vector_bytes; + using Base::serialize; + + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (update or compact) + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ + template + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); + + /** + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + + /** + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @param sd instance of a SerDe + * @param allocator instance of an Allocator + * @return an instance of the sketch + */ + template> + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); + +private: + explicit compact_array_of_strings_tuple_sketch(Base&& base); +}; + +/** + * Convenience alias for update_tuple_sketch for array of strings + */ +template, + typename Policy = default_array_of_strings_update_policy> +using update_array_of_strings_tuple_sketch = update_tuple_sketch< + array, + array, + Policy, + typename std::allocator_traits::template rebind_alloc> +>; + +/** + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). + * @param sketch input sketch + * @param ordered optional flag to specify if an ordered sketch should be produced + * @return compact array of strings sketch + */ +template, typename Policy = default_array_of_strings_update_policy> +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered = true); + +} /* namespace datasketches */ + +#include "array_of_strings_sketch_impl.hpp" + +#endif diff --git a/tuple/include/array_of_strings_sketch_impl.hpp b/tuple/include/array_of_strings_sketch_impl.hpp new file mode 100644 index 00000000..f5fa0652 --- /dev/null +++ b/tuple/include/array_of_strings_sketch_impl.hpp @@ -0,0 +1,255 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ +#define ARRAY_OF_STRINGS_SKETCH_IMPL_HPP_ + +#include + +#include "common_defs.hpp" +#include "third_party/utf8cpp/utf8.h" + +namespace datasketches { + +template +default_array_of_strings_update_policy::default_array_of_strings_update_policy(const Allocator& allocator): + allocator_(allocator) {} + +template +auto default_array_of_strings_update_policy::create() const -> array_of_strings { + return array_of_strings(0, "", allocator_); +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings& input +) const { + const auto length = static_cast(input.size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = input[i]; +} + +template +void default_array_of_strings_update_policy::update( + array_of_strings& array, const array_of_strings* input +) const { + if (input == nullptr) { + array = array_of_strings(0, "", allocator_); + return; + } + const auto length = static_cast(input->size()); + array = array_of_strings(static_cast(length), "", allocator_); + for (size_t i = 0; i < length; ++i) array[i] = (*input)[i]; +} + +template +uint64_t hash_array_of_strings_key(const array& key) { + // Matches Java Util.PRIME for ArrayOfStrings key hashing. + static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL; + XXHash64 hasher(STRING_ARR_HASH_SEED); + const auto size = static_cast(key.size()); + for (size_t i = 0; i < size; ++i) { + const auto& entry = key[i]; + hasher.add(entry.data(), entry.size()); + if (i + 1 < size) hasher.add(",", 1); + } + return hasher.hash(); +} + +template +compact_array_of_strings_tuple_sketch compact_array_of_strings_sketch( + const update_array_of_strings_tuple_sketch& sketch, bool ordered +) { + return compact_array_of_strings_tuple_sketch(sketch, ordered); +} + +template +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + const Sketch& sketch, bool ordered +): Base(sketch, ordered) {} + +template +compact_array_of_strings_tuple_sketch::compact_array_of_strings_tuple_sketch( + Base&& base +): Base(std::move(base)) {} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + std::istream& is, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(is, seed, sd, alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +template +auto compact_array_of_strings_tuple_sketch::deserialize( + const void* bytes, size_t size, uint64_t seed, const SerDe& sd, const Allocator& allocator +) -> compact_array_of_strings_tuple_sketch { + summary_allocator alloc(allocator); + auto base = Base::deserialize(bytes, size, seed, sd, alloc); + return compact_array_of_strings_tuple_sketch(std::move(base)); +} + +template +default_array_of_strings_serde::default_array_of_strings_serde(const Allocator& allocator): + allocator_(allocator), + summary_allocator_(allocator) {} + +template +void default_array_of_strings_serde::serialize( + std::ostream& os, const array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + write(os, total_bytes); + write(os, num_nodes); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + write(os, length); + os.write(data[j].data(), length); + } + } +} + +template +void default_array_of_strings_serde::deserialize( + std::istream& is, array_of_strings* items, unsigned num +) const { + for (unsigned i = 0; i < num; ++i) { + read(is); // total_bytes + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + const uint8_t num_nodes = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", allocator_); + for (uint8_t j = 0; j < num_nodes; ++j) { + const uint32_t length = read(is); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + std::string value(length, '\0'); + if (length != 0) { + is.read(&value[0], length); + if (!is) throw std::runtime_error("array_of_strings stream read failed"); + } + check_utf8(value); + array[j] = std::move(value); + } + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } +} + +template +size_t default_array_of_strings_serde::serialize( + void* ptr, size_t capacity, const array_of_strings* items, unsigned num +) const { + uint8_t* ptr8 = static_cast(ptr); + size_t bytes_written = 0; + + for (unsigned i = 0; i < num; ++i) { + const uint32_t total_bytes = compute_total_bytes(items[i]); + const uint8_t num_nodes = static_cast(items[i].size()); + check_memory_size(bytes_written + total_bytes, capacity); + bytes_written += copy_to_mem(total_bytes, ptr8 + bytes_written); + bytes_written += copy_to_mem(num_nodes, ptr8 + bytes_written); + const std::string* data = items[i].data(); + for (uint8_t j = 0; j < num_nodes; ++j) { + check_utf8(data[j]); + const uint32_t length = static_cast(data[j].size()); + + bytes_written += copy_to_mem(length, ptr8 + bytes_written); + bytes_written += copy_to_mem(data[j].data(), ptr8 + bytes_written, length); + } + } + return bytes_written; +} + +template +size_t default_array_of_strings_serde::deserialize( + const void* ptr, size_t capacity, array_of_strings* items, unsigned num +) const { + const uint8_t* ptr8 = static_cast(ptr); + size_t bytes_read = 0; + + for (unsigned i = 0; i < num; ++i) { + check_memory_size(bytes_read + sizeof(uint32_t), capacity); + const size_t item_start = bytes_read; + uint32_t total_bytes; + bytes_read += copy_from_mem(ptr8 + bytes_read, total_bytes); + check_memory_size(item_start + total_bytes, capacity); + uint8_t num_nodes; + bytes_read += copy_from_mem(ptr8 + bytes_read, num_nodes); + check_num_nodes(num_nodes); + array_of_strings array(num_nodes, "", allocator_); + for (uint8_t j = 0; j < num_nodes; ++j) { + uint32_t length; + bytes_read += copy_from_mem(ptr8 + bytes_read, length); + std::string value(length, '\0'); + if (length != 0) { + bytes_read += copy_from_mem(ptr8 + bytes_read, &value[0], length); + } + check_utf8(value); + array[j] = std::move(value); + } + summary_allocator alloc(summary_allocator_); + std::allocator_traits::construct(alloc, &items[i], std::move(array)); + } + return bytes_read; +} + +template +size_t default_array_of_strings_serde::size_of_item(const array_of_strings& item) const { + return compute_total_bytes(item); +} + +template +void default_array_of_strings_serde::check_num_nodes(uint8_t num_nodes) { + if (num_nodes > 127) { + throw std::runtime_error("array_of_strings size exceeds 127"); + } +} + +template +uint32_t default_array_of_strings_serde::compute_total_bytes(const array_of_strings& item) { + const auto count = item.size(); + check_num_nodes(static_cast(count)); + size_t total = sizeof(uint32_t) + sizeof(uint8_t) + count * sizeof(uint32_t); + const std::string* data = item.data(); + for (uint32_t j = 0; j < count; ++j) { + total += data[j].size(); + } + return static_cast(total); +} + +template +void default_array_of_strings_serde::check_utf8(const std::string& value) { + if (!utf8::is_valid(value.begin(), value.end())) { + throw std::runtime_error("array_of_strings contains invalid UTF-8"); + } +} + +} /* namespace datasketches */ + +#endif diff --git a/tuple/include/array_tuple_sketch.hpp b/tuple/include/array_tuple_sketch.hpp index 547b240c..03761ff4 100644 --- a/tuple/include/array_tuple_sketch.hpp +++ b/tuple/include/array_tuple_sketch.hpp @@ -22,6 +22,8 @@ #include #include +#include +#include #include "serde.hpp" #include "tuple_sketch.hpp" @@ -34,17 +36,18 @@ class array { public: using value_type = T; using allocator_type = Allocator; + using alloc_traits = std::allocator_traits; - explicit array(uint8_t size, T value, const Allocator& allocator = Allocator()): + explicit array(uint8_t size, const T& value, const Allocator& allocator = Allocator()): allocator_(allocator), size_(size), array_(allocator_.allocate(size_)) { - std::fill(array_, array_ + size_, value); + init_values(value, std::is_trivially_copyable()); } array(const array& other): allocator_(other.allocator_), size_(other.size_), array_(allocator_.allocate(size_)) { - std::copy(other.array_, other.array_ + size_, array_); + copy_from(other, std::is_trivially_copyable()); } array(array&& other) noexcept: allocator_(std::move(other.allocator_)), @@ -52,9 +55,13 @@ class array { array_(other.array_) { other.array_ = nullptr; + other.size_ = 0; } ~array() { - if (array_ != nullptr) allocator_.deallocate(array_, size_); + if (array_ != nullptr) { + destroy_values(std::is_trivially_destructible()); + allocator_.deallocate(array_, size_); + } } array& operator=(const array& other) { array copy(other); @@ -79,6 +86,29 @@ class array { return true; } private: + void init_values(const T& value, std::true_type) { + std::fill(array_, array_ + size_, value); + } + void init_values(const T& value, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, value); + } + } + void copy_from(const array& other, std::true_type) { + std::copy(other.array_, other.array_ + size_, array_); + } + void copy_from(const array& other, std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::construct(allocator_, array_ + i, other.array_[i]); + } + } + void destroy_values(std::true_type) {} + void destroy_values(std::false_type) { + for (uint8_t i = 0; i < size_; ++i) { + alloc_traits::destroy(allocator_, array_ + i); + } + } + Allocator allocator_; uint8_t size_; T* array_; diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt index 4ca6a503..3d7ccca3 100644 --- a/tuple/test/CMakeLists.txt +++ b/tuple/test/CMakeLists.txt @@ -44,6 +44,7 @@ target_sources(tuple_test tuple_a_not_b_test.cpp tuple_jaccard_similarity_test.cpp array_of_doubles_sketch_test.cpp + array_of_strings_sketch_test.cpp engagement_test.cpp ) @@ -52,6 +53,7 @@ target_sources(tuple_test PRIVATE aod_sketch_deserialize_from_java_test.cpp tuple_sketch_deserialize_from_java_test.cpp + aos_sketch_deserialize_from_java_test.cpp ) endif() @@ -60,5 +62,6 @@ target_sources(tuple_test PRIVATE aod_sketch_serialize_for_java.cpp tuple_sketch_serialize_for_java.cpp + aos_sketch_serialize_for_java.cpp ) endif() diff --git a/tuple/test/aos_sketch_deserialize_from_java_test.cpp b/tuple/test/aos_sketch_deserialize_from_java_test.cpp new file mode 100644 index 00000000..af37d6c2 --- /dev/null +++ b/tuple/test/aos_sketch_deserialize_from_java_test.cpp @@ -0,0 +1,283 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + // assume the binary sketches for this test have been generated by datasketches-java code + // in the subdirectory called "java" in the root directory of this project + static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/"; + + static std::vector read_binary_file(const std::string& path) { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + is.seekg(0, std::ios::end); + const auto size = static_cast(is.tellg()); + is.seekg(0, std::ios::beg); + std::vector bytes(size); + if (size != 0) { + is.read(reinterpret_cast(bytes.data()), size); + } + return bytes; + } + + TEST_CASE("aos sketch one value", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_1_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch three values", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_3_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 3); + } + } + } + } + + TEST_CASE("aos sketch non-empty no entries", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_1_non_empty_no_entries_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + } + } + + TEST_CASE("aos sketch multi keys strings", "[serde_compat]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + const auto path = testBinaryInputPath + "aos_multikey_n" + std::to_string(n) + "_java.sk"; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.is_estimation_mode() == (n > 1000)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 1); + } + } + } + } + + TEST_CASE("aos sketch unicode strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_unicode_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + + const std::vector> expected_values = { + {"밸류", "값"}, + {"📦", "🎁"}, + {"ценить1", "ценить2"} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + REQUIRE(entry.second.size() == 2); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } + + TEST_CASE("aos sketch empty strings", "[serde_compat]") { + const auto path = testBinaryInputPath + "aos_empty_strings_java.sk"; + auto check = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE_FALSE(sketch.is_estimation_mode()); + REQUIRE(sketch.get_num_retained() == 3); + const std::vector> expected_values = { + {"empty_key_value"}, + {""}, + {"", ""} + }; + std::vector matched(expected_values.size(), false); + for (const auto& entry: sketch) { + REQUIRE(entry.first < sketch.get_theta64()); + + bool found = false; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (matched[i]) continue; + const auto& expected = expected_values[i]; + if (entry.second.size() != expected.size()) continue; + bool equal = true; + for (size_t j = 0; j < expected.size(); ++j) { + if (entry.second[j] != expected[j]) { + equal = false; + break; + } + } + if (equal) { + matched[i] = true; + found = true; + break; + } + } + REQUIRE(found); + } + for (bool found: matched) REQUIRE(found); + }; + SECTION("stream") { + std::ifstream is; + is.exceptions(std::ios::failbit | std::ios::badbit); + is.open(path, std::ios::binary); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + is, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + SECTION("bytes") { + const auto bytes = read_binary_file(path); + const auto sketch = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + check(sketch); + } + } +} diff --git a/tuple/test/aos_sketch_serialize_for_java.cpp b/tuple/test/aos_sketch_serialize_for_java.cpp new file mode 100644 index 00000000..c6eb0dfc --- /dev/null +++ b/tuple/test/aos_sketch_serialize_for_java.cpp @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using aos_sketch = update_array_of_strings_tuple_sketch<>; +using array_of_strings = array; + +static array_of_strings make_array(std::initializer_list items) { + array_of_strings array(static_cast(items.size()), ""); + size_t i = 0; + for (const auto& item: items) { + array[static_cast(i)] = item; + ++i; + } + return array; +} + +TEST_CASE("aos sketch generate one value", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_1_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate three values", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(1, ""); + key[0] = std::to_string(i); + array_of_strings value(3, ""); + value[0] = "a" + std::to_string(i); + value[1] = "b" + std::to_string(i); + value[2] = "c" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_3_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate non-empty no entries", "[serialize_for_java]") { + auto sketch = aos_sketch::builder() + .set_lg_k(12) + .set_resize_factor(resize_factor::X8) + .set_p(0.01f) + .build(); + array_of_strings key(1, ""); + key[0] = "key1"; + array_of_strings value(1, ""); + value[0] = "value1"; + sketch.update(hash_array_of_strings_key(key), value); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 0); + std::ofstream os("aos_1_non_empty_no_entries_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate multi key strings", "[serialize_for_java]") { + const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (const unsigned n: n_arr) { + auto sketch = aos_sketch::builder().build(); + for (unsigned i = 0; i < n; ++i) { + array_of_strings key(2, ""); + key[0] = "key" + std::to_string(i); + key[1] = "subkey" + std::to_string(i % 10); + array_of_strings value(1, ""); + value[0] = "value" + std::to_string(i); + sketch.update(hash_array_of_strings_key(key), value); + } + REQUIRE(sketch.is_empty() == (n == 0)); + REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03)); + std::ofstream os("aos_multikey_n" + std::to_string(n) + "_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); + } +} + +TEST_CASE("aos sketch generate unicode strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update( + hash_array_of_strings_key(make_array({u8"키", u8"열쇠"})), + make_array({u8"밸류", u8"값"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"🔑", u8"🗝️"})), + make_array({u8"📦", u8"🎁"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({u8"ключ1", u8"ключ2"})), + make_array({u8"ценить1", u8"ценить2"}) + ); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_unicode_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +TEST_CASE("aos sketch generate empty strings", "[serialize_for_java]") { + auto sketch = aos_sketch::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({""})), make_array({"empty_key_value"})); + sketch.update(hash_array_of_strings_key(make_array({"empty_value_key"})), make_array({""})); + sketch.update(hash_array_of_strings_key(make_array({"", ""})), make_array({"", ""})); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_num_retained() == 3); + std::ofstream os("aos_empty_strings_cpp.sk", std::ios::binary); + compact_array_of_strings_sketch(sketch).serialize(os, default_array_of_strings_serde<>()); +} + +} /* namespace datasketches */ diff --git a/tuple/test/array_of_strings_sketch_test.cpp b/tuple/test/array_of_strings_sketch_test.cpp new file mode 100644 index 00000000..74b225b0 --- /dev/null +++ b/tuple/test/array_of_strings_sketch_test.cpp @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include "array_of_strings_sketch.hpp" + +namespace datasketches { + +using array_of_strings = array; + +TEST_CASE("aos update policy", "[tuple_sketch]") { + default_array_of_strings_update_policy<> policy; + + SECTION("create empty") { + auto values = policy.create(); + REQUIRE(values.size() == 0); + } + + SECTION("replace array") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "alpha"; + input[1] = "beta"; + policy.update(values, input); + REQUIRE(values.size() == 2); + REQUIRE(values[0] == "alpha"); + REQUIRE(values[1] == "beta"); + input[0] = "changed"; + REQUIRE(values[0] == "alpha"); + + array_of_strings input2(1, "", std::allocator()); + input2[0] = "gamma"; + policy.update(values, input2); + REQUIRE(values.size() == 1); + REQUIRE(values[0] == "gamma"); + } + + SECTION("nullptr clears") { + array_of_strings values(2, "", std::allocator()); + values[0] = "one"; + values[1] = "two"; + + policy.update(values, nullptr); + REQUIRE(values.size() == 0); + } + + SECTION("pointer input copies") { + auto values = policy.create(); + + array_of_strings input(2, "", std::allocator()); + input[0] = "first"; + input[1] = "second"; + policy.update(values, &input); + REQUIRE(values.size() == 2); + REQUIRE(values[1] == "second"); + input[1] = "changed"; + REQUIRE(values[1] == "second"); + } +} + +TEST_CASE("aos sketch update", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto* entry: entries) array[i++] = entry; + return array; + }; + + SECTION("same key replaces summary") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"first"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"alpha", "beta"})), + make_array({"second", "third"}) + ); + + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 2); + REQUIRE(it->second[0] == "second"); + REQUIRE(it->second[1] == "third"); + } + + SECTION("distinct keys retain multiple entries") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update( + hash_array_of_strings_key(make_array({"a", "bc"})), + make_array({"one"}) + ); + sketch.update( + hash_array_of_strings_key(make_array({"ab", "c"})), + make_array({"two"}) + ); + + REQUIRE(sketch.get_num_retained() == 2); + + bool saw_one = false; + bool saw_two = false; + for (const auto& entry: sketch) { + REQUIRE(entry.second.size() == 1); + if (entry.second[0] == "one") saw_one = true; + if (entry.second[0] == "two") saw_two = true; + } + REQUIRE(saw_one); + REQUIRE(saw_two); + } + + SECTION("empty key") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + + sketch.update(hash_array_of_strings_key(make_array({})), make_array({"value"})); + REQUIRE(sketch.get_num_retained() == 1); + + auto it = sketch.begin(); + REQUIRE(it != sketch.end()); + REQUIRE(it->second.size() == 1); + REQUIRE(it->second[0] == "value"); + } +} + +TEST_CASE("aos sketch: serialize deserialize", "[tuple_sketch]") { + auto make_array = [](std::initializer_list entries) { + array_of_strings array(static_cast(entries.size()), "", std::allocator()); + uint8_t i = 0; + for (const auto& entry: entries) array[i++] = entry; + return array; + }; + + auto collect_entries = [](const compact_array_of_strings_tuple_sketch<>& sketch) { + typedef std::pair entry_type; + std::vector entries; + for (const auto& entry: sketch) entries.push_back(entry); + struct entry_less { + bool operator()(const entry_type& lhs, const entry_type& rhs) const { + return lhs.first < rhs.first; + } + }; + std::sort(entries.begin(), entries.end(), entry_less()); + return entries; + }; + + auto check_round_trip = [&](const compact_array_of_strings_tuple_sketch<>& compact_sketch) { + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + compact_sketch.serialize(ss, default_array_of_strings_serde<>()); + auto deserialized_stream = compact_array_of_strings_tuple_sketch<>::deserialize( + ss, DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + auto bytes = compact_sketch.serialize(0, default_array_of_strings_serde<>()); + auto deserialized_bytes = compact_array_of_strings_tuple_sketch<>::deserialize( + bytes.data(), bytes.size(), DEFAULT_SEED, default_array_of_strings_serde<>() + ); + + const compact_array_of_strings_tuple_sketch<>* deserialized_list[2] = { + &deserialized_stream, + &deserialized_bytes + }; + for (int list_index = 0; list_index < 2; ++list_index) { + const compact_array_of_strings_tuple_sketch<>* deserialized = deserialized_list[list_index]; + REQUIRE(compact_sketch.is_empty() == deserialized->is_empty()); + REQUIRE(compact_sketch.is_estimation_mode() == deserialized->is_estimation_mode()); + REQUIRE(compact_sketch.is_ordered() == deserialized->is_ordered()); + REQUIRE(compact_sketch.get_num_retained() == deserialized->get_num_retained()); + REQUIRE(compact_sketch.get_theta() == Approx(deserialized->get_theta()).margin(1e-10)); + REQUIRE(compact_sketch.get_estimate() == Approx(deserialized->get_estimate()).margin(1e-10)); + REQUIRE(compact_sketch.get_lower_bound(1) == Approx(deserialized->get_lower_bound(1)).margin(1e-10)); + REQUIRE(compact_sketch.get_upper_bound(1) == Approx(deserialized->get_upper_bound(1)).margin(1e-10)); + + auto original_entries = collect_entries(compact_sketch); + auto round_trip_entries = collect_entries(*deserialized); + REQUIRE(original_entries.size() == round_trip_entries.size()); + for (size_t i = 0; i < original_entries.size(); ++i) { + REQUIRE(original_entries[i].first == round_trip_entries[i].first); + REQUIRE(original_entries[i].second.size() == round_trip_entries[i].second.size()); + for (size_t j = 0; j < original_entries[i].second.size(); ++j) { + REQUIRE(original_entries[i].second[static_cast(j)] == + round_trip_entries[i].second[static_cast(j)]); + } + } + } + }; + + auto run_tests = [&](const update_array_of_strings_tuple_sketch<>& sketch) { + auto ordered = compact_array_of_strings_sketch(sketch, true); + auto unordered = compact_array_of_strings_sketch(sketch, false); + check_round_trip(ordered); + check_round_trip(unordered); + }; + + SECTION("empty sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + run_tests(sketch); + } + + SECTION("single entry sketch") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + sketch.update(hash_array_of_strings_key(make_array({"key"})), make_array({"value"})); + run_tests(sketch); + } + + SECTION("multiple entries exact mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().set_lg_k(8).build(); + for (int i = 0; i < 50; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i), "extra"}) + ); + } + REQUIRE_FALSE(sketch.is_estimation_mode()); + run_tests(sketch); + } + + SECTION("multiple entries estimation mode") { + auto sketch = update_array_of_strings_tuple_sketch<>::builder().build(); + for (int i = 0; i < 10000; ++i) { + sketch.update( + hash_array_of_strings_key(make_array({std::string("key-") + std::to_string(i)})), + make_array({std::string("value-") + std::to_string(i)}) + ); + } + REQUIRE(sketch.is_estimation_mode()); + run_tests(sketch); + } +} + +TEST_CASE("aos serde validation", "[tuple_sketch]") { + default_array_of_strings_serde<> serde; + + SECTION("invalid utf8 rejected") { + array_of_strings array(1, "", std::allocator()); + const std::string invalid_utf8("\xC3\x28", 2); + array[0] = invalid_utf8; + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("invalid UTF-8") + ); + } + + SECTION("too many nodes rejected") { + array_of_strings array(128, "", std::allocator()); + std::stringstream ss; + ss.exceptions(std::ios::failbit | std::ios::badbit); + REQUIRE_THROWS_WITH( + serde.serialize(ss, &array, 1), + Catch::Matchers::Contains("size exceeds 127") + ); + } +} + +} /* namespace datasketches */