diff --git a/builtin-functions/kphp-light/stdlib/regex-functions.txt b/builtin-functions/kphp-light/stdlib/regex-functions.txt index dda204a2d3..54c35bb468 100644 --- a/builtin-functions/kphp-light/stdlib/regex-functions.txt +++ b/builtin-functions/kphp-light/stdlib/regex-functions.txt @@ -39,11 +39,10 @@ function preg_replace_callback( &$replace_count ::: int = TODO, $flags ::: int = 0): string | ^3 | null; +function preg_split ($pattern ::: string, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; + // ===== UNSUPPORTED ===== /** @kphp-extern-func-info stub generation-required */ function preg_last_error() ::: int; -/** @kphp-extern-func-info stub */ -function preg_split ($pattern ::: regexp, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; - diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 4f4e0948cb..0b4420d496 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -7,9 +7,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -29,9 +29,38 @@ #include "runtime-light/stdlib/string/regex-include.h" #include "runtime-light/stdlib/string/regex-state.h" -namespace { +namespace std { + +template<> +struct formatter { + static constexpr size_t ERROR_BUFFER_LENGTH{256}; -constexpr size_t ERROR_BUFFER_LENGTH{256}; + template + constexpr auto parse(ParseContext& ctx) const noexcept { + return ctx.begin(); + } + + template + auto format(kphp::regex::details::pcre2_error error, FmtContext& ctx) const noexcept { + std::array buffer{}; + auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + return format_to(ctx.out(), "unknown error ({})", error.code); + case PCRE2_ERROR_NOMEMORY: + return format_to(ctx.out(), "[truncated] {}", buffer.data()); + default: + kphp::log::error("unsupported regex error code: {}", ret_code); + } + } + return format_to(ctx.out(), "{}", buffer.data()); + } +}; + +} // namespace std + +namespace { enum class trailing_unmatch : uint8_t { skip, include }; @@ -39,12 +68,7 @@ using backref = std::string_view; using regex_pcre2_group_names_t = kphp::stl::vector; struct RegexInfo final { - std::string_view regex; - // non-null-terminated regex without delimiters and PCRE modifiers - // - // regex -> ~pattern~im\0 - // regex_body -> pattern - std::string_view regex_body; + const string& regex; std::string_view subject; std::string_view replacement; @@ -53,7 +77,7 @@ struct RegexInfo final { // number of groups including entire match uint32_t capture_count{}; // compiled regex - regex_pcre2_code_t regex_code{nullptr}; + pcre2_code_8* regex_code{nullptr}; // vector of group names regex_pcre2_group_names_t group_names; @@ -68,7 +92,7 @@ struct RegexInfo final { RegexInfo() = delete; - RegexInfo(std::string_view regex_, std::string_view subject_, std::string_view replacement_) noexcept + RegexInfo(const string& regex_, std::string_view subject_, std::string_view replacement_) noexcept : regex(regex_), subject(subject_), replacement(replacement_) {} @@ -76,15 +100,10 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept + pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, size_t num_groups) noexcept : m_subject_data{subject}, m_ovector_ptr{ovector}, - m_num_groups{ret_code} {} - - pcre2_match_view(const pcre2_match_view&) noexcept = default; - pcre2_match_view(pcre2_match_view&&) noexcept = default; - pcre2_match_view& operator=(const pcre2_match_view&) noexcept = default; - pcre2_match_view& operator=(pcre2_match_view&&) noexcept = default; + m_num_groups{num_groups} {} int32_t size() const noexcept { return m_num_groups; @@ -98,7 +117,7 @@ class pcre2_match_view { kphp::log::assertion(m_ovector_ptr); // ovector is an array of offset pairs PCRE2_SIZE start{m_ovector_ptr[2 * i]}; - PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]}; + PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]}; if (start == PCRE2_UNSET) { return std::nullopt; @@ -110,7 +129,7 @@ class pcre2_match_view { private: std::string_view m_subject_data; const PCRE2_SIZE* m_ovector_ptr; - int32_t m_num_groups; + size_t m_num_groups; }; template @@ -134,11 +153,11 @@ bool correct_offset(int64_t& offset, std::string_view subject) noexcept { return offset <= subject.size(); } -int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subject) noexcept { +int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept { // all multibyte utf8 runes consist of subsequent bytes, // these subsequent bytes start with 10 bit pattern // 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000) - while (offset < static_cast(subject.size()) && ((static_cast(subject[offset])) & 0xc0) == 0x80) { + while (offset < subject.size() && ((static_cast(subject[offset])) & 0xc0) == 0x80) { offset++; } return offset; @@ -173,11 +192,11 @@ class preg_replacement_parser { // $1, ${1} if (preg_replacement.front() == '{') { return try_get_backref(preg_replacement.substr(1)) - .and_then([this](auto value) noexcept -> std::optional { - auto digits_end_pos{1 + value.size()}; + .and_then([this](auto br) noexcept -> std::optional { + auto digits_end_pos{1 + br.size()}; if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { - preg_replacement = preg_replacement.substr(1 + value.size() + 1); - return value; + preg_replacement = preg_replacement.substr(1 + br.size() + 1); + return br; } return std::nullopt; @@ -186,27 +205,27 @@ class preg_replacement_parser { } return try_get_backref(preg_replacement) - .transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos{value.size()}; + .transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); - return value; + return br; }) .value_or('$'); case '\\': { // \1 - auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos{value.size()}; + auto opt_back_reference{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); - return value; + return br; })}; - if (back_reference_opt.has_value()) { - return *back_reference_opt; + if (opt_back_reference.has_value()) { + return *opt_back_reference; } else { - auto res{preg_replacement.front()}; - if (res == '$' || res == '\\') { + auto c{preg_replacement.front()}; + if (c == '$' || c == '\\') { preg_replacement = preg_replacement.substr(1); - return res; + return c; } return '\\'; } @@ -255,7 +274,7 @@ class preg_replacement_parser { } return *this; } - iterator operator++(int) noexcept { + iterator operator++(int) noexcept { // NOLINT iterator temp{*this}; ++(*this); return temp; @@ -277,7 +296,29 @@ class preg_replacement_parser { } }; -bool parse_regex(RegexInfo& regex_info) noexcept { +bool compile_regex(RegexInfo& regex_info) noexcept { + const vk::final_action finalizer{[®ex_info]() noexcept { + if (regex_info.regex_code != nullptr) [[likely]] { + pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); + ++regex_info.capture_count; // to also count entire match + } else { + regex_info.capture_count = 0; + } + }}; + + auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.compile_context) [[unlikely]] { + return false; + } + + // check runtime cache + if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { + const auto& [compile_options, regex_code]{opt_ref->get()}; + regex_info.compile_options = compile_options; + regex_info.regex_code = regex_code.get(); + return true; + } + if (regex_info.regex.empty()) { kphp::log::warning("empty regex"); return false; @@ -324,18 +365,18 @@ bool parse_regex(RegexInfo& regex_info) noexcept { } uint32_t compile_options{}; - regex_info.regex_body = regex_info.regex; + std::string_view regex_body{regex_info.regex.c_str(), regex_info.regex.size()}; // remove start delimiter - regex_info.regex_body.remove_prefix(1); + regex_body.remove_prefix(1); // parse compile options and skip all symbols until the end delimiter - for (; !regex_info.regex_body.empty() && regex_info.regex_body.back() != end_delim; regex_info.regex_body.remove_suffix(1)) { + for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) { // spaces and newlines are ignored - if (regex_info.regex_body.back() == ' ' || regex_info.regex_body.back() == '\n') { + if (regex_body.back() == ' ' || regex_body.back() == '\n') { continue; } - switch (regex_info.regex_body.back()) { + switch (regex_body.back()) { case 'i': { compile_options |= PCRE2_CASELESS; break; @@ -377,20 +418,20 @@ bool parse_regex(RegexInfo& regex_info) noexcept { break; } default: { - kphp::log::warning("unsupported regex modifier {}", regex_info.regex_body.back()); + kphp::log::warning("unsupported regex modifier {}", regex_body.back()); break; } } } - if (regex_info.regex_body.empty()) { - kphp::log::warning("no ending regex delimiter: {}", regex_info.regex); + if (regex_body.empty()) { + kphp::log::warning("no ending regex delimiter: {}", regex_info.regex.c_str()); return false; } // UTF-8 validation if (static_cast(compile_options & PCRE2_UTF)) { - if (!mb_UTF8_check(regex_info.regex.data())) [[unlikely]] { - kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex); + if (!mb_UTF8_check(regex_info.regex.c_str())) [[unlikely]] { + kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex.c_str()); return false; } if (!mb_UTF8_check(regex_info.subject.data())) [[unlikely]] { @@ -400,48 +441,24 @@ bool parse_regex(RegexInfo& regex_info) noexcept { } // remove the end delimiter - regex_info.regex_body.remove_suffix(1); + regex_body.remove_suffix(1); regex_info.compile_options = compile_options; - return true; -} -bool compile_regex(RegexInfo& regex_info) noexcept { - const vk::final_action finalizer{[®ex_info]() noexcept { - if (regex_info.regex_code != nullptr) [[likely]] { - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); - ++regex_info.capture_count; // to also count entire match - } else { - regex_info.capture_count = 0; - } - }}; - - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.compile_context) [[unlikely]] { - return false; - } - - // check runtime cache - if (const auto it{regex_state.regex_pcre2_code_cache.find(regex_info.regex)}; it != regex_state.regex_pcre2_code_cache.end()) { - regex_info.regex_code = it->second; - return true; - } // compile pcre2_code int32_t error_number{}; PCRE2_SIZE error_offset{}; - regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_info.regex_body.data()), regex_info.regex_body.size(), - regex_info.compile_options, std::addressof(error_number), std::addressof(error_offset), - regex_state.compile_context.get())}; + regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), regex_info.compile_options, + std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), + pcre2_code_free_8}; if (!regex_code) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, kphp::regex::details::pcre2_error{.code = error_number}); return false; } + regex_info.regex_code = regex_code.get(); // add compiled code to runtime cache - regex_state.regex_pcre2_code_cache.emplace(regex_info.regex, regex_code); + regex_state.add_compiled_regex(regex_info.regex, compile_options, std::move(regex_code)); - regex_info.regex_code = regex_code; return true; } @@ -475,76 +492,49 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { return true; } -std::optional match_regex(const RegexInfo& regex_info, size_t offset, uint32_t match_options) noexcept { - const auto& regex_state{RegexInstanceState::get()}; - if (regex_info.regex_code == nullptr || !regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - int32_t match_count{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, - match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; - // From https://www.pcre.org/current/doc/html/pcre2_match.html - // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set - // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. - if (match_count < 0 && match_count != PCRE2_ERROR_NOMATCH) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(match_count, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); - return std::nullopt; - } - // zero if the vector of offsets is too small - return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; -} - class matcher { public: matcher(const RegexInfo& info, size_t match_from) noexcept - : m_regex_info{std::addressof(info)}, + : m_regex_info{info}, m_match_options{info.match_options}, m_current_offset{match_from} { - if (info.regex_code == nullptr) { - return; - } + kphp::log::assertion(info.regex_code != nullptr); const auto& regex_state{RegexInstanceState::get()}; m_match_data = regex_state.regex_pcre2_match_data.get(); - if (!m_match_data) { - return; - } - - m_has_error = false; + kphp::log::assertion(m_match_data); } - bool has_error() const noexcept { - return m_has_error; - } + std::expected, kphp::regex::details::pcre2_error> next() noexcept { + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); - std::optional next() noexcept { - const auto& ri{*m_regex_info}; - auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; + const auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { // Try to find match - auto match_count_opt{match_regex(ri, m_current_offset, m_match_options)}; - if (!match_count_opt.has_value()) { - // std::nullopt means error - m_has_error = true; - return std::nullopt; + int32_t ret_code{pcre2_match_8(m_regex_info.regex_code, reinterpret_cast(m_regex_info.subject.data()), m_regex_info.subject.size(), + m_current_offset, m_match_options, m_match_data, regex_state.match_context.get())}; + // From https://www.pcre.org/current/doc/html/pcre2_match.html + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. + if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { + return std::unexpected{kphp::regex::details::pcre2_error{.code = ret_code}}; } + size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; - auto ret_code{*match_count_opt}; - - if (ret_code == 0) { + if (match_count == 0) { // If match is not found - if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { + if (m_match_options == m_regex_info.match_options || m_current_offset == m_regex_info.subject.size()) { // Here we are sure that there are no more matches here return std::nullopt; } // Here we know that we were looking for a non-empty and anchored match, // and we're going to try searching from the next character with the default options. ++m_current_offset; - m_current_offset = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, ri.subject) : m_current_offset; - m_match_options = ri.match_options; + m_current_offset = + static_cast(m_regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, m_regex_info.subject) : m_current_offset; + m_match_options = m_regex_info.match_options; continue; } @@ -558,18 +548,17 @@ class matcher { m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } else { // Else use default options - m_match_options = ri.match_options; + m_match_options = m_regex_info.match_options; } - return pcre2_match_view{ri.subject, ovector, ret_code}; + return pcre2_match_view{m_regex_info.subject, ovector, match_count}; } } private: - const RegexInfo* const m_regex_info{nullptr}; + const RegexInfo& m_regex_info; uint64_t m_match_options{}; PCRE2_SIZE m_current_offset{}; pcre2_match_data_8* m_match_data{nullptr}; - bool m_has_error{true}; }; // returns the ending offset of the entire match @@ -589,8 +578,8 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional return end_offset; } - const auto offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; // calculate last matched group int64_t last_matched_group{-1}; for (auto i{0}; i < regex_info.match_count; ++i) { @@ -606,24 +595,24 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional array output{array_size{static_cast(regex_info.group_names.size() + named_groups_count), named_groups_count == 0}}; for (auto i{0}; i < regex_info.group_names.size(); ++i) { // skip unmatched groups at the end unless unmatched_as_null is set - if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !unmatched_as_null) [[unlikely]] { + if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !is_unmatched_as_null) [[unlikely]] { break; } - const auto match_start{ovector[static_cast(2 * i)]}; - const auto match_end{ovector[static_cast(2 * i + 1)]}; + const auto match_start_offset{ovector[static_cast(2 * i)]}; + const auto match_end_offset{ovector[static_cast(2 * i + 1)]}; - mixed match_val; // NULL value - if (match_start != PCRE2_UNSET) { // handle matched group - const auto match_size{match_end - match_start}; - match_val = string{std::next(regex_info.subject.data(), match_start), static_cast(match_size)}; - } else if (!unmatched_as_null) { // handle unmatched group + mixed match_val; // NULL value + if (match_start_offset != PCRE2_UNSET) { // handle matched group + const auto match_size{match_end_offset - match_start_offset}; + match_val = string{std::next(regex_info.subject.data(), match_start_offset), static_cast(match_size)}; + } else if (!is_unmatched_as_null) { // handle unmatched group match_val = string{}; } mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(match_val), static_cast(match_start)); + if (is_offset_capture) { + output_val = array::create(std::move(match_val), static_cast(match_start_offset)); } else { output_val = std::move(match_val); } @@ -631,7 +620,7 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional if (regex_info.group_names[i] != nullptr) { output.set_value(string{regex_info.group_names[i]}, output_val); } - output.push_back(output_val); + output.emplace_back(output_val); } (*opt_matches).get() = std::move(output); @@ -692,31 +681,35 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(regex_info.replace_count, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("pcre2_substitute error {}", buffer.data()); + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = static_cast(regex_info.replace_count)}); return false; } } else { // replace only 'limit' times - size_t match_offset{}; size_t substitute_offset{}; int64_t replacement_diff_acc{}; PCRE2_SIZE length_after_replace{buffer_length}; string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; + matcher pcre2_matcher{regex_info, {}}; for (; regex_info.replace_count < limit; ++regex_info.replace_count) { - auto match_count_opt{match_regex(regex_info, match_offset, regex_info.match_options)}; - if (!match_count_opt.has_value()) [[unlikely]] { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); return false; } - regex_info.match_count = *match_count_opt; - if (regex_info.match_count == 0) { + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { break; } - const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; - const auto match_start_offset{ovector[0]}; - const auto match_end_offset{ovector[1]}; + auto match_view{*opt_match_view}; + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + return false; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + const auto match_start_offset{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data())}; + const auto match_end_offset{match_start_offset + entire_pattern_match_string_view.size()}; length_after_replace = buffer_length; if (auto replace_one_ret_code{pcre2_substitute_8( @@ -724,11 +717,10 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = replace_one_ret_code}); return false; } - match_offset = match_end_offset; replacement_diff_acc += regex_info.replacement.size() - (match_end_offset - match_start_offset); substitute_offset = match_end_offset + replacement_diff_acc; str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast(length_after_replace)}; @@ -745,11 +737,106 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return true; } +std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bool no_empty, bool delim_capture, bool offset_capture) noexcept { + if (limit == 0) { + limit = kphp::regex::PREG_NOLIMIT; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + array output{}; + + matcher pcre2_matcher{regex_info, {}}; + size_t offset{}; + for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + pcre2_match_view match_view{*opt_match_view}; + + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + return std::nullopt; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + + if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + ++out_parts_count; + } + + if (delim_capture) { + for (size_t i{1}; i < match_view.size(); i++) { + auto opt_submatch{match_view.get_group(i)}; + auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; + const auto size{submatch_string_view.size()}; + if (!no_empty || size != 0) { + string val; + if (opt_submatch.has_value()) [[likely]] { + val = string{submatch_string_view.data(), static_cast(size)}; + } + + mixed output_val; + if (offset_capture) { + output_val = + array::create(std::move(val), opt_submatch + .transform([®ex_info](auto submatch_string_view) noexcept { + return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); + }) + .value_or(-1)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + } + } + + offset = std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) + entire_pattern_match_string_view.size(); + } + + const auto size{regex_info.subject.size() - offset}; + if (!no_empty || size != 0) { + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + + return output; +} + } // namespace Optional f$preg_match(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { return false; @@ -757,20 +844,24 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!parse_regex(regex_info)) [[unlikely]] { - return false; - } if (!compile_regex(regex_info)) [[unlikely]] { return false; } if (!collect_group_names(regex_info)) [[unlikely]] { return false; } - auto match_count_opt{match_regex(regex_info, offset, regex_info.match_options)}; - if (!match_count_opt.has_value()) [[unlikely]] { + + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(regex_info.regex_code != nullptr && regex_state.match_context); + + auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); return false; } - regex_info.match_count = *match_count_opt; + auto opt_match_view{*expected_opt_match_view}; + + regex_info.match_count = opt_match_view.transform(&pcre2_match_view::size).value_or(0); std::optional> matches{}; if (opt_matches.has_value()) { @@ -786,7 +877,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt Optional f$preg_match_all(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { int64_t entire_match_count{}; - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { @@ -795,9 +886,6 @@ Optional f$preg_match_all(const string& pattern, const string& subject, if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!parse_regex(regex_info)) [[unlikely]] { - return false; - } if (!compile_regex(regex_info)) [[unlikely]] { return false; } @@ -825,20 +913,20 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } } - matcher m{regex_info, static_cast(offset)}; - if (m.has_error()) { - return false; - } + matcher pcre2_matcher{regex_info, static_cast(offset)}; - for (auto match_view_opt{m.next()}; match_view_opt.has_value(); match_view_opt = m.next()) { - pcre2_match_view match_view{*match_view_opt}; + auto expected_opt_match_view{pcre2_matcher.next()}; + while (expected_opt_match_view.has_value() && expected_opt_match_view->has_value()) { + pcre2_match_view match_view{**expected_opt_match_view}; regex_info.match_count = match_view.size(); set_all_matches(regex_info, flags, matches); if (regex_info.match_count > 0) { ++entire_match_count; } + expected_opt_match_view = pcre2_matcher.next(); } - if (m.has_error()) [[unlikely]] { + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); return false; } @@ -856,7 +944,7 @@ Optional f$preg_replace(const string& pattern, const string& replacement } }}; - if (limit < 0 && limit != kphp::regex::PREG_REPLACE_NOLIMIT) [[unlikely]] { + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { kphp::log::warning("invalid limit {} in preg_replace", limit); return {}; } @@ -880,12 +968,12 @@ Optional f$preg_replace(const string& pattern, const string& replacement } } - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; - bool success{parse_regex(regex_info)}; - success &= compile_regex(regex_info); - success &= replace_regex(regex_info, limit == kphp::regex::PREG_REPLACE_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); - if (!success) [[unlikely]] { + if (!compile_regex(regex_info)) [[unlikely]] { + return {}; + } + if (!replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) { return {}; } count = regex_info.replace_count; @@ -915,7 +1003,8 @@ Optional f$preg_replace(const mixed& pattern, const string& replacement, const auto& pattern_arr{pattern.as_array()}; for (const auto& it : pattern_arr) { int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(it.get_value().to_string(), replacement, result, limit, replace_one_count)}; replace_result.has_value()) [[likely]] { + if (Optional replace_result{f$preg_replace(it.get_value().to_string(), replacement, result, limit, replace_one_count)}; replace_result.has_value()) + [[likely]] { count += replace_one_count; result = std::move(replace_result.val()); } else { @@ -962,8 +1051,8 @@ Optional f$preg_replace(const mixed& pattern, const mixed& replacement, } int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(pattern_it.get_value().to_string(), replacement_str, result, limit, replace_one_count)}; replace_result.has_value()) - [[likely]] { + if (Optional replace_result{f$preg_replace(pattern_it.get_value().to_string(), replacement_str, result, limit, replace_one_count)}; + replace_result.has_value()) [[likely]] { count += replace_one_count; result = std::move(replace_result.val()); } else { @@ -999,7 +1088,7 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed array result{subject_arr.size()}; for (const auto& it : subject_arr) { int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(pattern, replacement, it.get_value().to_string(), limit, replace_one_count)}; replace_result.has_value()) + if (Optional replace_result{f$preg_replace(pattern, replacement, it.get_value().to_string(), limit, replace_one_count)}; replace_result.has_value()) [[likely]] { count += replace_one_count; result.set_value(it.get_key(), std::move(replace_result.val())); @@ -1011,3 +1100,23 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed return std::move(result); } + +Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + + if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, + kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { + return false; + } + if (!compile_regex(regex_info)) [[unlikely]] { + return false; + } + auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // + (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, // + (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; + if (!opt_output.has_value()) [[unlikely]] { + return false; + } + + return std::move(*opt_output); +} diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 2cc794208e..a2a52d3479 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -18,6 +18,14 @@ namespace kphp::regex { +namespace details { + +struct pcre2_error { + int32_t code{}; +}; + +} // namespace details + inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; @@ -34,7 +42,7 @@ inline constexpr auto PREG_SPLIT_DELIM_CAPTURE = static_cast(1U << 4U); inline constexpr auto PREG_SPLIT_OFFSET_CAPTURE = static_cast(1U << 5U); inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); -inline constexpr int64_t PREG_REPLACE_NOLIMIT = -1; +inline constexpr int64_t PREG_NOLIMIT = -1; } // namespace kphp::regex @@ -66,20 +74,20 @@ Optional f$preg_match_all(const string& pattern, const string& subject, // === preg_replace =============================================================================== -Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; template> -auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept { return f$preg_replace(regex, replace_val, subject.val(), limit, opt_count); } @@ -87,7 +95,7 @@ auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, i // === preg_replace_callback ====================================================================== template> F> -kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { static_assert(std::same_as>, string>); @@ -122,7 +130,7 @@ kphp::coro::task> f$preg_replace_callback(string pattern, F cal } template -kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { @@ -161,7 +169,7 @@ kphp::coro::task> f$preg_replace_callback(mixed pattern, F call } template -kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(subject)) [[unlikely]] { @@ -200,7 +208,7 @@ kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed } template> -auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, opt_count, flags)) { @@ -209,10 +217,14 @@ auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t // === preg_split ================================================================================= -inline Optional> f$preg_split(const string& /*unused*/, const string& /*unused*/, int64_t /*unused*/ = -1, int64_t /*unused*/ = 0) { - kphp::log::error("call to unsupported function"); -} +Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; -inline Optional> f$preg_split(const mixed& /*unused*/, const string& /*unused*/, int64_t /*unused*/ = -1, int64_t /*unused*/ = 0) { - kphp::log::error("call to unsupported function"); +inline Optional> f$preg_split(const mixed& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + if (!pattern.is_string()) [[unlikely]] { + kphp::log::warning("preg_split() expects parameter 1 to be string, {} given", pattern.get_type_or_class_name()); + return false; + } + return f$preg_split(pattern.as_string(), subject, limit, flags); } diff --git a/runtime-light/stdlib/string/regex-include.h b/runtime-light/stdlib/string/regex-include.h index b6924163b9..b0670a06ed 100644 --- a/runtime-light/stdlib/string/regex-include.h +++ b/runtime-light/stdlib/string/regex-include.h @@ -13,4 +13,4 @@ using regex_pcre2_general_context_t = std::unique_ptr; using regex_pcre2_match_context_t = std::unique_ptr; using regex_pcre2_match_data_t = std::unique_ptr; -using regex_pcre2_code_t = pcre2_code_8*; +using regex_pcre2_code_t = std::unique_ptr; diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index f8bd465140..dbf005549d 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -32,7 +32,7 @@ RegexInstanceState::RegexInstanceState() noexcept : regex_pcre2_general_context(pcre2_general_context_create_8(regex_malloc, regex_free, nullptr), pcre2_general_context_free_8), compile_context(pcre2_compile_context_create_8(regex_pcre2_general_context.get()), pcre2_compile_context_free_8), match_context(pcre2_match_context_create_8(regex_pcre2_general_context.get()), pcre2_match_context_free_8), - regex_pcre2_match_data(pcre2_match_data_create_8(MATCH_DATA_SIZE, regex_pcre2_general_context.get()), pcre2_match_data_free_8) { + regex_pcre2_match_data(pcre2_match_data_create_8(OVECTOR_SIZE, regex_pcre2_general_context.get()), pcre2_match_data_free_8) { if (!regex_pcre2_general_context) [[unlikely]] { kphp::log::error("can't create pcre2_general_context"); } diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 6d6fb64b1e..ea61a9cf9f 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -5,30 +5,55 @@ #pragma once #include -#include +#include +#include +#include +#include #include "common/mixin/not_copyable.h" #include "runtime-common/core/allocator/script-allocator.h" +#include "runtime-common/core/runtime-core.h" #include "runtime-common/core/std/containers.h" -#include "runtime-light/metaprogramming/concepts.h" #include "runtime-light/stdlib/string/regex-include.h" struct RegexInstanceState final : private vk::not_copyable { - template - using unordered_map = kphp::stl::unordered_map; +private: + using hasher_type = decltype([](const string& s) noexcept { return static_cast(s.hash()); }); - static constexpr size_t MAX_SUBPATTERNS_COUNT = 512; - // match data size should be a multiple of 3 since it holds ovector triples (see pcre2 docs) - static constexpr size_t MATCH_DATA_SIZE = 3 * MAX_SUBPATTERNS_COUNT; - static constexpr auto REPLACE_BUFFER_SIZE = static_cast(16U * 1024U); + static constexpr size_t MAX_SUBPATTERNS_COUNT{512}; + + struct compiled_regex { + // PCRE compile options of the regex + uint32_t compile_options{}; + // compiled regex + regex_pcre2_code_t regex_code; + }; + + kphp::stl::unordered_map regex_pcre2_code_cache; + +public: + static constexpr size_t OVECTOR_SIZE{MAX_SUBPATTERNS_COUNT + 1}; + static constexpr size_t REPLACE_BUFFER_SIZE{size_t{16U} * size_t{1024U}}; const regex_pcre2_general_context_t regex_pcre2_general_context; const regex_pcre2_compile_context_t compile_context; const regex_pcre2_match_context_t match_context; regex_pcre2_match_data_t regex_pcre2_match_data; - unordered_map regex_pcre2_code_cache; RegexInstanceState() noexcept; + std::optional> get_compiled_regex(const string& regex) const noexcept { + if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { + return it->second; + } + return std::nullopt; + } + + std::optional> add_compiled_regex(string regex, uint32_t compile_options, + regex_pcre2_code_t regex_code) noexcept { + return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) + .first->second; + } + static RegexInstanceState& get() noexcept; }; diff --git a/tests/phpt/dl/496_regex.php b/tests/phpt/dl/496_regex.php index a9adc843a2..9e4f705940 100644 --- a/tests/phpt/dl/496_regex.php +++ b/tests/phpt/dl/496_regex.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark