|
6 | 6 |
|
7 | 7 | #include <algorithm> |
8 | 8 | #include <array> |
| 9 | +#include <cctype> |
| 10 | +#include <concepts> |
9 | 11 | #include <cstddef> |
10 | 12 | #include <cstdint> |
11 | 13 | #include <functional> |
@@ -33,6 +35,7 @@ constexpr size_t ERROR_BUFFER_LENGTH = 256; |
33 | 35 |
|
34 | 36 | enum class trailing_unmatch : uint8_t { skip, include }; |
35 | 37 |
|
| 38 | +using backref = std::string_view; |
36 | 39 | using regex_pcre2_group_names_t = kphp::stl::vector<const char*, kphp::memory::script_allocator>; |
37 | 40 |
|
38 | 41 | struct RegexInfo final { |
@@ -102,6 +105,139 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec |
102 | 105 | return offset; |
103 | 106 | } |
104 | 107 |
|
| 108 | +std::optional<backref> try_get_backref(std::string_view preg_replacement) noexcept { |
| 109 | + if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) { |
| 110 | + return std::nullopt; |
| 111 | + } |
| 112 | + |
| 113 | + if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) { |
| 114 | + return backref{preg_replacement.substr(0, 1)}; |
| 115 | + } |
| 116 | + |
| 117 | + return backref{preg_replacement.substr(0, 2)}; |
| 118 | +} |
| 119 | + |
| 120 | +using replacement_term = std::variant<char, backref>; |
| 121 | + |
| 122 | +class preg_replacement_parser { |
| 123 | + std::string_view preg_replacement; |
| 124 | + |
| 125 | + replacement_term parse_term_internal() noexcept { |
| 126 | + kphp::log::assertion(!preg_replacement.empty()); |
| 127 | + auto first_char{preg_replacement.front()}; |
| 128 | + preg_replacement = preg_replacement.substr(1); |
| 129 | + if (preg_replacement.empty()) { |
| 130 | + return first_char; |
| 131 | + } |
| 132 | + switch (first_char) { |
| 133 | + case '$': |
| 134 | + // $1, ${1} |
| 135 | + if (preg_replacement.front() == '{') { |
| 136 | + return try_get_backref(preg_replacement.substr(1)) |
| 137 | + .and_then([this](auto value) noexcept -> std::optional<replacement_term> { |
| 138 | + auto digits_end_pos = 1 + value.size(); |
| 139 | + if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { |
| 140 | + preg_replacement = preg_replacement.substr(1 + value.size() + 1); |
| 141 | + return value; |
| 142 | + } |
| 143 | + |
| 144 | + return std::nullopt; |
| 145 | + }) |
| 146 | + .value_or('$'); |
| 147 | + } |
| 148 | + |
| 149 | + return try_get_backref(preg_replacement) |
| 150 | + .transform([this](auto value) noexcept -> replacement_term { |
| 151 | + auto digits_end_pos = value.size(); |
| 152 | + preg_replacement = preg_replacement.substr(digits_end_pos); |
| 153 | + return value; |
| 154 | + }) |
| 155 | + .value_or('$'); |
| 156 | + |
| 157 | + case '\\': { |
| 158 | + // \1 |
| 159 | + auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { |
| 160 | + auto digits_end_pos = value.size(); |
| 161 | + preg_replacement = preg_replacement.substr(digits_end_pos); |
| 162 | + return value; |
| 163 | + })}; |
| 164 | + if (back_reference_opt.has_value()) { |
| 165 | + return *back_reference_opt; |
| 166 | + } else { |
| 167 | + auto res{preg_replacement.front()}; |
| 168 | + if (res == '$' || res == '\\') { |
| 169 | + preg_replacement = preg_replacement.substr(1); |
| 170 | + return res; |
| 171 | + } |
| 172 | + return '\\'; |
| 173 | + } |
| 174 | + } |
| 175 | + default: |
| 176 | + return first_char; |
| 177 | + } |
| 178 | + } |
| 179 | + |
| 180 | +public: |
| 181 | + explicit preg_replacement_parser(std::string_view preg_replacement) noexcept |
| 182 | + : preg_replacement{preg_replacement} {} |
| 183 | + |
| 184 | + struct iterator { |
| 185 | + preg_replacement_parser* parser{nullptr}; |
| 186 | + replacement_term current_term{'\0'}; |
| 187 | + |
| 188 | + using difference_type = std::ptrdiff_t; |
| 189 | + using value_type = replacement_term; |
| 190 | + using reference = const replacement_term&; |
| 191 | + using pointer = const replacement_term*; |
| 192 | + using iterator_category = std::input_iterator_tag; |
| 193 | + |
| 194 | + iterator() noexcept = default; |
| 195 | + explicit iterator(preg_replacement_parser* p) noexcept |
| 196 | + : parser{p} { |
| 197 | + if (parser->preg_replacement.empty()) { |
| 198 | + parser = nullptr; |
| 199 | + } else { |
| 200 | + current_term = parser->parse_term_internal(); |
| 201 | + } |
| 202 | + } |
| 203 | + |
| 204 | + reference operator*() const noexcept { |
| 205 | + return current_term; |
| 206 | + } |
| 207 | + pointer operator->() const noexcept { |
| 208 | + return std::addressof(current_term); |
| 209 | + } |
| 210 | + |
| 211 | + iterator& operator++() noexcept { |
| 212 | + if (!parser->preg_replacement.empty()) { |
| 213 | + current_term = parser->parse_term_internal(); |
| 214 | + } else { |
| 215 | + parser = nullptr; |
| 216 | + } |
| 217 | + return *this; |
| 218 | + } |
| 219 | + iterator operator++(int) noexcept { |
| 220 | + iterator temp = *this; |
| 221 | + ++(*this); |
| 222 | + return temp; |
| 223 | + } |
| 224 | + |
| 225 | + friend bool operator==(const iterator& a, const iterator& b) noexcept { |
| 226 | + return a.parser == b.parser; |
| 227 | + } |
| 228 | + friend bool operator!=(const iterator& a, const iterator& b) noexcept { |
| 229 | + return !(a == b); |
| 230 | + } |
| 231 | + }; |
| 232 | + |
| 233 | + iterator begin() noexcept { |
| 234 | + return iterator{this}; |
| 235 | + } |
| 236 | + iterator end() noexcept { |
| 237 | + return iterator{}; |
| 238 | + } |
| 239 | +}; |
| 240 | + |
105 | 241 | bool parse_regex(RegexInfo& regex_info) noexcept { |
106 | 242 | if (regex_info.regex.empty()) { |
107 | 243 | kphp::log::warning("empty regex"); |
@@ -591,20 +727,23 @@ Optional<string> f$preg_replace(const string& pattern, const string& replacement |
591 | 727 | return {}; |
592 | 728 | } |
593 | 729 |
|
594 | | - string pcre2_replacement{replacement}; |
595 | | - { // we need to replace PHP's back references with PCRE2 ones |
596 | | - static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)"; |
597 | | - static constexpr std::string_view backreference_replacement = "$$$1"; |
598 | | - |
599 | | - RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement}; |
600 | | - bool success{parse_regex(regex_info)}; |
601 | | - success &= compile_regex(regex_info); |
602 | | - success &= replace_regex(regex_info, std::numeric_limits<uint64_t>::max()); |
603 | | - if (!success) [[unlikely]] { |
604 | | - kphp::log::warning("can't replace PHP back references with PCRE2 ones"); |
605 | | - return {}; |
| 730 | + // we need to replace PHP's back references with PCRE2 ones |
| 731 | + auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; |
| 732 | + kphp::stl::string<kphp::memory::script_allocator> pcre2_replacement{}; |
| 733 | + for (const auto& term : parser) { |
| 734 | + if (std::holds_alternative<char>(term)) { |
| 735 | + auto c{std::get<char>(term)}; |
| 736 | + pcre2_replacement.push_back(c); |
| 737 | + if (c == '$') { |
| 738 | + pcre2_replacement.push_back('$'); |
| 739 | + } |
| 740 | + } else { |
| 741 | + auto backreference{std::get<backref>(term)}; |
| 742 | + pcre2_replacement.reserve(pcre2_replacement.size() + backreference.size() + 3); |
| 743 | + pcre2_replacement.append("${"); |
| 744 | + pcre2_replacement.append(backreference); |
| 745 | + pcre2_replacement.append("}"); |
606 | 746 | } |
607 | | - pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement; |
608 | 747 | } |
609 | 748 |
|
610 | 749 | RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; |
|
0 commit comments