|
6 | 6 |
|
7 | 7 | #include <algorithm> |
8 | 8 | #include <array> |
| 9 | +#include <concepts> |
9 | 10 | #include <cstddef> |
10 | 11 | #include <cstdint> |
11 | 12 | #include <functional> |
@@ -71,6 +72,10 @@ struct RegexInfo final { |
71 | 72 | replacement(replacement_) {} |
72 | 73 | }; |
73 | 74 |
|
| 75 | +struct backref { |
| 76 | + std::string_view digits; |
| 77 | +}; |
| 78 | + |
74 | 79 | template<typename... Args> |
75 | 80 | requires((std::is_same_v<Args, int64_t> && ...) && sizeof...(Args) > 0) |
76 | 81 | bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { |
@@ -102,6 +107,118 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec |
102 | 107 | return offset; |
103 | 108 | } |
104 | 109 |
|
| 110 | +std::optional<backref> try_get_backref(std::string_view preg_replacement) noexcept { |
| 111 | + if (preg_replacement.empty() || preg_replacement[0] < '0' || preg_replacement[0] > '9') { |
| 112 | + return std::nullopt; |
| 113 | + } |
| 114 | + |
| 115 | + if (preg_replacement.size() == 1 || preg_replacement[1] < '0' || preg_replacement[1] > '9') { |
| 116 | + return backref{preg_replacement.substr(0, 1)}; |
| 117 | + } |
| 118 | + |
| 119 | + return backref{preg_replacement.substr(0, 2)}; |
| 120 | +} |
| 121 | + |
| 122 | +template<typename T, typename F> |
| 123 | +requires std::convertible_to<std::invoke_result_t<F>, T> |
| 124 | +auto value_or_else(std::optional<T>&& opt, F&& alternative_func) noexcept -> T { |
| 125 | + if (opt.has_value()) { |
| 126 | + return std::move(*std::move(opt)); |
| 127 | + } else { |
| 128 | + return std::forward<F>(alternative_func)(); |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +using replacement_term = std::variant<char, backref>; |
| 133 | + |
| 134 | +class preg_replacement_unescaper { |
| 135 | + std::string_view preg_replacement; |
| 136 | + |
| 137 | +public: |
| 138 | + preg_replacement_unescaper(std::string_view preg_replacement) |
| 139 | + : preg_replacement{preg_replacement} {} |
| 140 | + |
| 141 | + bool has_next() const noexcept { |
| 142 | + return !preg_replacement.empty(); |
| 143 | + } |
| 144 | + |
| 145 | + replacement_term unescape_term() noexcept { |
| 146 | + auto first_char{preg_replacement.front()}; |
| 147 | + preg_replacement = preg_replacement.substr(1); |
| 148 | + if (preg_replacement.empty()) { |
| 149 | + return first_char; |
| 150 | + } |
| 151 | + switch (first_char) { |
| 152 | + case '$': |
| 153 | + if (preg_replacement.front() == '{') { |
| 154 | + return try_get_backref(preg_replacement.substr(1)) |
| 155 | + .and_then([this](auto value) noexcept -> std::optional<replacement_term> { |
| 156 | + auto digits_end_pos = 1 + value.digits.size(); |
| 157 | + if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { |
| 158 | + preg_replacement = preg_replacement.substr(1 + value.digits.size() + 1); |
| 159 | + return value; |
| 160 | + } |
| 161 | + |
| 162 | + return std::nullopt; |
| 163 | + }) |
| 164 | + .value_or('$'); |
| 165 | + } |
| 166 | + |
| 167 | + return try_get_backref(preg_replacement) |
| 168 | + .transform([this](auto value) noexcept -> replacement_term { |
| 169 | + auto digits_end_pos = value.digits.size(); |
| 170 | + preg_replacement = preg_replacement.substr(digits_end_pos); |
| 171 | + return value; |
| 172 | + }) |
| 173 | + .value_or('$'); |
| 174 | + |
| 175 | + case '\\': |
| 176 | + return value_or_else(try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { |
| 177 | + auto digits_end_pos = value.digits.size(); |
| 178 | + preg_replacement = preg_replacement.substr(digits_end_pos); |
| 179 | + return value; |
| 180 | + }), |
| 181 | + [this] noexcept { |
| 182 | + auto res{preg_replacement.front()}; |
| 183 | + if (res == '$' || res == '\\') { |
| 184 | + preg_replacement = preg_replacement.substr(1); |
| 185 | + return res; |
| 186 | + } |
| 187 | + return '\\'; |
| 188 | + }); |
| 189 | + default: |
| 190 | + return first_char; |
| 191 | + } |
| 192 | + } |
| 193 | +}; |
| 194 | + |
| 195 | +class pcre2_replacement_escaper { |
| 196 | + kphp::stl::string<kphp::memory::script_allocator> pcre2_replacement{}; |
| 197 | + |
| 198 | +public: |
| 199 | + void operator()(char c) noexcept { |
| 200 | + pcre2_replacement.push_back(c); |
| 201 | + if (c == '$') { |
| 202 | + pcre2_replacement.push_back('$'); |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + void operator()(backref backreference) noexcept { |
| 207 | + pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); |
| 208 | + pcre2_replacement.append("${"); |
| 209 | + pcre2_replacement.append(backreference.digits); |
| 210 | + pcre2_replacement.append("}"); |
| 211 | + } |
| 212 | + |
| 213 | + void escape_term(const replacement_term& term) noexcept { |
| 214 | + std::visit(*this, term); |
| 215 | + } |
| 216 | + |
| 217 | + kphp::stl::string<kphp::memory::script_allocator>& result() noexcept { |
| 218 | + return pcre2_replacement; |
| 219 | + } |
| 220 | +}; |
| 221 | + |
105 | 222 | bool parse_regex(RegexInfo& regex_info) noexcept { |
106 | 223 | if (regex_info.regex.empty()) { |
107 | 224 | kphp::log::warning("empty regex"); |
@@ -591,21 +708,13 @@ Optional<string> f$preg_replace(const string& pattern, const string& replacement |
591 | 708 | return {}; |
592 | 709 | } |
593 | 710 |
|
594 | | - string pcre2_replacement{replacement}; |
595 | | - { // we need to replace PHP's back references with PCRE2 ones |
596 | | - static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)"; |
597 | | - static constexpr std::string_view backreference_replacement = "$$$1"; |
598 | | - |
599 | | - RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement}; |
600 | | - bool success{parse_regex(regex_info)}; |
601 | | - success &= compile_regex(regex_info); |
602 | | - success &= replace_regex(regex_info, std::numeric_limits<uint64_t>::max()); |
603 | | - if (!success) [[unlikely]] { |
604 | | - kphp::log::warning("can't replace PHP back references with PCRE2 ones"); |
605 | | - return {}; |
606 | | - } |
607 | | - pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement; |
| 711 | + // we need to replace PHP's back references with PCRE2 ones |
| 712 | + auto unescaper{preg_replacement_unescaper{{replacement.c_str(), replacement.size()}}}; |
| 713 | + pcre2_replacement_escaper escaper{}; |
| 714 | + while (unescaper.has_next()) { |
| 715 | + escaper.escape_term(unescaper.unescape_term()); |
608 | 716 | } |
| 717 | + auto& pcre2_replacement{escaper.result()}; |
609 | 718 |
|
610 | 719 | RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; |
611 | 720 |
|
|
0 commit comments