|
| 1 | +// Compiler for PHP (aka KPHP) |
| 2 | +// Copyright (c) 2024 LLC «V Kontakte» |
| 3 | +// Distributed under the GPL v3 License, see LICENSE.notice.txt |
| 4 | + |
| 5 | +#include "runtime-light/stdlib/string/pcre2-functions.h" |
| 6 | + |
| 7 | +#include <cstddef> |
| 8 | +#include <cstdint> |
| 9 | +#include <iterator> |
| 10 | +#include <memory> |
| 11 | +#include <optional> |
| 12 | +#include <string_view> |
| 13 | + |
| 14 | +#include "runtime-common/core/runtime-core.h" |
| 15 | +#include "runtime-common/stdlib/string/mbstring-functions.h" |
| 16 | +#include "runtime-light/stdlib/diagnostics/logs.h" |
| 17 | +#include "runtime-light/stdlib/string/regex-state.h" |
| 18 | + |
| 19 | +namespace kphp::pcre2 { |
| 20 | + |
| 21 | +namespace { |
| 22 | + |
| 23 | +constexpr size_t ERROR_BUFFER_LENGTH{256}; |
| 24 | + |
| 25 | +} |
| 26 | + |
| 27 | +std::optional<std::string_view> match_view::get_group(size_t i) const noexcept { |
| 28 | + kphp::log::assertion(i >= 0 && i < m_num_groups && m_ovector_ptr); |
| 29 | + // ovector is an array of offset pairs |
| 30 | + PCRE2_SIZE start{m_ovector_ptr[2 * i]}; |
| 31 | + PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]}; |
| 32 | + |
| 33 | + if (start == PCRE2_UNSET) { |
| 34 | + return std::nullopt; |
| 35 | + } |
| 36 | + |
| 37 | + return m_subject_data.substr(start, end - start); |
| 38 | +} |
| 39 | + |
| 40 | +const compiled_regex* compiled_regex::compile(const string& regex) noexcept { |
| 41 | + auto& regex_state{RegexInstanceState::get()}; |
| 42 | + if (!regex_state.compile_context) [[unlikely]] { |
| 43 | + return nullptr; |
| 44 | + } |
| 45 | + |
| 46 | + // check runtime cache |
| 47 | + if (auto* compiled_regex{regex_state.get_compiled_regex(regex)}; compiled_regex != nullptr) { |
| 48 | + return compiled_regex; |
| 49 | + } |
| 50 | + if (regex.empty()) { |
| 51 | + kphp::log::warning("empty regex"); |
| 52 | + return nullptr; |
| 53 | + } |
| 54 | + |
| 55 | + char end_delim{}; |
| 56 | + switch (const char start_delim{regex[0]}; start_delim) { |
| 57 | + case '(': { |
| 58 | + end_delim = ')'; |
| 59 | + break; |
| 60 | + } |
| 61 | + case '[': { |
| 62 | + end_delim = ']'; |
| 63 | + break; |
| 64 | + } |
| 65 | + case '{': { |
| 66 | + end_delim = '}'; |
| 67 | + break; |
| 68 | + } |
| 69 | + case '<': { |
| 70 | + end_delim = '>'; |
| 71 | + break; |
| 72 | + } |
| 73 | + case '>': |
| 74 | + case '!' ... '\'': |
| 75 | + case '*' ... '/': |
| 76 | + case ':': |
| 77 | + case ';': |
| 78 | + case '=': |
| 79 | + case '?': |
| 80 | + case '@': |
| 81 | + case '^': |
| 82 | + case '_': |
| 83 | + case '`': |
| 84 | + case '|': |
| 85 | + case '~': { |
| 86 | + end_delim = start_delim; |
| 87 | + break; |
| 88 | + } |
| 89 | + default: { |
| 90 | + kphp::log::warning("wrong regex delimiter {}", start_delim); |
| 91 | + return nullptr; |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + uint32_t compile_options{}; |
| 96 | + // non-null-terminated regex without delimiters and PCRE modifiers |
| 97 | + // |
| 98 | + // regex -> ~pattern~im\0 |
| 99 | + // regex_body -> pattern |
| 100 | + std::string_view regex_body = {regex.c_str(), regex.size()}; |
| 101 | + |
| 102 | + // remove start delimiter |
| 103 | + regex_body.remove_prefix(1); |
| 104 | + // parse compile options and skip all symbols until the end delimiter |
| 105 | + for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) { |
| 106 | + // spaces and newlines are ignored |
| 107 | + if (regex_body.back() == ' ' || regex_body.back() == '\n') { |
| 108 | + continue; |
| 109 | + } |
| 110 | + |
| 111 | + switch (regex_body.back()) { |
| 112 | + case 'i': { |
| 113 | + compile_options |= PCRE2_CASELESS; |
| 114 | + break; |
| 115 | + } |
| 116 | + case 'm': { |
| 117 | + compile_options |= PCRE2_MULTILINE; |
| 118 | + break; |
| 119 | + } |
| 120 | + case 's': { |
| 121 | + compile_options |= PCRE2_DOTALL; |
| 122 | + break; |
| 123 | + } |
| 124 | + case 'x': { |
| 125 | + compile_options |= PCRE2_EXTENDED; |
| 126 | + break; |
| 127 | + } |
| 128 | + case 'A': { |
| 129 | + compile_options |= PCRE2_ANCHORED; |
| 130 | + break; |
| 131 | + } |
| 132 | + case 'D': { |
| 133 | + compile_options |= PCRE2_DOLLAR_ENDONLY; |
| 134 | + break; |
| 135 | + } |
| 136 | + case 'U': { |
| 137 | + compile_options |= PCRE2_UNGREEDY; |
| 138 | + break; |
| 139 | + } |
| 140 | + case 'X': { |
| 141 | + compile_options |= PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL; |
| 142 | + break; |
| 143 | + } |
| 144 | + case 'J': { |
| 145 | + compile_options |= PCRE2_INFO_JCHANGED; |
| 146 | + break; |
| 147 | + } |
| 148 | + case 'u': { |
| 149 | + compile_options |= PCRE2_UTF | PCRE2_UCP; |
| 150 | + break; |
| 151 | + } |
| 152 | + default: { |
| 153 | + kphp::log::warning("unsupported regex modifier {}", regex_body.back()); |
| 154 | + break; |
| 155 | + } |
| 156 | + } |
| 157 | + } |
| 158 | + |
| 159 | + if (regex_body.empty()) { |
| 160 | + kphp::log::warning("no ending regex delimiter: {}", regex.c_str()); |
| 161 | + return nullptr; |
| 162 | + } |
| 163 | + // UTF-8 validation |
| 164 | + if (static_cast<bool>(compile_options & PCRE2_UTF) && !mb_UTF8_check(regex.c_str())) [[unlikely]] { |
| 165 | + kphp::log::warning("invalid UTF-8 pattern: {}", regex.c_str()); |
| 166 | + return nullptr; |
| 167 | + } |
| 168 | + |
| 169 | + // remove the end delimiter |
| 170 | + regex_body.remove_suffix(1); |
| 171 | + |
| 172 | + // compile pcre2_code |
| 173 | + int32_t error_number{}; |
| 174 | + PCRE2_SIZE error_offset{}; |
| 175 | + regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR8>(regex_body.data()), regex_body.size(), compile_options, |
| 176 | + std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get())}; |
| 177 | + if (!regex_code) [[unlikely]] { |
| 178 | + std::array<char, ERROR_BUFFER_LENGTH> buffer{}; |
| 179 | + pcre2_get_error_message_8(error_number, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size()); |
| 180 | + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); |
| 181 | + return nullptr; |
| 182 | + } |
| 183 | + |
| 184 | + return regex_state.add_compiled_regex(regex, {compile_options, *regex_code}); |
| 185 | +} |
| 186 | + |
| 187 | +group_names_t compiled_regex::collect_group_names() const noexcept { |
| 188 | + // vector of group names |
| 189 | + group_names_t group_names; |
| 190 | + |
| 191 | + // initialize an array of strings to hold group names |
| 192 | + group_names.resize(groups_count()); |
| 193 | + |
| 194 | + uint32_t name_count{}; |
| 195 | + pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMECOUNT, std::addressof(name_count)); |
| 196 | + if (name_count == 0) { |
| 197 | + return group_names; |
| 198 | + } |
| 199 | + |
| 200 | + PCRE2_SPTR8 name_table{}; |
| 201 | + uint32_t name_entry_size{}; |
| 202 | + pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMETABLE, std::addressof(name_table)); |
| 203 | + pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(name_entry_size)); |
| 204 | + |
| 205 | + PCRE2_SPTR8 entry{name_table}; |
| 206 | + for (auto i{0}; i < name_count; ++i) { |
| 207 | + const auto group_number{static_cast<uint16_t>((entry[0] << 8) | entry[1])}; |
| 208 | + PCRE2_SPTR8 group_name{std::next(entry, 2)}; |
| 209 | + group_names[group_number] = reinterpret_cast<const char*>(group_name); |
| 210 | + std::advance(entry, name_entry_size); |
| 211 | + } |
| 212 | + |
| 213 | + return group_names; |
| 214 | +} |
| 215 | + |
| 216 | +std::optional<match_view> compiled_regex::match(std::string_view subject, size_t offset, uint32_t match_options) const noexcept { |
| 217 | + const auto& regex_state{RegexInstanceState::get()}; |
| 218 | + if (!regex_state.match_context) [[unlikely]] { |
| 219 | + return std::nullopt; |
| 220 | + } |
| 221 | + |
| 222 | + auto* match_data = regex_state.regex_pcre2_match_data.get(); |
| 223 | + |
| 224 | + int32_t match_count{pcre2_match_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(subject.data()), subject.size(), offset, match_options, |
| 225 | + match_data, regex_state.match_context.get())}; |
| 226 | + // From https://www.pcre.org/current/doc/html/pcre2_match.html |
| 227 | + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set |
| 228 | + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. |
| 229 | + if (match_count < 0 && match_count != PCRE2_ERROR_NOMATCH) [[unlikely]] { |
| 230 | + std::array<char, ERROR_BUFFER_LENGTH> buffer{}; |
| 231 | + pcre2_get_error_message_8(match_count, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size()); |
| 232 | + kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); |
| 233 | + return std::nullopt; |
| 234 | + } |
| 235 | + return match_view{subject, pcre2_get_ovector_pointer_8(match_data), match_count != PCRE2_ERROR_NOMATCH ? match_count : 0}; |
| 236 | +} |
| 237 | + |
| 238 | +uint32_t compiled_regex::named_groups_count() const noexcept { |
| 239 | + // retrieve the named groups count |
| 240 | + uint32_t name_count{}; |
| 241 | + pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMECOUNT, std::addressof(name_count)); |
| 242 | + return name_count; |
| 243 | +} |
| 244 | + |
| 245 | +std::optional<string> compiled_regex::replace(const string& subject, uint32_t replace_options, std::string_view replacement, uint32_t match_options, |
| 246 | + uint64_t limit, int64_t& replace_count) const noexcept { |
| 247 | + replace_count = 0; |
| 248 | + |
| 249 | + const auto& regex_state{RegexInstanceState::get()}; |
| 250 | + auto& runtime_ctx{RuntimeContext::get()}; |
| 251 | + if (!regex_state.match_context) [[unlikely]] { |
| 252 | + return std::nullopt; |
| 253 | + } |
| 254 | + |
| 255 | + if (!validate({subject.c_str(), subject.size()})) [[unlikely]] { |
| 256 | + return std::nullopt; |
| 257 | + } |
| 258 | + |
| 259 | + const PCRE2_SIZE buffer_length{std::max( |
| 260 | + {static_cast<string::size_type>(subject.size()), static_cast<string::size_type>(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; |
| 261 | + runtime_ctx.static_SB.clean().reserve(buffer_length); |
| 262 | + PCRE2_SIZE output_length{buffer_length}; |
| 263 | + |
| 264 | + // replace all occurences |
| 265 | + if (limit == std::numeric_limits<uint64_t>::max()) [[likely]] { |
| 266 | + replace_count = pcre2_substitute_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(subject.c_str()), subject.size(), 0, |
| 267 | + replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(), |
| 268 | + reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.size(), |
| 269 | + reinterpret_cast<PCRE2_UCHAR8*>(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); |
| 270 | + |
| 271 | + if (replace_count < 0) [[unlikely]] { |
| 272 | + std::array<char, ERROR_BUFFER_LENGTH> buffer{}; |
| 273 | + pcre2_get_error_message_8(replace_count, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size()); |
| 274 | + kphp::log::warning("pcre2_substitute error {}", buffer.data()); |
| 275 | + return std::nullopt; |
| 276 | + } |
| 277 | + } else { // replace only 'limit' times |
| 278 | + size_t match_offset{}; |
| 279 | + size_t substitute_offset{}; |
| 280 | + int64_t replacement_diff_acc{}; |
| 281 | + PCRE2_SIZE length_after_replace{buffer_length}; |
| 282 | + string str_after_replace{subject}; |
| 283 | + |
| 284 | + for (; replace_count < limit; ++replace_count) { |
| 285 | + auto match_view_opt{match({subject.c_str(), subject.size()}, match_offset, match_options)}; |
| 286 | + if (!match_view_opt.has_value()) [[unlikely]] { |
| 287 | + return std::nullopt; |
| 288 | + } |
| 289 | + auto& match_view{*match_view_opt}; |
| 290 | + if (match_view.size() == 0) { |
| 291 | + break; |
| 292 | + } |
| 293 | + |
| 294 | + const auto entire_pattern_match_opt{match_view.get_group({})}; |
| 295 | + if (!entire_pattern_match_opt.has_value()) [[unlikely]] { |
| 296 | + return std::nullopt; |
| 297 | + } |
| 298 | + auto entire_pattern_match{*entire_pattern_match_opt}; |
| 299 | + |
| 300 | + length_after_replace = buffer_length; |
| 301 | + if (auto replace_one_ret_code{pcre2_substitute_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(str_after_replace.c_str()), |
| 302 | + str_after_replace.size(), substitute_offset, replace_options, nullptr, regex_state.match_context.get(), |
| 303 | + reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.size(), |
| 304 | + reinterpret_cast<PCRE2_UCHAR8*>(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; |
| 305 | + replace_one_ret_code != 1) [[unlikely]] { |
| 306 | + kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); |
| 307 | + return std::nullopt; |
| 308 | + } |
| 309 | + |
| 310 | + match_offset = entire_pattern_match.data() - subject.c_str() + entire_pattern_match.size(); |
| 311 | + replacement_diff_acc += replacement.size() - entire_pattern_match.size(); |
| 312 | + substitute_offset = match_offset + replacement_diff_acc; |
| 313 | + str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast<string::size_type>(length_after_replace)}; |
| 314 | + } |
| 315 | + |
| 316 | + output_length = length_after_replace; |
| 317 | + } |
| 318 | + |
| 319 | + if (replace_count > 0) { |
| 320 | + runtime_ctx.static_SB.set_pos(output_length); |
| 321 | + return runtime_ctx.static_SB.str(); |
| 322 | + } |
| 323 | + |
| 324 | + return subject; |
| 325 | +} |
| 326 | + |
| 327 | +uint32_t compiled_regex::groups_count() const noexcept { |
| 328 | + // number of groups including entire match |
| 329 | + uint32_t capture_count{}; |
| 330 | + pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_CAPTURECOUNT, std::addressof(capture_count)); |
| 331 | + return capture_count + 1; // to also count entire match |
| 332 | +} |
| 333 | + |
| 334 | +} // namespace kphp::pcre2 |
0 commit comments