Skip to content

Commit 375e746

Browse files
committed
squash
1 parent 5e5feb5 commit 375e746

10 files changed

Lines changed: 655 additions & 453 deletions

File tree

builtin-functions/kphp-light/stdlib/regex-functions.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,10 @@ function preg_replace_callback(
3939
&$replace_count ::: int = TODO,
4040
$flags ::: int = 0): string | ^3 | null;
4141

42+
function preg_split ($pattern ::: string, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false;
43+
4244
// ===== UNSUPPORTED =====
4345

4446
/** @kphp-extern-func-info stub generation-required */
4547
function preg_last_error() ::: int;
4648

47-
/** @kphp-extern-func-info stub */
48-
function preg_split ($pattern ::: regexp, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false;
49-

runtime-light/stdlib/stdlib.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ prepend(
3333
rpc/rpc-tl-request.cpp
3434
serialization/serialization-state.cpp
3535
server/http-functions.cpp
36+
string/pcre2-functions.cpp
3637
string/regex-functions.cpp
3738
string/regex-state.cpp
3839
string/string-state.cpp
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
// Compiler for PHP (aka KPHP)
2+
// Copyright (c) 2024 LLC «V Kontakte»
3+
// Distributed under the GPL v3 License, see LICENSE.notice.txt
4+
5+
#include "runtime-light/stdlib/string/pcre2-functions.h"
6+
7+
#include <cstddef>
8+
#include <cstdint>
9+
#include <iterator>
10+
#include <memory>
11+
#include <optional>
12+
#include <string_view>
13+
14+
#include "runtime-common/core/runtime-core.h"
15+
#include "runtime-common/stdlib/string/mbstring-functions.h"
16+
#include "runtime-light/stdlib/diagnostics/logs.h"
17+
#include "runtime-light/stdlib/string/regex-state.h"
18+
19+
namespace kphp::pcre2 {
20+
21+
namespace {
22+
23+
constexpr size_t ERROR_BUFFER_LENGTH{256};
24+
25+
}
26+
27+
std::optional<std::string_view> match_view::get_group(size_t i) const noexcept {
28+
kphp::log::assertion(i >= 0 && i < m_num_groups && m_ovector_ptr);
29+
// ovector is an array of offset pairs
30+
PCRE2_SIZE start{m_ovector_ptr[2 * i]};
31+
PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]};
32+
33+
if (start == PCRE2_UNSET) {
34+
return std::nullopt;
35+
}
36+
37+
return m_subject_data.substr(start, end - start);
38+
}
39+
40+
const compiled_regex* compiled_regex::compile(const string& regex) noexcept {
41+
auto& regex_state{RegexInstanceState::get()};
42+
if (!regex_state.compile_context) [[unlikely]] {
43+
return nullptr;
44+
}
45+
46+
// check runtime cache
47+
if (auto* compiled_regex{regex_state.get_compiled_regex(regex)}; compiled_regex != nullptr) {
48+
return compiled_regex;
49+
}
50+
if (regex.empty()) {
51+
kphp::log::warning("empty regex");
52+
return nullptr;
53+
}
54+
55+
char end_delim{};
56+
switch (const char start_delim{regex[0]}; start_delim) {
57+
case '(': {
58+
end_delim = ')';
59+
break;
60+
}
61+
case '[': {
62+
end_delim = ']';
63+
break;
64+
}
65+
case '{': {
66+
end_delim = '}';
67+
break;
68+
}
69+
case '<': {
70+
end_delim = '>';
71+
break;
72+
}
73+
case '>':
74+
case '!' ... '\'':
75+
case '*' ... '/':
76+
case ':':
77+
case ';':
78+
case '=':
79+
case '?':
80+
case '@':
81+
case '^':
82+
case '_':
83+
case '`':
84+
case '|':
85+
case '~': {
86+
end_delim = start_delim;
87+
break;
88+
}
89+
default: {
90+
kphp::log::warning("wrong regex delimiter {}", start_delim);
91+
return nullptr;
92+
}
93+
}
94+
95+
uint32_t compile_options{};
96+
// non-null-terminated regex without delimiters and PCRE modifiers
97+
//
98+
// regex -> ~pattern~im\0
99+
// regex_body -> pattern
100+
std::string_view regex_body = {regex.c_str(), regex.size()};
101+
102+
// remove start delimiter
103+
regex_body.remove_prefix(1);
104+
// parse compile options and skip all symbols until the end delimiter
105+
for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) {
106+
// spaces and newlines are ignored
107+
if (regex_body.back() == ' ' || regex_body.back() == '\n') {
108+
continue;
109+
}
110+
111+
switch (regex_body.back()) {
112+
case 'i': {
113+
compile_options |= PCRE2_CASELESS;
114+
break;
115+
}
116+
case 'm': {
117+
compile_options |= PCRE2_MULTILINE;
118+
break;
119+
}
120+
case 's': {
121+
compile_options |= PCRE2_DOTALL;
122+
break;
123+
}
124+
case 'x': {
125+
compile_options |= PCRE2_EXTENDED;
126+
break;
127+
}
128+
case 'A': {
129+
compile_options |= PCRE2_ANCHORED;
130+
break;
131+
}
132+
case 'D': {
133+
compile_options |= PCRE2_DOLLAR_ENDONLY;
134+
break;
135+
}
136+
case 'U': {
137+
compile_options |= PCRE2_UNGREEDY;
138+
break;
139+
}
140+
case 'X': {
141+
compile_options |= PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL;
142+
break;
143+
}
144+
case 'J': {
145+
compile_options |= PCRE2_INFO_JCHANGED;
146+
break;
147+
}
148+
case 'u': {
149+
compile_options |= PCRE2_UTF | PCRE2_UCP;
150+
break;
151+
}
152+
default: {
153+
kphp::log::warning("unsupported regex modifier {}", regex_body.back());
154+
break;
155+
}
156+
}
157+
}
158+
159+
if (regex_body.empty()) {
160+
kphp::log::warning("no ending regex delimiter: {}", regex.c_str());
161+
return nullptr;
162+
}
163+
// UTF-8 validation
164+
if (static_cast<bool>(compile_options & PCRE2_UTF) && !mb_UTF8_check(regex.c_str())) [[unlikely]] {
165+
kphp::log::warning("invalid UTF-8 pattern: {}", regex.c_str());
166+
return nullptr;
167+
}
168+
169+
// remove the end delimiter
170+
regex_body.remove_suffix(1);
171+
172+
// compile pcre2_code
173+
int32_t error_number{};
174+
PCRE2_SIZE error_offset{};
175+
regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast<PCRE2_SPTR8>(regex_body.data()), regex_body.size(), compile_options,
176+
std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get())};
177+
if (!regex_code) [[unlikely]] {
178+
std::array<char, ERROR_BUFFER_LENGTH> buffer{};
179+
pcre2_get_error_message_8(error_number, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size());
180+
kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data());
181+
return nullptr;
182+
}
183+
184+
return regex_state.add_compiled_regex(regex, {compile_options, *regex_code});
185+
}
186+
187+
group_names_t compiled_regex::collect_group_names() const noexcept {
188+
// vector of group names
189+
group_names_t group_names;
190+
191+
// initialize an array of strings to hold group names
192+
group_names.resize(groups_count());
193+
194+
uint32_t name_count{};
195+
pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMECOUNT, std::addressof(name_count));
196+
if (name_count == 0) {
197+
return group_names;
198+
}
199+
200+
PCRE2_SPTR8 name_table{};
201+
uint32_t name_entry_size{};
202+
pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMETABLE, std::addressof(name_table));
203+
pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMEENTRYSIZE, std::addressof(name_entry_size));
204+
205+
PCRE2_SPTR8 entry{name_table};
206+
for (auto i{0}; i < name_count; ++i) {
207+
const auto group_number{static_cast<uint16_t>((entry[0] << 8) | entry[1])};
208+
PCRE2_SPTR8 group_name{std::next(entry, 2)};
209+
group_names[group_number] = reinterpret_cast<const char*>(group_name);
210+
std::advance(entry, name_entry_size);
211+
}
212+
213+
return group_names;
214+
}
215+
216+
std::optional<match_view> compiled_regex::match(std::string_view subject, size_t offset, uint32_t match_options) const noexcept {
217+
const auto& regex_state{RegexInstanceState::get()};
218+
if (!regex_state.match_context) [[unlikely]] {
219+
return std::nullopt;
220+
}
221+
222+
auto* match_data = regex_state.regex_pcre2_match_data.get();
223+
224+
int32_t match_count{pcre2_match_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(subject.data()), subject.size(), offset, match_options,
225+
match_data, regex_state.match_context.get())};
226+
// From https://www.pcre.org/current/doc/html/pcre2_match.html
227+
// The return from pcre2_match() is one more than the highest numbered capturing pair that has been set
228+
// (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors.
229+
if (match_count < 0 && match_count != PCRE2_ERROR_NOMATCH) [[unlikely]] {
230+
std::array<char, ERROR_BUFFER_LENGTH> buffer{};
231+
pcre2_get_error_message_8(match_count, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size());
232+
kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data());
233+
return std::nullopt;
234+
}
235+
return match_view{subject, pcre2_get_ovector_pointer_8(match_data), match_count != PCRE2_ERROR_NOMATCH ? match_count : 0};
236+
}
237+
238+
uint32_t compiled_regex::named_groups_count() const noexcept {
239+
// retrieve the named groups count
240+
uint32_t name_count{};
241+
pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_NAMECOUNT, std::addressof(name_count));
242+
return name_count;
243+
}
244+
245+
std::optional<string> compiled_regex::replace(const string& subject, uint32_t replace_options, std::string_view replacement, uint32_t match_options,
246+
uint64_t limit, int64_t& replace_count) const noexcept {
247+
replace_count = 0;
248+
249+
const auto& regex_state{RegexInstanceState::get()};
250+
auto& runtime_ctx{RuntimeContext::get()};
251+
if (!regex_state.match_context) [[unlikely]] {
252+
return std::nullopt;
253+
}
254+
255+
if (!validate({subject.c_str(), subject.size()})) [[unlikely]] {
256+
return std::nullopt;
257+
}
258+
259+
const PCRE2_SIZE buffer_length{std::max(
260+
{static_cast<string::size_type>(subject.size()), static_cast<string::size_type>(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})};
261+
runtime_ctx.static_SB.clean().reserve(buffer_length);
262+
PCRE2_SIZE output_length{buffer_length};
263+
264+
// replace all occurences
265+
if (limit == std::numeric_limits<uint64_t>::max()) [[likely]] {
266+
replace_count = pcre2_substitute_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(subject.c_str()), subject.size(), 0,
267+
replace_options | PCRE2_SUBSTITUTE_GLOBAL, nullptr, regex_state.match_context.get(),
268+
reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.size(),
269+
reinterpret_cast<PCRE2_UCHAR8*>(runtime_ctx.static_SB.buffer()), std::addressof(output_length));
270+
271+
if (replace_count < 0) [[unlikely]] {
272+
std::array<char, ERROR_BUFFER_LENGTH> buffer{};
273+
pcre2_get_error_message_8(replace_count, reinterpret_cast<PCRE2_UCHAR8*>(buffer.data()), buffer.size());
274+
kphp::log::warning("pcre2_substitute error {}", buffer.data());
275+
return std::nullopt;
276+
}
277+
} else { // replace only 'limit' times
278+
size_t match_offset{};
279+
size_t substitute_offset{};
280+
int64_t replacement_diff_acc{};
281+
PCRE2_SIZE length_after_replace{buffer_length};
282+
string str_after_replace{subject};
283+
284+
for (; replace_count < limit; ++replace_count) {
285+
auto match_view_opt{match({subject.c_str(), subject.size()}, match_offset, match_options)};
286+
if (!match_view_opt.has_value()) [[unlikely]] {
287+
return std::nullopt;
288+
}
289+
auto& match_view{*match_view_opt};
290+
if (match_view.size() == 0) {
291+
break;
292+
}
293+
294+
const auto entire_pattern_match_opt{match_view.get_group({})};
295+
if (!entire_pattern_match_opt.has_value()) [[unlikely]] {
296+
return std::nullopt;
297+
}
298+
auto entire_pattern_match{*entire_pattern_match_opt};
299+
300+
length_after_replace = buffer_length;
301+
if (auto replace_one_ret_code{pcre2_substitute_8(std::addressof(regex_code), reinterpret_cast<PCRE2_SPTR8>(str_after_replace.c_str()),
302+
str_after_replace.size(), substitute_offset, replace_options, nullptr, regex_state.match_context.get(),
303+
reinterpret_cast<PCRE2_SPTR8>(replacement.data()), replacement.size(),
304+
reinterpret_cast<PCRE2_UCHAR8*>(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))};
305+
replace_one_ret_code != 1) [[unlikely]] {
306+
kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code);
307+
return std::nullopt;
308+
}
309+
310+
match_offset = entire_pattern_match.data() - subject.c_str() + entire_pattern_match.size();
311+
replacement_diff_acc += replacement.size() - entire_pattern_match.size();
312+
substitute_offset = match_offset + replacement_diff_acc;
313+
str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast<string::size_type>(length_after_replace)};
314+
}
315+
316+
output_length = length_after_replace;
317+
}
318+
319+
if (replace_count > 0) {
320+
runtime_ctx.static_SB.set_pos(output_length);
321+
return runtime_ctx.static_SB.str();
322+
}
323+
324+
return subject;
325+
}
326+
327+
uint32_t compiled_regex::groups_count() const noexcept {
328+
// number of groups including entire match
329+
uint32_t capture_count{};
330+
pcre2_pattern_info_8(std::addressof(regex_code), PCRE2_INFO_CAPTURECOUNT, std::addressof(capture_count));
331+
return capture_count + 1; // to also count entire match
332+
}
333+
334+
} // namespace kphp::pcre2

0 commit comments

Comments
 (0)