Skip to content

Commit c7627c2

Browse files
authored
[k2] fix preg_replace function (#1464)
* split tests * fix preg_replace function
1 parent 8e93667 commit c7627c2

6 files changed

Lines changed: 541 additions & 392 deletions

File tree

.idea/encodings.xml

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

runtime-light/stdlib/string/regex-functions.cpp

Lines changed: 152 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#include <algorithm>
88
#include <array>
9+
#include <cctype>
10+
#include <concepts>
911
#include <cstddef>
1012
#include <cstdint>
1113
#include <functional>
@@ -33,6 +35,7 @@ constexpr size_t ERROR_BUFFER_LENGTH = 256;
3335

3436
enum class trailing_unmatch : uint8_t { skip, include };
3537

38+
using backref = std::string_view;
3639
using regex_pcre2_group_names_t = kphp::stl::vector<const char*, kphp::memory::script_allocator>;
3740

3841
struct RegexInfo final {
@@ -102,6 +105,139 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec
102105
return offset;
103106
}
104107

108+
std::optional<backref> try_get_backref(std::string_view preg_replacement) noexcept {
109+
if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) {
110+
return std::nullopt;
111+
}
112+
113+
if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) {
114+
return backref{preg_replacement.substr(0, 1)};
115+
}
116+
117+
return backref{preg_replacement.substr(0, 2)};
118+
}
119+
120+
using replacement_term = std::variant<char, backref>;
121+
122+
class preg_replacement_parser {
123+
std::string_view preg_replacement;
124+
125+
replacement_term parse_term_internal() noexcept {
126+
kphp::log::assertion(!preg_replacement.empty());
127+
auto first_char{preg_replacement.front()};
128+
preg_replacement = preg_replacement.substr(1);
129+
if (preg_replacement.empty()) {
130+
return first_char;
131+
}
132+
switch (first_char) {
133+
case '$':
134+
// $1, ${1}
135+
if (preg_replacement.front() == '{') {
136+
return try_get_backref(preg_replacement.substr(1))
137+
.and_then([this](auto value) noexcept -> std::optional<replacement_term> {
138+
auto digits_end_pos = 1 + value.size();
139+
if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') {
140+
preg_replacement = preg_replacement.substr(1 + value.size() + 1);
141+
return value;
142+
}
143+
144+
return std::nullopt;
145+
})
146+
.value_or('$');
147+
}
148+
149+
return try_get_backref(preg_replacement)
150+
.transform([this](auto value) noexcept -> replacement_term {
151+
auto digits_end_pos = value.size();
152+
preg_replacement = preg_replacement.substr(digits_end_pos);
153+
return value;
154+
})
155+
.value_or('$');
156+
157+
case '\\': {
158+
// \1
159+
auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term {
160+
auto digits_end_pos = value.size();
161+
preg_replacement = preg_replacement.substr(digits_end_pos);
162+
return value;
163+
})};
164+
if (back_reference_opt.has_value()) {
165+
return *back_reference_opt;
166+
} else {
167+
auto res{preg_replacement.front()};
168+
if (res == '$' || res == '\\') {
169+
preg_replacement = preg_replacement.substr(1);
170+
return res;
171+
}
172+
return '\\';
173+
}
174+
}
175+
default:
176+
return first_char;
177+
}
178+
}
179+
180+
public:
181+
explicit preg_replacement_parser(std::string_view preg_replacement) noexcept
182+
: preg_replacement{preg_replacement} {}
183+
184+
struct iterator {
185+
preg_replacement_parser* parser{nullptr};
186+
replacement_term current_term{'\0'};
187+
188+
using difference_type = std::ptrdiff_t;
189+
using value_type = replacement_term;
190+
using reference = const replacement_term&;
191+
using pointer = const replacement_term*;
192+
using iterator_category = std::input_iterator_tag;
193+
194+
iterator() noexcept = default;
195+
explicit iterator(preg_replacement_parser* p) noexcept
196+
: parser{p} {
197+
if (parser->preg_replacement.empty()) {
198+
parser = nullptr;
199+
} else {
200+
current_term = parser->parse_term_internal();
201+
}
202+
}
203+
204+
reference operator*() const noexcept {
205+
return current_term;
206+
}
207+
pointer operator->() const noexcept {
208+
return std::addressof(current_term);
209+
}
210+
211+
iterator& operator++() noexcept {
212+
if (!parser->preg_replacement.empty()) {
213+
current_term = parser->parse_term_internal();
214+
} else {
215+
parser = nullptr;
216+
}
217+
return *this;
218+
}
219+
iterator operator++(int) noexcept {
220+
iterator temp = *this;
221+
++(*this);
222+
return temp;
223+
}
224+
225+
friend bool operator==(const iterator& a, const iterator& b) noexcept {
226+
return a.parser == b.parser;
227+
}
228+
friend bool operator!=(const iterator& a, const iterator& b) noexcept {
229+
return !(a == b);
230+
}
231+
};
232+
233+
iterator begin() noexcept {
234+
return iterator{this};
235+
}
236+
iterator end() noexcept {
237+
return iterator{};
238+
}
239+
};
240+
105241
bool parse_regex(RegexInfo& regex_info) noexcept {
106242
if (regex_info.regex.empty()) {
107243
kphp::log::warning("empty regex");
@@ -591,20 +727,23 @@ Optional<string> f$preg_replace(const string& pattern, const string& replacement
591727
return {};
592728
}
593729

594-
string pcre2_replacement{replacement};
595-
{ // we need to replace PHP's back references with PCRE2 ones
596-
static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)";
597-
static constexpr std::string_view backreference_replacement = "$$$1";
598-
599-
RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement};
600-
bool success{parse_regex(regex_info)};
601-
success &= compile_regex(regex_info);
602-
success &= replace_regex(regex_info, std::numeric_limits<uint64_t>::max());
603-
if (!success) [[unlikely]] {
604-
kphp::log::warning("can't replace PHP back references with PCRE2 ones");
605-
return {};
730+
// we need to replace PHP's back references with PCRE2 ones
731+
auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}};
732+
kphp::stl::string<kphp::memory::script_allocator> pcre2_replacement{};
733+
for (const auto& term : parser) {
734+
if (std::holds_alternative<char>(term)) {
735+
auto c{std::get<char>(term)};
736+
pcre2_replacement.push_back(c);
737+
if (c == '$') {
738+
pcre2_replacement.push_back('$');
739+
}
740+
} else {
741+
auto backreference{std::get<backref>(term)};
742+
pcre2_replacement.reserve(pcre2_replacement.size() + backreference.size() + 3);
743+
pcre2_replacement.append("${");
744+
pcre2_replacement.append(backreference);
745+
pcre2_replacement.append("}");
606746
}
607-
pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement;
608747
}
609748

610749
RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}};
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
@ok callback benchmark k2_skip
2+
<?php
3+
4+
define('RE_URL_PATTERN', '(?<![A-Za-z\$0-9À-ßà-ÿ¸¨\-\_])(https?:\/\/)?((?:[A-Za-z\$0-9À-ßà-ÿ¸¨](?:[A-Za-z\$0-9\-\_À-ßà-ÿ¸¨]*[A-Za-z\$0-9À-ßà-ÿ¸¨])?\.){1,5}[A-Za-z\$ðôóêÐÔÓÊ\-\d]{2,22}(?::\d{2,5})?)((?:\/(?:(?:\&amp;|\&#33;|,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.~=;:]+|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\))*(?:,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿\-\_#%?+\/\$.~=;:]*[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\_#%?+\/\$~=]|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\)))?)?)');
5+
6+
$text = 'ß ñëûøàë, ÷òî â iOS 7 ïîÿâèëèñü ëîêàëüíûå ïóø-óâåäîìëåíèÿ. Íî òóò http://blog.derand.net/2010/08/local-notifications-ios-40.html óòâåðæäàåòñÿ, ÷òî åùå â ÷åòâåðòîé.';
7+
$text = preg_replace_callback('/'.RE_URL_PATTERN.'/', 'prcConvertHyperref', $text);
8+
9+
/**
10+
* @kphp-required
11+
* @param string[] $matches
12+
* @return string
13+
*/
14+
function prcConvertHyperref($matches) {
15+
return (string)preg_match('/\.([a-zA-ZðôóêÐÔÓÊ\-0-9]+)$/', $matches[2], $match);
16+
}
17+
18+
19+
/**
20+
* @kphp-required
21+
* @param string[] $param
22+
* @return string
23+
*/
24+
function cb($param) {
25+
var_dump($param);
26+
return "yes!";
27+
}
28+
29+
30+
$input = "plain [indent] deep [indent] [abcd]deeper[/abcd] [/indent] deep [/indent] plain";
31+
32+
/**
33+
* @param mixed $input
34+
* @return string
35+
*/
36+
function parseTagsRecursive($input)
37+
{
38+
global $count;
39+
$regex = '#\[indent]((?:[^[]|\[(?!/?indent])|(?R))+)\[/indent]#';
40+
var_dump ($input);
41+
42+
if (is_array($input)) {
43+
$input = '<div style="margin-left: 10px">'.$input[1].'</div>';
44+
}
45+
46+
47+
$res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count);
48+
var_dump ($count);
49+
return (string)$res;
50+
51+
}
52+
53+
$output = parseTagsRecursive($input);
54+
55+
echo $output, "\n";
56+
57+
58+
/**
59+
* @kphp-required
60+
* @param string[] $x
61+
* @return string
62+
*/
63+
function g($x) {
64+
return "'{$x[0]}'";
65+
}
66+
67+
var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb')));
68+
69+
@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz'))));
70+
71+
/**
72+
* @kphp-required
73+
* @param string[] $m
74+
* @return string
75+
*/
76+
function tmp($m) {
77+
return strtolower($m[0]);
78+
}
79+
80+
var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC'));
81+
82+
var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde'));

0 commit comments

Comments
 (0)