Skip to content

Commit aa70f2e

Browse files
committed
fix preg_replace function
1 parent f24b2bb commit aa70f2e

6 files changed

Lines changed: 512 additions & 393 deletions

File tree

.idea/encodings.xml

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

runtime-light/stdlib/string/regex-functions.cpp

Lines changed: 123 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <algorithm>
88
#include <array>
9+
#include <concepts>
910
#include <cstddef>
1011
#include <cstdint>
1112
#include <functional>
@@ -71,6 +72,10 @@ struct RegexInfo final {
7172
replacement(replacement_) {}
7273
};
7374

75+
struct backref {
76+
std::string_view digits;
77+
};
78+
7479
template<typename... Args>
7580
requires((std::is_same_v<Args, int64_t> && ...) && sizeof...(Args) > 0)
7681
bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept {
@@ -102,6 +107,118 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec
102107
return offset;
103108
}
104109

110+
std::optional<backref> try_get_backref(std::string_view preg_replacement) noexcept {
111+
if (preg_replacement.empty() || preg_replacement[0] < '0' || preg_replacement[0] > '9') {
112+
return std::nullopt;
113+
}
114+
115+
if (preg_replacement.size() == 1 || preg_replacement[1] < '0' || preg_replacement[1] > '9') {
116+
return backref{preg_replacement.substr(0, 1)};
117+
}
118+
119+
return backref{preg_replacement.substr(0, 2)};
120+
}
121+
122+
template<typename T, typename F>
123+
requires std::convertible_to<std::invoke_result_t<F>, T>
124+
auto value_or_else(std::optional<T>&& opt, F&& alternative_func) noexcept -> T {
125+
if (opt.has_value()) {
126+
return std::move(*std::move(opt));
127+
} else {
128+
return std::forward<F>(alternative_func)();
129+
}
130+
}
131+
132+
using replacement_term = std::variant<char, backref>;
133+
134+
class preg_replacement_unescaper {
135+
std::string_view preg_replacement;
136+
137+
public:
138+
preg_replacement_unescaper(std::string_view preg_replacement)
139+
: preg_replacement{preg_replacement} {}
140+
141+
bool has_next() const noexcept {
142+
return !preg_replacement.empty();
143+
}
144+
145+
replacement_term unescape_term() noexcept {
146+
auto first_char{preg_replacement.front()};
147+
preg_replacement = preg_replacement.substr(1);
148+
if (preg_replacement.empty()) {
149+
return first_char;
150+
}
151+
switch (first_char) {
152+
case '$':
153+
if (preg_replacement.front() == '{') {
154+
return try_get_backref(preg_replacement.substr(1))
155+
.and_then([this](auto value) noexcept -> std::optional<replacement_term> {
156+
auto digits_end_pos = 1 + value.digits.size();
157+
if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') {
158+
preg_replacement = preg_replacement.substr(1 + value.digits.size() + 1);
159+
return value;
160+
}
161+
162+
return std::nullopt;
163+
})
164+
.value_or('$');
165+
}
166+
167+
return try_get_backref(preg_replacement)
168+
.transform([this](auto value) noexcept -> replacement_term {
169+
auto digits_end_pos = value.digits.size();
170+
preg_replacement = preg_replacement.substr(digits_end_pos);
171+
return value;
172+
})
173+
.value_or('$');
174+
175+
case '\\':
176+
return value_or_else(try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term {
177+
auto digits_end_pos = value.digits.size();
178+
preg_replacement = preg_replacement.substr(digits_end_pos);
179+
return value;
180+
}),
181+
[this] noexcept {
182+
auto res{preg_replacement.front()};
183+
if (res == '$' || res == '\\') {
184+
preg_replacement = preg_replacement.substr(1);
185+
return res;
186+
}
187+
return '\\';
188+
});
189+
default:
190+
return first_char;
191+
}
192+
}
193+
};
194+
195+
class pcre2_replacement_escaper {
196+
kphp::stl::string<kphp::memory::script_allocator> pcre2_replacement{};
197+
198+
public:
199+
void operator()(char c) noexcept {
200+
pcre2_replacement.push_back(c);
201+
if (c == '$') {
202+
pcre2_replacement.push_back('$');
203+
}
204+
}
205+
206+
void operator()(backref backreference) noexcept {
207+
pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3);
208+
pcre2_replacement.append("${");
209+
pcre2_replacement.append(backreference.digits);
210+
pcre2_replacement.append("}");
211+
}
212+
213+
void escape_term(const replacement_term& term) noexcept {
214+
std::visit(*this, term);
215+
}
216+
217+
kphp::stl::string<kphp::memory::script_allocator>& result() noexcept {
218+
return pcre2_replacement;
219+
}
220+
};
221+
105222
bool parse_regex(RegexInfo& regex_info) noexcept {
106223
if (regex_info.regex.empty()) {
107224
kphp::log::warning("empty regex");
@@ -591,21 +708,13 @@ Optional<string> f$preg_replace(const string& pattern, const string& replacement
591708
return {};
592709
}
593710

594-
string pcre2_replacement{replacement};
595-
{ // we need to replace PHP's back references with PCRE2 ones
596-
static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)";
597-
static constexpr std::string_view backreference_replacement = "$$$1";
598-
599-
RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement};
600-
bool success{parse_regex(regex_info)};
601-
success &= compile_regex(regex_info);
602-
success &= replace_regex(regex_info, std::numeric_limits<uint64_t>::max());
603-
if (!success) [[unlikely]] {
604-
kphp::log::warning("can't replace PHP back references with PCRE2 ones");
605-
return {};
606-
}
607-
pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement;
711+
// we need to replace PHP's back references with PCRE2 ones
712+
auto unescaper{preg_replacement_unescaper{{replacement.c_str(), replacement.size()}}};
713+
pcre2_replacement_escaper escaper{};
714+
while (unescaper.has_next()) {
715+
escaper.escape_term(unescaper.unescape_term());
608716
}
717+
auto& pcre2_replacement{escaper.result()};
609718

610719
RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}};
611720

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
@ok callback benchmark k2_skip
2+
<?php
3+
4+
define('RE_URL_PATTERN', '(?<![A-Za-z\$0-9À-ßà-ÿ¸¨\-\_])(https?:\/\/)?((?:[A-Za-z\$0-9À-ßà-ÿ¸¨](?:[A-Za-z\$0-9\-\_À-ßà-ÿ¸¨]*[A-Za-z\$0-9À-ßà-ÿ¸¨])?\.){1,5}[A-Za-z\$ðôóêÐÔÓÊ\-\d]{2,22}(?::\d{2,5})?)((?:\/(?:(?:\&amp;|\&#33;|,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.~=;:]+|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\))*(?:,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿\-\_#%?+\/\$.~=;:]*[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\_#%?+\/\$~=]|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\)))?)?)');
5+
6+
$text = 'ß ñëûøàë, ÷òî â iOS 7 ïîÿâèëèñü ëîêàëüíûå ïóø-óâåäîìëåíèÿ. Íî òóò http://blog.derand.net/2010/08/local-notifications-ios-40.html óòâåðæäàåòñÿ, ÷òî åùå â ÷åòâåðòîé.';
7+
$text = preg_replace_callback('/'.RE_URL_PATTERN.'/', 'prcConvertHyperref', $text);
8+
9+
/**
10+
* @kphp-required
11+
* @param string[] $matches
12+
* @return string
13+
*/
14+
function prcConvertHyperref($matches) {
15+
return (string)preg_match('/\.([a-zA-ZðôóêÐÔÓÊ\-0-9]+)$/', $matches[2], $match);
16+
}
17+
18+
19+
/**
20+
* @kphp-required
21+
* @param string[] $param
22+
* @return string
23+
*/
24+
function cb($param) {
25+
var_dump($param);
26+
return "yes!";
27+
}
28+
29+
30+
$input = "plain [indent] deep [indent] [abcd]deeper[/abcd] [/indent] deep [/indent] plain";
31+
32+
/**
33+
* @param mixed $input
34+
* @return string
35+
*/
36+
function parseTagsRecursive($input)
37+
{
38+
global $count;
39+
$regex = '#\[indent]((?:[^[]|\[(?!/?indent])|(?R))+)\[/indent]#';
40+
var_dump ($input);
41+
42+
if (is_array($input)) {
43+
$input = '<div style="margin-left: 10px">'.$input[1].'</div>';
44+
}
45+
46+
47+
$res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count);
48+
var_dump ($count);
49+
return (string)$res;
50+
51+
}
52+
53+
$output = parseTagsRecursive($input);
54+
55+
echo $output, "\n";
56+
57+
58+
/**
59+
* @kphp-required
60+
* @param string[] $x
61+
* @return string
62+
*/
63+
function g($x) {
64+
return "'{$x[0]}'";
65+
}
66+
67+
var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb')));
68+
69+
@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz'))));
70+
71+
/**
72+
* @kphp-required
73+
* @param string[] $m
74+
* @return string
75+
*/
76+
function tmp($m) {
77+
return strtolower($m[0]);
78+
}
79+
80+
var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC'));
81+
82+
var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde'));

0 commit comments

Comments
 (0)