Skip to content
4 changes: 3 additions & 1 deletion .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

165 changes: 152 additions & 13 deletions runtime-light/stdlib/string/regex-functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <algorithm>
#include <array>
#include <cctype>
#include <concepts>
#include <cstddef>
#include <cstdint>
#include <functional>
Expand Down Expand Up @@ -33,6 +35,7 @@ constexpr size_t ERROR_BUFFER_LENGTH = 256;

enum class trailing_unmatch : uint8_t { skip, include };

using backref = std::string_view;
using regex_pcre2_group_names_t = kphp::stl::vector<const char*, kphp::memory::script_allocator>;

struct RegexInfo final {
Expand Down Expand Up @@ -102,6 +105,139 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec
return offset;
}

std::optional<backref> try_get_backref(std::string_view preg_replacement) noexcept {
if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) {
return std::nullopt;
}

if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) {
return backref{preg_replacement.substr(0, 1)};
}

return backref{preg_replacement.substr(0, 2)};
}

using replacement_term = std::variant<char, backref>;

class preg_replacement_parser {
std::string_view preg_replacement;

replacement_term parse_term_internal() noexcept {
kphp::log::assertion(!preg_replacement.empty());
auto first_char{preg_replacement.front()};
preg_replacement = preg_replacement.substr(1);
if (preg_replacement.empty()) {
return first_char;
}
switch (first_char) {
case '$':
// $1, ${1}
if (preg_replacement.front() == '{') {
return try_get_backref(preg_replacement.substr(1))
.and_then([this](auto value) noexcept -> std::optional<replacement_term> {
auto digits_end_pos = 1 + value.size();
if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') {
preg_replacement = preg_replacement.substr(1 + value.size() + 1);
return value;
}

return std::nullopt;
})
.value_or('$');
}

return try_get_backref(preg_replacement)
.transform([this](auto value) noexcept -> replacement_term {
auto digits_end_pos = value.size();
preg_replacement = preg_replacement.substr(digits_end_pos);
return value;
})
.value_or('$');

case '\\': {
// \1
auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term {
auto digits_end_pos = value.size();
preg_replacement = preg_replacement.substr(digits_end_pos);
return value;
})};
if (back_reference_opt.has_value()) {
return *back_reference_opt;
} else {
auto res{preg_replacement.front()};
if (res == '$' || res == '\\') {
preg_replacement = preg_replacement.substr(1);
return res;
}
return '\\';
}
}
default:
return first_char;
}
}

public:
explicit preg_replacement_parser(std::string_view preg_replacement) noexcept
: preg_replacement{preg_replacement} {}

struct iterator {
preg_replacement_parser* parser{nullptr};
replacement_term current_term{'\0'};

using difference_type = std::ptrdiff_t;
using value_type = replacement_term;
using reference = const replacement_term&;
using pointer = const replacement_term*;
using iterator_category = std::input_iterator_tag;

iterator() noexcept = default;
explicit iterator(preg_replacement_parser* p) noexcept
: parser{p} {
if (parser->preg_replacement.empty()) {
parser = nullptr;
} else {
current_term = parser->parse_term_internal();
}
}

reference operator*() const noexcept {
return current_term;
}
pointer operator->() const noexcept {
return std::addressof(current_term);
}

iterator& operator++() noexcept {
if (!parser->preg_replacement.empty()) {
current_term = parser->parse_term_internal();
} else {
parser = nullptr;
}
return *this;
}
iterator operator++(int) noexcept {
iterator temp = *this;
++(*this);
return temp;
}

friend bool operator==(const iterator& a, const iterator& b) noexcept {
return a.parser == b.parser;
}
friend bool operator!=(const iterator& a, const iterator& b) noexcept {
return !(a == b);
}
};

iterator begin() noexcept {
return iterator{this};
}
iterator end() noexcept {
return iterator{};
}
};

bool parse_regex(RegexInfo& regex_info) noexcept {
if (regex_info.regex.empty()) {
kphp::log::warning("empty regex");
Expand Down Expand Up @@ -591,20 +727,23 @@ Optional<string> f$preg_replace(const string& pattern, const string& replacement
return {};
}

string pcre2_replacement{replacement};
{ // we need to replace PHP's back references with PCRE2 ones
static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)";
static constexpr std::string_view backreference_replacement = "$$$1";

RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement};
bool success{parse_regex(regex_info)};
success &= compile_regex(regex_info);
success &= replace_regex(regex_info, std::numeric_limits<uint64_t>::max());
if (!success) [[unlikely]] {
kphp::log::warning("can't replace PHP back references with PCRE2 ones");
return {};
// we need to replace PHP's back references with PCRE2 ones
auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}};
kphp::stl::string<kphp::memory::script_allocator> pcre2_replacement{};
for (const auto& term : parser) {
if (std::holds_alternative<char>(term)) {
auto c{std::get<char>(term)};
pcre2_replacement.push_back(c);
if (c == '$') {
pcre2_replacement.push_back('$');
}
} else {
auto backreference{std::get<backref>(term)};
pcre2_replacement.reserve(pcre2_replacement.size() + backreference.size() + 3);
pcre2_replacement.append("${");
pcre2_replacement.append(backreference);
pcre2_replacement.append("}");
}
pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement;
}

RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}};
Expand Down
82 changes: 82 additions & 0 deletions tests/phpt/dl/002_preg_replace_callback.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
@ok callback benchmark k2_skip
<?php

define('RE_URL_PATTERN', '(?<![A-Za-z\$0-9À-ßà-ÿ¸¨\-\_])(https?:\/\/)?((?:[A-Za-z\$0-9À-ßà-ÿ¸¨](?:[A-Za-z\$0-9\-\_À-ßà-ÿ¸¨]*[A-Za-z\$0-9À-ßà-ÿ¸¨])?\.){1,5}[A-Za-z\$ðôóêÐÔÓÊ\-\d]{2,22}(?::\d{2,5})?)((?:\/(?:(?:\&amp;|\&#33;|,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.~=;:]+|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\))*(?:,[_%]|[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿\-\_#%?+\/\$.~=;:]*[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\_#%?+\/\$~=]|\[[A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\]|\([A-Za-z0-9\xa8\xb8\xc0-\xffºª¥´¯¿²³\-\_#%?+\/\$.,~=;:]*\)))?)?)');

$text = 'ß ñëûøàë, ÷òî â iOS 7 ïîÿâèëèñü ëîêàëüíûå ïóø-óâåäîìëåíèÿ. Íî òóò http://blog.derand.net/2010/08/local-notifications-ios-40.html óòâåðæäàåòñÿ, ÷òî åùå â ÷åòâåðòîé.';
$text = preg_replace_callback('/'.RE_URL_PATTERN.'/', 'prcConvertHyperref', $text);

/**
* @kphp-required
* @param string[] $matches
* @return string
*/
function prcConvertHyperref($matches) {
return (string)preg_match('/\.([a-zA-ZðôóêÐÔÓÊ\-0-9]+)$/', $matches[2], $match);
}


/**
* @kphp-required
* @param string[] $param
* @return string
*/
function cb($param) {
var_dump($param);
return "yes!";
}


$input = "plain [indent] deep [indent] [abcd]deeper[/abcd] [/indent] deep [/indent] plain";

/**
* @param mixed $input
* @return string
*/
function parseTagsRecursive($input)
{
global $count;
$regex = '#\[indent]((?:[^[]|\[(?!/?indent])|(?R))+)\[/indent]#';
var_dump ($input);

if (is_array($input)) {
$input = '<div style="margin-left: 10px">'.$input[1].'</div>';
}


$res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count);
var_dump ($count);
return (string)$res;

}

$output = parseTagsRecursive($input);

echo $output, "\n";


/**
* @kphp-required
* @param string[] $x
* @return string
*/
function g($x) {
return "'{$x[0]}'";
}

var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb')));

@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz'))));

/**
* @kphp-required
* @param string[] $m
* @return string
*/
function tmp($m) {
return strtolower($m[0]);
}

var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC'));

var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde'));
Loading
Loading