From f6a626d7dd707a397f2af4ebc29e2c25694f576c Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 27 Nov 2025 12:31:45 +0300 Subject: [PATCH 01/13] split tests --- .idea/encodings.xml | 4 +- tests/phpt/dl/002_preg_replace_callback.php | 82 +++++ tests/phpt/dl/003_preg_match_all.php | 213 +++++++++++ tests/phpt/dl/004_preg_replace.php | 91 +++++ tests/phpt/dl/496_regex.php | 378 -------------------- 5 files changed, 389 insertions(+), 379 deletions(-) create mode 100644 tests/phpt/dl/002_preg_replace_callback.php create mode 100644 tests/phpt/dl/003_preg_match_all.php create mode 100644 tests/phpt/dl/004_preg_replace.php diff --git a/.idea/encodings.xml b/.idea/encodings.xml index ea72052b66..f8989491c4 100644 --- a/.idea/encodings.xml +++ b/.idea/encodings.xml @@ -1,9 +1,11 @@ - + + + diff --git a/tests/phpt/dl/002_preg_replace_callback.php b/tests/phpt/dl/002_preg_replace_callback.php new file mode 100644 index 0000000000..16c97a8946 --- /dev/null +++ b/tests/phpt/dl/002_preg_replace_callback.php @@ -0,0 +1,82 @@ +@ok callback benchmark k2_skip +'.$input[1].''; + } + + + $res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count); + var_dump ($count); + return (string)$res; + +} + +$output = parseTagsRecursive($input); + +echo $output, "\n"; + + +/** + * @kphp-required + * @param string[] $x + * @return string + */ +function g($x) { + return "'{$x[0]}'"; +} + +var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb'))); + +@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz')))); + +/** + * @kphp-required + * @param string[] $m + * @return string + */ +function tmp($m) { + return strtolower($m[0]); +} + +var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC')); + +var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde')); diff --git a/tests/phpt/dl/003_preg_match_all.php b/tests/phpt/dl/003_preg_match_all.php new file mode 100644 index 0000000000..36271182ad --- /dev/null +++ b/tests/phpt/dl/003_preg_match_all.php @@ -0,0 +1,213 @@ +@ok callback benchmark k2_skip +~', 'This is no more', $v)); var_dump ($v); +var_dump (preg_match_all ('~.*?~', 'This', $v)); var_dump ($v); +var_dump (preg_match_all ('~.*~', 'This', $v)); var_dump ($v); + +var_dump (preg_match_all ('~<.*?>~', 'This is no more', $v)); var_dump ($v); + +var_dump (preg_match_all ('~(\d+|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~(\d+\|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d)+\|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d)+\|)+~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d+)\|)+~', '12|34|567|', $v)); var_dump ($v); + + +$html = "bold textclick me"; + +preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $html, $matches, PREG_SET_ORDER); + +foreach ($matches as $val) { + echo "matched: " . $val[0] . "\n"; + echo "part 1: " . $val[1] . "\n"; + echo "part 2: " . $val[2] . "\n"; + echo "part 3: " . $val[3] . "\n"; + echo "part 4: " . $val[4] . "\n\n"; +} + +preg_match_all("/\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}/x", + "Call 555-1212 or 1-800-555-1212", $phones); + +$str = <<\w+): (?P\d+)/', $str, $matches); + +print_r($matches); + + +$str0 = <<1)?(?P[a-c]+):() (?P\d+)(?Pa)?)', '=A=i', "/(a)?/") as $pattern) { + foreach (array('((1)?([a-c]+):() (\d+)(a)?)', '=A=i', "/(a)?/") as $pattern) { + foreach (array($str0, '', "a", "1abAcaba", "dad") as $str) { + preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match_all($pattern, $str, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_PATTERN_ORDER)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER)"); + var_dump ($matches); + } + + preg_match ($pattern, $str, $matches, PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match ($pattern, $str, PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match ($pattern, $str, $matches); + if ($i == 0) { + var_dump ("preg_match ($pattern, $str)"); + var_dump ($matches); + } + } + } +} + + +foreach (array(PREG_PATTERN_ORDER, PREG_SET_ORDER) as $flag) { + var_dump(preg_match_all('~ + (?P + (?P(\d{2})?\d\d) - + (?P(?:\d\d|[a-zA-Z]{2,3})) - + (?P[0-3]?\d)) + ~x', + '2006-05-13 e outra data: "12-Aug-37"', $m, $flag)); + + var_dump($m); +} + + +var_dump(preg_match_all('/((?:(?:unsigned|struct)\s+)?\w+)(?:\s*(\*+)\s+|\s+(\**))(\w+(?:\[\s*\w*\s*\])?)\s*(?:(=)[^,;]+)?((?:\s*,\s*\**\s*\w+(?:\[\s*\w*\s*\])?\s*(?:=[^,;]+)?)*)\s*;/', 'unsigned int xpto = 124; short a, b;', $m, PREG_SET_ORDER)); +var_dump($m); + +var_dump(preg_match_all('/(?:\([^)]+\))?(&?)([\w>.()-]+(?:\[\w+\])?)\s*,?((?:\)*\s*=)?)/', '&a, b, &c', $m, PREG_SET_ORDER)); +var_dump($m); + +var_dump(preg_match_all('/zend_parse_parameters(?:_ex\s*\([^,]+,[^,]+|\s*\([^,]+),\s*"([^"]*)"\s*,\s*([^{;]*)/', 'zend_parse_parameters( 0, "addd|s/", a, b, &c);', $m, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)); +var_dump($m); + + +$sampledata = " +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swfbitmap_init': +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:323: warning: assignment from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swftextfield_setFont': +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:2597: warning: passing arg 2 of `SWFTextField_setFont' from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/oci8/oci8.c:1027: warning: `oci_ping' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getpgid': +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:484: warning: implicit declaration of function `getpgid' +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getsid': +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:506: warning: implicit declaration of function `getsid' +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_read_files': +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:302: warning: implicit declaration of function `pread' +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_write_files': +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:340: warning: implicit declaration of function `pwrite' +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_get_option': +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1862: warning: unused variable `timeout' +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_set_option': +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1941: warning: unused variable `timeout' +/p2/var/php_gcov/PHP_4_4/regex/regexec.c:19: warning: `nope' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/standard/exec.c:50: warning: `php_make_safe_mode_command' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/standard/image.c: In function `php_handle_jpc': +/p2/var/php_gcov/PHP_4_4/ext/standard/image.c:604: warning: unused variable `dummy_int' +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c: In function `php_gd_parse': +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c:1138: warning: implicit declaration of function `php_gd_lex' +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y: At top level: +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y:864: warning: return type defaults to `int' +/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c: In function `zif_msg_receive': +/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c:318: warning: passing arg 2 of `php_var_unserialize' from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c: In function `zif_yp_err_string': +/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c:372: warning: assignment discards qualifiers from pointer target type +Zend/zend_language_scanner.c:5944: warning: `yy_fatal_error' defined but not used +Zend/zend_language_scanner.c:2627: warning: `yy_last_accepting_state' defined but not used +Zend/zend_language_scanner.c:2628: warning: `yy_last_accepting_cpos' defined but not used +Zend/zend_language_scanner.c:2634: warning: `yy_more_flag' defined but not used +Zend/zend_language_scanner.c:2635: warning: `yy_more_len' defined but not used +Zend/zend_language_scanner.c:5483: warning: `yyunput' defined but not used +Zend/zend_language_scanner.c:5929: warning: `yy_top_state' defined but not used +conflicts: 2 shift/reduce +Zend/zend_ini_scanner.c:457: warning: `yy_last_accepting_state' defined but not used +Zend/zend_ini_scanner.c:458: warning: `yy_last_accepting_cpos' defined but not used +Zend/zend_ini_scanner.c:1361: warning: `yyunput' defined but not used +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c: In function `_safe_emalloc': +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 3) +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 4) +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 5) +/p2/var/php_gcov/PHP_4_4/Zend/zend_ini.c:338: warning: `zend_ini_displayer_cb' defined but not used +ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': +/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' +ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': +/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' +ext/ming/ming.o(.text+0xc115): In function `zim_swfmovie_namedAnchor': +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2207: undefined reference to `SWFMovie_namedAnchor' +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2209: undefined reference to `SWFMovie_xpto' +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2259: undefined reference to `SWFMovie_foo' +ext/ming/ming.o(.text+0x851): In function `zif_ming_setSWFCompression': +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:154: undefined reference to `Ming_setSWFCompression' +"; + +$gcc_regex = '/^((.+)(\(\.text\+0x[[:xdigit:]]+\))?: In function [`\'](\w+)\':\s+)?'. + '((?(1)(?(3)[^:\n]+|\2)|[^:\n]+)):(\d+): (?:(error|warning):\s+)?(.+)'. + str_repeat('(?:\s+\5:(\d+): (?:(error|warning):\s+)?(.+))?', 99). // capture up to 100 errors + '/m'; + + +var_dump(preg_match_all($gcc_regex, $sampledata, $m, PREG_SET_ORDER)); +print_r($m); + + +var_dump(preg_match_all('|(\w+)://([^\s"<]*[\w+#?/&=])|', "This is a text string", $matches, PREG_SET_ORDER)); +var_dump($matches); + +/** + * @return mixed + */ +function func1(){ + $string = 'what the word and the other word the'; + preg_match_all('/(?Pthe)/', $string, $matches); + return $matches['word']; +} +$words = func1(); +var_dump($words); + + +$pattern = +"/\s([\w_\.\/]+)(?:=([\'\"]?(?:[\w\d\s\?=\(\)\.,'_#\/\\:;&-]|(?:\\\\\"|\\\')?)+[\'\"]?))?/"; +$context = ""; + +$match = array(); + +if ($result = preg_match_all($pattern, $context, $match)) +{ +var_dump($result); +var_dump($match); +} + + +var_dump(preg_match_all('/\d+/', '123 456 789 012', $match, 0)); +var_dump($match); diff --git a/tests/phpt/dl/004_preg_replace.php b/tests/phpt/dl/004_preg_replace.php new file mode 100644 index 0000000000..74807b08ea --- /dev/null +++ b/tests/phpt/dl/004_preg_replace.php @@ -0,0 +1,91 @@ +@ok callback benchmark k2_skip +~', 'This is no more', $v)); var_dump ($v); -var_dump (preg_match_all ('~.*?~', 'This', $v)); var_dump ($v); -var_dump (preg_match_all ('~.*~', 'This', $v)); var_dump ($v); - -var_dump (preg_match_all ('~<.*?>~', 'This is no more', $v)); var_dump ($v); - -var_dump (preg_match_all ('~(\d+|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~(\d+\|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d)+\|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d)+\|)+~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d+)\|)+~', '12|34|567|', $v)); var_dump ($v); - -var_dump (preg_replace ('~|q~', '{\0}', 'eq')); -var_dump (preg_replace ('~|q~', '{\0}', 'ex')); - -var_dump (preg_replace ('~|q~', 'w', 'e')); -var_dump (preg_replace ('~|q~', 'w', 'q')); -/* bug in PHP -var_dump (preg_replace ('~|й~u', 'п', 'р')); -*/ -var_dump (preg_replace ('~|й~u', 'п', 'й')); var_dump (preg_split ('~|й~u', 'п')); var_dump (preg_split ('~|й~u', 'й')); -define('RE_URL_PATTERN', '(?bold textclick me"; - -preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $html, $matches, PREG_SET_ORDER); - -foreach ($matches as $val) { - echo "matched: " . $val[0] . "\n"; - echo "part 1: " . $val[1] . "\n"; - echo "part 2: " . $val[2] . "\n"; - echo "part 3: " . $val[3] . "\n"; - echo "part 4: " . $val[4] . "\n\n"; -} - -preg_match_all("/\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}/x", - "Call 555-1212 or 1-800-555-1212", $phones); - -$str = <<\w+): (?P\d+)/', $str, $matches); - -print_r($matches); - $keywords = preg_split("/[\s,]+/", "hypertext language, programming"); print_r($keywords); @@ -124,58 +16,6 @@ function prcConvertHyperref($matches) { $chars = preg_split('/ /', $str, -1, PREG_SPLIT_OFFSET_CAPTURE); print_r($chars); -var_dump (preg_replace ('~a|~', 'b', 'a')); -var_dump (preg_replace ('~a|~', 'a', 'b')); - -$str0 = <<1)?(?P[a-c]+):() (?P\d+)(?Pa)?)', '=A=i', "/(a)?/") as $pattern) { - foreach (array('((1)?([a-c]+):() (\d+)(a)?)', '=A=i', "/(a)?/") as $pattern) { - foreach (array($str0, '', "a", "1abAcaba", "dad") as $str) { - preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match_all($pattern, $str, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_PATTERN_ORDER)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER)"); - var_dump ($matches); - } - - preg_match ($pattern, $str, $matches, PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match ($pattern, $str, PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match ($pattern, $str, $matches); - if ($i == 0) { - var_dump ("preg_match ($pattern, $str)"); - var_dump ($matches); - } - } - } -} - foreach (array('2006-05-13', '06-12-12', 'data: "12-Aug-87"') as $s) { var_dump(preg_match('~ (?P @@ -187,123 +27,6 @@ function prcConvertHyperref($matches) { var_dump($m); } -foreach (array(PREG_PATTERN_ORDER, PREG_SET_ORDER) as $flag) { - var_dump(preg_match_all('~ - (?P - (?P(\d{2})?\d\d) - - (?P(?:\d\d|[a-zA-Z]{2,3})) - - (?P[0-3]?\d)) - ~x', - '2006-05-13 e outra data: "12-Aug-37"', $m, $flag)); - - var_dump($m); -} - - -var_dump(preg_match_all('/((?:(?:unsigned|struct)\s+)?\w+)(?:\s*(\*+)\s+|\s+(\**))(\w+(?:\[\s*\w*\s*\])?)\s*(?:(=)[^,;]+)?((?:\s*,\s*\**\s*\w+(?:\[\s*\w*\s*\])?\s*(?:=[^,;]+)?)*)\s*;/', 'unsigned int xpto = 124; short a, b;', $m, PREG_SET_ORDER)); -var_dump($m); - -var_dump(preg_match_all('/(?:\([^)]+\))?(&?)([\w>.()-]+(?:\[\w+\])?)\s*,?((?:\)*\s*=)?)/', '&a, b, &c', $m, PREG_SET_ORDER)); -var_dump($m); - -var_dump(preg_match_all('/zend_parse_parameters(?:_ex\s*\([^,]+,[^,]+|\s*\([^,]+),\s*"([^"]*)"\s*,\s*([^{;]*)/', 'zend_parse_parameters( 0, "addd|s/", a, b, &c);', $m, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)); -var_dump($m); - -var_dump(preg_replace(array('@//.*@', '@/\*.*\*/@sU'), array('', 'preg_replace("/[^\r\n]+/", "", \'$0\')'), "hello\n//x \n/*\ns\n*/")); - -$sampledata = " -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swfbitmap_init': -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:323: warning: assignment from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swftextfield_setFont': -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:2597: warning: passing arg 2 of `SWFTextField_setFont' from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/oci8/oci8.c:1027: warning: `oci_ping' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getpgid': -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:484: warning: implicit declaration of function `getpgid' -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getsid': -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:506: warning: implicit declaration of function `getsid' -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_read_files': -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:302: warning: implicit declaration of function `pread' -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_write_files': -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:340: warning: implicit declaration of function `pwrite' -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_get_option': -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1862: warning: unused variable `timeout' -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_set_option': -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1941: warning: unused variable `timeout' -/p2/var/php_gcov/PHP_4_4/regex/regexec.c:19: warning: `nope' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/standard/exec.c:50: warning: `php_make_safe_mode_command' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/standard/image.c: In function `php_handle_jpc': -/p2/var/php_gcov/PHP_4_4/ext/standard/image.c:604: warning: unused variable `dummy_int' -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c: In function `php_gd_parse': -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c:1138: warning: implicit declaration of function `php_gd_lex' -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y: At top level: -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y:864: warning: return type defaults to `int' -/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c: In function `zif_msg_receive': -/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c:318: warning: passing arg 2 of `php_var_unserialize' from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c: In function `zif_yp_err_string': -/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c:372: warning: assignment discards qualifiers from pointer target type -Zend/zend_language_scanner.c:5944: warning: `yy_fatal_error' defined but not used -Zend/zend_language_scanner.c:2627: warning: `yy_last_accepting_state' defined but not used -Zend/zend_language_scanner.c:2628: warning: `yy_last_accepting_cpos' defined but not used -Zend/zend_language_scanner.c:2634: warning: `yy_more_flag' defined but not used -Zend/zend_language_scanner.c:2635: warning: `yy_more_len' defined but not used -Zend/zend_language_scanner.c:5483: warning: `yyunput' defined but not used -Zend/zend_language_scanner.c:5929: warning: `yy_top_state' defined but not used -conflicts: 2 shift/reduce -Zend/zend_ini_scanner.c:457: warning: `yy_last_accepting_state' defined but not used -Zend/zend_ini_scanner.c:458: warning: `yy_last_accepting_cpos' defined but not used -Zend/zend_ini_scanner.c:1361: warning: `yyunput' defined but not used -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c: In function `_safe_emalloc': -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 3) -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 4) -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 5) -/p2/var/php_gcov/PHP_4_4/Zend/zend_ini.c:338: warning: `zend_ini_displayer_cb' defined but not used -ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': -/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' -ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': -/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' -ext/ming/ming.o(.text+0xc115): In function `zim_swfmovie_namedAnchor': -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2207: undefined reference to `SWFMovie_namedAnchor' -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2209: undefined reference to `SWFMovie_xpto' -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2259: undefined reference to `SWFMovie_foo' -ext/ming/ming.o(.text+0x851): In function `zif_ming_setSWFCompression': -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:154: undefined reference to `Ming_setSWFCompression' -"; - -$gcc_regex = '/^((.+)(\(\.text\+0x[[:xdigit:]]+\))?: In function [`\'](\w+)\':\s+)?'. - '((?(1)(?(3)[^:\n]+|\2)|[^:\n]+)):(\d+): (?:(error|warning):\s+)?(.+)'. - str_repeat('(?:\s+\5:(\d+): (?:(error|warning):\s+)?(.+))?', 99). // capture up to 100 errors - '/m'; - - -var_dump(preg_match_all($gcc_regex, $sampledata, $m, PREG_SET_ORDER)); -print_r($m); - - -/** - * @kphp-required - * @param string[] $param - * @return string - */ -function cb($param) { - var_dump($param); - return "yes!"; -} - -#var_dump(preg_replace('', array(), '')); - -var_dump(preg_match_all('|(\w+)://([^\s"<]*[\w+#?/&=])|', "This is a text string", $matches, PREG_SET_ORDER)); -var_dump($matches); - -/** - * @return mixed - */ -function func1(){ - $string = 'what the word and the other word the'; - preg_match_all('/(?Pthe)/', $string, $matches); - return $matches['word']; -} -$words = func1(); -var_dump($words); $foo = 'bla bla bla'; @@ -318,18 +41,6 @@ function func1(){ var_dump(preg_match('@^(/([a-z]+))+$@', $subject, $m)); var_dump($m); var_dump(preg_match('@^(/(?:[a-z]+))+$@', $subject, $m)); var_dump($m); -$pattern = -"/\s([\w_\.\/]+)(?:=([\'\"]?(?:[\w\d\s\?=\(\)\.,'_#\/\\:;&-]|(?:\\\\\"|\\\')?)+[\'\"]?))?/"; -$context = ""; - -$match = array(); - -if ($result = preg_match_all($pattern, $context, $match)) -{ -var_dump($result); -var_dump($match); -} - $regex = '/(insert|drop|create|select|delete|update)([^;\']*('."('[^']*')+".')?)*(;|$)/i'; $sql = 'SELECT * FROM #__components'; @@ -343,24 +54,12 @@ function func1(){ */ - -var_dump(preg_replace(array('/\da(.)/ui', '@..@'), '$1', '12Abc')); -var_dump(preg_replace(array('/\da(.)/ui', '@(.)@'), '$1', array('x','a2aA', '1av2Ab'))); - - -var_dump(preg_replace(array('/[\w]+/'), array('$'), array('xyz', 'bdbd'))); -var_dump(preg_replace(array('/\s+/', '~[b-d]~'), array('$'), array('x y', 'bd bc'))); - - var_dump(preg_match('/\d+/', '123 456 789 012', $match, 0)); var_dump($match); var_dump(preg_match('/\d+/', '123 456 789 012', $match, 0)); var_dump($match); -var_dump(preg_match_all('/\d+/', '123 456 789 012', $match, 0)); -var_dump($match); - var_dump(preg_split('/PHP_(?:NAMED_)?(?:FUNCTION|METHOD)\s*\((\w+(?:,\s*\w+)?)\)/', "PHP_FUNCTION(s, preg_match)\n{\nlalala", -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); @@ -384,80 +83,3 @@ function func1(){ var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE)); var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); - - - - - -/* -PHP 5.2.0 - 5.3.6 bug -$text = '[CODE]<td align="$stylevar[right]">[/CODE]'; -$result = preg_replace(array('#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU', '#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU'), '', $text); -var_dump($text); -var_dump($result); - -$result = preg_replace('#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU', '', $text); -var_dump($text); -var_dump($result); -*/ - - - - - - - -$input = "plain [indent] deep [indent] [abcd]deeper[/abcd] [/indent] deep [/indent] plain"; - -/** - * @param mixed $input - * @return string - */ -function parseTagsRecursive($input) -{ - global $count; - $regex = '#\[indent]((?:[^[]|\[(?!/?indent])|(?R))+)\[/indent]#'; - - if (is_array($input)) { - $input = '
'.$input[1].'
'; - } - - - $res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count); - var_dump ($count); - return (string)$res; - -} - -$output = parseTagsRecursive($input); - -echo $output, "\n"; - - -/** - * @kphp-required - * @param string[] $x - * @return string - */ -function g($x) { - return "'{$x[0]}'"; -} - -var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb'))); - -@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz')))); - -/** - * @kphp-required - * @param string[] $m - * @return string - */ -function tmp($m) { - return strtolower($m[0]); -} - -var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC')); - -var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde')); - - From c7c6dac3384f38011552f0f2a650b66ac602b5ff Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 26 Nov 2025 11:39:00 +0300 Subject: [PATCH 02/13] fix preg_replace function --- .../stdlib/string/regex-functions.cpp | 137 ++++++++++++++++-- tests/phpt/dl/004_preg_replace.php | 2 +- 2 files changed, 124 insertions(+), 15 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 0aba6c839d..7a5ac72aaa 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -71,6 +72,10 @@ struct RegexInfo final { replacement(replacement_) {} }; +struct backref { + std::string_view digits; +}; + template requires((std::is_same_v && ...) && sizeof...(Args) > 0) bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { @@ -102,6 +107,118 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec return offset; } +std::optional try_get_backref(std::string_view preg_replacement) noexcept { + if (preg_replacement.empty() || preg_replacement[0] < '0' || preg_replacement[0] > '9') { + return std::nullopt; + } + + if (preg_replacement.size() == 1 || preg_replacement[1] < '0' || preg_replacement[1] > '9') { + return backref{preg_replacement.substr(0, 1)}; + } + + return backref{preg_replacement.substr(0, 2)}; +} + +template +requires std::convertible_to, T> +auto value_or_else(std::optional&& opt, F&& alternative_func) noexcept -> T { + if (opt.has_value()) { + return std::move(*std::move(opt)); + } else { + return std::forward(alternative_func)(); + } +} + +using replacement_term = std::variant; + +class preg_replacement_unescaper { + std::string_view preg_replacement; + +public: + preg_replacement_unescaper(std::string_view preg_replacement) + : preg_replacement{preg_replacement} {} + + bool has_next() const noexcept { + return !preg_replacement.empty(); + } + + replacement_term unescape_term() noexcept { + auto first_char{preg_replacement.front()}; + preg_replacement = preg_replacement.substr(1); + if (preg_replacement.empty()) { + return first_char; + } + switch (first_char) { + case '$': + if (preg_replacement.front() == '{') { + return try_get_backref(preg_replacement.substr(1)) + .and_then([this](auto value) noexcept -> std::optional { + auto digits_end_pos = 1 + value.digits.size(); + if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { + preg_replacement = preg_replacement.substr(1 + value.digits.size() + 1); + return value; + } + + return std::nullopt; + }) + .value_or('$'); + } + + return try_get_backref(preg_replacement) + .transform([this](auto value) noexcept -> replacement_term { + auto digits_end_pos = value.digits.size(); + preg_replacement = preg_replacement.substr(digits_end_pos); + return value; + }) + .value_or('$'); + + case '\\': + return value_or_else(try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { + auto digits_end_pos = value.digits.size(); + preg_replacement = preg_replacement.substr(digits_end_pos); + return value; + }), + [this] noexcept { + auto res{preg_replacement.front()}; + if (res == '$' || res == '\\') { + preg_replacement = preg_replacement.substr(1); + return res; + } + return '\\'; + }); + default: + return first_char; + } + } +}; + +class pcre2_replacement_escaper { + kphp::stl::string pcre2_replacement{}; + +public: + void operator()(char c) noexcept { + pcre2_replacement.push_back(c); + if (c == '$') { + pcre2_replacement.push_back('$'); + } + } + + void operator()(backref backreference) noexcept { + pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); + pcre2_replacement.append("${"); + pcre2_replacement.append(backreference.digits); + pcre2_replacement.append("}"); + } + + void escape_term(const replacement_term& term) noexcept { + std::visit(*this, term); + } + + kphp::stl::string& result() noexcept { + return pcre2_replacement; + } +}; + bool parse_regex(RegexInfo& regex_info) noexcept { if (regex_info.regex.empty()) { kphp::log::warning("empty regex"); @@ -591,21 +708,13 @@ Optional f$preg_replace(const string& pattern, const string& replacement return {}; } - string pcre2_replacement{replacement}; - { // we need to replace PHP's back references with PCRE2 ones - static constexpr std::string_view backreference_pattern = R"(/\\(\d)/)"; - static constexpr std::string_view backreference_replacement = "$$$1"; - - RegexInfo regex_info{backreference_pattern, {replacement.c_str(), replacement.size()}, backreference_replacement}; - bool success{parse_regex(regex_info)}; - success &= compile_regex(regex_info); - success &= replace_regex(regex_info, std::numeric_limits::max()); - if (!success) [[unlikely]] { - kphp::log::warning("can't replace PHP back references with PCRE2 ones"); - return {}; - } - pcre2_replacement = regex_info.opt_replace_result.has_value() ? *std::move(regex_info.opt_replace_result) : replacement; + // we need to replace PHP's back references with PCRE2 ones + auto unescaper{preg_replacement_unescaper{{replacement.c_str(), replacement.size()}}}; + pcre2_replacement_escaper escaper{}; + while (unescaper.has_next()) { + escaper.escape_term(unescaper.unescape_term()); } + auto& pcre2_replacement{escaper.result()}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; diff --git a/tests/phpt/dl/004_preg_replace.php b/tests/phpt/dl/004_preg_replace.php index 74807b08ea..8d5439a88d 100644 --- a/tests/phpt/dl/004_preg_replace.php +++ b/tests/phpt/dl/004_preg_replace.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark Date: Mon, 1 Dec 2025 12:59:00 +0300 Subject: [PATCH 03/13] inline value_or_else --- .../stdlib/string/regex-functions.cpp | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 7a5ac72aaa..5ab5b9ba03 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -119,16 +119,6 @@ std::optional try_get_backref(std::string_view preg_replacement) noexce return backref{preg_replacement.substr(0, 2)}; } -template -requires std::convertible_to, T> -auto value_or_else(std::optional&& opt, F&& alternative_func) noexcept -> T { - if (opt.has_value()) { - return std::move(*std::move(opt)); - } else { - return std::forward(alternative_func)(); - } -} - using replacement_term = std::variant; class preg_replacement_unescaper { @@ -172,20 +162,23 @@ class preg_replacement_unescaper { }) .value_or('$'); - case '\\': - return value_or_else(try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { + case '\\': { + auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { auto digits_end_pos = value.digits.size(); preg_replacement = preg_replacement.substr(digits_end_pos); return value; - }), - [this] noexcept { - auto res{preg_replacement.front()}; - if (res == '$' || res == '\\') { - preg_replacement = preg_replacement.substr(1); - return res; - } - return '\\'; - }); + })}; + if (back_reference_opt.has_value()) { + return *std::move(back_reference_opt); + } else { + auto res{preg_replacement.front()}; + if (res == '$' || res == '\\') { + preg_replacement = preg_replacement.substr(1); + return res; + } + return '\\'; + } + } default: return first_char; } From e7f20196dba653caa114469e8c640daf3d0f3f91 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 13:01:49 +0300 Subject: [PATCH 04/13] unescaper -> decoder, escaper -> encoder --- runtime-light/stdlib/string/regex-functions.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 5ab5b9ba03..d7ceab1b0e 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -121,11 +121,11 @@ std::optional try_get_backref(std::string_view preg_replacement) noexce using replacement_term = std::variant; -class preg_replacement_unescaper { +class preg_replacement_decoder { std::string_view preg_replacement; public: - preg_replacement_unescaper(std::string_view preg_replacement) + preg_replacement_decoder(std::string_view preg_replacement) : preg_replacement{preg_replacement} {} bool has_next() const noexcept { @@ -185,7 +185,7 @@ class preg_replacement_unescaper { } }; -class pcre2_replacement_escaper { +class pcre2_replacement_encoder { kphp::stl::string pcre2_replacement{}; public: @@ -702,12 +702,12 @@ Optional f$preg_replace(const string& pattern, const string& replacement } // we need to replace PHP's back references with PCRE2 ones - auto unescaper{preg_replacement_unescaper{{replacement.c_str(), replacement.size()}}}; - pcre2_replacement_escaper escaper{}; - while (unescaper.has_next()) { - escaper.escape_term(unescaper.unescape_term()); + auto decoder{preg_replacement_decoder{{replacement.c_str(), replacement.size()}}}; + pcre2_replacement_encoder encoder{}; + while (decoder.has_next()) { + encoder.escape_term(decoder.unescape_term()); } - auto& pcre2_replacement{escaper.result()}; + auto& pcre2_replacement{encoder.result()}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; From 82b645f01134ea15c0438bbb1e827ac6be922ebb Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 13:18:32 +0300 Subject: [PATCH 05/13] unescaper -> decoder, escaper -> encoder cc --- .../stdlib/string/regex-functions.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index d7ceab1b0e..f51009c7f4 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -132,7 +132,7 @@ class preg_replacement_decoder { return !preg_replacement.empty(); } - replacement_term unescape_term() noexcept { + replacement_term decode_term() noexcept { auto first_char{preg_replacement.front()}; preg_replacement = preg_replacement.substr(1); if (preg_replacement.empty()) { @@ -140,6 +140,7 @@ class preg_replacement_decoder { } switch (first_char) { case '$': + // $1, ${1} if (preg_replacement.front() == '{') { return try_get_backref(preg_replacement.substr(1)) .and_then([this](auto value) noexcept -> std::optional { @@ -163,6 +164,7 @@ class preg_replacement_decoder { .value_or('$'); case '\\': { + // \1 auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { auto digits_end_pos = value.digits.size(); preg_replacement = preg_replacement.substr(digits_end_pos); @@ -189,24 +191,20 @@ class pcre2_replacement_encoder { kphp::stl::string pcre2_replacement{}; public: - void operator()(char c) noexcept { + void encode_char(char c) noexcept { pcre2_replacement.push_back(c); if (c == '$') { pcre2_replacement.push_back('$'); } } - void operator()(backref backreference) noexcept { + void encode_backref(backref backreference) noexcept { pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); pcre2_replacement.append("${"); pcre2_replacement.append(backreference.digits); pcre2_replacement.append("}"); } - void escape_term(const replacement_term& term) noexcept { - std::visit(*this, term); - } - kphp::stl::string& result() noexcept { return pcre2_replacement; } @@ -705,7 +703,11 @@ Optional f$preg_replace(const string& pattern, const string& replacement auto decoder{preg_replacement_decoder{{replacement.c_str(), replacement.size()}}}; pcre2_replacement_encoder encoder{}; while (decoder.has_next()) { - encoder.escape_term(decoder.unescape_term()); + if (auto term{decoder.decode_term()}; std::holds_alternative(term)) { + encoder.encode_char(std::get(term)); + } else { + encoder.encode_backref(std::get(term)); + } } auto& pcre2_replacement{encoder.result()}; From 9352ec8c7154a9ec48dde5c369a3bd810bcfacd3 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 17:21:11 +0300 Subject: [PATCH 06/13] decode -> parse, encode -> format --- .../stdlib/string/regex-functions.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index f51009c7f4..8300cdf448 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -121,18 +121,18 @@ std::optional try_get_backref(std::string_view preg_replacement) noexce using replacement_term = std::variant; -class preg_replacement_decoder { +class preg_replacement_parser { std::string_view preg_replacement; public: - preg_replacement_decoder(std::string_view preg_replacement) + preg_replacement_parser(std::string_view preg_replacement) : preg_replacement{preg_replacement} {} bool has_next() const noexcept { return !preg_replacement.empty(); } - replacement_term decode_term() noexcept { + replacement_term parse_term() noexcept { auto first_char{preg_replacement.front()}; preg_replacement = preg_replacement.substr(1); if (preg_replacement.empty()) { @@ -187,18 +187,18 @@ class preg_replacement_decoder { } }; -class pcre2_replacement_encoder { +class pcre2_replacement_formatter { kphp::stl::string pcre2_replacement{}; public: - void encode_char(char c) noexcept { + void format_char(char c) noexcept { pcre2_replacement.push_back(c); if (c == '$') { pcre2_replacement.push_back('$'); } } - void encode_backref(backref backreference) noexcept { + void format_backref(backref backreference) noexcept { pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); pcre2_replacement.append("${"); pcre2_replacement.append(backreference.digits); @@ -700,16 +700,16 @@ Optional f$preg_replace(const string& pattern, const string& replacement } // we need to replace PHP's back references with PCRE2 ones - auto decoder{preg_replacement_decoder{{replacement.c_str(), replacement.size()}}}; - pcre2_replacement_encoder encoder{}; - while (decoder.has_next()) { - if (auto term{decoder.decode_term()}; std::holds_alternative(term)) { - encoder.encode_char(std::get(term)); + auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; + pcre2_replacement_formatter formatter{}; + while (parser.has_next()) { + if (auto term{parser.parse_term()}; std::holds_alternative(term)) { + formatter.format_char(std::get(term)); } else { - encoder.encode_backref(std::get(term)); + formatter.format_backref(std::get(term)); } } - auto& pcre2_replacement{encoder.result()}; + auto& pcre2_replacement{formatter.result()}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; From 0cb923b3d6629a999441db2c7720ef0f4dd19927 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 13:04:25 +0300 Subject: [PATCH 07/13] inline pcre2_replacement_formatter --- .../stdlib/string/regex-functions.cpp | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 8300cdf448..cc3429cd9e 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -187,29 +187,6 @@ class preg_replacement_parser { } }; -class pcre2_replacement_formatter { - kphp::stl::string pcre2_replacement{}; - -public: - void format_char(char c) noexcept { - pcre2_replacement.push_back(c); - if (c == '$') { - pcre2_replacement.push_back('$'); - } - } - - void format_backref(backref backreference) noexcept { - pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); - pcre2_replacement.append("${"); - pcre2_replacement.append(backreference.digits); - pcre2_replacement.append("}"); - } - - kphp::stl::string& result() noexcept { - return pcre2_replacement; - } -}; - bool parse_regex(RegexInfo& regex_info) noexcept { if (regex_info.regex.empty()) { kphp::log::warning("empty regex"); @@ -701,15 +678,22 @@ Optional f$preg_replace(const string& pattern, const string& replacement // we need to replace PHP's back references with PCRE2 ones auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; - pcre2_replacement_formatter formatter{}; + kphp::stl::string pcre2_replacement{}; while (parser.has_next()) { if (auto term{parser.parse_term()}; std::holds_alternative(term)) { - formatter.format_char(std::get(term)); + auto c{std::get(term)}; + pcre2_replacement.push_back(c); + if (c == '$') { + pcre2_replacement.push_back('$'); + } } else { - formatter.format_backref(std::get(term)); + auto backreference{std::get(term)}; + pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); + pcre2_replacement.append("${"); + pcre2_replacement.append(backreference.digits); + pcre2_replacement.append("}"); } } - auto& pcre2_replacement{formatter.result()}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; From 4b431806bd2d6b1d55ee8de469be2feee588a9e0 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 13:10:30 +0300 Subject: [PATCH 08/13] explicit noexcept preg_replacement_parser constructor --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index cc3429cd9e..6a05bdf97b 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -125,7 +125,7 @@ class preg_replacement_parser { std::string_view preg_replacement; public: - preg_replacement_parser(std::string_view preg_replacement) + explicit preg_replacement_parser(std::string_view preg_replacement) noexcept : preg_replacement{preg_replacement} {} bool has_next() const noexcept { From f925e7eee2f0edd4c9df70bf25e72ea6dd65b3e9 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 13:13:51 +0300 Subject: [PATCH 09/13] remove std::move --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 6a05bdf97b..ac45402069 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -171,7 +171,7 @@ class preg_replacement_parser { return value; })}; if (back_reference_opt.has_value()) { - return *std::move(back_reference_opt); + return *back_reference_opt; } else { auto res{preg_replacement.front()}; if (res == '$' || res == '\\') { From 7543b22d5e35024c1eec0712076a2b6c52685f42 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 14:26:32 +0300 Subject: [PATCH 10/13] rewrite preg_replacement_parser as range --- .../stdlib/string/regex-functions.cpp | 74 ++++++++++++++++--- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index ac45402069..24a4cd3972 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -124,15 +124,7 @@ using replacement_term = std::variant; class preg_replacement_parser { std::string_view preg_replacement; -public: - explicit preg_replacement_parser(std::string_view preg_replacement) noexcept - : preg_replacement{preg_replacement} {} - - bool has_next() const noexcept { - return !preg_replacement.empty(); - } - - replacement_term parse_term() noexcept { + replacement_term parse_term_internal() noexcept { auto first_char{preg_replacement.front()}; preg_replacement = preg_replacement.substr(1); if (preg_replacement.empty()) { @@ -185,6 +177,66 @@ class preg_replacement_parser { return first_char; } } + +public: + explicit preg_replacement_parser(std::string_view preg_replacement) noexcept + : preg_replacement{preg_replacement} {} + + struct iterator { + preg_replacement_parser* parser{nullptr}; + replacement_term current_term{'\0'}; + + using difference_type = std::ptrdiff_t; + using value_type = replacement_term; + using reference = const replacement_term&; + using pointer = const replacement_term*; + using iterator_category = std::input_iterator_tag; + + iterator() noexcept = default; + explicit iterator(preg_replacement_parser* p) noexcept + : parser{p} { + if (parser->preg_replacement.empty()) { + parser = nullptr; + } else { + current_term = parser->parse_term_internal(); + } + } + + reference operator*() const noexcept { + return current_term; + } + pointer operator->() const noexcept { + return std::addressof(current_term); + } + + iterator& operator++() noexcept { + if (!parser->preg_replacement.empty()) { + current_term = parser->parse_term_internal(); + } else { + parser = nullptr; + } + return *this; + } + iterator operator++(int) noexcept { + iterator temp = *this; + ++(*this); + return temp; + } + + friend bool operator==(const iterator& a, const iterator& b) noexcept { + return a.parser == b.parser; + } + friend bool operator!=(const iterator& a, const iterator& b) noexcept { + return !(a == b); + } + }; + + iterator begin() noexcept { + return iterator{this}; + } + iterator end() noexcept { + return iterator{}; + } }; bool parse_regex(RegexInfo& regex_info) noexcept { @@ -679,8 +731,8 @@ Optional f$preg_replace(const string& pattern, const string& replacement // we need to replace PHP's back references with PCRE2 ones auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; kphp::stl::string pcre2_replacement{}; - while (parser.has_next()) { - if (auto term{parser.parse_term()}; std::holds_alternative(term)) { + for (const auto& term : parser) { + if (std::holds_alternative(term)) { auto c{std::get(term)}; pcre2_replacement.push_back(c); if (c == '$') { From fed3f769a30db5ee9611d33facb915a6d481cd4e Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 16:15:38 +0300 Subject: [PATCH 11/13] use std::isdigit --- runtime-light/stdlib/string/regex-functions.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 24a4cd3972..fc8df95f1a 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -108,11 +109,11 @@ int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subjec } std::optional try_get_backref(std::string_view preg_replacement) noexcept { - if (preg_replacement.empty() || preg_replacement[0] < '0' || preg_replacement[0] > '9') { + if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) { return std::nullopt; } - if (preg_replacement.size() == 1 || preg_replacement[1] < '0' || preg_replacement[1] > '9') { + if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) { return backref{preg_replacement.substr(0, 1)}; } From ba98c715b22c56dd928b81de3c947360d21fc2a3 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 4 Dec 2025 14:37:39 +0300 Subject: [PATCH 12/13] add assertion --- runtime-light/stdlib/string/regex-functions.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index fc8df95f1a..00bf6d53df 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -126,6 +126,7 @@ class preg_replacement_parser { std::string_view preg_replacement; replacement_term parse_term_internal() noexcept { + kphp::log::assertion(!preg_replacement.empty()); auto first_char{preg_replacement.front()}; preg_replacement = preg_replacement.substr(1); if (preg_replacement.empty()) { From e3309d664e66b4c4e5b6cd58750fd28e63db1e26 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Fri, 5 Dec 2025 12:25:21 +0300 Subject: [PATCH 13/13] using backref = std::string_view; --- runtime-light/stdlib/string/regex-functions.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 00bf6d53df..7bbe643afe 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -35,6 +35,7 @@ constexpr size_t ERROR_BUFFER_LENGTH = 256; enum class trailing_unmatch : uint8_t { skip, include }; +using backref = std::string_view; using regex_pcre2_group_names_t = kphp::stl::vector; struct RegexInfo final { @@ -73,10 +74,6 @@ struct RegexInfo final { replacement(replacement_) {} }; -struct backref { - std::string_view digits; -}; - template requires((std::is_same_v && ...) && sizeof...(Args) > 0) bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { @@ -138,9 +135,9 @@ class preg_replacement_parser { if (preg_replacement.front() == '{') { return try_get_backref(preg_replacement.substr(1)) .and_then([this](auto value) noexcept -> std::optional { - auto digits_end_pos = 1 + value.digits.size(); + auto digits_end_pos = 1 + value.size(); if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { - preg_replacement = preg_replacement.substr(1 + value.digits.size() + 1); + preg_replacement = preg_replacement.substr(1 + value.size() + 1); return value; } @@ -151,7 +148,7 @@ class preg_replacement_parser { return try_get_backref(preg_replacement) .transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos = value.digits.size(); + auto digits_end_pos = value.size(); preg_replacement = preg_replacement.substr(digits_end_pos); return value; }) @@ -160,7 +157,7 @@ class preg_replacement_parser { case '\\': { // \1 auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos = value.digits.size(); + auto digits_end_pos = value.size(); preg_replacement = preg_replacement.substr(digits_end_pos); return value; })}; @@ -742,9 +739,9 @@ Optional f$preg_replace(const string& pattern, const string& replacement } } else { auto backreference{std::get(term)}; - pcre2_replacement.reserve(pcre2_replacement.size() + backreference.digits.size() + 3); + pcre2_replacement.reserve(pcre2_replacement.size() + backreference.size() + 3); pcre2_replacement.append("${"); - pcre2_replacement.append(backreference.digits); + pcre2_replacement.append(backreference); pcre2_replacement.append("}"); } }