From f6a626d7dd707a397f2af4ebc29e2c25694f576c Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 27 Nov 2025 12:31:45 +0300 Subject: [PATCH 01/68] split tests --- .idea/encodings.xml | 4 +- tests/phpt/dl/002_preg_replace_callback.php | 82 +++++ tests/phpt/dl/003_preg_match_all.php | 213 +++++++++++ tests/phpt/dl/004_preg_replace.php | 91 +++++ tests/phpt/dl/496_regex.php | 378 -------------------- 5 files changed, 389 insertions(+), 379 deletions(-) create mode 100644 tests/phpt/dl/002_preg_replace_callback.php create mode 100644 tests/phpt/dl/003_preg_match_all.php create mode 100644 tests/phpt/dl/004_preg_replace.php diff --git a/.idea/encodings.xml b/.idea/encodings.xml index ea72052b66..f8989491c4 100644 --- a/.idea/encodings.xml +++ b/.idea/encodings.xml @@ -1,9 +1,11 @@ - + + + diff --git a/tests/phpt/dl/002_preg_replace_callback.php b/tests/phpt/dl/002_preg_replace_callback.php new file mode 100644 index 0000000000..16c97a8946 --- /dev/null +++ b/tests/phpt/dl/002_preg_replace_callback.php @@ -0,0 +1,82 @@ +@ok callback benchmark k2_skip +'.$input[1].''; + } + + + $res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count); + var_dump ($count); + return (string)$res; + +} + +$output = parseTagsRecursive($input); + +echo $output, "\n"; + + +/** + * @kphp-required + * @param string[] $x + * @return string + */ +function g($x) { + return "'{$x[0]}'"; +} + +var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb'))); + +@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz')))); + +/** + * @kphp-required + * @param string[] $m + * @return string + */ +function tmp($m) { + return strtolower($m[0]); +} + +var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC')); + +var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde')); diff --git a/tests/phpt/dl/003_preg_match_all.php b/tests/phpt/dl/003_preg_match_all.php new file mode 100644 index 0000000000..36271182ad --- /dev/null +++ b/tests/phpt/dl/003_preg_match_all.php @@ -0,0 +1,213 @@ +@ok callback benchmark k2_skip +~', 'This is no more', $v)); var_dump ($v); +var_dump (preg_match_all ('~.*?~', 'This', $v)); var_dump ($v); +var_dump (preg_match_all ('~.*~', 'This', $v)); var_dump ($v); + +var_dump (preg_match_all ('~<.*?>~', 'This is no more', $v)); var_dump ($v); + +var_dump (preg_match_all ('~(\d+|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~(\d+\|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d)+\|)~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d)+\|)+~', '12|34|567|', $v)); var_dump ($v); +var_dump (preg_match_all ('~((\d+)\|)+~', '12|34|567|', $v)); var_dump ($v); + + +$html = "bold textclick me"; + +preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $html, $matches, PREG_SET_ORDER); + +foreach ($matches as $val) { + echo "matched: " . $val[0] . "\n"; + echo "part 1: " . $val[1] . "\n"; + echo "part 2: " . $val[2] . "\n"; + echo "part 3: " . $val[3] . "\n"; + echo "part 4: " . $val[4] . "\n\n"; +} + +preg_match_all("/\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}/x", + "Call 555-1212 or 1-800-555-1212", $phones); + +$str = <<\w+): (?P\d+)/', $str, $matches); + +print_r($matches); + + +$str0 = <<1)?(?P[a-c]+):() (?P\d+)(?Pa)?)', '=A=i', "/(a)?/") as $pattern) { + foreach (array('((1)?([a-c]+):() (\d+)(a)?)', '=A=i', "/(a)?/") as $pattern) { + foreach (array($str0, '', "a", "1abAcaba", "dad") as $str) { + preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match_all($pattern, $str, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_PATTERN_ORDER)"); + var_dump ($matches); + } + + preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER); + if ($i == 0) { + var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER)"); + var_dump ($matches); + } + + preg_match ($pattern, $str, $matches, PREG_OFFSET_CAPTURE); + if ($i == 0) { + var_dump ("preg_match ($pattern, $str, PREG_OFFSET_CAPTURE)"); + var_dump ($matches); + } + + preg_match ($pattern, $str, $matches); + if ($i == 0) { + var_dump ("preg_match ($pattern, $str)"); + var_dump ($matches); + } + } + } +} + + +foreach (array(PREG_PATTERN_ORDER, PREG_SET_ORDER) as $flag) { + var_dump(preg_match_all('~ + (?P + (?P(\d{2})?\d\d) - + (?P(?:\d\d|[a-zA-Z]{2,3})) - + (?P[0-3]?\d)) + ~x', + '2006-05-13 e outra data: "12-Aug-37"', $m, $flag)); + + var_dump($m); +} + + +var_dump(preg_match_all('/((?:(?:unsigned|struct)\s+)?\w+)(?:\s*(\*+)\s+|\s+(\**))(\w+(?:\[\s*\w*\s*\])?)\s*(?:(=)[^,;]+)?((?:\s*,\s*\**\s*\w+(?:\[\s*\w*\s*\])?\s*(?:=[^,;]+)?)*)\s*;/', 'unsigned int xpto = 124; short a, b;', $m, PREG_SET_ORDER)); +var_dump($m); + +var_dump(preg_match_all('/(?:\([^)]+\))?(&?)([\w>.()-]+(?:\[\w+\])?)\s*,?((?:\)*\s*=)?)/', '&a, b, &c', $m, PREG_SET_ORDER)); +var_dump($m); + +var_dump(preg_match_all('/zend_parse_parameters(?:_ex\s*\([^,]+,[^,]+|\s*\([^,]+),\s*"([^"]*)"\s*,\s*([^{;]*)/', 'zend_parse_parameters( 0, "addd|s/", a, b, &c);', $m, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)); +var_dump($m); + + +$sampledata = " +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swfbitmap_init': +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:323: warning: assignment from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swftextfield_setFont': +/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:2597: warning: passing arg 2 of `SWFTextField_setFont' from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/oci8/oci8.c:1027: warning: `oci_ping' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getpgid': +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:484: warning: implicit declaration of function `getpgid' +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getsid': +/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:506: warning: implicit declaration of function `getsid' +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_read_files': +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:302: warning: implicit declaration of function `pread' +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_write_files': +/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:340: warning: implicit declaration of function `pwrite' +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_get_option': +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1862: warning: unused variable `timeout' +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_set_option': +/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1941: warning: unused variable `timeout' +/p2/var/php_gcov/PHP_4_4/regex/regexec.c:19: warning: `nope' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/standard/exec.c:50: warning: `php_make_safe_mode_command' defined but not used +/p2/var/php_gcov/PHP_4_4/ext/standard/image.c: In function `php_handle_jpc': +/p2/var/php_gcov/PHP_4_4/ext/standard/image.c:604: warning: unused variable `dummy_int' +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c: In function `php_gd_parse': +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c:1138: warning: implicit declaration of function `php_gd_lex' +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y: At top level: +/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y:864: warning: return type defaults to `int' +/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c: In function `zif_msg_receive': +/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c:318: warning: passing arg 2 of `php_var_unserialize' from incompatible pointer type +/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c: In function `zif_yp_err_string': +/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c:372: warning: assignment discards qualifiers from pointer target type +Zend/zend_language_scanner.c:5944: warning: `yy_fatal_error' defined but not used +Zend/zend_language_scanner.c:2627: warning: `yy_last_accepting_state' defined but not used +Zend/zend_language_scanner.c:2628: warning: `yy_last_accepting_cpos' defined but not used +Zend/zend_language_scanner.c:2634: warning: `yy_more_flag' defined but not used +Zend/zend_language_scanner.c:2635: warning: `yy_more_len' defined but not used +Zend/zend_language_scanner.c:5483: warning: `yyunput' defined but not used +Zend/zend_language_scanner.c:5929: warning: `yy_top_state' defined but not used +conflicts: 2 shift/reduce +Zend/zend_ini_scanner.c:457: warning: `yy_last_accepting_state' defined but not used +Zend/zend_ini_scanner.c:458: warning: `yy_last_accepting_cpos' defined but not used +Zend/zend_ini_scanner.c:1361: warning: `yyunput' defined but not used +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c: In function `_safe_emalloc': +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 3) +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 4) +/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 5) +/p2/var/php_gcov/PHP_4_4/Zend/zend_ini.c:338: warning: `zend_ini_displayer_cb' defined but not used +ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': +/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' +ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': +/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' +ext/ming/ming.o(.text+0xc115): In function `zim_swfmovie_namedAnchor': +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2207: undefined reference to `SWFMovie_namedAnchor' +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2209: undefined reference to `SWFMovie_xpto' +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2259: undefined reference to `SWFMovie_foo' +ext/ming/ming.o(.text+0x851): In function `zif_ming_setSWFCompression': +/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:154: undefined reference to `Ming_setSWFCompression' +"; + +$gcc_regex = '/^((.+)(\(\.text\+0x[[:xdigit:]]+\))?: In function [`\'](\w+)\':\s+)?'. + '((?(1)(?(3)[^:\n]+|\2)|[^:\n]+)):(\d+): (?:(error|warning):\s+)?(.+)'. + str_repeat('(?:\s+\5:(\d+): (?:(error|warning):\s+)?(.+))?', 99). // capture up to 100 errors + '/m'; + + +var_dump(preg_match_all($gcc_regex, $sampledata, $m, PREG_SET_ORDER)); +print_r($m); + + +var_dump(preg_match_all('|(\w+)://([^\s"<]*[\w+#?/&=])|', "This is a text string", $matches, PREG_SET_ORDER)); +var_dump($matches); + +/** + * @return mixed + */ +function func1(){ + $string = 'what the word and the other word the'; + preg_match_all('/(?Pthe)/', $string, $matches); + return $matches['word']; +} +$words = func1(); +var_dump($words); + + +$pattern = +"/\s([\w_\.\/]+)(?:=([\'\"]?(?:[\w\d\s\?=\(\)\.,'_#\/\\:;&-]|(?:\\\\\"|\\\')?)+[\'\"]?))?/"; +$context = ""; + +$match = array(); + +if ($result = preg_match_all($pattern, $context, $match)) +{ +var_dump($result); +var_dump($match); +} + + +var_dump(preg_match_all('/\d+/', '123 456 789 012', $match, 0)); +var_dump($match); diff --git a/tests/phpt/dl/004_preg_replace.php b/tests/phpt/dl/004_preg_replace.php new file mode 100644 index 0000000000..74807b08ea --- /dev/null +++ b/tests/phpt/dl/004_preg_replace.php @@ -0,0 +1,91 @@ +@ok callback benchmark k2_skip +~', 'This is no more', $v)); var_dump ($v); -var_dump (preg_match_all ('~.*?~', 'This', $v)); var_dump ($v); -var_dump (preg_match_all ('~.*~', 'This', $v)); var_dump ($v); - -var_dump (preg_match_all ('~<.*?>~', 'This is no more', $v)); var_dump ($v); - -var_dump (preg_match_all ('~(\d+|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~(\d+\|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d)+\|)~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d)+\|)+~', '12|34|567|', $v)); var_dump ($v); -var_dump (preg_match_all ('~((\d+)\|)+~', '12|34|567|', $v)); var_dump ($v); - -var_dump (preg_replace ('~|q~', '{\0}', 'eq')); -var_dump (preg_replace ('~|q~', '{\0}', 'ex')); - -var_dump (preg_replace ('~|q~', 'w', 'e')); -var_dump (preg_replace ('~|q~', 'w', 'q')); -/* bug in PHP -var_dump (preg_replace ('~|й~u', 'п', 'р')); -*/ -var_dump (preg_replace ('~|й~u', 'п', 'й')); var_dump (preg_split ('~|й~u', 'п')); var_dump (preg_split ('~|й~u', 'й')); -define('RE_URL_PATTERN', '(?bold textclick me"; - -preg_match_all("/(<([\w]+)[^>]*>)(.*?)(<\/\\2>)/", $html, $matches, PREG_SET_ORDER); - -foreach ($matches as $val) { - echo "matched: " . $val[0] . "\n"; - echo "part 1: " . $val[1] . "\n"; - echo "part 2: " . $val[2] . "\n"; - echo "part 3: " . $val[3] . "\n"; - echo "part 4: " . $val[4] . "\n\n"; -} - -preg_match_all("/\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}/x", - "Call 555-1212 or 1-800-555-1212", $phones); - -$str = <<\w+): (?P\d+)/', $str, $matches); - -print_r($matches); - $keywords = preg_split("/[\s,]+/", "hypertext language, programming"); print_r($keywords); @@ -124,58 +16,6 @@ function prcConvertHyperref($matches) { $chars = preg_split('/ /', $str, -1, PREG_SPLIT_OFFSET_CAPTURE); print_r($chars); -var_dump (preg_replace ('~a|~', 'b', 'a')); -var_dump (preg_replace ('~a|~', 'a', 'b')); - -$str0 = <<1)?(?P[a-c]+):() (?P\d+)(?Pa)?)', '=A=i', "/(a)?/") as $pattern) { - foreach (array('((1)?([a-c]+):() (\d+)(a)?)', '=A=i', "/(a)?/") as $pattern) { - foreach (array($str0, '', "a", "1abAcaba", "dad") as $str) { - preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match_all($pattern, $str, PREG_PATTERN_ORDER | PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_PATTERN_ORDER); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_PATTERN_ORDER)"); - var_dump ($matches); - } - - preg_match_all ($pattern, $str, $matches, PREG_SET_ORDER); - if ($i == 0) { - var_dump ("preg_match_all ($pattern, $str, PREG_SET_ORDER)"); - var_dump ($matches); - } - - preg_match ($pattern, $str, $matches, PREG_OFFSET_CAPTURE); - if ($i == 0) { - var_dump ("preg_match ($pattern, $str, PREG_OFFSET_CAPTURE)"); - var_dump ($matches); - } - - preg_match ($pattern, $str, $matches); - if ($i == 0) { - var_dump ("preg_match ($pattern, $str)"); - var_dump ($matches); - } - } - } -} - foreach (array('2006-05-13', '06-12-12', 'data: "12-Aug-87"') as $s) { var_dump(preg_match('~ (?P @@ -187,123 +27,6 @@ function prcConvertHyperref($matches) { var_dump($m); } -foreach (array(PREG_PATTERN_ORDER, PREG_SET_ORDER) as $flag) { - var_dump(preg_match_all('~ - (?P - (?P(\d{2})?\d\d) - - (?P(?:\d\d|[a-zA-Z]{2,3})) - - (?P[0-3]?\d)) - ~x', - '2006-05-13 e outra data: "12-Aug-37"', $m, $flag)); - - var_dump($m); -} - - -var_dump(preg_match_all('/((?:(?:unsigned|struct)\s+)?\w+)(?:\s*(\*+)\s+|\s+(\**))(\w+(?:\[\s*\w*\s*\])?)\s*(?:(=)[^,;]+)?((?:\s*,\s*\**\s*\w+(?:\[\s*\w*\s*\])?\s*(?:=[^,;]+)?)*)\s*;/', 'unsigned int xpto = 124; short a, b;', $m, PREG_SET_ORDER)); -var_dump($m); - -var_dump(preg_match_all('/(?:\([^)]+\))?(&?)([\w>.()-]+(?:\[\w+\])?)\s*,?((?:\)*\s*=)?)/', '&a, b, &c', $m, PREG_SET_ORDER)); -var_dump($m); - -var_dump(preg_match_all('/zend_parse_parameters(?:_ex\s*\([^,]+,[^,]+|\s*\([^,]+),\s*"([^"]*)"\s*,\s*([^{;]*)/', 'zend_parse_parameters( 0, "addd|s/", a, b, &c);', $m, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)); -var_dump($m); - -var_dump(preg_replace(array('@//.*@', '@/\*.*\*/@sU'), array('', 'preg_replace("/[^\r\n]+/", "", \'$0\')'), "hello\n//x \n/*\ns\n*/")); - -$sampledata = " -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swfbitmap_init': -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:323: warning: assignment from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c: In function `zif_swftextfield_setFont': -/p2/var/php_gcov/PHP_4_4/ext/ming/ming.c:2597: warning: passing arg 2 of `SWFTextField_setFont' from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/oci8/oci8.c:1027: warning: `oci_ping' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getpgid': -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:484: warning: implicit declaration of function `getpgid' -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c: In function `zif_posix_getsid': -/p2/var/php_gcov/PHP_4_4/ext/posix/posix.c:506: warning: implicit declaration of function `getsid' -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_read_files': -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:302: warning: implicit declaration of function `pread' -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c: In function `ps_write_files': -/p2/var/php_gcov/PHP_4_4/ext/session/mod_files.c:340: warning: implicit declaration of function `pwrite' -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_get_option': -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1862: warning: unused variable `timeout' -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c: In function `zif_socket_set_option': -/p2/var/php_gcov/PHP_4_4/ext/sockets/sockets.c:1941: warning: unused variable `timeout' -/p2/var/php_gcov/PHP_4_4/regex/regexec.c:19: warning: `nope' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/standard/exec.c:50: warning: `php_make_safe_mode_command' defined but not used -/p2/var/php_gcov/PHP_4_4/ext/standard/image.c: In function `php_handle_jpc': -/p2/var/php_gcov/PHP_4_4/ext/standard/image.c:604: warning: unused variable `dummy_int' -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c: In function `php_gd_parse': -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.c:1138: warning: implicit declaration of function `php_gd_lex' -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y: At top level: -/p2/var/php_gcov/PHP_4_4/ext/standard/parsedate.y:864: warning: return type defaults to `int' -/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c: In function `zif_msg_receive': -/p2/var/php_gcov/PHP_4_4/ext/sysvmsg/sysvmsg.c:318: warning: passing arg 2 of `php_var_unserialize' from incompatible pointer type -/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c: In function `zif_yp_err_string': -/p2/var/php_gcov/PHP_4_4/ext/yp/yp.c:372: warning: assignment discards qualifiers from pointer target type -Zend/zend_language_scanner.c:5944: warning: `yy_fatal_error' defined but not used -Zend/zend_language_scanner.c:2627: warning: `yy_last_accepting_state' defined but not used -Zend/zend_language_scanner.c:2628: warning: `yy_last_accepting_cpos' defined but not used -Zend/zend_language_scanner.c:2634: warning: `yy_more_flag' defined but not used -Zend/zend_language_scanner.c:2635: warning: `yy_more_len' defined but not used -Zend/zend_language_scanner.c:5483: warning: `yyunput' defined but not used -Zend/zend_language_scanner.c:5929: warning: `yy_top_state' defined but not used -conflicts: 2 shift/reduce -Zend/zend_ini_scanner.c:457: warning: `yy_last_accepting_state' defined but not used -Zend/zend_ini_scanner.c:458: warning: `yy_last_accepting_cpos' defined but not used -Zend/zend_ini_scanner.c:1361: warning: `yyunput' defined but not used -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c: In function `_safe_emalloc': -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 3) -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 4) -/p2/var/php_gcov/PHP_4_4/Zend/zend_alloc.c:237: warning: long int format, size_t arg (arg 5) -/p2/var/php_gcov/PHP_4_4/Zend/zend_ini.c:338: warning: `zend_ini_displayer_cb' defined but not used -ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': -/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' -ext/mysql/libmysql/my_tempnam.o(.text+0x80): In function `my_tempnam': -/p2/var/php_gcov/PHP_4_4/ext/mysql/libmysql/my_tempnam.c:115: warning: the use of `tempnam' is dangerous, better use `mkstemp' -ext/ming/ming.o(.text+0xc115): In function `zim_swfmovie_namedAnchor': -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2207: undefined reference to `SWFMovie_namedAnchor' -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2209: undefined reference to `SWFMovie_xpto' -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:2259: undefined reference to `SWFMovie_foo' -ext/ming/ming.o(.text+0x851): In function `zif_ming_setSWFCompression': -/p2/var/php_gcov/PHP_5_2/ext/ming/ming.c:154: undefined reference to `Ming_setSWFCompression' -"; - -$gcc_regex = '/^((.+)(\(\.text\+0x[[:xdigit:]]+\))?: In function [`\'](\w+)\':\s+)?'. - '((?(1)(?(3)[^:\n]+|\2)|[^:\n]+)):(\d+): (?:(error|warning):\s+)?(.+)'. - str_repeat('(?:\s+\5:(\d+): (?:(error|warning):\s+)?(.+))?', 99). // capture up to 100 errors - '/m'; - - -var_dump(preg_match_all($gcc_regex, $sampledata, $m, PREG_SET_ORDER)); -print_r($m); - - -/** - * @kphp-required - * @param string[] $param - * @return string - */ -function cb($param) { - var_dump($param); - return "yes!"; -} - -#var_dump(preg_replace('', array(), '')); - -var_dump(preg_match_all('|(\w+)://([^\s"<]*[\w+#?/&=])|', "This is a text string", $matches, PREG_SET_ORDER)); -var_dump($matches); - -/** - * @return mixed - */ -function func1(){ - $string = 'what the word and the other word the'; - preg_match_all('/(?Pthe)/', $string, $matches); - return $matches['word']; -} -$words = func1(); -var_dump($words); $foo = 'bla bla bla'; @@ -318,18 +41,6 @@ function func1(){ var_dump(preg_match('@^(/([a-z]+))+$@', $subject, $m)); var_dump($m); var_dump(preg_match('@^(/(?:[a-z]+))+$@', $subject, $m)); var_dump($m); -$pattern = -"/\s([\w_\.\/]+)(?:=([\'\"]?(?:[\w\d\s\?=\(\)\.,'_#\/\\:;&-]|(?:\\\\\"|\\\')?)+[\'\"]?))?/"; -$context = ""; - -$match = array(); - -if ($result = preg_match_all($pattern, $context, $match)) -{ -var_dump($result); -var_dump($match); -} - $regex = '/(insert|drop|create|select|delete|update)([^;\']*('."('[^']*')+".')?)*(;|$)/i'; $sql = 'SELECT * FROM #__components'; @@ -343,24 +54,12 @@ function func1(){ */ - -var_dump(preg_replace(array('/\da(.)/ui', '@..@'), '$1', '12Abc')); -var_dump(preg_replace(array('/\da(.)/ui', '@(.)@'), '$1', array('x','a2aA', '1av2Ab'))); - - -var_dump(preg_replace(array('/[\w]+/'), array('$'), array('xyz', 'bdbd'))); -var_dump(preg_replace(array('/\s+/', '~[b-d]~'), array('$'), array('x y', 'bd bc'))); - - var_dump(preg_match('/\d+/', '123 456 789 012', $match, 0)); var_dump($match); var_dump(preg_match('/\d+/', '123 456 789 012', $match, 0)); var_dump($match); -var_dump(preg_match_all('/\d+/', '123 456 789 012', $match, 0)); -var_dump($match); - var_dump(preg_split('/PHP_(?:NAMED_)?(?:FUNCTION|METHOD)\s*\((\w+(?:,\s*\w+)?)\)/', "PHP_FUNCTION(s, preg_match)\n{\nlalala", -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); @@ -384,80 +83,3 @@ function func1(){ var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE)); var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); var_dump(preg_split('/(\d*)/', 'ab2c3u', -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE)); - - - - - -/* -PHP 5.2.0 - 5.3.6 bug -$text = '[CODE]<td align="$stylevar[right]">[/CODE]'; -$result = preg_replace(array('#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU', '#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU'), '', $text); -var_dump($text); -var_dump($result); - -$result = preg_replace('#\[(right)\](((?R)|[^[]+?|\[)*)\[/\\1\]#siU', '', $text); -var_dump($text); -var_dump($result); -*/ - - - - - - - -$input = "plain [indent] deep [indent] [abcd]deeper[/abcd] [/indent] deep [/indent] plain"; - -/** - * @param mixed $input - * @return string - */ -function parseTagsRecursive($input) -{ - global $count; - $regex = '#\[indent]((?:[^[]|\[(?!/?indent])|(?R))+)\[/indent]#'; - - if (is_array($input)) { - $input = '
'.$input[1].'
'; - } - - - $res = preg_replace_callback($regex, 'parseTagsRecursive', $input, -1, $count); - var_dump ($count); - return (string)$res; - -} - -$output = parseTagsRecursive($input); - -echo $output, "\n"; - - -/** - * @kphp-required - * @param string[] $x - * @return string - */ -function g($x) { - return "'{$x[0]}'"; -} - -var_dump(preg_replace_callback('@\b\w{1,2}\b@', 'g', array('a b3 bcd', 'v' => 'aksfjk', 12 => 'aa bb'))); - -@var_dump(preg_replace_callback('~\A.~', 'g', array(array('xyz')))); - -/** - * @kphp-required - * @param string[] $m - * @return string - */ -function tmp($m) { - return strtolower($m[0]); -} - -var_dump(preg_replace_callback('~\A.~', 'tmp', 'ABC')); - -var_dump(preg_replace_callback("/(ab)(cd)(e)/", "cb", 'abcde')); - - From 2e0adf92a936c2eddedbc40692a55ec8a9e6d897 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 27 Nov 2025 12:25:06 +0300 Subject: [PATCH 02/68] fix preg_match_all --- .../stdlib/string/regex-functions.cpp | 252 +++++++++++++++--- tests/phpt/dl/003_preg_match_all.php | 2 +- 2 files changed, 209 insertions(+), 45 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 0aba6c839d..69e6b0dc92 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -71,6 +71,37 @@ struct RegexInfo final { replacement(replacement_) {} }; +class pcre2_match_view { +public: + pcre2_match_view() = default; + + pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t rc) noexcept + : subject_data_{subject}, + ovector_ptr_{ovector}, + num_groups_{rc} {} + + int32_t size() const noexcept { + return num_groups_; + } + + std::optional get_group(int i) const noexcept { + kphp::log::assertion(i >= 0 && i < num_groups_ && ovector_ptr_); + PCRE2_SIZE start{ovector_ptr_[2 * i]}; + PCRE2_SIZE end{ovector_ptr_[2 * i + 1]}; + + if (start == PCRE2_UNSET) { + return std::nullopt; + } + + return subject_data_.substr(start, end - start); + } + +private: + std::string_view subject_data_; + PCRE2_SIZE* ovector_ptr_; + int32_t num_groups_; +}; + template requires((std::is_same_v && ...) && sizeof...(Args) > 0) bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { @@ -300,15 +331,14 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { return true; } -bool match_regex(RegexInfo& regex_info, size_t offset) noexcept { - regex_info.match_count = 0; +std::optional match_regex(const RegexInfo& regex_info, size_t offset, uint32_t match_options) noexcept { const auto& regex_state{RegexInstanceState::get()}; if (regex_info.regex_code == nullptr || !regex_state.match_context) [[unlikely]] { return false; } int32_t match_count{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, - regex_info.match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; + match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; // From https://www.pcre.org/current/doc/html/pcre2_match.html // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. @@ -318,10 +348,122 @@ bool match_regex(RegexInfo& regex_info, size_t offset) noexcept { kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); return false; } - regex_info.match_count = match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; - return true; + return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; } +class pcre2_iterator { +public: + using value_type = pcre2_match_view; + using difference_type = std::ptrdiff_t; + using reference = value_type; + using pointer = const value_type*; + + pcre2_iterator() noexcept + : regex_info_{nullptr}, + match_data_{nullptr}, + is_end_{true}, + is_valid_{true} {} + + pcre2_iterator(const RegexInfo& info, size_t match_from) noexcept + : regex_info_{std::addressof(info)}, + match_options_{info.match_options}, + current_offset_{match_from}, + is_end_{true}, + is_valid_{false} { + if (info.regex_code == nullptr) { + return; + } + + const auto& regex_state{RegexInstanceState::get()}; + match_data_ = regex_state.regex_pcre2_match_data.get(); + if (!match_data_) { + return; + } + + is_valid_ = true; + is_end_ = false; + increment(); + } + + bool is_terminal() const noexcept { + return !is_valid_ || is_end_; + } + + bool is_valid() const noexcept { + return is_valid_; + } + + reference operator*() const noexcept { + PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(match_data_)}; + return pcre2_match_view{regex_info_->subject, ovector, last_rc_}; + } + + pcre2_iterator& operator++() noexcept { + increment(); + return *this; + } + pcre2_iterator operator++(int) noexcept { + pcre2_iterator temp{*this}; + increment(); + return temp; + } + + bool operator==(const pcre2_iterator& other) const noexcept { + return is_terminal() && other.is_terminal(); + } + bool operator!=(const pcre2_iterator& other) const noexcept { + return !(*this == other); + } + +private: + void increment() noexcept { + kphp::log::trace("incrementing pcre2_iterator with offset={}", current_offset_); + auto& ri{*regex_info_}; + auto* const ovector{pcre2_get_ovector_pointer_8(match_data_)}; + + while (true) { + auto match_count_opt{match_regex(ri, current_offset_, match_options_)}; + if (!match_count_opt.has_value()) { + is_end_ = true; + is_valid_ = false; + return; + } + + last_rc_ = *match_count_opt; + + if (last_rc_ == 0) { + if (match_options_ == ri.match_options || current_offset_ == ri.subject.size()) { + is_end_ = true; + return; + } + ++current_offset_; + current_offset_ = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(current_offset_, ri.subject) : current_offset_; + match_options_ = ri.match_options; + continue; + } + + PCRE2_SIZE match_start{ovector[0]}; + PCRE2_SIZE match_end{ovector[1]}; + + current_offset_ = match_end; + if (match_end == match_start) { + match_options_ |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } else { + match_options_ = ri.match_options; + } + return; + } + } + + const RegexInfo* regex_info_{nullptr}; + uint64_t match_options_; + PCRE2_SIZE current_offset_; + pcre2_match_data_8* match_data_{nullptr}; + int32_t last_rc_{}; + bool is_end_{false}; + bool is_valid_{false}; +}; + // returns the ending offset of the entire match PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional> opt_matches, trailing_unmatch last_unmatched_policy) noexcept { @@ -455,30 +597,32 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; for (; regex_info.replace_count < limit; ++regex_info.replace_count) { - if (!match_regex(regex_info, match_offset)) [[unlikely]] { + auto match_count_opt{match_regex(regex_info, match_offset, regex_info.match_options)}; + if (!match_count_opt.has_value()) [[unlikely]] { return false; } + regex_info.match_count = *match_count_opt; if (regex_info.match_count == 0) { break; } const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; - const auto match_start{ovector[0]}; - const auto match_end{ovector[1]}; + const auto match_start_offset{ovector[0]}; + const auto match_end_offset{ovector[1]}; length_after_replace = buffer_length; - if (auto replace_one{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), - substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), - reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), - reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; - replace_one != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error {}", replace_one); + if (auto replace_one_rc{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), + substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), + reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), + reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; + replace_one_rc != 1) [[unlikely]] { + kphp::log::warning("pcre2_substitute error {}", replace_one_rc); return false; } - match_offset = match_end; - replacement_diff_acc += regex_info.replacement.size() - (match_end - match_start); - substitute_offset = match_end + replacement_diff_acc; + match_offset = match_end_offset; + replacement_diff_acc += regex_info.replacement.size() - (match_end_offset - match_start_offset); + substitute_offset = match_end_offset + replacement_diff_acc; str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast(length_after_replace)}; } @@ -499,15 +643,26 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt int64_t flags, int64_t offset) noexcept { RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; - bool success{valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)}; - success &= correct_offset(offset, regex_info.subject); - success &= parse_regex(regex_info); - success &= compile_regex(regex_info); - success &= collect_group_names(regex_info); - success &= match_regex(regex_info, offset); - if (!success) [[unlikely]] { + if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + return false; + } + if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } + if (!parse_regex(regex_info)) [[unlikely]] { + return false; + } + if (!compile_regex(regex_info)) [[unlikely]] { + return false; + } + if (!collect_group_names(regex_info)) [[unlikely]] { + return false; + } + auto match_count_opt = match_regex(regex_info, offset, regex_info.match_options); + if (!match_count_opt.has_value()) [[unlikely]] { + return false; + } + regex_info.match_count = *match_count_opt; std::optional> matches{}; if (opt_matches.has_value()) { @@ -525,12 +680,22 @@ Optional f$preg_match_all(const string& pattern, const string& subject, int64_t entire_match_count{}; RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; - bool success{valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, - kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)}; - success &= correct_offset(offset, regex_info.subject); - success &= parse_regex(regex_info); - success &= compile_regex(regex_info); - success &= collect_group_names(regex_info); + if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, kphp::regex::PREG_OFFSET_CAPTURE, + kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + return false; + } + if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { + return false; + } + if (!parse_regex(regex_info)) [[unlikely]] { + return false; + } + if (!compile_regex(regex_info)) [[unlikely]] { + return false; + } + if (!collect_group_names(regex_info)) [[unlikely]] { + return false; + } std::optional> matches{}; if (opt_matches.has_value()) { @@ -541,7 +706,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } // pre-init matches in case of pattern order - if (success && matches.has_value() && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { + if (matches.has_value() && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { auto& inner_ref{(*matches).get()}; const array init_val{}; for (const auto* group_name : regex_info.group_names) { @@ -552,23 +717,22 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } } - while (offset <= subject.size() && (success &= match_regex(regex_info, offset))) { - const auto next_offset{set_all_matches(regex_info, flags, matches)}; + pcre2_iterator it{regex_info, static_cast(offset)}; + if (!it.is_valid()) { + return false; + } + + pcre2_iterator end_it{}; + + for (; it != end_it; ++it) { + pcre2_match_view match_view{*it}; + regex_info.match_count = match_view.size(); + set_all_matches(regex_info, flags, matches); if (regex_info.match_count > 0) { ++entire_match_count; - if (next_offset == PCRE2_UNSET) [[unlikely]] { - break; - } else if (next_offset == offset) [[unlikely]] { - offset = next_offset + 1; - } else { - offset = next_offset; - } - } else { - ++offset; - offset = static_cast(regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(offset, regex_info.subject) : offset; } } - if (!success) [[unlikely]] { + if (!it.is_valid()) [[unlikely]] { return false; } diff --git a/tests/phpt/dl/003_preg_match_all.php b/tests/phpt/dl/003_preg_match_all.php index 36271182ad..96b2aa4a36 100644 --- a/tests/phpt/dl/003_preg_match_all.php +++ b/tests/phpt/dl/003_preg_match_all.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark ~', 'This is no more', $v)); var_dump ($v); From 04695df28399a6f27fad479d180318128206f318 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 12:06:49 +0300 Subject: [PATCH 03/68] int -> size_t --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 69e6b0dc92..928bd26685 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -84,7 +84,7 @@ class pcre2_match_view { return num_groups_; } - std::optional get_group(int i) const noexcept { + std::optional get_group(size_t i) const noexcept { kphp::log::assertion(i >= 0 && i < num_groups_ && ovector_ptr_); PCRE2_SIZE start{ovector_ptr_[2 * i]}; PCRE2_SIZE end{ovector_ptr_[2 * i + 1]}; From d743e587b0756eaebefa256272b459f1aaa55862 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 12:15:50 +0300 Subject: [PATCH 04/68] fix names --- .../stdlib/string/regex-functions.cpp | 90 +++++++++---------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 928bd26685..ecaca77bd1 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -75,10 +75,10 @@ class pcre2_match_view { public: pcre2_match_view() = default; - pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t rc) noexcept + pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept : subject_data_{subject}, ovector_ptr_{ovector}, - num_groups_{rc} {} + num_groups_{ret_code} {} int32_t size() const noexcept { return num_groups_; @@ -359,43 +359,43 @@ class pcre2_iterator { using pointer = const value_type*; pcre2_iterator() noexcept - : regex_info_{nullptr}, - match_data_{nullptr}, - is_end_{true}, - is_valid_{true} {} + : m_regex_info{nullptr}, + m_match_data{nullptr}, + m_is_end{true}, + m_is_valid{true} {} pcre2_iterator(const RegexInfo& info, size_t match_from) noexcept - : regex_info_{std::addressof(info)}, - match_options_{info.match_options}, - current_offset_{match_from}, - is_end_{true}, - is_valid_{false} { + : m_regex_info{std::addressof(info)}, + m_match_options{info.match_options}, + m_current_offset{match_from}, + m_is_end{true}, + m_is_valid{false} { if (info.regex_code == nullptr) { return; } const auto& regex_state{RegexInstanceState::get()}; - match_data_ = regex_state.regex_pcre2_match_data.get(); - if (!match_data_) { + m_match_data = regex_state.regex_pcre2_match_data.get(); + if (!m_match_data) { return; } - is_valid_ = true; - is_end_ = false; + m_is_valid = true; + m_is_end = false; increment(); } bool is_terminal() const noexcept { - return !is_valid_ || is_end_; + return !m_is_valid || m_is_end; } bool is_valid() const noexcept { - return is_valid_; + return m_is_valid; } reference operator*() const noexcept { - PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(match_data_)}; - return pcre2_match_view{regex_info_->subject, ovector, last_rc_}; + PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data)}; + return pcre2_match_view{m_regex_info->subject, ovector, m_last_ret_code}; } pcre2_iterator& operator++() noexcept { @@ -417,51 +417,51 @@ class pcre2_iterator { private: void increment() noexcept { - kphp::log::trace("incrementing pcre2_iterator with offset={}", current_offset_); - auto& ri{*regex_info_}; - auto* const ovector{pcre2_get_ovector_pointer_8(match_data_)}; + kphp::log::trace("incrementing pcre2_iterator with offset={}", m_current_offset); + auto& ri{*m_regex_info}; + auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { - auto match_count_opt{match_regex(ri, current_offset_, match_options_)}; + auto match_count_opt{match_regex(ri, m_current_offset, m_match_options)}; if (!match_count_opt.has_value()) { - is_end_ = true; - is_valid_ = false; + m_is_end = true; + m_is_valid = false; return; } - last_rc_ = *match_count_opt; + m_last_ret_code = *match_count_opt; - if (last_rc_ == 0) { - if (match_options_ == ri.match_options || current_offset_ == ri.subject.size()) { - is_end_ = true; + if (m_last_ret_code == 0) { + if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { + m_is_end = true; return; } - ++current_offset_; - current_offset_ = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(current_offset_, ri.subject) : current_offset_; - match_options_ = ri.match_options; + ++m_current_offset; + m_current_offset = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, ri.subject) : m_current_offset; + m_match_options = ri.match_options; continue; } PCRE2_SIZE match_start{ovector[0]}; PCRE2_SIZE match_end{ovector[1]}; - current_offset_ = match_end; + m_current_offset = match_end; if (match_end == match_start) { - match_options_ |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } else { - match_options_ = ri.match_options; + m_match_options = ri.match_options; } return; } } - const RegexInfo* regex_info_{nullptr}; - uint64_t match_options_; - PCRE2_SIZE current_offset_; - pcre2_match_data_8* match_data_{nullptr}; - int32_t last_rc_{}; - bool is_end_{false}; - bool is_valid_{false}; + const RegexInfo* m_regex_info{nullptr}; + uint64_t m_match_options; + PCRE2_SIZE m_current_offset; + pcre2_match_data_8* m_match_data{nullptr}; + int32_t m_last_ret_code{}; + bool m_is_end{false}; + bool m_is_valid{false}; }; // returns the ending offset of the entire match @@ -611,12 +611,12 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { const auto match_end_offset{ovector[1]}; length_after_replace = buffer_length; - if (auto replace_one_rc{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), + if (auto replace_one_ret_code{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; - replace_one_rc != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error {}", replace_one_rc); + replace_one_ret_code != 1) [[unlikely]] { + kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); return false; } From f0d0ca17a257b9ff8a039a3d87c6eb71ad5aa190 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 12:26:57 +0300 Subject: [PATCH 05/68] cc --- runtime-light/stdlib/string/regex-functions.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index ecaca77bd1..275b5e2d94 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -86,6 +86,7 @@ class pcre2_match_view { std::optional get_group(size_t i) const noexcept { kphp::log::assertion(i >= 0 && i < num_groups_ && ovector_ptr_); + // ovector is an array of offset pairs PCRE2_SIZE start{ovector_ptr_[2 * i]}; PCRE2_SIZE end{ovector_ptr_[2 * i + 1]}; @@ -611,8 +612,8 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { const auto match_end_offset{ovector[1]}; length_after_replace = buffer_length; - if (auto replace_one_ret_code{pcre2_substitute_8(regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), - substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), + if (auto replace_one_ret_code{pcre2_substitute_8( + regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { From c3b01c96d06b859d84bad34be38be465a9f6a1b8 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 12:42:58 +0300 Subject: [PATCH 06/68] cc --- runtime-light/stdlib/string/regex-functions.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 275b5e2d94..dd40d4776a 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -423,8 +423,10 @@ class pcre2_iterator { auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { + // Try to find match auto match_count_opt{match_regex(ri, m_current_offset, m_match_options)}; if (!match_count_opt.has_value()) { + // std::nullopt means error m_is_end = true; m_is_valid = false; return; @@ -433,23 +435,30 @@ class pcre2_iterator { m_last_ret_code = *match_count_opt; if (m_last_ret_code == 0) { + // If match is not found if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { + // Here we are sure that there are no more matches here m_is_end = true; return; } + // Here we know that we were looking for a non-empty and anchored match, + // and we're going to try searching from the next character with the default options. ++m_current_offset; m_current_offset = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, ri.subject) : m_current_offset; m_match_options = ri.match_options; continue; } + // Match found PCRE2_SIZE match_start{ovector[0]}; PCRE2_SIZE match_end{ovector[1]}; m_current_offset = match_end; if (match_end == match_start) { + // If an empty match is found, try searching for a non-empty attached match next time. m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } else { + // Else use default options m_match_options = ri.match_options; } return; @@ -613,8 +622,8 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { length_after_replace = buffer_length; if (auto replace_one_ret_code{pcre2_substitute_8( - regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, regex_info.replace_options, nullptr, regex_state.match_context.get(), - reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), + regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, + regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); From f9cfe5c20a0a240041c279e7a70950eeb7c199d2 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 12:50:41 +0300 Subject: [PATCH 07/68] format --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index dd40d4776a..3e718bcb43 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -623,8 +623,8 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { length_after_replace = buffer_length; if (auto replace_one_ret_code{pcre2_substitute_8( regex_info.regex_code, reinterpret_cast(str_after_replace.c_str()), str_after_replace.size(), substitute_offset, - regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), - reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; + regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), + regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); return false; From 0776c8f1d31afbf6a04b0c237067d34c2dbae51a Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 1 Dec 2025 18:15:01 +0300 Subject: [PATCH 08/68] fixed names --- .../stdlib/string/regex-functions.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 3e718bcb43..21e8a79055 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -76,31 +76,31 @@ class pcre2_match_view { pcre2_match_view() = default; pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept - : subject_data_{subject}, - ovector_ptr_{ovector}, - num_groups_{ret_code} {} + : m_subject_data{subject}, + m_ovector_ptr{ovector}, + m_num_groups{ret_code} {} int32_t size() const noexcept { - return num_groups_; + return m_num_groups; } std::optional get_group(size_t i) const noexcept { - kphp::log::assertion(i >= 0 && i < num_groups_ && ovector_ptr_); + kphp::log::assertion(i >= 0 && i < m_num_groups && m_ovector_ptr); // ovector is an array of offset pairs - PCRE2_SIZE start{ovector_ptr_[2 * i]}; - PCRE2_SIZE end{ovector_ptr_[2 * i + 1]}; + PCRE2_SIZE start{m_ovector_ptr[2 * i]}; + PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]}; if (start == PCRE2_UNSET) { return std::nullopt; } - return subject_data_.substr(start, end - start); + return m_subject_data.substr(start, end - start); } private: - std::string_view subject_data_; - PCRE2_SIZE* ovector_ptr_; - int32_t num_groups_; + std::string_view m_subject_data; + PCRE2_SIZE* m_ovector_ptr; + int32_t m_num_groups; }; template From a5d597a6c89db07018437b410661f6e40e248bf2 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:01:47 +0300 Subject: [PATCH 09/68] add const in class fields --- runtime-light/stdlib/string/regex-functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 21e8a79055..4a7a725642 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -98,9 +98,9 @@ class pcre2_match_view { } private: - std::string_view m_subject_data; - PCRE2_SIZE* m_ovector_ptr; - int32_t m_num_groups; + const std::string_view m_subject_data; + const PCRE2_SIZE* const m_ovector_ptr{nullptr}; + const int32_t m_num_groups{}; }; template @@ -465,7 +465,7 @@ class pcre2_iterator { } } - const RegexInfo* m_regex_info{nullptr}; + const RegexInfo* const m_regex_info{nullptr}; uint64_t m_match_options; PCRE2_SIZE m_current_offset; pcre2_match_data_8* m_match_data{nullptr}; From cafa47c378ab8d35b7ecbc1f2129885316f47bdb Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:04:44 +0300 Subject: [PATCH 10/68] add noexcept to pcre2_match_view default constructor --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 4a7a725642..27f03eb04d 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -73,7 +73,7 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view() = default; + pcre2_match_view() noexcept = default; pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept : m_subject_data{subject}, From 21868cb0e97571cc195ba14df5f0938689d83be5 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:11:45 +0300 Subject: [PATCH 11/68] fix initializing of pcre2_iterator fields --- runtime-light/stdlib/string/regex-functions.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 27f03eb04d..37bb1662e5 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -360,17 +360,12 @@ class pcre2_iterator { using pointer = const value_type*; pcre2_iterator() noexcept - : m_regex_info{nullptr}, - m_match_data{nullptr}, - m_is_end{true}, - m_is_valid{true} {} + : m_is_valid{true} {} pcre2_iterator(const RegexInfo& info, size_t match_from) noexcept : m_regex_info{std::addressof(info)}, m_match_options{info.match_options}, - m_current_offset{match_from}, - m_is_end{true}, - m_is_valid{false} { + m_current_offset{match_from} { if (info.regex_code == nullptr) { return; } @@ -470,7 +465,7 @@ class pcre2_iterator { PCRE2_SIZE m_current_offset; pcre2_match_data_8* m_match_data{nullptr}; int32_t m_last_ret_code{}; - bool m_is_end{false}; + bool m_is_end{true}; bool m_is_valid{false}; }; From 8e5740d46eb6c4b612d9bca7473b85353ac0199b Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:21:51 +0300 Subject: [PATCH 12/68] delete unused default constructor of pcre2_match_view --- runtime-light/stdlib/string/regex-functions.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 37bb1662e5..7ae971f095 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -73,8 +73,6 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view() noexcept = default; - pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept : m_subject_data{subject}, m_ovector_ptr{ovector}, @@ -99,8 +97,8 @@ class pcre2_match_view { private: const std::string_view m_subject_data; - const PCRE2_SIZE* const m_ovector_ptr{nullptr}; - const int32_t m_num_groups{}; + const PCRE2_SIZE* const m_ovector_ptr; + const int32_t m_num_groups; }; template From 668e8477f0af19b049f498a700d18d67c1d03ecb Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:32:36 +0300 Subject: [PATCH 13/68] fix get_group argument check --- runtime-light/stdlib/string/regex-functions.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 7ae971f095..3a2618b268 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -83,7 +83,11 @@ class pcre2_match_view { } std::optional get_group(size_t i) const noexcept { - kphp::log::assertion(i >= 0 && i < m_num_groups && m_ovector_ptr); + if (i >= m_num_groups) { + return std::nullopt; + } + + kphp::log::assertion(m_ovector_ptr); // ovector is an array of offset pairs PCRE2_SIZE start{m_ovector_ptr[2 * i]}; PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]}; From e66a58d1284c44dea81ae96e988f275b6051861d Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:41:54 +0300 Subject: [PATCH 14/68] add iterator_category --- runtime-light/stdlib/string/regex-functions.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 3a2618b268..35d6a8c201 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -360,6 +360,7 @@ class pcre2_iterator { using difference_type = std::ptrdiff_t; using reference = value_type; using pointer = const value_type*; + using iterator_category = std::forward_iterator_tag; pcre2_iterator() noexcept : m_is_valid{true} {} From c614e63f3ea47b3f80b8fb6db57d0dbcd99d8b37 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:44:25 +0300 Subject: [PATCH 15/68] remove const --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 35d6a8c201..06ef28a3fd 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -359,7 +359,7 @@ class pcre2_iterator { using value_type = pcre2_match_view; using difference_type = std::ptrdiff_t; using reference = value_type; - using pointer = const value_type*; + using pointer = value_type*; using iterator_category = std::forward_iterator_tag; pcre2_iterator() noexcept From 3cd144b2cdafffa70fb02e532e6cb02a913dafb5 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 15:49:41 +0300 Subject: [PATCH 16/68] remove kphp::log::trace --- runtime-light/stdlib/string/regex-functions.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 06ef28a3fd..a39156c7bd 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -416,7 +416,6 @@ class pcre2_iterator { private: void increment() noexcept { - kphp::log::trace("incrementing pcre2_iterator with offset={}", m_current_offset); auto& ri{*m_regex_info}; auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; From a23be3219c61338e87fde280fec4ae9bfcc455fe Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 16:02:33 +0300 Subject: [PATCH 17/68] brace init --- runtime-light/stdlib/string/regex-functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index a39156c7bd..ee85d61e16 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -324,7 +324,7 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_NAMEENTRYSIZE, std::addressof(name_entry_size)); PCRE2_SPTR8 entry{name_table}; - for (auto i = 0; i < name_count; ++i) { + for (auto i{0}; i < name_count; ++i) { const auto group_number{static_cast((entry[0] << 8) | entry[1])}; PCRE2_SPTR8 group_name{std::next(entry, 2)}; regex_info.group_names[group_number] = reinterpret_cast(group_name); @@ -492,7 +492,7 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional const auto unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; // calculate last matched group int64_t last_matched_group{-1}; - for (auto i = 0; i < regex_info.match_count; ++i) { + for (auto i{0}; i < regex_info.match_count; ++i) { if (ovector[static_cast(2 * i)] != PCRE2_UNSET) { last_matched_group = i; } @@ -503,7 +503,7 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional // reserve enough space for output array output{array_size{static_cast(regex_info.group_names.size() + named_groups_count), named_groups_count == 0}}; - for (auto i = 0; i < regex_info.group_names.size(); ++i) { + for (auto i{0}; i < regex_info.group_names.size(); ++i) { // skip unmatched groups at the end unless unmatched_as_null is set if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !unmatched_as_null) [[unlikely]] { break; @@ -665,7 +665,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt if (!collect_group_names(regex_info)) [[unlikely]] { return false; } - auto match_count_opt = match_regex(regex_info, offset, regex_info.match_options); + auto match_count_opt{match_regex(regex_info, offset, regex_info.match_options)}; if (!match_count_opt.has_value()) [[unlikely]] { return false; } From 0c13d970c7a88e715ab490dfffb09b63211d8ecc Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 17:19:07 +0300 Subject: [PATCH 18/68] fix match_regex return --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index ee85d61e16..8a5b92c363 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -337,7 +337,7 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { std::optional match_regex(const RegexInfo& regex_info, size_t offset, uint32_t match_options) noexcept { const auto& regex_state{RegexInstanceState::get()}; if (regex_info.regex_code == nullptr || !regex_state.match_context) [[unlikely]] { - return false; + return std::nullopt; } int32_t match_count{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, @@ -349,7 +349,7 @@ std::optional match_regex(const RegexInfo& regex_info, size_t offset, u std::array buffer{}; pcre2_get_error_message_8(match_count, reinterpret_cast(buffer.data()), buffer.size()); kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); - return false; + return std::nullopt; } return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; } From 080b71747d71b36c84419432d83fa4fb50531497 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 3 Dec 2025 17:48:03 +0300 Subject: [PATCH 19/68] remove increment in constructor --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 8a5b92c363..02f7490bb4 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -351,6 +351,7 @@ std::optional match_regex(const RegexInfo& regex_info, size_t offset, u kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); return std::nullopt; } + // zero if the vector of offsets is too small return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; } @@ -381,7 +382,6 @@ class pcre2_iterator { m_is_valid = true; m_is_end = false; - increment(); } bool is_terminal() const noexcept { From d41ce1e934d08fc9a42e91b5a618011ffd7eebcc Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 4 Dec 2025 17:20:03 +0300 Subject: [PATCH 20/68] add initializers --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 02f7490bb4..7e3a2e3ea5 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -463,8 +463,8 @@ class pcre2_iterator { } const RegexInfo* const m_regex_info{nullptr}; - uint64_t m_match_options; - PCRE2_SIZE m_current_offset; + uint64_t m_match_options{}; + PCRE2_SIZE m_current_offset{}; pcre2_match_data_8* m_match_data{nullptr}; int32_t m_last_ret_code{}; bool m_is_end{true}; From 19404da7a5ac9989cdb7494d60e4a66403941c20 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 4 Dec 2025 17:28:02 +0300 Subject: [PATCH 21/68] add const --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 7e3a2e3ea5..ff595246c2 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -416,7 +416,7 @@ class pcre2_iterator { private: void increment() noexcept { - auto& ri{*m_regex_info}; + const auto& ri{*m_regex_info}; auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { @@ -556,7 +556,7 @@ PCRE2_SIZE set_all_matches(const RegexInfo& regex_info, int64_t flags, std::opti mixed& all_matches{(*opt_all_matches).get()}; if (pattern_order) [[likely]] { - for (auto& it : std::as_const(matches)) { + for (const auto& it : std::as_const(matches)) { all_matches[it.get_key()].push_back(it.get_value()); } } else { From 2ba17d50db6d666618bb26a71ceb3328d78094f6 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Fri, 5 Dec 2025 13:01:47 +0300 Subject: [PATCH 22/68] replace iterator with "matcher" --- .../stdlib/string/regex-functions.cpp | 91 ++++++------------- 1 file changed, 27 insertions(+), 64 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index ff595246c2..66547df6ce 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -78,6 +78,11 @@ class pcre2_match_view { m_ovector_ptr{ovector}, m_num_groups{ret_code} {} + pcre2_match_view(const pcre2_match_view&) noexcept = default; + pcre2_match_view(pcre2_match_view&&) noexcept = default; + pcre2_match_view& operator=(const pcre2_match_view&) noexcept = default; + pcre2_match_view& operator=(pcre2_match_view&&) noexcept = default; + int32_t size() const noexcept { return m_num_groups; } @@ -100,9 +105,9 @@ class pcre2_match_view { } private: - const std::string_view m_subject_data; - const PCRE2_SIZE* const m_ovector_ptr; - const int32_t m_num_groups; + std::string_view m_subject_data; + const PCRE2_SIZE* m_ovector_ptr; + int32_t m_num_groups; }; template @@ -355,18 +360,9 @@ std::optional match_regex(const RegexInfo& regex_info, size_t offset, u return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; } -class pcre2_iterator { +class matcher { public: - using value_type = pcre2_match_view; - using difference_type = std::ptrdiff_t; - using reference = value_type; - using pointer = value_type*; - using iterator_category = std::forward_iterator_tag; - - pcre2_iterator() noexcept - : m_is_valid{true} {} - - pcre2_iterator(const RegexInfo& info, size_t match_from) noexcept + matcher(const RegexInfo& info, size_t match_from) noexcept : m_regex_info{std::addressof(info)}, m_match_options{info.match_options}, m_current_offset{match_from} { @@ -380,42 +376,14 @@ class pcre2_iterator { return; } - m_is_valid = true; - m_is_end = false; - } - - bool is_terminal() const noexcept { - return !m_is_valid || m_is_end; + m_has_error = false; } - bool is_valid() const noexcept { - return m_is_valid; + bool has_error() const noexcept { + return m_has_error; } - reference operator*() const noexcept { - PCRE2_SIZE* ovector{pcre2_get_ovector_pointer_8(m_match_data)}; - return pcre2_match_view{m_regex_info->subject, ovector, m_last_ret_code}; - } - - pcre2_iterator& operator++() noexcept { - increment(); - return *this; - } - pcre2_iterator operator++(int) noexcept { - pcre2_iterator temp{*this}; - increment(); - return temp; - } - - bool operator==(const pcre2_iterator& other) const noexcept { - return is_terminal() && other.is_terminal(); - } - bool operator!=(const pcre2_iterator& other) const noexcept { - return !(*this == other); - } - -private: - void increment() noexcept { + std::optional next() noexcept { const auto& ri{*m_regex_info}; auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; @@ -424,19 +392,17 @@ class pcre2_iterator { auto match_count_opt{match_regex(ri, m_current_offset, m_match_options)}; if (!match_count_opt.has_value()) { // std::nullopt means error - m_is_end = true; - m_is_valid = false; - return; + m_has_error = true; + return std::nullopt; } - m_last_ret_code = *match_count_opt; + auto ret_code{*match_count_opt}; - if (m_last_ret_code == 0) { + if (ret_code == 0) { // If match is not found if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { // Here we are sure that there are no more matches here - m_is_end = true; - return; + return std::nullopt; } // Here we know that we were looking for a non-empty and anchored match, // and we're going to try searching from the next character with the default options. @@ -458,17 +424,16 @@ class pcre2_iterator { // Else use default options m_match_options = ri.match_options; } - return; + return pcre2_match_view{ri.subject, ovector, ret_code}; } } +private: const RegexInfo* const m_regex_info{nullptr}; uint64_t m_match_options{}; PCRE2_SIZE m_current_offset{}; pcre2_match_data_8* m_match_data{nullptr}; - int32_t m_last_ret_code{}; - bool m_is_end{true}; - bool m_is_valid{false}; + bool m_has_error{true}; }; // returns the ending offset of the entire match @@ -724,22 +689,20 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } } - pcre2_iterator it{regex_info, static_cast(offset)}; - if (!it.is_valid()) { + matcher m{regex_info, static_cast(offset)}; + if (m.has_error()) { return false; } - pcre2_iterator end_it{}; - - for (; it != end_it; ++it) { - pcre2_match_view match_view{*it}; + for (auto match_view_opt{m.next()}; match_view_opt.has_value(); match_view_opt = m.next()) { + pcre2_match_view match_view{*match_view_opt}; regex_info.match_count = match_view.size(); set_all_matches(regex_info, flags, matches); if (regex_info.match_count > 0) { ++entire_match_count; } } - if (!it.is_valid()) [[unlikely]] { + if (m.has_error()) [[unlikely]] { return false; } From 48b620d73f9b17fcbff4f9076d92976d2596bf33 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Fri, 5 Dec 2025 17:50:20 +0300 Subject: [PATCH 23/68] brace init --- runtime-light/stdlib/string/regex-functions.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index daf5bdbd10..4f4e0948cb 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -31,7 +31,7 @@ namespace { -constexpr size_t ERROR_BUFFER_LENGTH = 256; +constexpr size_t ERROR_BUFFER_LENGTH{256}; enum class trailing_unmatch : uint8_t { skip, include }; @@ -174,7 +174,7 @@ class preg_replacement_parser { if (preg_replacement.front() == '{') { return try_get_backref(preg_replacement.substr(1)) .and_then([this](auto value) noexcept -> std::optional { - auto digits_end_pos = 1 + value.size(); + auto digits_end_pos{1 + value.size()}; if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { preg_replacement = preg_replacement.substr(1 + value.size() + 1); return value; @@ -187,7 +187,7 @@ class preg_replacement_parser { return try_get_backref(preg_replacement) .transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos = value.size(); + auto digits_end_pos{value.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); return value; }) @@ -196,7 +196,7 @@ class preg_replacement_parser { case '\\': { // \1 auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos = value.size(); + auto digits_end_pos{value.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); return value; })}; @@ -256,7 +256,7 @@ class preg_replacement_parser { return *this; } iterator operator++(int) noexcept { - iterator temp = *this; + iterator temp{*this}; ++(*this); return temp; } From caeabf82530a4999bc59afec2b87ae56938807d4 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Fri, 5 Dec 2025 21:26:33 +0300 Subject: [PATCH 24/68] implement preg_split --- .../kphp-light/stdlib/regex-functions.txt | 5 +- .../stdlib/string/regex-functions.cpp | 258 ++++++++++++------ runtime-light/stdlib/string/regex-functions.h | 34 ++- runtime-light/stdlib/string/regex-state.cpp | 18 +- runtime-light/stdlib/string/regex-state.h | 31 ++- tests/phpt/dl/496_regex.php | 2 +- tests/phpt/dl/497_preg_split.php | 2 +- 7 files changed, 244 insertions(+), 106 deletions(-) diff --git a/builtin-functions/kphp-light/stdlib/regex-functions.txt b/builtin-functions/kphp-light/stdlib/regex-functions.txt index dda204a2d3..54c35bb468 100644 --- a/builtin-functions/kphp-light/stdlib/regex-functions.txt +++ b/builtin-functions/kphp-light/stdlib/regex-functions.txt @@ -39,11 +39,10 @@ function preg_replace_callback( &$replace_count ::: int = TODO, $flags ::: int = 0): string | ^3 | null; +function preg_split ($pattern ::: string, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; + // ===== UNSUPPORTED ===== /** @kphp-extern-func-info stub generation-required */ function preg_last_error() ::: int; -/** @kphp-extern-func-info stub */ -function preg_split ($pattern ::: regexp, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; - diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 4f4e0948cb..5a9ec648dd 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -39,12 +39,7 @@ using backref = std::string_view; using regex_pcre2_group_names_t = kphp::stl::vector; struct RegexInfo final { - std::string_view regex; - // non-null-terminated regex without delimiters and PCRE modifiers - // - // regex -> ~pattern~im\0 - // regex_body -> pattern - std::string_view regex_body; + const string& regex; std::string_view subject; std::string_view replacement; @@ -68,7 +63,7 @@ struct RegexInfo final { RegexInfo() = delete; - RegexInfo(std::string_view regex_, std::string_view subject_, std::string_view replacement_) noexcept + RegexInfo(const string& regex_, std::string_view subject_, std::string_view replacement_) noexcept : regex(regex_), subject(subject_), replacement(replacement_) {} @@ -173,11 +168,11 @@ class preg_replacement_parser { // $1, ${1} if (preg_replacement.front() == '{') { return try_get_backref(preg_replacement.substr(1)) - .and_then([this](auto value) noexcept -> std::optional { - auto digits_end_pos{1 + value.size()}; + .and_then([this](auto br) noexcept -> std::optional { + auto digits_end_pos{1 + br.size()}; if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { - preg_replacement = preg_replacement.substr(1 + value.size() + 1); - return value; + preg_replacement = preg_replacement.substr(1 + br.size() + 1); + return br; } return std::nullopt; @@ -186,27 +181,27 @@ class preg_replacement_parser { } return try_get_backref(preg_replacement) - .transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos{value.size()}; + .transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); - return value; + return br; }) .value_or('$'); case '\\': { // \1 - auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto value) noexcept -> replacement_term { - auto digits_end_pos{value.size()}; + auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); - return value; + return br; })}; if (back_reference_opt.has_value()) { return *back_reference_opt; } else { - auto res{preg_replacement.front()}; - if (res == '$' || res == '\\') { + auto c{preg_replacement.front()}; + if (c == '$' || c == '\\') { preg_replacement = preg_replacement.substr(1); - return res; + return c; } return '\\'; } @@ -277,7 +272,20 @@ class preg_replacement_parser { } }; -bool parse_regex(RegexInfo& regex_info) noexcept { +bool compile_regex(RegexInfo& regex_info) noexcept { + auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.compile_context) [[unlikely]] { + return false; + } + + // check runtime cache + if (const auto ptr{regex_state.get_compiled_regex(regex_info.regex)}; ptr != nullptr) { + auto& [compile_options, regex_code]{*ptr}; + regex_info.compile_options = compile_options; + regex_info.regex_code = std::addressof(regex_code); + return true; + } + if (regex_info.regex.empty()) { kphp::log::warning("empty regex"); return false; @@ -324,18 +332,18 @@ bool parse_regex(RegexInfo& regex_info) noexcept { } uint32_t compile_options{}; - regex_info.regex_body = regex_info.regex; + auto regex_body{std::string_view{regex_info.regex.c_str(), regex_info.regex.size()}}; // remove start delimiter - regex_info.regex_body.remove_prefix(1); + regex_body.remove_prefix(1); // parse compile options and skip all symbols until the end delimiter - for (; !regex_info.regex_body.empty() && regex_info.regex_body.back() != end_delim; regex_info.regex_body.remove_suffix(1)) { + for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) { // spaces and newlines are ignored - if (regex_info.regex_body.back() == ' ' || regex_info.regex_body.back() == '\n') { + if (regex_body.back() == ' ' || regex_body.back() == '\n') { continue; } - switch (regex_info.regex_body.back()) { + switch (regex_body.back()) { case 'i': { compile_options |= PCRE2_CASELESS; break; @@ -377,20 +385,20 @@ bool parse_regex(RegexInfo& regex_info) noexcept { break; } default: { - kphp::log::warning("unsupported regex modifier {}", regex_info.regex_body.back()); + kphp::log::warning("unsupported regex modifier {}", regex_body.back()); break; } } } - if (regex_info.regex_body.empty()) { - kphp::log::warning("no ending regex delimiter: {}", regex_info.regex); + if (regex_body.empty()) { + kphp::log::warning("no ending regex delimiter: {}", regex_info.regex.c_str()); return false; } // UTF-8 validation if (static_cast(compile_options & PCRE2_UTF)) { - if (!mb_UTF8_check(regex_info.regex.data())) [[unlikely]] { - kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex); + if (!mb_UTF8_check(regex_info.regex.c_str())) [[unlikely]] { + kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex.c_str()); return false; } if (!mb_UTF8_check(regex_info.subject.data())) [[unlikely]] { @@ -400,12 +408,9 @@ bool parse_regex(RegexInfo& regex_info) noexcept { } // remove the end delimiter - regex_info.regex_body.remove_suffix(1); + regex_body.remove_suffix(1); regex_info.compile_options = compile_options; - return true; -} -bool compile_regex(RegexInfo& regex_info) noexcept { const vk::final_action finalizer{[®ex_info]() noexcept { if (regex_info.regex_code != nullptr) [[likely]] { pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); @@ -415,20 +420,10 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } }}; - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.compile_context) [[unlikely]] { - return false; - } - - // check runtime cache - if (const auto it{regex_state.regex_pcre2_code_cache.find(regex_info.regex)}; it != regex_state.regex_pcre2_code_cache.end()) { - regex_info.regex_code = it->second; - return true; - } // compile pcre2_code int32_t error_number{}; PCRE2_SIZE error_offset{}; - regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_info.regex_body.data()), regex_info.regex_body.size(), + regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), regex_info.compile_options, std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get())}; if (!regex_code) [[unlikely]] { @@ -439,7 +434,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } // add compiled code to runtime cache - regex_state.regex_pcre2_code_cache.emplace(regex_info.regex, regex_code); + regex_state.add_compiled_regex(regex_info.regex, compile_options, *regex_code); regex_info.regex_code = regex_code; return true; @@ -589,8 +584,8 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional return end_offset; } - const auto offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; // calculate last matched group int64_t last_matched_group{-1}; for (auto i{0}; i < regex_info.match_count; ++i) { @@ -606,24 +601,24 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional array output{array_size{static_cast(regex_info.group_names.size() + named_groups_count), named_groups_count == 0}}; for (auto i{0}; i < regex_info.group_names.size(); ++i) { // skip unmatched groups at the end unless unmatched_as_null is set - if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !unmatched_as_null) [[unlikely]] { + if (last_unmatched_policy == trailing_unmatch::skip && i > last_matched_group && !is_unmatched_as_null) [[unlikely]] { break; } - const auto match_start{ovector[static_cast(2 * i)]}; - const auto match_end{ovector[static_cast(2 * i + 1)]}; + const auto match_start_offset{ovector[static_cast(2 * i)]}; + const auto match_end_offset{ovector[static_cast(2 * i + 1)]}; - mixed match_val; // NULL value - if (match_start != PCRE2_UNSET) { // handle matched group - const auto match_size{match_end - match_start}; - match_val = string{std::next(regex_info.subject.data(), match_start), static_cast(match_size)}; - } else if (!unmatched_as_null) { // handle unmatched group + mixed match_val; // NULL value + if (match_start_offset != PCRE2_UNSET) { // handle matched group + const auto match_size{match_end_offset - match_start_offset}; + match_val = string{std::next(regex_info.subject.data(), match_start_offset), static_cast(match_size)}; + } else if (!is_unmatched_as_null) { // handle unmatched group match_val = string{}; } mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(match_val), static_cast(match_start)); + if (is_offset_capture) { + output_val = array::create(std::move(match_val), static_cast(match_start_offset)); } else { output_val = std::move(match_val); } @@ -745,11 +740,109 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return true; } +std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val, bool no_empty, + bool delim_capture, bool offset_capture) noexcept { + const char* offset{regex_info.subject.data()}; + + if (limit_val == 0) { + limit_val = kphp::regex::PREG_NOLIMIT; + } + + const auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + array output{}; + + matcher m{regex_info, {}}; + if (m.has_error()) { + return std::nullopt; + } + + for (auto match_view_opt{m.next()}; match_view_opt.has_value(); match_view_opt = m.next()) { + pcre2_match_view match_view{*match_view_opt}; + + auto entire_pattern_match_opt{match_view.get_group(0)}; + if (!entire_pattern_match_opt.has_value()) [[unlikely]] { + return std::nullopt; + } + auto entire_pattern_match_sv{*entire_pattern_match_opt}; + + if (!(limit_val == kphp::regex::PREG_NOLIMIT || limit_val > 1)) { + break; + } + + if (const auto size{entire_pattern_match_sv.data() - offset}; !no_empty || size != 0) { + auto val{string{offset, static_cast(size)}}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset - regex_info.subject.data())); + } else { + output_val = std::move(val); + } + + output.push_back(std::move(output_val)); + if (limit_val != kphp::regex::PREG_NOLIMIT) { + limit_val--; + } + } + + if (delim_capture) { + for (auto i{1uz}; i < match_view.size(); i++) { + auto sv_opt{match_view.get_group(i)}; + auto sv{sv_opt.value_or(std::string_view{})}; + const auto size{sv.size()}; + if (!no_empty || size != 0) { + string val; + if (sv_opt.has_value()) [[likely]] { + val = string{sv.data(), static_cast(size)}; + } + + mixed output_val; + if (offset_capture) { + output_val = array::create( + std::move(val), + sv_opt.transform([®ex_info](auto sv) noexcept { return static_cast(sv.data() - regex_info.subject.data()); }).value_or(-1)); + } else { + output_val = std::move(val); + } + + output.push_back(std::forward(output_val)); + } + } + } + + offset = std::next(entire_pattern_match_sv.data(), entire_pattern_match_sv.size()); + } + + if (m.has_error()) [[unlikely]] { + return std::nullopt; + } + + const auto size{regex_info.subject.size() - (offset - regex_info.subject.data())}; + if (!no_empty || size != 0) { + auto val{string{offset, static_cast(size)}}; + + mixed output_val; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset - regex_info.subject.data())); + } else { + output_val = std::move(val); + } + + output.push_back(std::forward(output_val)); + } + + return output; +} + } // namespace Optional f$preg_match(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { return false; @@ -757,9 +850,6 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!parse_regex(regex_info)) [[unlikely]] { - return false; - } if (!compile_regex(regex_info)) [[unlikely]] { return false; } @@ -786,7 +876,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt Optional f$preg_match_all(const string& pattern, const string& subject, Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { int64_t entire_match_count{}; - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { @@ -795,9 +885,6 @@ Optional f$preg_match_all(const string& pattern, const string& subject, if (!correct_offset(offset, regex_info.subject)) [[unlikely]] { return false; } - if (!parse_regex(regex_info)) [[unlikely]] { - return false; - } if (!compile_regex(regex_info)) [[unlikely]] { return false; } @@ -856,7 +943,7 @@ Optional f$preg_replace(const string& pattern, const string& replacement } }}; - if (limit < 0 && limit != kphp::regex::PREG_REPLACE_NOLIMIT) [[unlikely]] { + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { kphp::log::warning("invalid limit {} in preg_replace", limit); return {}; } @@ -880,11 +967,10 @@ Optional f$preg_replace(const string& pattern, const string& replacement } } - RegexInfo regex_info{{pattern.c_str(), pattern.size()}, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; - bool success{parse_regex(regex_info)}; - success &= compile_regex(regex_info); - success &= replace_regex(regex_info, limit == kphp::regex::PREG_REPLACE_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); + bool success{compile_regex(regex_info)}; + success &= replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); if (!success) [[unlikely]] { return {}; } @@ -915,7 +1001,8 @@ Optional f$preg_replace(const mixed& pattern, const string& replacement, const auto& pattern_arr{pattern.as_array()}; for (const auto& it : pattern_arr) { int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(it.get_value().to_string(), replacement, result, limit, replace_one_count)}; replace_result.has_value()) [[likely]] { + if (Optional replace_result{f$preg_replace(it.get_value().to_string(), replacement, result, limit, replace_one_count)}; replace_result.has_value()) + [[likely]] { count += replace_one_count; result = std::move(replace_result.val()); } else { @@ -962,8 +1049,8 @@ Optional f$preg_replace(const mixed& pattern, const mixed& replacement, } int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(pattern_it.get_value().to_string(), replacement_str, result, limit, replace_one_count)}; replace_result.has_value()) - [[likely]] { + if (Optional replace_result{f$preg_replace(pattern_it.get_value().to_string(), replacement_str, result, limit, replace_one_count)}; + replace_result.has_value()) [[likely]] { count += replace_one_count; result = std::move(replace_result.val()); } else { @@ -999,7 +1086,7 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed array result{subject_arr.size()}; for (const auto& it : subject_arr) { int64_t replace_one_count{}; - if (auto replace_result{f$preg_replace(pattern, replacement, it.get_value().to_string(), limit, replace_one_count)}; replace_result.has_value()) + if (Optional replace_result{f$preg_replace(pattern, replacement, it.get_value().to_string(), limit, replace_one_count)}; replace_result.has_value()) [[likely]] { count += replace_one_count; result.set_value(it.get_key(), std::move(replace_result.val())); @@ -1011,3 +1098,22 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed return std::move(result); } + +Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { + RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {}}; + + if (!valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, + kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { + return false; + } + if (!compile_regex(regex_info)) [[unlikely]] { + return false; + } + auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, + (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; + if (!opt_output.has_value()) [[unlikely]] { + return false; + } + + return *std::move(opt_output); +} diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 2cc794208e..a00d44c83d 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -34,7 +34,7 @@ inline constexpr auto PREG_SPLIT_DELIM_CAPTURE = static_cast(1U << 4U); inline constexpr auto PREG_SPLIT_OFFSET_CAPTURE = static_cast(1U << 5U); inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); -inline constexpr int64_t PREG_REPLACE_NOLIMIT = -1; +inline constexpr int64_t PREG_NOLIMIT = -1; } // namespace kphp::regex @@ -66,20 +66,20 @@ Optional f$preg_match_all(const string& pattern, const string& subject, // === preg_replace =============================================================================== -Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; -mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; template> -auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept { return f$preg_replace(regex, replace_val, subject.val(), limit, opt_count); } @@ -87,7 +87,7 @@ auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, i // === preg_replace_callback ====================================================================== template> F> -kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { static_assert(std::same_as>, string>); @@ -122,7 +122,7 @@ kphp::coro::task> f$preg_replace_callback(string pattern, F cal } template -kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { if (!regex_impl_::valid_preg_replace_mixed(pattern)) [[unlikely]] { @@ -161,7 +161,7 @@ kphp::coro::task> f$preg_replace_callback(mixed pattern, F call } template -kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { if (!regex_impl_::valid_preg_replace_mixed(pattern) || !regex_impl_::valid_preg_replace_mixed(subject)) [[unlikely]] { @@ -200,7 +200,7 @@ kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed } template> -auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_REPLACE_NOLIMIT, +auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, opt_count, flags)) { @@ -209,10 +209,14 @@ auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t // === preg_split ================================================================================= -inline Optional> f$preg_split(const string& /*unused*/, const string& /*unused*/, int64_t /*unused*/ = -1, int64_t /*unused*/ = 0) { - kphp::log::error("call to unsupported function"); -} +Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; -inline Optional> f$preg_split(const mixed& /*unused*/, const string& /*unused*/, int64_t /*unused*/ = -1, int64_t /*unused*/ = 0) { - kphp::log::error("call to unsupported function"); +inline Optional> f$preg_split(const mixed& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + if (!pattern.is_string()) [[unlikely]] { + kphp::log::warning("preg_split() expects parameter 1 to be string, {} given", pattern.get_type_or_class_name()); + return false; + } + return f$preg_split(pattern.as_string(), subject, limit, flags); } diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index f8bd465140..f37a4d3225 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -4,7 +4,11 @@ #include "runtime-light/stdlib/string/regex-state.h" +#include +#include + #include "runtime-common/core/allocator/script-malloc-interface.h" +#include "runtime-common/core/runtime-core.h" #include "runtime-light/state/instance-state.h" #include "runtime-light/stdlib/diagnostics/logs.h" #include "runtime-light/stdlib/string/regex-include.h" @@ -32,7 +36,7 @@ RegexInstanceState::RegexInstanceState() noexcept : regex_pcre2_general_context(pcre2_general_context_create_8(regex_malloc, regex_free, nullptr), pcre2_general_context_free_8), compile_context(pcre2_compile_context_create_8(regex_pcre2_general_context.get()), pcre2_compile_context_free_8), match_context(pcre2_match_context_create_8(regex_pcre2_general_context.get()), pcre2_match_context_free_8), - regex_pcre2_match_data(pcre2_match_data_create_8(MATCH_DATA_SIZE, regex_pcre2_general_context.get()), pcre2_match_data_free_8) { + regex_pcre2_match_data(pcre2_match_data_create_8(OVECTOR_SIZE, regex_pcre2_general_context.get()), pcre2_match_data_free_8) { if (!regex_pcre2_general_context) [[unlikely]] { kphp::log::error("can't create pcre2_general_context"); } @@ -44,6 +48,18 @@ RegexInstanceState::RegexInstanceState() noexcept } } +const RegexInstanceState::compiled_regex* RegexInstanceState::get_compiled_regex(const string& regex) const noexcept { + if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { + return std::addressof(it->second); + } + return nullptr; +} + +const RegexInstanceState::compiled_regex* RegexInstanceState::add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept { + return std::addressof( + regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = regex_code}).first->second); +} + RegexInstanceState& RegexInstanceState::get() noexcept { return InstanceState::get().regex_instance_state; } diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 6d6fb64b1e..0764b6f416 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -5,30 +5,43 @@ #pragma once #include -#include +#include #include "common/mixin/not_copyable.h" #include "runtime-common/core/allocator/script-allocator.h" +#include "runtime-common/core/runtime-core.h" #include "runtime-common/core/std/containers.h" -#include "runtime-light/metaprogramming/concepts.h" #include "runtime-light/stdlib/string/regex-include.h" struct RegexInstanceState final : private vk::not_copyable { - template - using unordered_map = kphp::stl::unordered_map; +private: + using hasher_type = decltype([](const string& s) noexcept { return static_cast(s.hash()); }); - static constexpr size_t MAX_SUBPATTERNS_COUNT = 512; - // match data size should be a multiple of 3 since it holds ovector triples (see pcre2 docs) - static constexpr size_t MATCH_DATA_SIZE = 3 * MAX_SUBPATTERNS_COUNT; - static constexpr auto REPLACE_BUFFER_SIZE = static_cast(16U * 1024U); + static constexpr size_t MAX_SUBPATTERNS_COUNT{512}; + + struct compiled_regex { + // PCRE compile options of the regex + uint32_t compile_options{}; + // compiled regex + pcre2_code_8& regex_code; + }; + + kphp::stl::unordered_map regex_pcre2_code_cache; + +public: + static constexpr size_t OVECTOR_SIZE{MAX_SUBPATTERNS_COUNT + 1}; + static constexpr size_t REPLACE_BUFFER_SIZE{16U * 1024U}; const regex_pcre2_general_context_t regex_pcre2_general_context; const regex_pcre2_compile_context_t compile_context; const regex_pcre2_match_context_t match_context; regex_pcre2_match_data_t regex_pcre2_match_data; - unordered_map regex_pcre2_code_cache; RegexInstanceState() noexcept; + const compiled_regex* get_compiled_regex(const string& regex) const noexcept; + + const compiled_regex* add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept; + static RegexInstanceState& get() noexcept; }; diff --git a/tests/phpt/dl/496_regex.php b/tests/phpt/dl/496_regex.php index a9adc843a2..9e4f705940 100644 --- a/tests/phpt/dl/496_regex.php +++ b/tests/phpt/dl/496_regex.php @@ -1,4 +1,4 @@ -@ok callback benchmark k2_skip +@ok callback benchmark Date: Fri, 5 Dec 2025 21:34:44 +0300 Subject: [PATCH 25/68] format --- runtime-light/stdlib/string/regex-functions.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 5a9ec648dd..9caeafc842 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -423,9 +423,8 @@ bool compile_regex(RegexInfo& regex_info) noexcept { // compile pcre2_code int32_t error_number{}; PCRE2_SIZE error_offset{}; - regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), - regex_info.compile_options, std::addressof(error_number), std::addressof(error_offset), - regex_state.compile_context.get())}; + regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), regex_info.compile_options, + std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get())}; if (!regex_code) [[unlikely]] { std::array buffer{}; pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size()); @@ -740,8 +739,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return true; } -std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val, bool no_empty, - bool delim_capture, bool offset_capture) noexcept { +std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val, bool no_empty, bool delim_capture, bool offset_capture) noexcept { const char* offset{regex_info.subject.data()}; if (limit_val == 0) { @@ -1109,8 +1107,8 @@ Optional> f$preg_split(const string& pattern, const string& subject if (!compile_regex(regex_info)) [[unlikely]] { return false; } - auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, - (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; + auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, + (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; if (!opt_output.has_value()) [[unlikely]] { return false; } From 1df66c9298938974ef5c69cb1fe230e15c378d4c Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Fri, 5 Dec 2025 21:45:32 +0300 Subject: [PATCH 26/68] fix regex_info.capture_count --- .../stdlib/string/regex-functions.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 9caeafc842..2ffa8af26c 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -273,6 +273,15 @@ class preg_replacement_parser { }; bool compile_regex(RegexInfo& regex_info) noexcept { + const vk::final_action finalizer{[®ex_info]() noexcept { + if (regex_info.regex_code != nullptr) [[likely]] { + pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); + ++regex_info.capture_count; // to also count entire match + } else { + regex_info.capture_count = 0; + } + }}; + auto& regex_state{RegexInstanceState::get()}; if (!regex_state.compile_context) [[unlikely]] { return false; @@ -411,15 +420,6 @@ bool compile_regex(RegexInfo& regex_info) noexcept { regex_body.remove_suffix(1); regex_info.compile_options = compile_options; - const vk::final_action finalizer{[®ex_info]() noexcept { - if (regex_info.regex_code != nullptr) [[likely]] { - pcre2_pattern_info_8(regex_info.regex_code, PCRE2_INFO_CAPTURECOUNT, std::addressof(regex_info.capture_count)); - ++regex_info.capture_count; // to also count entire match - } else { - regex_info.capture_count = 0; - } - }}; - // compile pcre2_code int32_t error_number{}; PCRE2_SIZE error_offset{}; From 6b729af899fa2c3c98477ecc71540bbbd658a38a Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:06:52 +0300 Subject: [PATCH 27/68] using std::distance --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 2ffa8af26c..26620abc0d 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -819,7 +819,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val return std::nullopt; } - const auto size{regex_info.subject.size() - (offset - regex_info.subject.data())}; + const auto size{regex_info.subject.size() - std::distance(regex_info.subject.data(), offset)}; if (!no_empty || size != 0) { auto val{string{offset, static_cast(size)}}; From b3678426db5e03db8c4242f3464b962fe6c21ed1 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:09:44 +0300 Subject: [PATCH 28/68] const auto ptr -> const auto* ptr --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 26620abc0d..ad3b049c48 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -288,7 +288,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } // check runtime cache - if (const auto ptr{regex_state.get_compiled_regex(regex_info.regex)}; ptr != nullptr) { + if (const auto* ptr{regex_state.get_compiled_regex(regex_info.regex)}; ptr != nullptr) { auto& [compile_options, regex_code]{*ptr}; regex_info.compile_options = compile_options; regex_info.regex_code = std::addressof(regex_code); From 0339c8b83ad732e3362d5de3ac1bffe93d49b852 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:13:26 +0300 Subject: [PATCH 29/68] make split_regex call more readable --- runtime-light/stdlib/string/regex-functions.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index ad3b049c48..94deeb967b 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -1107,7 +1107,8 @@ Optional> f$preg_split(const string& pattern, const string& subject if (!compile_regex(regex_info)) [[unlikely]] { return false; } - auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, + auto opt_output{split_regex(regex_info, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // + (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, // (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; if (!opt_output.has_value()) [[unlikely]] { return false; From aa8c218a0e2da1bb6cfe1ffd2d14eb644284781a Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:16:16 +0300 Subject: [PATCH 30/68] std::move(*opt_output); --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 94deeb967b..54d4d30fbf 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -1114,5 +1114,5 @@ Optional> f$preg_split(const string& pattern, const string& subject return false; } - return *std::move(opt_output); + return std::move(*opt_output); } From 7c55e31fd9740508bad73effda75a81c25f06cef Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:42:33 +0300 Subject: [PATCH 31/68] std::move(*opt_output); --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- runtime-light/stdlib/string/regex-state.cpp | 12 ++++++------ runtime-light/stdlib/string/regex-state.h | 6 ++++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 54d4d30fbf..443cc4e244 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -288,8 +288,8 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } // check runtime cache - if (const auto* ptr{regex_state.get_compiled_regex(regex_info.regex)}; ptr != nullptr) { - auto& [compile_options, regex_code]{*ptr}; + if (auto ref_opt{regex_state.get_compiled_regex(regex_info.regex)}; ref_opt.has_value()) { + auto& [compile_options, regex_code]{ref_opt->get()}; regex_info.compile_options = compile_options; regex_info.regex_code = std::addressof(regex_code); return true; diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index f37a4d3225..e560c36252 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -48,16 +48,16 @@ RegexInstanceState::RegexInstanceState() noexcept } } -const RegexInstanceState::compiled_regex* RegexInstanceState::get_compiled_regex(const string& regex) const noexcept { +std::optional> RegexInstanceState::get_compiled_regex(const string& regex) const noexcept { if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { - return std::addressof(it->second); + return it->second; } - return nullptr; + return std::nullopt; } -const RegexInstanceState::compiled_regex* RegexInstanceState::add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept { - return std::addressof( - regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = regex_code}).first->second); +std::optional> RegexInstanceState::add_compiled_regex(string regex, uint32_t compile_options, + pcre2_code_8& regex_code) noexcept { + return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = regex_code}).first->second; } RegexInstanceState& RegexInstanceState::get() noexcept { diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 0764b6f416..688e28b58d 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -5,7 +5,9 @@ #pragma once #include +#include #include +#include #include "common/mixin/not_copyable.h" #include "runtime-common/core/allocator/script-allocator.h" @@ -39,9 +41,9 @@ struct RegexInstanceState final : private vk::not_copyable { RegexInstanceState() noexcept; - const compiled_regex* get_compiled_regex(const string& regex) const noexcept; + std::optional> get_compiled_regex(const string& regex) const noexcept; - const compiled_regex* add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept; + std::optional> add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept; static RegexInstanceState& get() noexcept; }; From ff4731484fd47329a356e1173778cc6797badd78 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 18:54:29 +0300 Subject: [PATCH 32/68] #include --- runtime-light/stdlib/string/regex-state.h | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 688e28b58d..ae47ad9d69 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include From d7c7ce7cc0658311b1a451300f161e786049a414 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 19:03:31 +0300 Subject: [PATCH 33/68] push_back -> emplace_back --- runtime-light/stdlib/string/regex-functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 443cc4e244..e9dec6a0ce 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -625,7 +625,7 @@ PCRE2_SIZE set_matches(const RegexInfo& regex_info, int64_t flags, std::optional if (regex_info.group_names[i] != nullptr) { output.set_value(string{regex_info.group_names[i]}, output_val); } - output.push_back(output_val); + output.emplace_back(output_val); } (*opt_matches).get() = std::move(output); @@ -781,7 +781,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val output_val = std::move(val); } - output.push_back(std::move(output_val)); + output.emplace_back(std::move(output_val)); if (limit_val != kphp::regex::PREG_NOLIMIT) { limit_val--; } @@ -807,7 +807,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val output_val = std::move(val); } - output.push_back(std::forward(output_val)); + output.emplace_back(std::forward(output_val)); } } } @@ -830,7 +830,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val output_val = std::move(val); } - output.push_back(std::forward(output_val)); + output.emplace_back(std::forward(output_val)); } return output; From 3e4fcbe608d5fb4751bc69f9ebaeee747d5c2b5b Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 19:06:09 +0300 Subject: [PATCH 34/68] std::move --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index e9dec6a0ce..5120fe40d5 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -807,7 +807,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val output_val = std::move(val); } - output.emplace_back(std::forward(output_val)); + output.emplace_back(std::move(output_val)); } } } @@ -830,7 +830,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val output_val = std::move(val); } - output.emplace_back(std::forward(output_val)); + output.emplace_back(std::move(output_val)); } return output; From 857dcaf1310eeb6c1d22c0eb34240e3aa62c98ee Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:17:52 +0300 Subject: [PATCH 35/68] fixes --- .../stdlib/string/regex-functions.cpp | 155 ++++++++---------- 1 file changed, 71 insertions(+), 84 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 5120fe40d5..1d62d1536f 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -469,66 +470,40 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { return true; } -std::optional match_regex(const RegexInfo& regex_info, size_t offset, uint32_t match_options) noexcept { - const auto& regex_state{RegexInstanceState::get()}; - if (regex_info.regex_code == nullptr || !regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - int32_t match_count{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, - match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; - // From https://www.pcre.org/current/doc/html/pcre2_match.html - // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set - // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. - if (match_count < 0 && match_count != PCRE2_ERROR_NOMATCH) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(match_count, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't match pcre2 regex due to error: {}", buffer.data()); - return std::nullopt; - } - // zero if the vector of offsets is too small - return match_count != PCRE2_ERROR_NOMATCH ? match_count : 0; -} - class matcher { public: matcher(const RegexInfo& info, size_t match_from) noexcept : m_regex_info{std::addressof(info)}, m_match_options{info.match_options}, m_current_offset{match_from} { - if (info.regex_code == nullptr) { - return; - } + kphp::log::assertion(info.regex_code != nullptr); const auto& regex_state{RegexInstanceState::get()}; m_match_data = regex_state.regex_pcre2_match_data.get(); - if (!m_match_data) { - return; - } - - m_has_error = false; - } - - bool has_error() const noexcept { - return m_has_error; + kphp::log::assertion(m_match_data); } - std::optional next() noexcept { + std::expected, int32_t> next() noexcept { const auto& ri{*m_regex_info}; + + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(ri.regex_code != nullptr && regex_state.match_context); + auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { // Try to find match - auto match_count_opt{match_regex(ri, m_current_offset, m_match_options)}; - if (!match_count_opt.has_value()) { - // std::nullopt means error - m_has_error = true; - return std::nullopt; + int32_t ret_code{pcre2_match_8(ri.regex_code, reinterpret_cast(ri.subject.data()), ri.subject.size(), m_current_offset, m_match_options, + m_match_data, regex_state.match_context.get())}; + // From https://www.pcre.org/current/doc/html/pcre2_match.html + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. + if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { + return std::unexpected{ret_code}; } + auto match_count{ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0}; - auto ret_code{*match_count_opt}; - - if (ret_code == 0) { + if (match_count == 0) { // If match is not found if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { // Here we are sure that there are no more matches here @@ -554,7 +529,7 @@ class matcher { // Else use default options m_match_options = ri.match_options; } - return pcre2_match_view{ri.subject, ovector, ret_code}; + return pcre2_match_view{ri.subject, ovector, match_count}; } } @@ -563,7 +538,6 @@ class matcher { uint64_t m_match_options{}; PCRE2_SIZE m_current_offset{}; pcre2_match_data_8* m_match_data{nullptr}; - bool m_has_error{true}; }; // returns the ending offset of the entire match @@ -692,19 +666,22 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return false; } } else { // replace only 'limit' times - size_t match_offset{}; size_t substitute_offset{}; int64_t replacement_diff_acc{}; PCRE2_SIZE length_after_replace{buffer_length}; string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; + matcher m{regex_info, {}}; for (; regex_info.replace_count < limit; ++regex_info.replace_count) { - auto match_count_opt{match_regex(regex_info, match_offset, regex_info.match_options)}; - if (!match_count_opt.has_value()) [[unlikely]] { + auto expected_opt_match_view{m.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + std::array buffer{}; + pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", buffer.data()); return false; } - regex_info.match_count = *match_count_opt; - if (regex_info.match_count == 0) { + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { break; } @@ -722,7 +699,6 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return false; } - match_offset = match_end_offset; replacement_diff_acc += regex_info.replacement.size() - (match_end_offset - match_start_offset); substitute_offset = match_end_offset + replacement_diff_acc; str_after_replace = {runtime_ctx.static_SB.buffer(), static_cast(length_after_replace)}; @@ -739,11 +715,11 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { return true; } -std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val, bool no_empty, bool delim_capture, bool offset_capture) noexcept { +std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bool no_empty, bool delim_capture, bool offset_capture) noexcept { const char* offset{regex_info.subject.data()}; - if (limit_val == 0) { - limit_val = kphp::regex::PREG_NOLIMIT; + if (limit == 0) { + limit = kphp::regex::PREG_NOLIMIT; } const auto& regex_state{RegexInstanceState::get()}; @@ -754,12 +730,19 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val array output{}; matcher m{regex_info, {}}; - if (m.has_error()) { - return std::nullopt; - } - - for (auto match_view_opt{m.next()}; match_view_opt.has_value(); match_view_opt = m.next()) { - pcre2_match_view match_view{*match_view_opt}; + for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { + auto expected_opt_match_view{m.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + std::array buffer{}; + pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); + kphp::log::warning("can't split by pcre2 regex due to match error: {}", buffer.data()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + pcre2_match_view match_view{*opt_match_view}; auto entire_pattern_match_opt{match_view.get_group(0)}; if (!entire_pattern_match_opt.has_value()) [[unlikely]] { @@ -767,10 +750,6 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val } auto entire_pattern_match_sv{*entire_pattern_match_opt}; - if (!(limit_val == kphp::regex::PREG_NOLIMIT || limit_val > 1)) { - break; - } - if (const auto size{entire_pattern_match_sv.data() - offset}; !no_empty || size != 0) { auto val{string{offset, static_cast(size)}}; @@ -782,13 +761,11 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val } output.emplace_back(std::move(output_val)); - if (limit_val != kphp::regex::PREG_NOLIMIT) { - limit_val--; - } + ++out_parts_count; } if (delim_capture) { - for (auto i{1uz}; i < match_view.size(); i++) { + for (size_t i{1}; i < match_view.size(); i++) { auto sv_opt{match_view.get_group(i)}; auto sv{sv_opt.value_or(std::string_view{})}; const auto size{sv.size()}; @@ -815,13 +792,9 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit_val offset = std::next(entire_pattern_match_sv.data(), entire_pattern_match_sv.size()); } - if (m.has_error()) [[unlikely]] { - return std::nullopt; - } - const auto size{regex_info.subject.size() - std::distance(regex_info.subject.data(), offset)}; if (!no_empty || size != 0) { - auto val{string{offset, static_cast(size)}}; + string val{offset, static_cast(size)}; mixed output_val; if (offset_capture) { @@ -854,11 +827,22 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt if (!collect_group_names(regex_info)) [[unlikely]] { return false; } - auto match_count_opt{match_regex(regex_info, offset, regex_info.match_options)}; - if (!match_count_opt.has_value()) [[unlikely]] { + + const auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(regex_info.regex_code != nullptr && regex_state.match_context); + + int32_t ret_code{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, + regex_info.match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; + // From https://www.pcre.org/current/doc/html/pcre2_match.html + // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set + // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. + if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { + std::array buffer{}; + pcre2_get_error_message_8(ret_code, reinterpret_cast(buffer.data()), buffer.size()); + kphp::log::warning("can't match by pcre2 regex due to error: {}", buffer.data()); return false; } - regex_info.match_count = *match_count_opt; + regex_info.match_count = ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0; std::optional> matches{}; if (opt_matches.has_value()) { @@ -911,19 +895,21 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } matcher m{regex_info, static_cast(offset)}; - if (m.has_error()) { - return false; - } - for (auto match_view_opt{m.next()}; match_view_opt.has_value(); match_view_opt = m.next()) { - pcre2_match_view match_view{*match_view_opt}; + auto expected_opt_match_view{m.next()}; + while (expected_opt_match_view.has_value() && expected_opt_match_view->has_value()) { + pcre2_match_view match_view{**expected_opt_match_view}; regex_info.match_count = match_view.size(); set_all_matches(regex_info, flags, matches); if (regex_info.match_count > 0) { ++entire_match_count; } + expected_opt_match_view = m.next(); } - if (m.has_error()) [[unlikely]] { + if (!expected_opt_match_view.has_value()) [[unlikely]] { + std::array buffer{}; + pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); + kphp::log::warning("can't find all matches due to match error: {}", buffer.data()); return false; } @@ -967,9 +953,10 @@ Optional f$preg_replace(const string& pattern, const string& replacement RegexInfo regex_info{pattern, {subject.c_str(), subject.size()}, {pcre2_replacement.c_str(), pcre2_replacement.size()}}; - bool success{compile_regex(regex_info)}; - success &= replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)); - if (!success) [[unlikely]] { + if (!compile_regex(regex_info)) [[unlikely]] { + return {}; + } + if (!replace_regex(regex_info, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))) { return {}; } count = regex_info.replace_count; From 55ccacf1a1574a9ae51bc71b8ec0ea46f1ed567b Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:24:00 +0300 Subject: [PATCH 36/68] names --- .../stdlib/string/regex-functions.cpp | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 1d62d1536f..60de7b3cd1 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -671,9 +671,9 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { PCRE2_SIZE length_after_replace{buffer_length}; string str_after_replace{regex_info.subject.data(), static_cast(regex_info.subject.size())}; - matcher m{regex_info, {}}; + matcher pcre2_matcher{regex_info, {}}; for (; regex_info.replace_count < limit; ++regex_info.replace_count) { - auto expected_opt_match_view{m.next()}; + auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { std::array buffer{}; pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); @@ -729,9 +729,9 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo array output{}; - matcher m{regex_info, {}}; + matcher pcre2_matcher{regex_info, {}}; for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { - auto expected_opt_match_view{m.next()}; + auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { std::array buffer{}; pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); @@ -748,9 +748,9 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (!entire_pattern_match_opt.has_value()) [[unlikely]] { return std::nullopt; } - auto entire_pattern_match_sv{*entire_pattern_match_opt}; + auto entire_pattern_match_string_view{*entire_pattern_match_opt}; - if (const auto size{entire_pattern_match_sv.data() - offset}; !no_empty || size != 0) { + if (const auto size{entire_pattern_match_string_view.data() - offset}; !no_empty || size != 0) { auto val{string{offset, static_cast(size)}}; mixed output_val; @@ -766,20 +766,20 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (delim_capture) { for (size_t i{1}; i < match_view.size(); i++) { - auto sv_opt{match_view.get_group(i)}; - auto sv{sv_opt.value_or(std::string_view{})}; - const auto size{sv.size()}; + auto submatch_opt{match_view.get_group(i)}; + auto string_view{submatch_opt.value_or(std::string_view{})}; + const auto size{string_view.size()}; if (!no_empty || size != 0) { string val; - if (sv_opt.has_value()) [[likely]] { - val = string{sv.data(), static_cast(size)}; + if (submatch_opt.has_value()) [[likely]] { + val = string{string_view.data(), static_cast(size)}; } mixed output_val; if (offset_capture) { output_val = array::create( std::move(val), - sv_opt.transform([®ex_info](auto sv) noexcept { return static_cast(sv.data() - regex_info.subject.data()); }).value_or(-1)); + submatch_opt.transform([®ex_info](auto string_view) noexcept { return static_cast(string_view.data() - regex_info.subject.data()); }).value_or(-1)); } else { output_val = std::move(val); } @@ -789,7 +789,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo } } - offset = std::next(entire_pattern_match_sv.data(), entire_pattern_match_sv.size()); + offset = std::next(entire_pattern_match_string_view.data(), entire_pattern_match_string_view.size()); } const auto size{regex_info.subject.size() - std::distance(regex_info.subject.data(), offset)}; @@ -894,9 +894,9 @@ Optional f$preg_match_all(const string& pattern, const string& subject, } } - matcher m{regex_info, static_cast(offset)}; + matcher pcre2_matcher{regex_info, static_cast(offset)}; - auto expected_opt_match_view{m.next()}; + auto expected_opt_match_view{pcre2_matcher.next()}; while (expected_opt_match_view.has_value() && expected_opt_match_view->has_value()) { pcre2_match_view match_view{**expected_opt_match_view}; regex_info.match_count = match_view.size(); @@ -904,7 +904,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, if (regex_info.match_count > 0) { ++entire_match_count; } - expected_opt_match_view = m.next(); + expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { std::array buffer{}; From 6aefbc670411860e0101c92f68ea9d4471ed9574 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:27:52 +0300 Subject: [PATCH 37/68] opt in names --- .../stdlib/string/regex-functions.cpp | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 60de7b3cd1..9a83c65922 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -191,13 +191,13 @@ class preg_replacement_parser { case '\\': { // \1 - auto back_reference_opt{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { + auto opt_back_reference{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { auto digits_end_pos{br.size()}; preg_replacement = preg_replacement.substr(digits_end_pos); return br; })}; - if (back_reference_opt.has_value()) { - return *back_reference_opt; + if (opt_back_reference.has_value()) { + return *opt_back_reference; } else { auto c{preg_replacement.front()}; if (c == '$' || c == '\\') { @@ -289,8 +289,8 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } // check runtime cache - if (auto ref_opt{regex_state.get_compiled_regex(regex_info.regex)}; ref_opt.has_value()) { - auto& [compile_options, regex_code]{ref_opt->get()}; + if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { + auto& [compile_options, regex_code]{opt_ref->get()}; regex_info.compile_options = compile_options; regex_info.regex_code = std::addressof(regex_code); return true; @@ -744,11 +744,11 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo } pcre2_match_view match_view{*opt_match_view}; - auto entire_pattern_match_opt{match_view.get_group(0)}; - if (!entire_pattern_match_opt.has_value()) [[unlikely]] { + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { return std::nullopt; } - auto entire_pattern_match_string_view{*entire_pattern_match_opt}; + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; if (const auto size{entire_pattern_match_string_view.data() - offset}; !no_empty || size != 0) { auto val{string{offset, static_cast(size)}}; @@ -766,20 +766,22 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (delim_capture) { for (size_t i{1}; i < match_view.size(); i++) { - auto submatch_opt{match_view.get_group(i)}; - auto string_view{submatch_opt.value_or(std::string_view{})}; + auto opt_submatch{match_view.get_group(i)}; + auto string_view{opt_submatch.value_or(std::string_view{})}; const auto size{string_view.size()}; if (!no_empty || size != 0) { string val; - if (submatch_opt.has_value()) [[likely]] { + if (opt_submatch.has_value()) [[likely]] { val = string{string_view.data(), static_cast(size)}; } mixed output_val; if (offset_capture) { - output_val = array::create( - std::move(val), - submatch_opt.transform([®ex_info](auto string_view) noexcept { return static_cast(string_view.data() - regex_info.subject.data()); }).value_or(-1)); + output_val = array::create(std::move(val), opt_submatch + .transform([®ex_info](auto string_view) noexcept { + return static_cast(string_view.data() - regex_info.subject.data()); + }) + .value_or(-1)); } else { output_val = std::move(val); } From c74ed3d7e698dd200aa4f023b0dae29d0c3710f5 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:38:58 +0300 Subject: [PATCH 38/68] const RegexInfo& --- .../stdlib/string/regex-functions.cpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 9a83c65922..2e1c660737 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -473,7 +473,7 @@ bool collect_group_names(RegexInfo& regex_info) noexcept { class matcher { public: matcher(const RegexInfo& info, size_t match_from) noexcept - : m_regex_info{std::addressof(info)}, + : m_regex_info{info}, m_match_options{info.match_options}, m_current_offset{match_from} { kphp::log::assertion(info.regex_code != nullptr); @@ -484,17 +484,15 @@ class matcher { } std::expected, int32_t> next() noexcept { - const auto& ri{*m_regex_info}; - const auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(ri.regex_code != nullptr && regex_state.match_context); + kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { // Try to find match - int32_t ret_code{pcre2_match_8(ri.regex_code, reinterpret_cast(ri.subject.data()), ri.subject.size(), m_current_offset, m_match_options, - m_match_data, regex_state.match_context.get())}; + int32_t ret_code{pcre2_match_8(m_regex_info.regex_code, reinterpret_cast(m_regex_info.subject.data()), m_regex_info.subject.size(), + m_current_offset, m_match_options, m_match_data, regex_state.match_context.get())}; // From https://www.pcre.org/current/doc/html/pcre2_match.html // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. @@ -505,15 +503,16 @@ class matcher { if (match_count == 0) { // If match is not found - if (m_match_options == ri.match_options || m_current_offset == ri.subject.size()) { + if (m_match_options == m_regex_info.match_options || m_current_offset == m_regex_info.subject.size()) { // Here we are sure that there are no more matches here return std::nullopt; } // Here we know that we were looking for a non-empty and anchored match, // and we're going to try searching from the next character with the default options. ++m_current_offset; - m_current_offset = static_cast(ri.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, ri.subject) : m_current_offset; - m_match_options = ri.match_options; + m_current_offset = + static_cast(m_regex_info.compile_options & PCRE2_UTF) ? skip_utf8_subsequent_bytes(m_current_offset, m_regex_info.subject) : m_current_offset; + m_match_options = m_regex_info.match_options; continue; } @@ -527,14 +526,14 @@ class matcher { m_match_options |= PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } else { // Else use default options - m_match_options = ri.match_options; + m_match_options = m_regex_info.match_options; } - return pcre2_match_view{ri.subject, ovector, match_count}; + return pcre2_match_view{m_regex_info.subject, ovector, match_count}; } } private: - const RegexInfo* const m_regex_info{nullptr}; + const RegexInfo& m_regex_info; uint64_t m_match_options{}; PCRE2_SIZE m_current_offset{}; pcre2_match_data_8* m_match_data{nullptr}; @@ -716,8 +715,6 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { } std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bool no_empty, bool delim_capture, bool offset_capture) noexcept { - const char* offset{regex_info.subject.data()}; - if (limit == 0) { limit = kphp::regex::PREG_NOLIMIT; } @@ -730,6 +727,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo array output{}; matcher pcre2_matcher{regex_info, {}}; + const char* offset{regex_info.subject.data()}; for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { From 7f487e6f067a8313abcea698ed2ba89afed2299b Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:42:13 +0300 Subject: [PATCH 39/68] remove unused include --- runtime-light/stdlib/string/regex-functions.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 2e1c660737..30c1bb9178 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include From bd80df995585a0dca3210afe9b0adba47b533304 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:43:23 +0300 Subject: [PATCH 40/68] remove defaults --- runtime-light/stdlib/string/regex-functions.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 30c1bb9178..6b2992aa39 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -76,11 +76,6 @@ class pcre2_match_view { m_ovector_ptr{ovector}, m_num_groups{ret_code} {} - pcre2_match_view(const pcre2_match_view&) noexcept = default; - pcre2_match_view(pcre2_match_view&&) noexcept = default; - pcre2_match_view& operator=(const pcre2_match_view&) noexcept = default; - pcre2_match_view& operator=(pcre2_match_view&&) noexcept = default; - int32_t size() const noexcept { return m_num_groups; } From dfdd5319de6082976a110b6ba0254a17d46ff045 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:45:27 +0300 Subject: [PATCH 41/68] const ovector --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 6b2992aa39..b2582f0752 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -71,7 +71,7 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view(std::string_view subject, PCRE2_SIZE* ovector, int32_t ret_code) noexcept + pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, int32_t ret_code) noexcept : m_subject_data{subject}, m_ovector_ptr{ovector}, m_num_groups{ret_code} {} @@ -481,7 +481,7 @@ class matcher { const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); - auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; + const auto* const ovector{pcre2_get_ovector_pointer_8(m_match_data)}; while (true) { // Try to find match From 28c0ad231e26caf0eb26de7f972bc206bc80fe9e Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:51:14 +0300 Subject: [PATCH 42/68] using std::distance --- runtime-light/stdlib/string/regex-functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index b2582f0752..df2d625012 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -742,12 +742,12 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo } auto entire_pattern_match_string_view{*opt_entire_pattern_match}; - if (const auto size{entire_pattern_match_string_view.data() - offset}; !no_empty || size != 0) { + if (const auto size{std::distance(offset, entire_pattern_match_string_view.data())}; !no_empty || size != 0) { auto val{string{offset, static_cast(size)}}; mixed output_val; if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset - regex_info.subject.data())); + output_val = array::create(std::move(val), static_cast(std::distance(regex_info.subject.data(), offset))); } else { output_val = std::move(val); } @@ -771,7 +771,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (offset_capture) { output_val = array::create(std::move(val), opt_submatch .transform([®ex_info](auto string_view) noexcept { - return static_cast(string_view.data() - regex_info.subject.data()); + return static_cast(std::distance(regex_info.subject.data(), string_view.data())); }) .value_or(-1)); } else { @@ -792,7 +792,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo mixed output_val; if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset - regex_info.subject.data())); + output_val = array::create(std::move(val), static_cast(std::distance(regex_info.subject.data(), offset))); } else { output_val = std::move(val); } From 88c5f7b5ad72741b9723afec4cc8fa070c60b0e3 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 22:58:48 +0300 Subject: [PATCH 43/68] use size_t offset --- runtime-light/stdlib/string/regex-functions.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index df2d625012..8fec54234b 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -721,7 +721,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo array output{}; matcher pcre2_matcher{regex_info, {}}; - const char* offset{regex_info.subject.data()}; + size_t offset{}; for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { @@ -742,12 +742,12 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo } auto entire_pattern_match_string_view{*opt_entire_pattern_match}; - if (const auto size{std::distance(offset, entire_pattern_match_string_view.data())}; !no_empty || size != 0) { - auto val{string{offset, static_cast(size)}}; + if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { + auto val{string{std::next(regex_info.subject.data(), offset), static_cast(size)}}; mixed output_val; if (offset_capture) { - output_val = array::create(std::move(val), static_cast(std::distance(regex_info.subject.data(), offset))); + output_val = array::create(std::move(val), static_cast(offset)); } else { output_val = std::move(val); } @@ -783,16 +783,16 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo } } - offset = std::next(entire_pattern_match_string_view.data(), entire_pattern_match_string_view.size()); + offset = std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) + entire_pattern_match_string_view.size(); } - const auto size{regex_info.subject.size() - std::distance(regex_info.subject.data(), offset)}; + const auto size{regex_info.subject.size() - offset}; if (!no_empty || size != 0) { - string val{offset, static_cast(size)}; + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; mixed output_val; if (offset_capture) { - output_val = array::create(std::move(val), static_cast(std::distance(regex_info.subject.data(), offset))); + output_val = array::create(std::move(val), static_cast(offset)); } else { output_val = std::move(val); } From 02bc3ecf474d898f23c7a7a57c9b58e35e05d92b Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:04:28 +0300 Subject: [PATCH 44/68] small refactoring --- runtime-light/stdlib/string/regex-functions.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 8fec54234b..b49d76a5eb 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -720,8 +720,8 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo array output{}; - matcher pcre2_matcher{regex_info, {}}; size_t offset{}; + matcher pcre2_matcher{regex_info, {}}; for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { @@ -734,6 +734,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (!opt_match_view.has_value()) { break; } + pcre2_match_view match_view{*opt_match_view}; auto opt_entire_pattern_match{match_view.get_group(0)}; From 1fb1aab98b7815c3fdca979559af07fa0285ae45 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:06:59 +0300 Subject: [PATCH 45/68] remove unused include --- runtime-light/stdlib/string/regex-state.h | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index ae47ad9d69..6f59c5249f 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "common/mixin/not_copyable.h" From 25ac6771ec7cf0c10665b5634b44cbcd72bb1799 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:37:05 +0300 Subject: [PATCH 46/68] using regex_pcre2_code_t = std::unique_ptr; --- runtime-light/stdlib/string/regex-functions.cpp | 11 ++++++----- runtime-light/stdlib/string/regex-include.h | 2 +- runtime-light/stdlib/string/regex-state.cpp | 5 +++-- runtime-light/stdlib/string/regex-state.h | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index b49d76a5eb..d0c7f90d56 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -48,7 +48,7 @@ struct RegexInfo final { // number of groups including entire match uint32_t capture_count{}; // compiled regex - regex_pcre2_code_t regex_code{nullptr}; + pcre2_code_8* regex_code{nullptr}; // vector of group names regex_pcre2_group_names_t group_names; @@ -286,7 +286,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { auto& [compile_options, regex_code]{opt_ref->get()}; regex_info.compile_options = compile_options; - regex_info.regex_code = std::addressof(regex_code); + regex_info.regex_code = regex_code.get(); return true; } @@ -419,7 +419,8 @@ bool compile_regex(RegexInfo& regex_info) noexcept { int32_t error_number{}; PCRE2_SIZE error_offset{}; regex_pcre2_code_t regex_code{pcre2_compile_8(reinterpret_cast(regex_body.data()), regex_body.size(), regex_info.compile_options, - std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get())}; + std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), + pcre2_code_free_8}; if (!regex_code) [[unlikely]] { std::array buffer{}; pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size()); @@ -427,10 +428,10 @@ bool compile_regex(RegexInfo& regex_info) noexcept { return false; } + regex_info.regex_code = regex_code.get(); // add compiled code to runtime cache - regex_state.add_compiled_regex(regex_info.regex, compile_options, *regex_code); + regex_state.add_compiled_regex(regex_info.regex, compile_options, std::move(regex_code)); - regex_info.regex_code = regex_code; return true; } diff --git a/runtime-light/stdlib/string/regex-include.h b/runtime-light/stdlib/string/regex-include.h index b6924163b9..b0670a06ed 100644 --- a/runtime-light/stdlib/string/regex-include.h +++ b/runtime-light/stdlib/string/regex-include.h @@ -13,4 +13,4 @@ using regex_pcre2_general_context_t = std::unique_ptr; using regex_pcre2_match_context_t = std::unique_ptr; using regex_pcre2_match_data_t = std::unique_ptr; -using regex_pcre2_code_t = pcre2_code_8*; +using regex_pcre2_code_t = std::unique_ptr; diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index e560c36252..a365d1b4ea 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -56,8 +56,9 @@ std::optional> } std::optional> RegexInstanceState::add_compiled_regex(string regex, uint32_t compile_options, - pcre2_code_8& regex_code) noexcept { - return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = regex_code}).first->second; + regex_pcre2_code_t regex_code) noexcept { + return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) + .first->second; } RegexInstanceState& RegexInstanceState::get() noexcept { diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 6f59c5249f..37a7cab3df 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -25,7 +25,7 @@ struct RegexInstanceState final : private vk::not_copyable { // PCRE compile options of the regex uint32_t compile_options{}; // compiled regex - pcre2_code_8& regex_code; + regex_pcre2_code_t regex_code; }; kphp::stl::unordered_map regex_pcre2_code_cache; @@ -43,7 +43,7 @@ struct RegexInstanceState final : private vk::not_copyable { std::optional> get_compiled_regex(const string& regex) const noexcept; - std::optional> add_compiled_regex(string regex, uint32_t compile_options, pcre2_code_8& regex_code) noexcept; + std::optional> add_compiled_regex(string regex, uint32_t compile_options, regex_pcre2_code_t regex_code) noexcept; static RegexInstanceState& get() noexcept; }; From 290e81baf78b479ab53eaa8a40594c0ad45547bd Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:41:55 +0300 Subject: [PATCH 47/68] inline regex code cache --- runtime-light/stdlib/string/regex-state.cpp | 13 ------------- runtime-light/stdlib/string/regex-state.h | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index a365d1b4ea..c633286957 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -48,19 +48,6 @@ RegexInstanceState::RegexInstanceState() noexcept } } -std::optional> RegexInstanceState::get_compiled_regex(const string& regex) const noexcept { - if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { - return it->second; - } - return std::nullopt; -} - -std::optional> RegexInstanceState::add_compiled_regex(string regex, uint32_t compile_options, - regex_pcre2_code_t regex_code) noexcept { - return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) - .first->second; -} - RegexInstanceState& RegexInstanceState::get() noexcept { return InstanceState::get().regex_instance_state; } diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 37a7cab3df..a479dc73d4 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -41,9 +41,18 @@ struct RegexInstanceState final : private vk::not_copyable { RegexInstanceState() noexcept; - std::optional> get_compiled_regex(const string& regex) const noexcept; - - std::optional> add_compiled_regex(string regex, uint32_t compile_options, regex_pcre2_code_t regex_code) noexcept; + std::optional> get_compiled_regex(const string& regex) const noexcept { + if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { + return it->second; + } + return std::nullopt; + } + + std::optional> add_compiled_regex(string regex, uint32_t compile_options, + regex_pcre2_code_t regex_code) noexcept { + return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) + .first->second; + } static RegexInstanceState& get() noexcept; }; From bda7c7458f634a66797a168c4146ccb2ece53a1a Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:43:11 +0300 Subject: [PATCH 48/68] move size_t offset down as much as possible --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index d0c7f90d56..8fe5ac8b86 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -721,8 +721,8 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo array output{}; - size_t offset{}; matcher pcre2_matcher{regex_info, {}}; + size_t offset{}; for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { From 4f66884b7df617de94eeba1ab5fe0319a46b9bb8 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Mon, 8 Dec 2025 23:49:43 +0300 Subject: [PATCH 49/68] auto -> string --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 8fe5ac8b86..d3cae5c472 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -745,7 +745,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo auto entire_pattern_match_string_view{*opt_entire_pattern_match}; if (const auto size{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data()) - offset}; !no_empty || size != 0) { - auto val{string{std::next(regex_info.subject.data(), offset), static_cast(size)}}; + string val{std::next(regex_info.subject.data(), offset), static_cast(size)}; mixed output_val; if (offset_capture) { From 8ce46e18c77510aa313f5f7927d875afd6ea8fc1 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 16:01:02 +0300 Subject: [PATCH 50/68] squash --- .../stdlib/string/regex-functions.cpp | 44 +++++++++---------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index d3cae5c472..887c59e3cb 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -267,6 +267,13 @@ class preg_replacement_parser { } }; +template +void log_regex_error(const char (&msg)[N], int32_t regex_error) noexcept { + std::array buffer{}; + pcre2_get_error_message_8(regex_error, reinterpret_cast(buffer.data()), buffer.size()); + kphp::log::warning("{}: {}", msg, buffer.data()); +} + bool compile_regex(RegexInfo& regex_info) noexcept { const vk::final_action finalizer{[®ex_info]() noexcept { if (regex_info.regex_code != nullptr) [[likely]] { @@ -654,9 +661,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(regex_info.replace_count, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("pcre2_substitute error {}", buffer.data()); + log_regex_error("pcre2_substitute error {}", regex_info.replace_count); return false; } } else { // replace only 'limit' times @@ -669,9 +674,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't replace by pcre2 regex due to match error: {}", buffer.data()); + log_regex_error("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; @@ -726,9 +729,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't split by pcre2 regex due to match error: {}", buffer.data()); + log_regex_error("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); return std::nullopt; } auto opt_match_view{*expected_opt_match_view}; @@ -761,21 +762,22 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo if (delim_capture) { for (size_t i{1}; i < match_view.size(); i++) { auto opt_submatch{match_view.get_group(i)}; - auto string_view{opt_submatch.value_or(std::string_view{})}; - const auto size{string_view.size()}; + auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; + const auto size{submatch_string_view.size()}; if (!no_empty || size != 0) { string val; if (opt_submatch.has_value()) [[likely]] { - val = string{string_view.data(), static_cast(size)}; + val = string{submatch_string_view.data(), static_cast(size)}; } mixed output_val; if (offset_capture) { - output_val = array::create(std::move(val), opt_submatch - .transform([®ex_info](auto string_view) noexcept { - return static_cast(std::distance(regex_info.subject.data(), string_view.data())); - }) - .value_or(-1)); + output_val = + array::create(std::move(val), opt_submatch + .transform([®ex_info](auto submatch_string_view) noexcept { + return static_cast(std::distance(regex_info.subject.data(), submatch_string_view.data())); + }) + .value_or(-1)); } else { output_val = std::move(val); } @@ -833,9 +835,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(ret_code, reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't match by pcre2 regex due to error: {}", buffer.data()); + log_regex_error("can't match by pcre2 regex due to error: {}", ret_code); return false; } regex_info.match_count = ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0; @@ -903,9 +903,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - pcre2_get_error_message_8(expected_opt_match_view.error(), reinterpret_cast(buffer.data()), buffer.size()); - kphp::log::warning("can't find all matches due to match error: {}", buffer.data()); + log_regex_error("can't find all matches due to match error: {}", expected_opt_match_view.error()); return false; } From e60fe9bca675acd8b5214422db2efcf585bea71f Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 16:10:26 +0300 Subject: [PATCH 51/68] fix REPLACE_BUFFER_SIZE --- runtime-light/stdlib/string/regex-state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index a479dc73d4..da1775eb7a 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -32,7 +32,7 @@ struct RegexInstanceState final : private vk::not_copyable { public: static constexpr size_t OVECTOR_SIZE{MAX_SUBPATTERNS_COUNT + 1}; - static constexpr size_t REPLACE_BUFFER_SIZE{16U * 1024U}; + static constexpr size_t REPLACE_BUFFER_SIZE{size_t{16U} * size_t{1024U}}; const regex_pcre2_general_context_t regex_pcre2_general_context; const regex_pcre2_compile_context_t compile_context; From 5d442239aa15dcfd8e55218d461df0c72c9f4a97 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 16:19:22 +0300 Subject: [PATCH 52/68] fix includes --- runtime-light/stdlib/string/regex-state.cpp | 4 ---- runtime-light/stdlib/string/regex-state.h | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index c633286957..dbf005549d 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -4,11 +4,7 @@ #include "runtime-light/stdlib/string/regex-state.h" -#include -#include - #include "runtime-common/core/allocator/script-malloc-interface.h" -#include "runtime-common/core/runtime-core.h" #include "runtime-light/state/instance-state.h" #include "runtime-light/stdlib/diagnostics/logs.h" #include "runtime-light/stdlib/string/regex-include.h" diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index da1775eb7a..ea61a9cf9f 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "common/mixin/not_copyable.h" #include "runtime-common/core/allocator/script-allocator.h" From 368004e05cf513dd58aa8cf9e1414255efa20bb3 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 16:21:12 +0300 Subject: [PATCH 53/68] ret_code -> num_groups --- runtime-light/stdlib/string/regex-functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 887c59e3cb..8bffe16c55 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -71,10 +71,10 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, int32_t ret_code) noexcept + pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, int32_t num_groups) noexcept : m_subject_data{subject}, m_ovector_ptr{ovector}, - m_num_groups{ret_code} {} + m_num_groups{num_groups} {} int32_t size() const noexcept { return m_num_groups; From 11496f3d8293f9250014cb80c03818a21c70a399 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 16:25:01 +0300 Subject: [PATCH 54/68] regex_body type --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 8bffe16c55..50b63255c0 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -343,7 +343,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { } uint32_t compile_options{}; - auto regex_body{std::string_view{regex_info.regex.c_str(), regex_info.regex.size()}}; + std::string_view regex_body{regex_info.regex.c_str(), regex_info.regex.size()}; // remove start delimiter regex_body.remove_prefix(1); From 9351c5e19c160797de295026a272c338cf40ebab Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 17:46:51 +0300 Subject: [PATCH 55/68] fixes --- .../stdlib/string/regex-functions.cpp | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 50b63255c0..380cd2ce89 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -71,7 +71,7 @@ struct RegexInfo final { class pcre2_match_view { public: - pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, int32_t num_groups) noexcept + pcre2_match_view(std::string_view subject, const PCRE2_SIZE* ovector, size_t num_groups) noexcept : m_subject_data{subject}, m_ovector_ptr{ovector}, m_num_groups{num_groups} {} @@ -88,7 +88,7 @@ class pcre2_match_view { kphp::log::assertion(m_ovector_ptr); // ovector is an array of offset pairs PCRE2_SIZE start{m_ovector_ptr[2 * i]}; - PCRE2_SIZE end{m_ovector_ptr[2 * i + 1]}; + PCRE2_SIZE end{m_ovector_ptr[(2 * i) + 1]}; if (start == PCRE2_UNSET) { return std::nullopt; @@ -100,7 +100,7 @@ class pcre2_match_view { private: std::string_view m_subject_data; const PCRE2_SIZE* m_ovector_ptr; - int32_t m_num_groups; + size_t m_num_groups; }; template @@ -124,11 +124,11 @@ bool correct_offset(int64_t& offset, std::string_view subject) noexcept { return offset <= subject.size(); } -int64_t skip_utf8_subsequent_bytes(int64_t offset, const std::string_view subject) noexcept { +int64_t skip_utf8_subsequent_bytes(size_t offset, const std::string_view subject) noexcept { // all multibyte utf8 runes consist of subsequent bytes, // these subsequent bytes start with 10 bit pattern // 0xc0 selects the two most significant bits, then we compare it to 0x80 (0b10000000) - while (offset < static_cast(subject.size()) && ((static_cast(subject[offset])) & 0xc0) == 0x80) { + while (offset < subject.size() && ((static_cast(subject[offset])) & 0xc0) == 0x80) { offset++; } return offset; @@ -291,7 +291,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { // check runtime cache if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { - auto& [compile_options, regex_code]{opt_ref->get()}; + const auto& [compile_options, regex_code]{opt_ref->get()}; regex_info.compile_options = compile_options; regex_info.regex_code = regex_code.get(); return true; @@ -682,9 +682,14 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { break; } - const auto* ovector{pcre2_get_ovector_pointer_8(regex_state.regex_pcre2_match_data.get())}; - const auto match_start_offset{ovector[0]}; - const auto match_end_offset{ovector[1]}; + auto match_view{*opt_match_view}; + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { + return false; + } + auto entire_pattern_match_string_view{*opt_entire_pattern_match}; + const auto match_start_offset{std::distance(regex_info.subject.data(), entire_pattern_match_string_view.data())}; + const auto match_end_offset{match_start_offset + entire_pattern_match_string_view.size()}; length_after_replace = buffer_length; if (auto replace_one_ret_code{pcre2_substitute_8( From 63508245e386ac039933a30594f8e344b97ac43c Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 17:55:20 +0300 Subject: [PATCH 56/68] fix --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 380cd2ce89..7bc10986fc 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -501,7 +501,7 @@ class matcher { if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { return std::unexpected{ret_code}; } - auto match_count{ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0}; + size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; if (match_count == 0) { // If match is not found From 176485fdeb1a012522dfcd3b084f89c883fcb2c7 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 19:40:24 +0300 Subject: [PATCH 57/68] add "// NOLINT" --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 7bc10986fc..6c5b43c581 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -245,7 +245,7 @@ class preg_replacement_parser { } return *this; } - iterator operator++(int) noexcept { + iterator operator++(int) noexcept { // NOLINT iterator temp{*this}; ++(*this); return temp; From 41343bb66de95ca782ecd5b81e8603c4f9d5c8d3 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 19:58:30 +0300 Subject: [PATCH 58/68] check pcre2_get_error_message_8 return code --- .../stdlib/string/regex-functions.cpp | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 6c5b43c581..2ad232a15b 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -270,7 +270,20 @@ class preg_replacement_parser { template void log_regex_error(const char (&msg)[N], int32_t regex_error) noexcept { std::array buffer{}; - pcre2_get_error_message_8(regex_error, reinterpret_cast(buffer.data()), buffer.size()); + auto ret_code{pcre2_get_error_message_8(regex_error, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + kphp::log::warning("unknown regex error code: {}", regex_error); + return; + case PCRE2_ERROR_NOMEMORY: + kphp::log::warning("[truncated] {}: {}", msg, buffer.data()); + return; + default: + kphp::log::warning("unsupported regex error code: {}", ret_code); + return; + } + } kphp::log::warning("{}: {}", msg, buffer.data()); } @@ -430,7 +443,20 @@ bool compile_regex(RegexInfo& regex_info) noexcept { pcre2_code_free_8}; if (!regex_code) [[unlikely]] { std::array buffer{}; - pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size()); + auto ret_code{pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + kphp::log::warning("unknown regex error code: {}", error_number); + return false; + case PCRE2_ERROR_NOMEMORY: + kphp::log::warning("[truncated] can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); + return false; + default: + kphp::log::warning("unsupported regex error code: {}", ret_code); + return false; + } + } kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); return false; } From 146461839e6f61383b6541bd217e7c169707ca37 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 20:04:20 +0300 Subject: [PATCH 59/68] fix usage of log_regex_error --- runtime-light/stdlib/string/regex-functions.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 2ad232a15b..7291bd4c9d 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -687,7 +687,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - log_regex_error("pcre2_substitute error {}", regex_info.replace_count); + log_regex_error("pcre2_substitute error", regex_info.replace_count); return false; } } else { // replace only 'limit' times @@ -700,7 +700,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + log_regex_error("can't replace by pcre2 regex due to match error:", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; @@ -760,7 +760,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + log_regex_error("can't split by pcre2 regex due to match error:", expected_opt_match_view.error()); return std::nullopt; } auto opt_match_view{*expected_opt_match_view}; @@ -866,7 +866,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - log_regex_error("can't match by pcre2 regex due to error: {}", ret_code); + log_regex_error("can't match by pcre2 regex due to error:", ret_code); return false; } regex_info.match_count = ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0; @@ -934,7 +934,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't find all matches due to match error: {}", expected_opt_match_view.error()); + log_regex_error("can't find all matches due to match error:", expected_opt_match_view.error()); return false; } From b3ba1905e7c010d11b801a7aaf14dc7435cf6816 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 20:13:08 +0300 Subject: [PATCH 60/68] rewrite preg_match using matcher --- .../stdlib/string/regex-functions.cpp | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 7291bd4c9d..3e629c111a 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -700,7 +700,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't replace by pcre2 regex due to match error:", expected_opt_match_view.error()); + log_regex_error("can't replace by pcre2 regex due to match error", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; @@ -760,7 +760,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't split by pcre2 regex due to match error:", expected_opt_match_view.error()); + log_regex_error("can't split by pcre2 regex due to match error", expected_opt_match_view.error()); return std::nullopt; } auto opt_match_view{*expected_opt_match_view}; @@ -860,16 +860,23 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(regex_info.regex_code != nullptr && regex_state.match_context); - int32_t ret_code{pcre2_match_8(regex_info.regex_code, reinterpret_cast(regex_info.subject.data()), regex_info.subject.size(), offset, - regex_info.match_options, regex_state.regex_pcre2_match_data.get(), regex_state.match_context.get())}; - // From https://www.pcre.org/current/doc/html/pcre2_match.html - // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set - // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. - if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - log_regex_error("can't match by pcre2 regex due to error:", ret_code); + auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + log_regex_error("can't match by pcre2 regex due to error", expected_opt_match_view.error()); + return false; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + return 0; + } + + pcre2_match_view match_view{*opt_match_view}; + + auto opt_entire_pattern_match{match_view.get_group(0)}; + if (!opt_entire_pattern_match.has_value()) [[unlikely]] { return false; } - regex_info.match_count = ret_code != PCRE2_ERROR_NOMATCH ? ret_code : 0; + regex_info.match_count = opt_entire_pattern_match->size(); std::optional> matches{}; if (opt_matches.has_value()) { @@ -934,7 +941,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't find all matches due to match error:", expected_opt_match_view.error()); + log_regex_error("can't find all matches due to match error", expected_opt_match_view.error()); return false; } From 21aa388a4e1a57ca2587db74a80aac22fa61a1ac Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Tue, 9 Dec 2025 20:28:25 +0300 Subject: [PATCH 61/68] fix --- runtime-light/stdlib/string/regex-functions.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 3e629c111a..6f78cbf039 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -866,17 +866,8 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt return false; } auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - return 0; - } - - pcre2_match_view match_view{*opt_match_view}; - auto opt_entire_pattern_match{match_view.get_group(0)}; - if (!opt_entire_pattern_match.has_value()) [[unlikely]] { - return false; - } - regex_info.match_count = opt_entire_pattern_match->size(); + regex_info.match_count = opt_match_view.transform(&pcre2_match_view::size).value_or(0); std::optional> matches{}; if (opt_matches.has_value()) { From 0211fe2a9af8eccf074e0d9bbaa18a0cb9e48bc4 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 10 Dec 2025 16:13:18 +0300 Subject: [PATCH 62/68] fix pcre2_substitute error logging --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 6f78cbf039..d279d754a1 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -723,7 +723,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error {}", replace_one_ret_code); + log_regex_error("pcre2_substitute error", replace_one_ret_code); return false; } From 18a0c41e43abe5ce220e7bcef3fb9f2b39c16113 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 10 Dec 2025 18:09:57 +0300 Subject: [PATCH 63/68] regex_error_to_buffer --- .../stdlib/string/regex-functions.cpp | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index d279d754a1..41a490e238 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -267,24 +267,22 @@ class preg_replacement_parser { } }; -template -void log_regex_error(const char (&msg)[N], int32_t regex_error) noexcept { - std::array buffer{}; +bool regex_error_to_buffer(std::array& buffer, int32_t regex_error) noexcept { auto ret_code{pcre2_get_error_message_8(regex_error, reinterpret_cast(buffer.data()), buffer.size())}; + return true; if (ret_code < 0) [[unlikely]] { switch (ret_code) { case PCRE2_ERROR_BADDATA: kphp::log::warning("unknown regex error code: {}", regex_error); - return; + return false; case PCRE2_ERROR_NOMEMORY: - kphp::log::warning("[truncated] {}: {}", msg, buffer.data()); - return; + kphp::log::warning("[truncated]: {}", buffer.data()); + return false; default: kphp::log::warning("unsupported regex error code: {}", ret_code); - return; + return false; } } - kphp::log::warning("{}: {}", msg, buffer.data()); } bool compile_regex(RegexInfo& regex_info) noexcept { @@ -443,21 +441,9 @@ bool compile_regex(RegexInfo& regex_info) noexcept { pcre2_code_free_8}; if (!regex_code) [[unlikely]] { std::array buffer{}; - auto ret_code{pcre2_get_error_message_8(error_number, reinterpret_cast(buffer.data()), buffer.size())}; - if (ret_code < 0) [[unlikely]] { - switch (ret_code) { - case PCRE2_ERROR_BADDATA: - kphp::log::warning("unknown regex error code: {}", error_number); - return false; - case PCRE2_ERROR_NOMEMORY: - kphp::log::warning("[truncated] can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); - return false; - default: - kphp::log::warning("unsupported regex error code: {}", ret_code); - return false; - } + if (regex_error_to_buffer(buffer, error_number)) { + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); } - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); return false; } @@ -687,7 +673,10 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - log_regex_error("pcre2_substitute error", regex_info.replace_count); + std::array buffer{}; + if (regex_error_to_buffer(buffer, regex_info.replace_count)) { + kphp::log::warning("pcre2_substitute error: {}", buffer.data()); + } return false; } } else { // replace only 'limit' times @@ -700,7 +689,10 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't replace by pcre2 regex due to match error", expected_opt_match_view.error()); + std::array buffer{}; + if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", buffer.data()); + } return false; } auto opt_match_view{*expected_opt_match_view}; @@ -723,7 +715,10 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - log_regex_error("pcre2_substitute error", replace_one_ret_code); + std::array buffer{}; + if (regex_error_to_buffer(buffer, replace_one_ret_code)) { + kphp::log::warning("pcre2_substitute error: {}", buffer.data()); + } return false; } @@ -760,7 +755,10 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't split by pcre2 regex due to match error", expected_opt_match_view.error()); + std::array buffer{}; + if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { + kphp::log::warning("can't split by pcre2 regex due to match error: {}", buffer.data()); + } return std::nullopt; } auto opt_match_view{*expected_opt_match_view}; @@ -862,7 +860,10 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't match by pcre2 regex due to error", expected_opt_match_view.error()); + std::array buffer{}; + if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { + kphp::log::warning("can't match by pcre2 regex due to error: {}", buffer.data()); + } return false; } auto opt_match_view{*expected_opt_match_view}; @@ -932,7 +933,10 @@ Optional f$preg_match_all(const string& pattern, const string& subject, expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { - log_regex_error("can't find all matches due to match error", expected_opt_match_view.error()); + std::array buffer{}; + if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { + kphp::log::warning("can't find all matches due to match error: {}", buffer.data()); + } return false; } From 97d531c6baf8c59561cbe25cbd748fe227e91b21 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 10 Dec 2025 19:19:47 +0300 Subject: [PATCH 64/68] formatter --- .../stdlib/string/regex-functions.cpp | 88 ++++++++----------- runtime-light/stdlib/string/regex-functions.h | 6 ++ 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 41a490e238..c5091ee1ec 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -29,9 +29,36 @@ #include "runtime-light/stdlib/string/regex-include.h" #include "runtime-light/stdlib/string/regex-state.h" -namespace { +namespace std { + +template<> +struct formatter { + template + constexpr auto parse(ParseContext& ctx) const noexcept { + return ctx.begin(); + } + + template + auto format(kphp::regex::pcre2_error error, FmtContext& ctx) const noexcept { + std::array buffer{}; + auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; + if (ret_code < 0) [[unlikely]] { + switch (ret_code) { + case PCRE2_ERROR_BADDATA: + return format_to(ctx.out(), "unknown error ({})", error.code); + case PCRE2_ERROR_NOMEMORY: + return format_to(ctx.out(), "[truncated] {}", buffer.data()); + default: + kphp::log::error("unsupported regex error code: {}", ret_code); + } + } + return format_to(ctx.out(), "{}", buffer.data()); + } +}; + +} // namespace std -constexpr size_t ERROR_BUFFER_LENGTH{256}; +namespace { enum class trailing_unmatch : uint8_t { skip, include }; @@ -267,24 +294,6 @@ class preg_replacement_parser { } }; -bool regex_error_to_buffer(std::array& buffer, int32_t regex_error) noexcept { - auto ret_code{pcre2_get_error_message_8(regex_error, reinterpret_cast(buffer.data()), buffer.size())}; - return true; - if (ret_code < 0) [[unlikely]] { - switch (ret_code) { - case PCRE2_ERROR_BADDATA: - kphp::log::warning("unknown regex error code: {}", regex_error); - return false; - case PCRE2_ERROR_NOMEMORY: - kphp::log::warning("[truncated]: {}", buffer.data()); - return false; - default: - kphp::log::warning("unsupported regex error code: {}", ret_code); - return false; - } - } -} - bool compile_regex(RegexInfo& regex_info) noexcept { const vk::final_action finalizer{[®ex_info]() noexcept { if (regex_info.regex_code != nullptr) [[likely]] { @@ -440,10 +449,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), pcre2_code_free_8}; if (!regex_code) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, error_number)) { - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, buffer.data()); - } + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, kphp::regex::pcre2_error{.code = error_number}); return false; } @@ -497,7 +503,7 @@ class matcher { kphp::log::assertion(m_match_data); } - std::expected, int32_t> next() noexcept { + std::expected, kphp::regex::pcre2_error> next() noexcept { const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); @@ -511,7 +517,7 @@ class matcher { // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - return std::unexpected{ret_code}; + return std::unexpected{kphp::regex::pcre2_error{.code = ret_code}}; } size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; @@ -673,10 +679,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, regex_info.replace_count)) { - kphp::log::warning("pcre2_substitute error: {}", buffer.data()); - } + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::pcre2_error{.code = static_cast(regex_info.replace_count)}); return false; } } else { // replace only 'limit' times @@ -689,10 +692,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { for (; regex_info.replace_count < limit; ++regex_info.replace_count) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { - kphp::log::warning("can't replace by pcre2 regex due to match error: {}", buffer.data()); - } + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; @@ -715,10 +715,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, replace_one_ret_code)) { - kphp::log::warning("pcre2_substitute error: {}", buffer.data()); - } + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::pcre2_error{.code = replace_one_ret_code}); return false; } @@ -755,10 +752,7 @@ std::optional> split_regex(RegexInfo& regex_info, int64_t limit, bo for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { auto expected_opt_match_view{pcre2_matcher.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { - kphp::log::warning("can't split by pcre2 regex due to match error: {}", buffer.data()); - } + kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); return std::nullopt; } auto opt_match_view{*expected_opt_match_view}; @@ -860,10 +854,7 @@ Optional f$preg_match(const string& pattern, const string& subject, Opt auto expected_opt_match_view{matcher{regex_info, static_cast(offset)}.next()}; if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { - kphp::log::warning("can't match by pcre2 regex due to error: {}", buffer.data()); - } + kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); return false; } auto opt_match_view{*expected_opt_match_view}; @@ -933,10 +924,7 @@ Optional f$preg_match_all(const string& pattern, const string& subject, expected_opt_match_view = pcre2_matcher.next(); } if (!expected_opt_match_view.has_value()) [[unlikely]] { - std::array buffer{}; - if (regex_error_to_buffer(buffer, expected_opt_match_view.error())) { - kphp::log::warning("can't find all matches due to match error: {}", buffer.data()); - } + kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); return false; } diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index a00d44c83d..1d0f049f40 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -18,6 +18,8 @@ namespace kphp::regex { +constexpr size_t ERROR_BUFFER_LENGTH{256}; + inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; @@ -36,6 +38,10 @@ inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); inline constexpr int64_t PREG_NOLIMIT = -1; +struct pcre2_error { + int32_t code{}; +}; + } // namespace kphp::regex namespace regex_impl_ { From e52fd209422ff2c22fbd6c77fdc6614f8325a9ed Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Wed, 10 Dec 2025 21:24:24 +0300 Subject: [PATCH 65/68] move to details namespace --- runtime-light/stdlib/string/regex-functions.cpp | 16 ++++++++-------- runtime-light/stdlib/string/regex-functions.h | 14 +++++++++----- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index c5091ee1ec..fe7c5b0edc 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -32,15 +32,15 @@ namespace std { template<> -struct formatter { +struct formatter { template constexpr auto parse(ParseContext& ctx) const noexcept { return ctx.begin(); } template - auto format(kphp::regex::pcre2_error error, FmtContext& ctx) const noexcept { - std::array buffer{}; + auto format(details::pcre2_error error, FmtContext& ctx) const noexcept { + std::array buffer{}; auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; if (ret_code < 0) [[unlikely]] { switch (ret_code) { @@ -449,7 +449,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), pcre2_code_free_8}; if (!regex_code) [[unlikely]] { - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, kphp::regex::pcre2_error{.code = error_number}); + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, details::pcre2_error{.code = error_number}); return false; } @@ -503,7 +503,7 @@ class matcher { kphp::log::assertion(m_match_data); } - std::expected, kphp::regex::pcre2_error> next() noexcept { + std::expected, details::pcre2_error> next() noexcept { const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); @@ -517,7 +517,7 @@ class matcher { // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - return std::unexpected{kphp::regex::pcre2_error{.code = ret_code}}; + return std::unexpected{details::pcre2_error{.code = ret_code}}; } size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; @@ -679,7 +679,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", kphp::regex::pcre2_error{.code = static_cast(regex_info.replace_count)}); + kphp::log::warning("pcre2_substitute error: {}", details::pcre2_error{.code = static_cast(regex_info.replace_count)}); return false; } } else { // replace only 'limit' times @@ -715,7 +715,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", kphp::regex::pcre2_error{.code = replace_one_ret_code}); + kphp::log::warning("pcre2_substitute error: {}", details::pcre2_error{.code = replace_one_ret_code}); return false; } diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 1d0f049f40..6e9507067a 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -16,10 +16,18 @@ #include "runtime-light/coroutine/type-traits.h" #include "runtime-light/stdlib/diagnostics/logs.h" -namespace kphp::regex { +namespace details { constexpr size_t ERROR_BUFFER_LENGTH{256}; +struct pcre2_error { + int32_t code{}; +}; + +} // namespace details + +namespace kphp::regex { + inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; @@ -38,10 +46,6 @@ inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); inline constexpr int64_t PREG_NOLIMIT = -1; -struct pcre2_error { - int32_t code{}; -}; - } // namespace kphp::regex namespace regex_impl_ { From c4f000f4a9af74dd1a72932fccf4670acaf46039 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 11 Dec 2025 00:38:27 +0300 Subject: [PATCH 66/68] fix --- .../stdlib/string/regex-functions.cpp | 18 ++++++++++-------- runtime-light/stdlib/string/regex-functions.h | 6 ++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index fe7c5b0edc..274e6bcb6f 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -32,15 +32,17 @@ namespace std { template<> -struct formatter { +struct formatter { + constexpr static size_t ERROR_BUFFER_LENGTH{256}; + template constexpr auto parse(ParseContext& ctx) const noexcept { return ctx.begin(); } template - auto format(details::pcre2_error error, FmtContext& ctx) const noexcept { - std::array buffer{}; + auto format(kphp::regex::details::pcre2_error error, FmtContext& ctx) const noexcept { + std::array buffer{}; auto ret_code{pcre2_get_error_message_8(error.code, reinterpret_cast(buffer.data()), buffer.size())}; if (ret_code < 0) [[unlikely]] { switch (ret_code) { @@ -449,7 +451,7 @@ bool compile_regex(RegexInfo& regex_info) noexcept { std::addressof(error_number), std::addressof(error_offset), regex_state.compile_context.get()), pcre2_code_free_8}; if (!regex_code) [[unlikely]] { - kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, details::pcre2_error{.code = error_number}); + kphp::log::warning("can't compile pcre2 regex due to error at offset {}: {}", error_offset, kphp::regex::details::pcre2_error{.code = error_number}); return false; } @@ -503,7 +505,7 @@ class matcher { kphp::log::assertion(m_match_data); } - std::expected, details::pcre2_error> next() noexcept { + std::expected, kphp::regex::details::pcre2_error> next() noexcept { const auto& regex_state{RegexInstanceState::get()}; kphp::log::assertion(m_regex_info.regex_code != nullptr && regex_state.match_context); @@ -517,7 +519,7 @@ class matcher { // The return from pcre2_match() is one more than the highest numbered capturing pair that has been set // (for example, 1 if there are no captures), zero if the vector of offsets is too small, or a negative error code for no match and other errors. if (ret_code < 0 && ret_code != PCRE2_ERROR_NOMATCH) [[unlikely]] { - return std::unexpected{details::pcre2_error{.code = ret_code}}; + return std::unexpected{kphp::regex::details::pcre2_error{.code = ret_code}}; } size_t match_count{ret_code != PCRE2_ERROR_NOMATCH ? static_cast(ret_code) : 0}; @@ -679,7 +681,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(output_length)); if (regex_info.replace_count < 0) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", details::pcre2_error{.code = static_cast(regex_info.replace_count)}); + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = static_cast(regex_info.replace_count)}); return false; } } else { // replace only 'limit' times @@ -715,7 +717,7 @@ bool replace_regex(RegexInfo& regex_info, uint64_t limit) noexcept { regex_info.replace_options, nullptr, regex_state.match_context.get(), reinterpret_cast(regex_info.replacement.data()), regex_info.replacement.size(), reinterpret_cast(runtime_ctx.static_SB.buffer()), std::addressof(length_after_replace))}; replace_one_ret_code != 1) [[unlikely]] { - kphp::log::warning("pcre2_substitute error: {}", details::pcre2_error{.code = replace_one_ret_code}); + kphp::log::warning("pcre2_substitute error: {}", kphp::regex::details::pcre2_error{.code = replace_one_ret_code}); return false; } diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index 6e9507067a..a2a52d3479 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -16,9 +16,9 @@ #include "runtime-light/coroutine/type-traits.h" #include "runtime-light/stdlib/diagnostics/logs.h" -namespace details { +namespace kphp::regex { -constexpr size_t ERROR_BUFFER_LENGTH{256}; +namespace details { struct pcre2_error { int32_t code{}; @@ -26,8 +26,6 @@ struct pcre2_error { } // namespace details -namespace kphp::regex { - inline constexpr int64_t PREG_NO_ERROR = 0; inline constexpr int64_t PREG_INTERNAL_ERROR = 1; inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; From da142f2d6059ba909086b27b93225d9621faa555 Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 11 Dec 2025 00:41:07 +0300 Subject: [PATCH 67/68] format --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 274e6bcb6f..6dcfcbb5bb 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -34,7 +34,7 @@ namespace std { template<> struct formatter { constexpr static size_t ERROR_BUFFER_LENGTH{256}; - + template constexpr auto parse(ParseContext& ctx) const noexcept { return ctx.begin(); From 1c7c02c863147eba9b2a14649df6bcccb84bb48d Mon Sep 17 00:00:00 2001 From: Karim Shamazov Date: Thu, 11 Dec 2025 00:43:56 +0300 Subject: [PATCH 68/68] swap constexpr and static --- runtime-light/stdlib/string/regex-functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 6dcfcbb5bb..0b4420d496 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -33,7 +33,7 @@ namespace std { template<> struct formatter { - constexpr static size_t ERROR_BUFFER_LENGTH{256}; + static constexpr size_t ERROR_BUFFER_LENGTH{256}; template constexpr auto parse(ParseContext& ctx) const noexcept {