From 1765f8c2ffc90dc02836c31a7a203b2db77dff9c Mon Sep 17 00:00:00 2001 From: David Grudl Date: Wed, 27 Oct 2021 01:21:23 +0200 Subject: [PATCH] Strings: added support for UTF8 offsets in regexp --- src/Utils/Strings.php | 60 +++++++++++++++++++++++++++-- tests/Utils/Strings.match().phpt | 13 ++++++- tests/Utils/Strings.matchAll().phpt | 17 ++++++++ tests/Utils/Strings.replace().phpt | 6 +++ tests/Utils/Strings.split().phpt | 30 ++++++++++++--- 5 files changed, 114 insertions(+), 12 deletions(-) diff --git a/src/Utils/Strings.php b/src/Utils/Strings.php index 164475574..f820700fd 100644 --- a/src/Utils/Strings.php +++ b/src/Utils/Strings.php @@ -474,11 +474,17 @@ public static function split( string $pattern, bool|int $captureOffset = false, bool $skipEmpty = false, + bool $utf8 = false, ): array { $flags = is_int($captureOffset) && $captureOffset // back compatibility ? $captureOffset : ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0); - return self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]); + $pattern .= $utf8 ? 'u' : ''; + $m = self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]); + if ($utf8 && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) { + return self::bytesToChars($subject, [$m])[0]; + } + return $m; } @@ -491,16 +497,25 @@ public static function match( bool|int $captureOffset = false, int $offset = 0, bool $unmatchedAsNull = false, + bool $utf8 = false, ): ?array { $flags = is_int($captureOffset) && $captureOffset // back compatibility ? $captureOffset : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); + if ($utf8) { + $offset = strlen(self::substring($subject, 0, $offset)); + $pattern .= 'u'; + } if ($offset > strlen($subject)) { return null; } - return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset]) - ? $m - : null; + if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) { + return null; + } + if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) { + return self::bytesToChars($subject, [$m])[0]; + } + return $m; } @@ -515,10 +530,15 @@ public static function matchAll( int $offset = 0, bool $unmatchedAsNull = false, bool $patternOrder = false, + bool $utf8 = false, ): array { $flags = is_int($captureOffset) && $captureOffset // back compatibility ? $captureOffset : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0); + if ($utf8) { + $offset = strlen(self::substring($subject, 0, $offset)); + $pattern .= 'u'; + } if ($offset > strlen($subject)) { return []; } @@ -527,6 +547,9 @@ public static function matchAll( ($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER), $offset, ]); + if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) { + return self::bytesToChars($subject, $m); + } return $m; } @@ -541,12 +564,19 @@ public static function replace( int $limit = -1, bool $captureOffset = false, bool $unmatchedAsNull = false, + bool $utf8 = false, ): string { if (is_object($replacement) || is_array($replacement)) { if (!is_callable($replacement, false, $textual)) { throw new Nette\InvalidStateException("Callback '$textual' is not callable."); } $flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); + if ($utf8) { + $pattern .= 'u'; + if ($captureOffset) { + $replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]); + } + } return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]); } elseif (is_array($pattern) && is_string(key($pattern))) { @@ -554,10 +584,32 @@ public static function replace( $pattern = array_keys($pattern); } + if ($utf8) { + $pattern = array_map(fn($item) => $item . 'u', (array) $pattern); + } + return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]); } + private static function bytesToChars(string $s, array $groups): array + { + $lastBytes = $lastChars = 0; + foreach ($groups as &$matches) { + foreach ($matches as &$match) { + if ($match[1] > $lastBytes) { + $lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes)); + } elseif ($match[1] < $lastBytes) { + $lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1])); + } + $lastBytes = $match[1]; + $match[1] = $lastChars; + } + } + return $groups; + } + + /** @internal */ public static function pcre(string $func, array $args) { diff --git a/tests/Utils/Strings.match().phpt b/tests/Utils/Strings.match().phpt index 5b4196180..34799a885 100644 --- a/tests/Utils/Strings.match().phpt +++ b/tests/Utils/Strings.match().phpt @@ -19,13 +19,22 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#')); Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#')); -Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE)); -Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true)); +Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE)); +Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true)); +Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true)); Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true)); Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', 0, 0, unmatchedAsNull: true)); // $flags = 0 Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2)); +Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2)); + +Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2)); + +Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier + +Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2)); + Assert::null(Strings::match('hello world!', '', offset: 50)); Assert::null(Strings::match('', '', offset: 1)); diff --git a/tests/Utils/Strings.matchAll().phpt b/tests/Utils/Strings.matchAll().phpt index 257887f95..002b43599 100644 --- a/tests/Utils/Strings.matchAll().phpt +++ b/tests/Utils/Strings.matchAll().phpt @@ -45,14 +45,31 @@ Assert::same([ [['u', 3], ['u', 7], ['', 11], ['', 15]], ], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER)); +Assert::same([ + [['lu', 1], ['l', 1], ['u', 2]], + [['ou', 4], ['o', 4], ['u', 5]], + [['k', 7], ['k', 7], ['', 8]], + [['k', 10], ['k', 10], ['', 11]], +], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true)); + Assert::same([ [['lu', 2], ['ou', 6], ['k', 10], ['k', 14]], [['l', 2], ['o', 6], ['k', 10], ['k', 14]], [['u', 3], ['u', 7], ['', 11], ['', 15]], ], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true)); +Assert::same([ + [['lu', 1], ['ou', 4], ['k', 7], ['k', 10]], + [['l', 1], ['o', 4], ['k', 7], ['k', 10]], + [['u', 2], ['u', 5], ['', 8], ['', 11]], +], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true)); + Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2)); +Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true)); + +Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier + Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2)); Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true)); diff --git a/tests/Utils/Strings.replace().phpt b/tests/Utils/Strings.replace().phpt index 8510afb4a..047efeb11 100644 --- a/tests/Utils/Strings.replace().phpt +++ b/tests/Utils/Strings.replace().phpt @@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#'])); // flags & callback Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true)); +Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8: true)); Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true); + +// utf-8 without modifier +Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', fn() => '*', utf8: true)); +Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', '*', utf8: true)); +Assert::same('* *', Strings::replace('žluťoučký kůň', ['#\w+#'], '*', utf8: true)); diff --git a/tests/Utils/Strings.split().phpt b/tests/Utils/Strings.split().phpt index 3a57a4c4e..3d17f6f72 100644 --- a/tests/Utils/Strings.split().phpt +++ b/tests/Utils/Strings.split().phpt @@ -46,9 +46,27 @@ Assert::same([ ], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE)); Assert::same([ - ['a', 0], - [',', 1], - ['b', 3], - [',', 4], - ['c', 6], -], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true)); + ['ž', 0], + ['lu', 2], + ['ť', 4], + ['ou', 6], + ['č', 8], + ['k', 10], + ['ý ', 11], + ['k', 14], + ['ůň', 15], +], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true)); + +Assert::same([ + ['ž', 0], + ['lu', 1], + ['ť', 3], + ['ou', 4], + ['č', 6], + ['k', 7], + ['ý ', 8], + ['k', 10], + ['ůň', 11], +], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true)); + +Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier