From 1765f8c2ffc90dc02836c31a7a203b2db77dff9c Mon Sep 17 00:00:00 2001
From: David Grudl <david@grudl.com>
Date: Wed, 27 Oct 2021 01:21:23 +0200
Subject: [PATCH] Strings: added support for UTF8 offsets in regexp

---
 src/Utils/Strings.php               | 60 +++++++++++++++++++++++++++--
 tests/Utils/Strings.match().phpt    | 13 ++++++-
 tests/Utils/Strings.matchAll().phpt | 17 ++++++++
 tests/Utils/Strings.replace().phpt  |  6 +++
 tests/Utils/Strings.split().phpt    | 30 ++++++++++++---
 5 files changed, 114 insertions(+), 12 deletions(-)

diff --git a/src/Utils/Strings.php b/src/Utils/Strings.php
index 164475574..f820700fd 100644
--- a/src/Utils/Strings.php
+++ b/src/Utils/Strings.php
@@ -474,11 +474,17 @@ public static function split(
 		string $pattern,
 		bool|int $captureOffset = false,
 		bool $skipEmpty = false,
+		bool $utf8 = false,
 	): array {
 		$flags = is_int($captureOffset) && $captureOffset // back compatibility
 			? $captureOffset
 			: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
-		return self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]);
+		$pattern .= $utf8 ? 'u' : '';
+		$m = self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]);
+		if ($utf8 && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
+			return self::bytesToChars($subject, [$m])[0];
+		}
+		return $m;
 	}
 
 
@@ -491,16 +497,25 @@ public static function match(
 		bool|int $captureOffset = false,
 		int $offset = 0,
 		bool $unmatchedAsNull = false,
+		bool $utf8 = false,
 	): ?array {
 		$flags = is_int($captureOffset) && $captureOffset // back compatibility
 			? $captureOffset
 			: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
+		if ($utf8) {
+			$offset = strlen(self::substring($subject, 0, $offset));
+			$pattern .= 'u';
+		}
 		if ($offset > strlen($subject)) {
 			return null;
 		}
-		return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
-			? $m
-			: null;
+		if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
+			return null;
+		}
+		if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
+			return self::bytesToChars($subject, [$m])[0];
+		}
+		return $m;
 	}
 
 
@@ -515,10 +530,15 @@ public static function matchAll(
 		int $offset = 0,
 		bool $unmatchedAsNull = false,
 		bool $patternOrder = false,
+		bool $utf8 = false,
 	): array {
 		$flags = is_int($captureOffset) && $captureOffset // back compatibility
 			? $captureOffset
 			: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
+		if ($utf8) {
+			$offset = strlen(self::substring($subject, 0, $offset));
+			$pattern .= 'u';
+		}
 		if ($offset > strlen($subject)) {
 			return [];
 		}
@@ -527,6 +547,9 @@ public static function matchAll(
 			($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
 			$offset,
 		]);
+		if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
+			return self::bytesToChars($subject, $m);
+		}
 		return $m;
 	}
 
@@ -541,12 +564,19 @@ public static function replace(
 		int $limit = -1,
 		bool $captureOffset = false,
 		bool $unmatchedAsNull = false,
+		bool $utf8 = false,
 	): string {
 		if (is_object($replacement) || is_array($replacement)) {
 			if (!is_callable($replacement, false, $textual)) {
 				throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
 			}
 			$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
+			if ($utf8) {
+				$pattern .= 'u';
+				if ($captureOffset) {
+					$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
+				}
+			}
 			return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
 
 		} elseif (is_array($pattern) && is_string(key($pattern))) {
@@ -554,10 +584,32 @@ public static function replace(
 			$pattern = array_keys($pattern);
 		}
 
+		if ($utf8) {
+			$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
+		}
+
 		return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
 	}
 
 
+	private static function bytesToChars(string $s, array $groups): array
+	{
+		$lastBytes = $lastChars = 0;
+		foreach ($groups as &$matches) {
+			foreach ($matches as &$match) {
+				if ($match[1] > $lastBytes) {
+					$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
+				} elseif ($match[1] < $lastBytes) {
+					$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
+				}
+				$lastBytes = $match[1];
+				$match[1] = $lastChars;
+			}
+		}
+		return $groups;
+	}
+
+
 	/** @internal */
 	public static function pcre(string $func, array $args)
 	{
diff --git a/tests/Utils/Strings.match().phpt b/tests/Utils/Strings.match().phpt
index 5b4196180..34799a885 100644
--- a/tests/Utils/Strings.match().phpt
+++ b/tests/Utils/Strings.match().phpt
@@ -19,13 +19,22 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
 
 Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
 
-Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
-Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
+Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
+Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
 
+Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true));
 Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true));
 Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', 0, 0, unmatchedAsNull: true)); // $flags = 0
 
 Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2));
 
+Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2));
+
+Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2));
+
+Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
+
+Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2));
+
 Assert::null(Strings::match('hello world!', '', offset: 50));
 Assert::null(Strings::match('', '', offset: 1));
diff --git a/tests/Utils/Strings.matchAll().phpt b/tests/Utils/Strings.matchAll().phpt
index 257887f95..002b43599 100644
--- a/tests/Utils/Strings.matchAll().phpt
+++ b/tests/Utils/Strings.matchAll().phpt
@@ -45,14 +45,31 @@ Assert::same([
 	[['u', 3], ['u', 7], ['', 11], ['', 15]],
 ], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
 
+Assert::same([
+	[['lu', 1], ['l', 1], ['u', 2]],
+	[['ou', 4], ['o', 4], ['u', 5]],
+	[['k', 7], ['k', 7], ['', 8]],
+	[['k', 10], ['k', 10], ['', 11]],
+], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true));
+
 Assert::same([
 	[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
 	[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
 	[['u', 3], ['u', 7], ['', 11], ['', 15]],
 ], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
 
+Assert::same([
+	[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
+	[['l', 1], ['o', 4], ['k', 7], ['k', 10]],
+	[['u', 2], ['u', 5], ['', 8], ['', 11]],
+], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true));
+
 Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2));
 
+Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true));
+
+Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
+
 Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
 Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true));
 
diff --git a/tests/Utils/Strings.replace().phpt b/tests/Utils/Strings.replace().phpt
index 8510afb4a..047efeb11 100644
--- a/tests/Utils/Strings.replace().phpt
+++ b/tests/Utils/Strings.replace().phpt
@@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
 
 // flags & callback
 Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true));
+Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8: true));
 Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);
+
+// utf-8 without modifier
+Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', fn() => '*', utf8: true));
+Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', '*', utf8: true));
+Assert::same('* *', Strings::replace('žluťoučký kůň', ['#\w+#'], '*', utf8: true));
diff --git a/tests/Utils/Strings.split().phpt b/tests/Utils/Strings.split().phpt
index 3a57a4c4e..3d17f6f72 100644
--- a/tests/Utils/Strings.split().phpt
+++ b/tests/Utils/Strings.split().phpt
@@ -46,9 +46,27 @@ Assert::same([
 ], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
 
 Assert::same([
-	['a', 0],
-	[',', 1],
-	['b', 3],
-	[',', 4],
-	['c', 6],
-], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
+	['ž', 0],
+	['lu', 2],
+	['ť', 4],
+	['ou', 6],
+	['č', 8],
+	['k', 10],
+	['ý ', 11],
+	['k', 14],
+	['ůň', 15],
+], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
+
+Assert::same([
+	['ž', 0],
+	['lu', 1],
+	['ť', 3],
+	['ou', 4],
+	['č', 6],
+	['k', 7],
+	['ý ', 8],
+	['k', 10],
+	['ůň', 11],
+], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true));
+
+Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier