From 75b1867cceda6c2a8e4d3f876050a12e8ca3b671 Mon Sep 17 00:00:00 2001 From: Ayesh Karunaratne Date: Wed, 5 Jun 2024 21:48:32 +0700 Subject: [PATCH] [PHP 8.4][Intl] Add `grapheme_str_split` Add a polyfill for the `grapheme_str_split` function added in PHP 8.4. Requires PHP 7.3, because the polyfill is based on `\X` Regex, and it only works properly on PCRE2, which [only comes with PHP 7.3+](https://php.watch/versions/7.3/pcre2). Further, there are some cases that the polyfill cannot split complex characters (such as two consecutive country flag Emojis). This is now fixed in However, this change will likely only make it to PHP 8.4. References: - [RFC: Grapheme cluster for `str_split` function: `grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) - [PHP.Watch: PHP 8.4: New `grapheme_str_split` function](https://php.watch/versions/8.4/grapheme_str_split) --- README.md | 1 + src/Intl/Grapheme/Grapheme.php | 33 ++++++++++++++++++++++++++++ src/Intl/Grapheme/README.md | 1 + src/Intl/Grapheme/bootstrap.php | 4 ++++ src/Intl/Grapheme/bootstrap73.php | 17 ++++++++++++++ src/Intl/Grapheme/bootstrap80.php | 3 +++ src/Php84/Php84.php | 29 ++++++++++++++++++++++++ src/Php84/README.md | 1 + src/Php84/bootstrap.php | 4 ++++ src/Php84/bootstrap73.php | 21 ++++++++++++++++++ tests/Intl/Grapheme/GraphemeTest.php | 24 ++++++++++++++++++++ tests/Php84/Php84Test.php | 20 +++++++++++++++++ 12 files changed, 158 insertions(+) create mode 100644 src/Intl/Grapheme/bootstrap73.php create mode 100644 src/Php84/bootstrap73.php diff --git a/README.md b/README.md index 370956e57..dd6ee0d9d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ Polyfills are provided for: - the `Date*Exception/Error` classes introduced in PHP 8.3; - the `SQLite3Exception` class introduced in PHP 8.3; - the `mb_ucfirst` and `mb_lcfirst` functions introduced in PHP 8.4; +- the `grapheme_str_split` function introduced in PHP 8.4 (requires PHP >= 7.3); It is strongly recommended to upgrade your PHP version and/or install the missing extensions whenever possible. This polyfill should be used only when there is no diff --git a/src/Intl/Grapheme/Grapheme.php b/src/Intl/Grapheme/Grapheme.php index 5373f1685..b28697355 100644 --- a/src/Intl/Grapheme/Grapheme.php +++ b/src/Intl/Grapheme/Grapheme.php @@ -26,6 +26,7 @@ * - grapheme_strrpos - Find position (in grapheme units) of last occurrence of a string * - grapheme_strstr - Returns part of haystack string from the first occurrence of needle to the end of haystack * - grapheme_substr - Return part of a string + * - grapheme_str_split - Splits a string into an array of individual or chunks of graphemes. * * @author Nicolas Grekas * @@ -191,6 +192,38 @@ public static function grapheme_strstr($s, $needle, $beforeNeedle = false) return mb_strstr($s, $needle, $beforeNeedle, 'UTF-8'); } + public static function grapheme_str_split($s, $len = 1) { + if ($len < 0 || $len > 1073741823) { + if (80000 > \PHP_VERSION_ID) { + return false; + } + + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ($s === '') { + return []; + } + + preg_match_all('/\X/u', $s, $matches); + + if (empty($matches[0])) { + return false; + } + + if ($len === 1) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $len); + + array_walk($chunks, static function(&$value) { + $value = implode('', $value); + }); + + return $chunks; + } + private static function grapheme_position($s, $needle, $offset, $mode) { $needle = (string) $needle; diff --git a/src/Intl/Grapheme/README.md b/src/Intl/Grapheme/README.md index f55d92c5c..8e936ad7f 100644 --- a/src/Intl/Grapheme/README.md +++ b/src/Intl/Grapheme/README.md @@ -21,6 +21,7 @@ This component provides a partial, native PHP implementation of the - [`grapheme_strstr`](https://php.net/grapheme_strstr): Returns part of haystack string from the first occurrence of needle to the end of haystack - [`grapheme_substr`](https://php.net/grapheme_substr): Return part of a string +- [`grapheme_str_split](https://php.net/grapheme_str_split): Splits a string into an array of individual or chunks of graphemes. More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Intl/Grapheme/bootstrap.php b/src/Intl/Grapheme/bootstrap.php index a9ea03c7e..a53c335f2 100644 --- a/src/Intl/Grapheme/bootstrap.php +++ b/src/Intl/Grapheme/bootstrap.php @@ -56,3 +56,7 @@ function grapheme_strstr($haystack, $needle, $beforeNeedle = false) { return p\G if (!function_exists('grapheme_substr')) { function grapheme_substr($string, $offset, $length = null) { return p\Grapheme::grapheme_substr($string, $offset, $length); } } + +if (\PHP_VERSION_ID >= 70300) { + require __DIR__.'/bootstrap73.php'; +} diff --git a/src/Intl/Grapheme/bootstrap73.php b/src/Intl/Grapheme/bootstrap73.php new file mode 100644 index 000000000..488202afd --- /dev/null +++ b/src/Intl/Grapheme/bootstrap73.php @@ -0,0 +1,17 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Polyfill\Php84 as p; + +if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) { + function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); } +} + diff --git a/src/Intl/Grapheme/bootstrap80.php b/src/Intl/Grapheme/bootstrap80.php index b8c078677..e746b75e7 100644 --- a/src/Intl/Grapheme/bootstrap80.php +++ b/src/Intl/Grapheme/bootstrap80.php @@ -48,3 +48,6 @@ function grapheme_strstr(?string $haystack, ?string $needle, ?bool $beforeNeedle if (!function_exists('grapheme_substr')) { function grapheme_substr(?string $string, ?int $offset, ?int $length = null): string|false { return p\Grapheme::grapheme_substr((string) $string, (int) $offset, $length); } } +if (!function_exists('grapheme_str_split')) { + function grapheme_str_split(string $string, int $length = 1): array|false { return p\Grapheme::grapheme_str_split($string, $length); } +} diff --git a/src/Php84/Php84.php b/src/Php84/Php84.php index c8a9cf160..92df16a70 100644 --- a/src/Php84/Php84.php +++ b/src/Php84/Php84.php @@ -63,4 +63,33 @@ public static function mb_lcfirst(string $string, ?string $encoding = null): str return $firstChar . mb_substr($string, 1, null, $encoding); } + + public static function grapheme_str_split(string $string, int $length) + { + if ($length < 0 || $length > 1073741823) { + throw new \ValueError('grapheme_str_split(): Argument #2 ($length) must be greater than 0 and less than or equal to 1073741823.'); + } + + if ($string === '') { + return []; + } + + preg_match_all('/\X/u', $string, $matches); + + if (empty($matches[0])) { + return false; + } + + if ($length === 1) { + return $matches[0]; + } + + $chunks = array_chunk($matches[0], $length); + + array_walk($chunks, static function(&$value) { + $value = implode('', $value); + }); + + return $chunks; + } } diff --git a/src/Php84/README.md b/src/Php84/README.md index 77d249bed..ba13db76b 100644 --- a/src/Php84/README.md +++ b/src/Php84/README.md @@ -4,6 +4,7 @@ Symfony Polyfill / Php84 This component provides features added to PHP 8.4 core: - [`mb_ucfirst` and `mb_lcfirst`](https://wiki.php.net/rfc/mb_ucfirst) +- [`grapheme_str_split`](https://wiki.php.net/rfc/grapheme_str_split) More information can be found in the [main Polyfill README](https://github.com/symfony/polyfill/blob/main/README.md). diff --git a/src/Php84/bootstrap.php b/src/Php84/bootstrap.php index f73ba3d42..5f6a0a762 100644 --- a/src/Php84/bootstrap.php +++ b/src/Php84/bootstrap.php @@ -23,3 +23,7 @@ function mb_ucfirst($string, ?string $encoding = null): string { return p\Php84: if (!function_exists('mb_lcfirst')) { function mb_lcfirst($string, ?string $encoding = null): string { return p\Php84::mb_lcfirst($string, $encoding); } } + +if (\PHP_VERSION_ID >= 70300) { + require __DIR__.'/bootstrap73.php'; +} diff --git a/src/Php84/bootstrap73.php b/src/Php84/bootstrap73.php new file mode 100644 index 000000000..f0b3ea3b2 --- /dev/null +++ b/src/Php84/bootstrap73.php @@ -0,0 +1,21 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Polyfill\Php84 as p; + +if (\PHP_VERSION_ID >= 80400) { + return; +} + +if (!function_exists('grapheme_str_split') && function_exists('grapheme_substr')) { + function grapheme_str_split(string $string, int $length = 1) { return p\Php84::grapheme_str_split($string, $length); } +} + diff --git a/tests/Intl/Grapheme/GraphemeTest.php b/tests/Intl/Grapheme/GraphemeTest.php index befe1e36f..9be4730e4 100644 --- a/tests/Intl/Grapheme/GraphemeTest.php +++ b/tests/Intl/Grapheme/GraphemeTest.php @@ -207,4 +207,28 @@ public function testGraphemeStrstr() $this->assertSame('국어', grapheme_strstr('한국어', '국')); $this->assertSame('ÉJÀ', grapheme_stristr('DÉJÀ', 'é')); } + + /** + * @dataProvider graphemeStrSplitDataProvider + * @requires PHP 7.3 + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array { + $return = [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + ]; + + // https://github.com/PCRE2Project/pcre2/issues/410 + if (PCRE_VERSION_MAJOR > 10 && PCRE_VERSION_MAJOR >= 44) { + $return[] = ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]]; + } + } } diff --git a/tests/Php84/Php84Test.php b/tests/Php84/Php84Test.php index c66f402df..4c0c6ea1b 100644 --- a/tests/Php84/Php84Test.php +++ b/tests/Php84/Php84Test.php @@ -68,4 +68,24 @@ public static function lcFirstDataProvider(): array { ["ß", "ß"], ]; } + + /** + * @dataProvider graphemeStrSplitDataProvider + * @requires PHP 7.3 + */ + public function testGraphemeStrSplit(string $string, int $length, array $expectedValues) { + $this->assertSame($expectedValues, grapheme_str_split($string, $length)); + } + + public static function graphemeStrSplitDataProvider(): array { + return [ + ['', 1, []], + ['PHP', 1, ['P', 'H', 'P']], + ['你好', 1, ['你', '好']], + ['අයේෂ්', 1, ['අ', 'යේ', 'ෂ්']], + ['สวัสดี', 2, ['สวั', 'สดี']], + ['土下座🙇‍♀を', 1, ["土", "下", "座", "🙇‍♀", "を"]], + // ['👭🏻👰🏿‍♂️', 2, ['👭🏻', '👰🏿‍♂️']], // https://github.com/PCRE2Project/pcre2/issues/410 + ]; + } }