Skip to content

Commit

Permalink
Merge branch 'PHP-8.1' into PHP-8.2
Browse files Browse the repository at this point in the history
* PHP-8.1:
  Use different mblen_table for different SJIS variants
  Correct entry for 0x80,0xFD-FF in SJIS multi-byte character length table
  • Loading branch information
alexdowad committed Jan 6, 2023
2 parents de633c3 + 3152b7b commit 1751f34
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 33 deletions.
6 changes: 3 additions & 3 deletions ext/mbstring/libmbfl/filters/mbfilter_sjis.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);

const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand All @@ -49,14 +49,14 @@ const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};

static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
Expand Down
4 changes: 2 additions & 2 deletions ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#include "unicode_table_jis2004.h"
#include "unicode_table_jis.h"

extern const unsigned char mblen_table_sjis[];
extern const unsigned char mblen_table_sjis_mobile[];
extern const unsigned char mblen_table_eucjp[];

static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
Expand All @@ -62,7 +62,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
"SJIS-2004",
"Shift_JIS",
mbfl_encoding_sjis2004_aliases,
mblen_table_sjis,
mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis2004_wchar,
&vtbl_wchar_sjis2004,
Expand Down
21 changes: 19 additions & 2 deletions ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,24 @@

#include "sjis_mac2uni.h"

extern const unsigned char mblen_table_sjis[];
const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};

static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_sjis_mac_wchar_flush(mbfl_convert_filter *filter);
Expand All @@ -49,7 +66,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
"SJIS-mac",
"Shift_JIS",
mbfl_encoding_sjis_mac_aliases,
mblen_table_sjis,
mblen_table_sjismac,
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis_mac_wchar,
&vtbl_wchar_sjis_mac,
Expand Down
26 changes: 22 additions & 4 deletions ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,26 @@

#include "emoji2uni.h"

const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
};

extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n);
extern const unsigned char mblen_table_sjis[];

static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
Expand All @@ -55,7 +73,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
"SJIS-Mobile#DOCOMO",
"Shift_JIS",
mbfl_encoding_sjis_docomo_aliases,
mblen_table_sjis,
mblen_table_sjis_mobile,
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis_docomo_wchar,
&vtbl_wchar_sjis_docomo,
Expand All @@ -68,7 +86,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
"SJIS-Mobile#KDDI",
"Shift_JIS",
mbfl_encoding_sjis_kddi_aliases,
mblen_table_sjis,
mblen_table_sjis_mobile,
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis_kddi_wchar,
&vtbl_wchar_sjis_kddi,
Expand All @@ -81,7 +99,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
"SJIS-Mobile#SOFTBANK",
"Shift_JIS",
mbfl_encoding_sjis_sb_aliases,
mblen_table_sjis,
mblen_table_sjis_mobile,
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjis_sb_wchar,
&vtbl_wchar_sjis_sb,
Expand Down
24 changes: 24 additions & 0 deletions ext/mbstring/tests/mb_str_split_jp.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,17 @@ if(end($array) !== $enc){
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
}

/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
echo "== Regression test for SJIS byte 0x80 ==\n";
foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
$array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";

// Also try bytes 0xFD, 0xFE, and 0xFF
$array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
}

?>
--EXPECT--
BIG-5: a4e9 a5bb
Expand All @@ -80,3 +91,16 @@ UTF-16LE: e565 2c67
UTF-32BE: 000065e5 0000672c
UTF-32LE: e5650000 2c670000
UTF-8: e697a5 e69cac
== Regression test for SJIS byte 0x80 ==
SJIS: [80a1, 6162, 6380, a1]
SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
SJIS-2004: [80a1, 6162, 6380, a1]
SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
MacJapanese: [80a1, 6162, 6380, a1]
MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
72 changes: 53 additions & 19 deletions ext/mbstring/tests/mb_strlen.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -13,43 +13,59 @@ include_once('common.inc');
mb_detect_order('auto');

// Test string
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
$euc_jp = mb_convert_encoding("0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。", 'EUC-JP', 'UTF-8');
$ascii = 'abcdefghijklmnopqrstuvwxyz;]=#0123456789';

// ASCII
echo "== ASCII ==\n";
print mb_strlen($ascii,'ASCII') . "\n";
print strlen($ascii) . "\n";
print mb_strlen($ascii,'ASCII') . "\n";
print strlen($ascii) . "\n";

// EUC-JP
echo "== EUC-JP ==\n";
print mb_strlen($euc_jp,'EUC-JP') . "\n";
print mb_strlen($euc_jp,'EUC-JP') . "\n";
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
print strlen($euc_jp) . "\n";
print strlen($euc_jp) . "\n";

// SJIS
echo "== SJIS ==\n";
$sjis = mb_convert_encoding($euc_jp, 'SJIS','EUC-JP');
print mb_strlen($sjis,'SJIS') . "\n";
print mb_strlen($sjis,'SJIS') . "\n";
mb_internal_encoding('SJIS') or print("mb_internal_encoding() failed\n");
print strlen($sjis) . "\n";
print strlen($sjis) . "\n";
print "-- Testing illegal bytes 0x80,0xFD-FF --\n";
// mb_strlen used to wrongly treat 0x80 as the starting byte of a 2-byte SJIS character
print mb_strlen("\x80\xA1", 'SJIS') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS') . "\n";

echo "== MacJapanese ==\n";
print mb_strlen("\x80\xA1", 'MacJapanese') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'MacJapanese') . "\n";

echo "== SJIS-2004 ==\n";
print mb_strlen("\x80\xA1", 'SJIS-2004') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-2004') . "\n";

echo "== SJIS-Mobile#DOCOMO ==\n";
print mb_strlen("\x80\xA1", 'SJIS-Mobile#DOCOMO') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#DOCOMO') . "\n";

echo "== SJIS-Mobile#KDDI ==\n";
print mb_strlen("\x80\xA1", 'SJIS-Mobile#KDDI') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#KDDI') . "\n";

echo "== SJIS-Mobile#SoftBank ==\n";
print mb_strlen("\x80\xA1", 'SJIS-Mobile#SoftBank') . "\n";
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#SoftBank') . "\n";

// JIS
// Note: either convert_encoding or strlen has problem
echo "== JIS ==\n";
$jis = mb_convert_encoding($euc_jp, 'JIS','EUC-JP');
print mb_strlen($jis,'JIS') . "\n";
print mb_strlen($jis,'JIS') . "\n";
mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n");
print strlen($jis) . "\n";
print strlen($jis) . "\n";

// UTF-8
// Note: either convert_encoding or strlen has problem
echo "== UTF-8 ==\n";
$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP');
print mb_strlen($utf8,'UTF-8') . "\n";
print mb_strlen($utf8,'UTF-8') . "\n";
mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
print strlen($utf8) . "\n";

print strlen($utf8) . "\n";

// Wrong Parameters
echo "== WRONG PARAMETERS ==\n";
Expand All @@ -72,6 +88,24 @@ try {
== SJIS ==
43
72
-- Testing illegal bytes 0x80,0xFD-FF --
2
6
== MacJapanese ==
2
6
== SJIS-2004 ==
2
6
== SJIS-Mobile#DOCOMO ==
2
6
== SJIS-Mobile#KDDI ==
2
6
== SJIS-Mobile#SoftBank ==
2
6
== JIS ==
43
90
Expand Down
47 changes: 44 additions & 3 deletions ext/mbstring/tests/mb_substr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ ini_set('include_path','.');
include_once('common.inc');

// EUC-JP
$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA3\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
$euc_jp = mb_convert_encoding('0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。', 'EUC-JP', 'UTF-8');
// SJIS
$sjis = "\x93\xFA\x96{\x8C\xEA\x83e\x83L\x83X\x83g\x82\xC5\x82\xB7\x81B01234\x82T\x82U\x82V\x82W\x82X\x81B";
$sjis = mb_convert_encoding('日本語テキストです。0123456789。', 'SJIS', 'UTF-8');
// ISO-2022-JP
$iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC";
// GB-18030
$gb18030 = "\xC3\xDC\xC2\xEB\xD3\xC3\xBB\xA7\xC3\xFB\xC3\xDC\xC2\xEB\xC3\xFB\xB3\xC6\xC3\xFB\xB3\xC6";
$gb18030 = mb_convert_encoding('密码用户名密码名称名称', 'GB18030', 'UTF-8');
// HZ
$hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.";
// UTF-8
Expand All @@ -40,6 +40,29 @@ print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n";
print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n";
print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n";
print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n";
echo "-- Testing illegal SJIS byte 0x80 --\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n";

echo "SJIS-2004:\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n";

echo "MacJapanese:\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n";

echo "SJIS-Mobile#DOCOMO:\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n";

echo "SJIS-Mobile#KDDI:\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n";

echo "SJIS-Mobile#SoftBank:\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";

echo "ISO-2022-JP:\n";
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
Expand Down Expand Up @@ -98,6 +121,24 @@ SJIS:
3: 825582568257
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
5:
-- Testing illegal SJIS byte 0x80 --
6380
806162
SJIS-2004:
6380
806162
MacJapanese:
6380
806162
SJIS-Mobile#DOCOMO:
6380
806162
SJIS-Mobile#KDDI:
6380
806162
SJIS-Mobile#SoftBank:
6380
806162
ISO-2022-JP:
1: 1b2442212121721b284241
2: 43
Expand Down

0 comments on commit 1751f34

Please sign in to comment.