1--TEST--
2mb_str_split() tests for the japanese language
3--EXTENSIONS--
4mbstring
5--INI--
6output_handler=
7--FILE--
8<?php
9ini_set('include_path','.');
10include_once('common.inc');
11
12$string = "日本";             /* 2 chars */
13$len = 2;
14$charset = [
15    "BIG-5",
16    "EUC-JP",
17    "ISO-2022-JP",
18    "SJIS",
19    "UTF-16BE",
20    "UTF-16LE",
21    "UTF-32BE",
22    "UTF-32LE",
23    "UTF-8"
24];
25
26/* Try empty strings first */
27foreach ($charset as $cs) {
28    if (count(mb_str_split("", 1, $cs)) !== 0)
29        echo "Empty $cs string should convert to empty array!\n";
30    if (count(mb_str_split("", 2, $cs)) !== 0)
31        echo "Empty $cs string should convert to empty array!\n";
32}
33
34foreach ($charset as $cs) {
35    $enc = mb_convert_encoding($string, $cs, "UTF-8");
36    $split = mb_str_split($enc, 1, $cs);
37
38    /* check chunks number */
39    for($i = 1; $i <= $len; ++$i){
40        $ceil = ceil($len / $i);
41        $cnt = count(mb_str_split($enc,$i,$cs));
42        if ($ceil != $cnt){
43          echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
44        }
45    }
46
47    /* check content */
48    echo "$cs:";
49    for($i = 0; $i < $len; ++$i){
50        echo  " " . unpack("H*", $split[$i])[1];
51    }
52    echo "\n";
53}
54
55/* long string test */
56$size = 50000;
57$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */
58$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8");
59$array = mb_str_split($enc, $len, "ISO-2022-JP");
60$count = count($array);
61
62/* check array size */
63if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
64
65/* compare initial string and last array element after splitting */
66$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8");
67if(end($array) !== $enc){
68    printf("Long string splitting error:
69        last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
70}
71
72/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
73echo "== Regression test for SJIS byte 0x80 ==\n";
74foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
75    $array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
76    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
77
78    // Also try bytes 0xFD, 0xFE, and 0xFF
79    $array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
80    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
81}
82
83/*
84Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples:
85
860x85AB  0xF862+0x0058+0x0049+0x0049+0x0049  # roman numeral thirteen
870x85AC  0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen
880x85AD  0xF860+0x0058+0x0056    # roman numeral fifteen
890x85BF  0xF862+0x0078+0x0069+0x0069+0x0069  # small roman numeral thirteen
900x85C0  0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen
910x85C1  0xF860+0x0078+0x0076    # small roman numeral fifteen
92
93Even though they map to multiple codepoints, mb_str_split treats these as ONE character each
94*/
95
96echo "== MacJapanese characters which map to 3-5 codepoints each ==\n";
97echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n";
98echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n";
99
100?>
101--EXPECT--
102BIG-5: a4e9 a5bb
103EUC-JP: c6fc cbdc
104ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842
105SJIS: 93fa 967b
106UTF-16BE: 65e5 672c
107UTF-16LE: e565 2c67
108UTF-32BE: 000065e5 0000672c
109UTF-32LE: e5650000 2c670000
110UTF-8: e697a5 e69cac
111== Regression test for SJIS byte 0x80 ==
112SJIS: [80a1, 6162, 6380, a1]
113SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
114SJIS-2004: [80a1, 6162, 6380, a1]
115SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
116MacJapanese: [80a1, 6162, 6380, a1]
117MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
118SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
119SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
120SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
121SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
122SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
123SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
124== MacJapanese characters which map to 3-5 codepoints each ==
125[61, 62, 63, 85ab, 85ac, 85ad]
126[6162, 6385bf, 85c085c1]
127