1--TEST--
2mb_str_split() tests for the japanese language
3--EXTENSIONS--
4mbstring
5--INI--
6output_handler=
7--FILE--
8<?php
9ini_set('include_path','.');
10include_once('common.inc');
11
12$string = "日本";             /* 2 chars */
13$len = 2;
14$charset = [
15    "BIG-5",
16    "EUC-JP",
17    "ISO-2022-JP",
18    "SJIS",
19    "UTF-16BE",
20    "UTF-16LE",
21    "UTF-32BE",
22    "UTF-32LE",
23    "UTF-8"
24];
25
26/* Try empty strings first */
27foreach ($charset as $cs) {
28    if (count(mb_str_split("", 1, $cs)) !== 0)
29        echo "Empty $cs string should convert to empty array!\n";
30    if (count(mb_str_split("", 2, $cs)) !== 0)
31        echo "Empty $cs string should convert to empty array!\n";
32}
33
34foreach ($charset as $cs) {
35    $enc = mb_convert_encoding($string, $cs, "UTF-8");
36    $split = mb_str_split($enc, 1, $cs);
37
38    /* check chunks number */
39    for($i = 1; $i <= $len; ++$i){
40        $ceil = ceil($len / $i);
41        $cnt = count(mb_str_split($enc,$i,$cs));
42        if ($ceil != $cnt){
43          echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n";
44        }
45    }
46
47    /* check content */
48    echo "$cs:";
49    for($i = 0; $i < $len; ++$i){
50        echo  " " . unpack("H*", $split[$i])[1];
51    }
52    echo "\n";
53}
54
55/* long string test */
56$size = 50000;
57$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */
58$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8");
59$array = mb_str_split($enc, $len, "ISO-2022-JP");
60$count = count($array);
61
62/* check array size */
63if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size);
64
65/* compare initial string and last array element after splitting */
66$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8");
67if(end($array) !== $enc){
68    printf("Long string splitting error:
69        last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
70}
71
72/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
73echo "== Regression test for SJIS byte 0x80 ==\n";
74foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
75    $array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
76    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
77
78    // Also try bytes 0xFD, 0xFE, and 0xFF
79    $array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
80    echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
81}
82
83?>
84--EXPECT--
85BIG-5: a4e9 a5bb
86EUC-JP: c6fc cbdc
87ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842
88SJIS: 93fa 967b
89UTF-16BE: 65e5 672c
90UTF-16LE: e565 2c67
91UTF-32BE: 000065e5 0000672c
92UTF-32LE: e5650000 2c670000
93UTF-8: e697a5 e69cac
94== Regression test for SJIS byte 0x80 ==
95SJIS: [80a1, 6162, 6380, a1]
96SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
97SJIS-2004: [80a1, 6162, 6380, a1]
98SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
99MacJapanese: [80a1, 6162, 6380, a1]
100MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
101SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
102SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
103SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
104SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
105SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
106SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
107