1--TEST--
2mb_str_split() tests for more text encodings
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7$array = mb_str_split("\x00\x01\x02\x03\x04\x05\x06\x07", 2, "UCS-2BE");
8echo "[", bin2hex($array[0]), ", ", bin2hex($array[1]), "]\n";
9
10$str = "test カタカナ 汉字";
11
12echo "== HZ ==\n";
13$hz = mb_convert_encoding($str, 'HZ', 'UTF-8');
14for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
15	$array = mb_str_split($hz, $i, 'HZ');
16	echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
17	$converted = mb_convert_encoding(implode($array), 'UTF-8', 'HZ');
18	if ($converted !== $str) {
19		die("Expected " . $str . "; got " . $converted);
20	}
21}
22
23$str = "test カタカナ 漢字";
24
25echo "== BIG-5 ==\n";
26$big5 = mb_convert_encoding($str, 'BIG-5', 'UTF-8');
27for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
28	$array = mb_str_split($big5, $i, 'BIG-5');
29	echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
30	$converted = mb_convert_encoding(implode($array), 'UTF-8', 'BIG-5');
31	if ($converted !== $str) {
32		die("Expected " . $str . "; got " . $converted);
33	}
34}
35
36$str = "test ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ";
37
38echo "== ISO-8859-1 ==\n";
39$iso = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
40for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) {
41	$array = mb_str_split($iso, $i, 'ISO-8859-1');
42	echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
43	$converted = mb_convert_encoding(implode($array), 'UTF-8', 'ISO-8859-1');
44	if ($converted !== $str) {
45		die("Expected " . $str . "; got " . $converted);
46	}
47}
48
49echo "== Regression tests ==\n";
50
51// The old implementation of mb_str_split had a bug due to using char* instead of unsigned char*
52// When retrieving a byte with the MSB set, it would sign-extend it to become a negative int
53// For example, 0xCA would become 0xFFFFFFCA
54$array = mb_str_split("\xCA\xCA\xCA\xCA", 2, "JIS");
55echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
56
57// Another bug in the old implementation of mb_str_split; when finishing one last (incomplete)
58// incomplete chunk, it required that the last byte of the input string should decode to
59// some codepoint for the last chunk to actually be returned
60// This is not always the case; the last byte might be some kind of control sequence which
61// affects the decoder state but doesn't actually decode to any codepoint
62$array = mb_str_split("Z~", 2, "HZ");
63echo "[", implode(', ', array_map('bin2hex', $array)), "]\n";
64
65// Another problem with the old implementation of mb_str_split: If you passed a huge chunk_len
66// argument, it would run the PHP interpreter out of memory
67try {
68	mb_str_split("abc", 1234567890, "UTF-8");
69} catch (ValueError $e) {
70	echo $e->getMessage() . \PHP_EOL;
71}
72
73?>
74--EXPECT--
75[00010203, 04050607]
76== HZ ==
77[74, 65, 73, 74, 20, 7e7b252b7e7d, 7e7b253f7e7d, 7e7b252b7e7d, 7e7b254a7e7d, 20, 7e7b3a3a7e7d, 7e7b57567e7d]
78[7465, 7374, 207e7b252b7e7d, 7e7b253f252b7e7d, 7e7b254a7e7d20, 7e7b3a3a57567e7d]
79[746573, 74207e7b252b7e7d, 7e7b253f252b254a7e7d, 207e7b3a3a57567e7d]
80[74657374, 207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d]
81[7465737420, 7e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d]
82[74657374207e7b252b7e7d, 7e7b253f252b254a7e7d207e7b3a3a57567e7d]
83[74657374207e7b252b253f7e7d, 7e7b252b254a7e7d207e7b3a3a57567e7d]
84[74657374207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d]
85[74657374207e7b252b253f252b254a7e7d, 207e7b3a3a57567e7d]
86[74657374207e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d]
87[74657374207e7b252b253f252b254a7e7d207e7b3a3a7e7d, 7e7b57567e7d]
88[74657374207e7b252b253f252b254a7e7d207e7b3a3a57567e7d]
89== BIG-5 ==
90[74, 65, 73, 74, 20, c743, c757, c743, c762, 20, ba7e, a672]
91[7465, 7374, 20c743, c757c743, c76220, ba7ea672]
92[746573, 7420c743, c757c743c762, 20ba7ea672]
93[74657374, 20c743c757c743, c76220ba7ea672]
94[7465737420, c743c757c743c76220, ba7ea672]
95[7465737420c743, c757c743c76220ba7ea672]
96[7465737420c743c757, c743c76220ba7ea672]
97[7465737420c743c757c743, c76220ba7ea672]
98[7465737420c743c757c743c762, 20ba7ea672]
99[7465737420c743c757c743c76220, ba7ea672]
100[7465737420c743c757c743c76220ba7e, a672]
101[7465737420c743c757c743c76220ba7ea672]
102== ISO-8859-1 ==
103[74, 65, 73, 74, 20, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf]
104[7465, 7374, 20c0, c1c2, c3c4, c5c6, c7c8, c9ca, cbcc, cdce, cf]
105[746573, 7420c0, c1c2c3, c4c5c6, c7c8c9, cacbcc, cdcecf]
106[74657374, 20c0c1c2, c3c4c5c6, c7c8c9ca, cbcccdce, cf]
107[7465737420, c0c1c2c3c4, c5c6c7c8c9, cacbcccdce, cf]
108[7465737420c0, c1c2c3c4c5c6, c7c8c9cacbcc, cdcecf]
109[7465737420c0c1, c2c3c4c5c6c7c8, c9cacbcccdcecf]
110[7465737420c0c1c2, c3c4c5c6c7c8c9ca, cbcccdcecf]
111[7465737420c0c1c2c3, c4c5c6c7c8c9cacbcc, cdcecf]
112[7465737420c0c1c2c3c4, c5c6c7c8c9cacbcccdce, cf]
113[7465737420c0c1c2c3c4c5, c6c7c8c9cacbcccdcecf]
114[7465737420c0c1c2c3c4c5c6, c7c8c9cacbcccdcecf]
115[7465737420c0c1c2c3c4c5c6c7, c8c9cacbcccdcecf]
116[7465737420c0c1c2c3c4c5c6c7c8, c9cacbcccdcecf]
117[7465737420c0c1c2c3c4c5c6c7c8c9, cacbcccdcecf]
118[7465737420c0c1c2c3c4c5c6c7c8c9ca, cbcccdcecf]
119[7465737420c0c1c2c3c4c5c6c7c8c9cacb, cccdcecf]
120[7465737420c0c1c2c3c4c5c6c7c8c9cacbcc, cdcecf]
121[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccd, cecf]
122[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdce, cf]
123[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdcecf]
124== Regression tests ==
125[1b28494a4a1b2842, 1b28494a4a1b2842]
126[5a]
127mb_str_split(): Argument #2 ($length) is too large
128