1--TEST-- 2mb_str_split() tests for more text encodings 3--EXTENSIONS-- 4mbstring 5--FILE-- 6<?php 7$array = mb_str_split("\x00\x01\x02\x03\x04\x05\x06\x07", 2, "UCS-2BE"); 8echo "[", bin2hex($array[0]), ", ", bin2hex($array[1]), "]\n"; 9 10$str = "test カタカナ 汉字"; 11 12echo "== HZ ==\n"; 13$hz = mb_convert_encoding($str, 'HZ', 'UTF-8'); 14for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) { 15 $array = mb_str_split($hz, $i, 'HZ'); 16 echo "[", implode(', ', array_map('bin2hex', $array)), "]\n"; 17 $converted = mb_convert_encoding(implode($array), 'UTF-8', 'HZ'); 18 if ($converted !== $str) { 19 die("Expected " . $str . "; got " . $converted); 20 } 21} 22 23$str = "test カタカナ 漢字"; 24 25echo "== BIG-5 ==\n"; 26$big5 = mb_convert_encoding($str, 'BIG-5', 'UTF-8'); 27for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) { 28 $array = mb_str_split($big5, $i, 'BIG-5'); 29 echo "[", implode(', ', array_map('bin2hex', $array)), "]\n"; 30 $converted = mb_convert_encoding(implode($array), 'UTF-8', 'BIG-5'); 31 if ($converted !== $str) { 32 die("Expected " . $str . "; got " . $converted); 33 } 34} 35 36$str = "test ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ"; 37 38echo "== ISO-8859-1 ==\n"; 39$iso = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8'); 40for ($i = 1; $i <= mb_strlen($str, 'UTF-8'); $i++) { 41 $array = mb_str_split($iso, $i, 'ISO-8859-1'); 42 echo "[", implode(', ', array_map('bin2hex', $array)), "]\n"; 43 $converted = mb_convert_encoding(implode($array), 'UTF-8', 'ISO-8859-1'); 44 if ($converted !== $str) { 45 die("Expected " . $str . "; got " . $converted); 46 } 47} 48 49echo "== Regression tests ==\n"; 50 51// The old implementation of mb_str_split had a bug due to using char* instead of unsigned char* 52// When retrieving a byte with the MSB set, it would sign-extend it to become a negative int 53// For example, 0xCA would become 0xFFFFFFCA 54$array = mb_str_split("\xCA\xCA\xCA\xCA", 2, "JIS"); 55echo "[", implode(', ', array_map('bin2hex', $array)), "]\n"; 56 57// Another bug in the old implementation of mb_str_split; when finishing one last (incomplete) 58// incomplete chunk, it required that the last byte of the input string should decode to 59// some codepoint for the last chunk to actually be returned 60// This is not always the case; the last byte might be some kind of control sequence which 61// affects the decoder state but doesn't actually decode to any codepoint 62$array = mb_str_split("Z~", 2, "HZ"); 63echo "[", implode(', ', array_map('bin2hex', $array)), "]\n"; 64 65// Another problem with the old implementation of mb_str_split: If you passed a huge chunk_len 66// argument, it would run the PHP interpreter out of memory 67try { 68 mb_str_split("abc", 1234567890, "UTF-8"); 69} catch (ValueError $e) { 70 echo $e->getMessage() . \PHP_EOL; 71} 72 73?> 74--EXPECT-- 75[00010203, 04050607] 76== HZ == 77[74, 65, 73, 74, 20, 7e7b252b7e7d, 7e7b253f7e7d, 7e7b252b7e7d, 7e7b254a7e7d, 20, 7e7b3a3a7e7d, 7e7b57567e7d] 78[7465, 7374, 207e7b252b7e7d, 7e7b253f252b7e7d, 7e7b254a7e7d20, 7e7b3a3a57567e7d] 79[746573, 74207e7b252b7e7d, 7e7b253f252b254a7e7d, 207e7b3a3a57567e7d] 80[74657374, 207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d] 81[7465737420, 7e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d] 82[74657374207e7b252b7e7d, 7e7b253f252b254a7e7d207e7b3a3a57567e7d] 83[74657374207e7b252b253f7e7d, 7e7b252b254a7e7d207e7b3a3a57567e7d] 84[74657374207e7b252b253f252b7e7d, 7e7b254a7e7d207e7b3a3a57567e7d] 85[74657374207e7b252b253f252b254a7e7d, 207e7b3a3a57567e7d] 86[74657374207e7b252b253f252b254a7e7d20, 7e7b3a3a57567e7d] 87[74657374207e7b252b253f252b254a7e7d207e7b3a3a7e7d, 7e7b57567e7d] 88[74657374207e7b252b253f252b254a7e7d207e7b3a3a57567e7d] 89== BIG-5 == 90[74, 65, 73, 74, 20, c743, c757, c743, c762, 20, ba7e, a672] 91[7465, 7374, 20c743, c757c743, c76220, ba7ea672] 92[746573, 7420c743, c757c743c762, 20ba7ea672] 93[74657374, 20c743c757c743, c76220ba7ea672] 94[7465737420, c743c757c743c76220, ba7ea672] 95[7465737420c743, c757c743c76220ba7ea672] 96[7465737420c743c757, c743c76220ba7ea672] 97[7465737420c743c757c743, c76220ba7ea672] 98[7465737420c743c757c743c762, 20ba7ea672] 99[7465737420c743c757c743c76220, ba7ea672] 100[7465737420c743c757c743c76220ba7e, a672] 101[7465737420c743c757c743c76220ba7ea672] 102== ISO-8859-1 == 103[74, 65, 73, 74, 20, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ca, cb, cc, cd, ce, cf] 104[7465, 7374, 20c0, c1c2, c3c4, c5c6, c7c8, c9ca, cbcc, cdce, cf] 105[746573, 7420c0, c1c2c3, c4c5c6, c7c8c9, cacbcc, cdcecf] 106[74657374, 20c0c1c2, c3c4c5c6, c7c8c9ca, cbcccdce, cf] 107[7465737420, c0c1c2c3c4, c5c6c7c8c9, cacbcccdce, cf] 108[7465737420c0, c1c2c3c4c5c6, c7c8c9cacbcc, cdcecf] 109[7465737420c0c1, c2c3c4c5c6c7c8, c9cacbcccdcecf] 110[7465737420c0c1c2, c3c4c5c6c7c8c9ca, cbcccdcecf] 111[7465737420c0c1c2c3, c4c5c6c7c8c9cacbcc, cdcecf] 112[7465737420c0c1c2c3c4, c5c6c7c8c9cacbcccdce, cf] 113[7465737420c0c1c2c3c4c5, c6c7c8c9cacbcccdcecf] 114[7465737420c0c1c2c3c4c5c6, c7c8c9cacbcccdcecf] 115[7465737420c0c1c2c3c4c5c6c7, c8c9cacbcccdcecf] 116[7465737420c0c1c2c3c4c5c6c7c8, c9cacbcccdcecf] 117[7465737420c0c1c2c3c4c5c6c7c8c9, cacbcccdcecf] 118[7465737420c0c1c2c3c4c5c6c7c8c9ca, cbcccdcecf] 119[7465737420c0c1c2c3c4c5c6c7c8c9cacb, cccdcecf] 120[7465737420c0c1c2c3c4c5c6c7c8c9cacbcc, cdcecf] 121[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccd, cecf] 122[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdce, cf] 123[7465737420c0c1c2c3c4c5c6c7c8c9cacbcccdcecf] 124== Regression tests == 125[1b28494a4a1b2842, 1b28494a4a1b2842] 126[5a] 127mb_str_split(): Argument #2 ($length) is too large 128