1--TEST-- 2mb_str_split() tests for the japanese language 3--EXTENSIONS-- 4mbstring 5--INI-- 6output_handler= 7--FILE-- 8<?php 9ini_set('include_path','.'); 10include_once('common.inc'); 11 12$string = "日本"; /* 2 chars */ 13$len = 2; 14$charset = [ 15 "BIG-5", 16 "EUC-JP", 17 "ISO-2022-JP", 18 "SJIS", 19 "UTF-16BE", 20 "UTF-16LE", 21 "UTF-32BE", 22 "UTF-32LE", 23 "UTF-8" 24]; 25 26/* Try empty strings first */ 27foreach ($charset as $cs) { 28 if (count(mb_str_split("", 1, $cs)) !== 0) 29 echo "Empty $cs string should convert to empty array!\n"; 30 if (count(mb_str_split("", 2, $cs)) !== 0) 31 echo "Empty $cs string should convert to empty array!\n"; 32} 33 34foreach ($charset as $cs) { 35 $enc = mb_convert_encoding($string, $cs, "UTF-8"); 36 $split = mb_str_split($enc, 1, $cs); 37 38 /* check chunks number */ 39 for($i = 1; $i <= $len; ++$i){ 40 $ceil = ceil($len / $i); 41 $cnt = count(mb_str_split($enc,$i,$cs)); 42 if ($ceil != $cnt){ 43 echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n"; 44 } 45 } 46 47 /* check content */ 48 echo "$cs:"; 49 for($i = 0; $i < $len; ++$i){ 50 echo " " . unpack("H*", $split[$i])[1]; 51 } 52 echo "\n"; 53} 54 55/* long string test */ 56$size = 50000; 57$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */ 58$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8"); 59$array = mb_str_split($enc, $len, "ISO-2022-JP"); 60$count = count($array); 61 62/* check array size */ 63if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size); 64 65/* compare initial string and last array element after splitting */ 66$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8"); 67if(end($array) !== $enc){ 68 printf("Long string splitting error: 69 last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]); 70} 71 72/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */ 73echo "== Regression test for SJIS byte 0x80 ==\n"; 74foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) { 75 $array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding); 76 echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n"; 77 78 // Also try bytes 0xFD, 0xFE, and 0xFF 79 $array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding); 80 echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n"; 81} 82 83/* 84Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples: 85 860x85AB 0xF862+0x0058+0x0049+0x0049+0x0049 # roman numeral thirteen 870x85AC 0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen 880x85AD 0xF860+0x0058+0x0056 # roman numeral fifteen 890x85BF 0xF862+0x0078+0x0069+0x0069+0x0069 # small roman numeral thirteen 900x85C0 0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen 910x85C1 0xF860+0x0078+0x0076 # small roman numeral fifteen 92 93Even though they map to multiple codepoints, mb_str_split treats these as ONE character each 94*/ 95 96echo "== MacJapanese characters which map to 3-5 codepoints each ==\n"; 97echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n"; 98echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n"; 99 100?> 101--EXPECT-- 102BIG-5: a4e9 a5bb 103EUC-JP: c6fc cbdc 104ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842 105SJIS: 93fa 967b 106UTF-16BE: 65e5 672c 107UTF-16LE: e565 2c67 108UTF-32BE: 000065e5 0000672c 109UTF-32LE: e5650000 2c670000 110UTF-8: e697a5 e69cac 111== Regression test for SJIS byte 0x80 == 112SJIS: [80a1, 6162, 6380, a1] 113SJIS: [6162, 63fd, feff, 6162, fdfe, ff] 114SJIS-2004: [80a1, 6162, 6380, a1] 115SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff] 116MacJapanese: [80a1, 6162, 6380, a1] 117MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff] 118SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1] 119SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff] 120SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1] 121SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff] 122SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1] 123SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff] 124== MacJapanese characters which map to 3-5 codepoints each == 125[61, 62, 63, 85ab, 85ac, 85ad] 126[6162, 6385bf, 85c085c1] 127