1--TEST-- 2mb_str_split() tests UTF-8 illegal chars & UTF-16 surrogate pairs 3--EXTENSIONS-- 4mbstring 5--INI-- 6output_handler= 7--FILE-- 8<?php 9ini_set('include_path','.'); 10include_once('common.inc'); 11 12/* 123 string and 4-bytes length character 0xf09280a9 */ 13$utf8 = pack("H*", "313233f09280a9"); 14 15/* 123 string and 4-bytes length character 0xf09280a9 head without tail */ 16$utf8_bad = pack("H*", "313233f092"); 17 18/* very first and very last utf-16 4-bytes characters */ 19$utf16_first_be = pack("H*", "d800dc00"); 20$utf16_first_le = pack("H*", "00d800dc"); 21 22$utf16_last_be = pack("H*", "dbffdfff"); 23$utf16_last_le = pack("H*", "ffdbffdf"); 24$utf16be_char_bad = pack("H*", "dc00dc00"); /* this char is illegal because it starts from low surrogate char */ 25$utf16le_char_bad = pack("H*", "00dc00dc"); /* this char is illegal because it starts from low surrogate char */ 26 27 28$utf16be = $utf16_first_be . $utf16_last_be; 29$utf16le = $utf16_first_le . $utf16_last_le; 30 31$utf16be_bad = $utf16_first_be . $utf16be_char_bad; 32$utf16le_bad = $utf16_first_le . $utf16le_char_bad; 33 34/* print each chunk as HEX string */ 35echo "UTF-8:"; 36foreach(mb_str_split($utf8, 2) as $chunk){ 37 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 38} 39echo PHP_EOL; 40 41echo "BAD UTF-8:"; 42foreach(mb_str_split($utf8_bad, 2) as $chunk){ 43 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 44} 45echo PHP_EOL; 46 47echo "UTF-16BE:"; 48foreach(mb_str_split($utf16be, 1, "UTF-16BE") as $chunk){ 49 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 50} 51echo PHP_EOL; 52 53echo "UTF-16LE:"; 54foreach(mb_str_split($utf16le, 1, "UTF-16LE") as $chunk){ 55 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 56} 57echo PHP_EOL; 58 59echo "BAD UTF-16BE:"; 60foreach(mb_str_split($utf16be_bad, 1, "UTF-16BE") as $chunk){ 61 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 62} 63echo PHP_EOL; 64 65echo "BAD UTF-16LE:"; 66foreach(mb_str_split($utf16le_bad, 1, "UTF-16LE") as $chunk){ 67 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 68} 69echo PHP_EOL; 70 71var_dump(mb_str_split("", 1, "ASCII")); 72var_dump(mb_str_split("", 1, "UTF-8")); 73var_dump(mb_str_split("", 1, "UTF-16LE")); 74 75?> 76--EXPECT-- 77UTF-8: l:2 v:3132 l:5 v:33f09280a9 78BAD UTF-8: l:2 v:3132 l:3 v:33f092 79UTF-16BE: l:4 v:d800dc00 l:4 v:dbffdfff 80UTF-16LE: l:4 v:00d800dc l:4 v:ffdbffdf 81BAD UTF-16BE: l:4 v:d800dc00 l:2 v:003f l:2 v:003f 82BAD UTF-16LE: l:4 v:00d800dc l:2 v:3f00 l:2 v:3f00 83array(0) { 84} 85array(0) { 86} 87array(0) { 88} 89