1--TEST-- 2mb_str_split() tests UTF-8 illegal chars & UTF-16 surrogate pairs 3--SKIPIF-- 4<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> 5--INI-- 6output_handler= 7mbstring.func_overload=0 8--FILE-- 9<?php 10ini_set('include_path','.'); 11include_once('common.inc'); 12 13/* 123 string and 4-bytes length character 0xf09280a9 */ 14$utf8 = pack("H*", "313233f09280a9"); 15 16/* 123 string and 4-bytes length character 0xf09280a9 head without tail */ 17$utf8_bad = pack("H*", "313233f092"); 18 19/* very first and very last utf-16 4-bytes characters */ 20$utf16_first_be = pack("H*", "d800dc00"); 21$utf16_first_le = pack("H*", "00d800dc"); 22 23$utf16_last_be = pack("H*", "dbffdfff"); 24$utf16_last_le = pack("H*", "ffdbffdf"); 25$utf16be_char_bad = pack("H*", "dc00dc00"); /* this char is illegal because it starts from low surrogate char */ 26$utf16le_char_bad = pack("H*", "00dc00dc"); /* this char is illegal because it starts from low surrogate char */ 27 28 29$utf16be = $utf16_first_be . $utf16_last_be; 30$utf16le = $utf16_first_le . $utf16_last_le; 31 32$utf16be_bad = $utf16_first_be . $utf16be_char_bad; 33$utf16le_bad = $utf16_first_le . $utf16le_char_bad; 34 35/* print each chunk as HEX string */ 36echo "UTF-8:"; 37foreach(mb_str_split($utf8, 2) as $chunk){ 38 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 39} 40echo PHP_EOL; 41 42echo "BAD UTF-8:"; 43foreach(mb_str_split($utf8_bad, 2) as $chunk){ 44 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 45} 46echo PHP_EOL; 47 48echo "UTF-16BE:"; 49foreach(mb_str_split($utf16be, 1, "UTF-16BE") as $chunk){ 50 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 51} 52echo PHP_EOL; 53 54echo "UTF-16LE:"; 55foreach(mb_str_split($utf16le, 1, "UTF-16LE") as $chunk){ 56 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 57} 58echo PHP_EOL; 59 60echo "BAD UTF-16BE:"; 61foreach(mb_str_split($utf16be_bad, 1, "UTF-16BE") as $chunk){ 62 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 63} 64echo PHP_EOL; 65 66echo "BAD UTF-16LE:"; 67foreach(mb_str_split($utf16le_bad, 1, "UTF-16LE") as $chunk){ 68 printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); 69} 70echo PHP_EOL; 71 72var_dump(mb_str_split("", 1, "ASCII")); 73var_dump(mb_str_split("", 1, "UTF-8")); 74var_dump(mb_str_split("", 1, "UTF-16LE")); 75 76?> 77--EXPECT-- 78UTF-8: l:2 v:3132 l:5 v:33f09280a9 79BAD UTF-8: l:2 v:3132 l:3 v:33f092 80UTF-16BE: l:4 v:d800dc00 l:4 v:dbffdfff 81UTF-16LE: l:4 v:00d800dc l:4 v:ffdbffdf 82BAD UTF-16BE: l:4 v:d800dc00 l:2 v:003f l:2 v:003f 83BAD UTF-16LE: l:4 v:00d800dc l:2 v:3f00 l:2 v:3f00 84array(0) { 85} 86array(0) { 87} 88array(0) { 89} 90