#!/usr/bin/env php | +----------------------------------------------------------------------+ */ /* This file prints to stdout the contents of ext/standard/html_tables.h */ /* put together with glue; have patience */ $t = << cs_utf_8 && (cs) < cs_big5) #define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) static const struct { const char *codeset; uint32_t codeset_len; enum entity_charset charset; } charset_map[] = { { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 }, { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 }, { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 }, { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 }, { "utf-8", sizeof("utf-8")-1, cs_utf_8 }, { "cp1252", sizeof("cp1252")-1, cs_cp1252 }, { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 }, { "1252", sizeof("1252")-1, cs_cp1252 }, { "BIG5", sizeof("BIG5")-1, cs_big5 }, { "950", sizeof("950")-1, cs_big5 }, { "GB2312", sizeof("GB2312")-1, cs_gb2312 }, { "936", sizeof("936")-1, cs_gb2312 }, { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs }, { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis }, { "SJIS", sizeof("SJIS")-1, cs_sjis }, { "932", sizeof("932")-1, cs_sjis }, { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis }, { "CP932", sizeof("CP932")-1, cs_sjis }, { "EUCJP", sizeof("EUCJP")-1, cs_eucjp }, { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp }, { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp }, { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r }, { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r }, { "koi8r", sizeof("koi8r")-1, cs_koi8r }, { "cp1251", sizeof("cp1251")-1, cs_cp1251 }, { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 }, { "win-1251", sizeof("win-1251")-1, cs_cp1251 }, { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 }, { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 }, { "cp866", sizeof("cp866")-1, cs_cp866 }, { "866", sizeof("866")-1, cs_cp866 }, { "ibm866", sizeof("ibm866")-1, cs_cp866 }, { "MacRoman", sizeof("MacRoman")-1, cs_macroman } }; /* longest entity name length excluding & and ; */ #define LONGEST_ENTITY_LENGTH 31 /* Definitions for mappings *to* Unicode. * The origin charset must have at most 256 code points. * The multi-byte encodings are not supported */ typedef struct { unsigned short uni_cp[64]; } enc_to_uni_stage2; typedef struct { const enc_to_uni_stage2 *inner[4]; } enc_to_uni; /* bits 7-8 bits (only single bytes encodings supported )*/ #define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) /* bits 1-6 */ #define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) CODE; echo $t; $encodings = array( array( "ident" => "iso88591", "enumid" => 1, "name" => "ISO-8859-1", "file" => "mappings/8859-1.TXT", ), array( "ident" => "iso88595", "enumid" => 5, "name" => "ISO-8859-5", "file" => "mappings/8859-5.TXT", ), array( "ident" => "iso885915", "enumid" => 3, "name" => "ISO-8859-15", "file" => "mappings/8859-15.TXT", ), array( "ident" => "win1252", "enumid" => 2, "enumident" => "cp1252", "name" => "Windows-1252", "file" => "mappings/CP1252.TXT", ), array( "ident" => "win1251", "enumid" => 4, "enumident" => "cp1252", "name" => "Windows-1251", "file" => "mappings/CP1251.TXT", ), array( "ident" => "koi8r", "enumid" => 8, "name" => "KOI8-R", "file" => "mappings/KOI8-R.TXT", ), array( "ident" => "cp866", "enumid" => 6, "name" => "CP-866", "file" => "mappings/CP866.TXT", ), array( "ident" => "macroman", "enumid" => 7, "name" => "MacRoman", "file" => "mappings/ROMAN.TXT", ), ); $prevStage2 = array(); foreach ($encodings as $e) { echo "/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; /* process file */ $map = array(); $lines = explode("\n", file_get_contents($e{'file'})); foreach ($lines as $l) { if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) $map[] = array($matches[1], $matches[2]); } $mappy = array(); foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } $mstable = array("ident" => $e['ident']); /* calculate two-stage tables */ for ($i = 0; $i < 4; $i++) { for ($j = 0; $j < 64; $j++) { $cp = $i << 6 | $j; $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; } } echo "/* {{{ Stage 2 tables for {$e['name']} */\n\n"; $s2tables_idents = array(); for ($i = 0; $i < 4; $i++) { if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; continue; } $s2tables_idents[$i] = $e["ident"]; echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". sprintf("%02X", $i << 6)." = { {\n"; for ($j = 0; $j < 64; $j++) { if ($j == 0) echo "\t"; elseif ($j % 6 == 0) echo "\n\t"; else echo " "; if ($mstable[$i][$j] !== NULL) echo sprintf("0x%04X,", $mstable[$i][$j]); else echo "0xFFFF,"; /* special value; indicates no mapping */ } echo "\n} };\n\n"; $prevStage2[] = $mstable[$i]; } echo "/* end of stage 2 tables for {$e['name']} }}} */\n\n"; echo "/* {{{ Stage 1 table for {$e['name']} */\n"; echo "static const enc_to_uni enc_to_uni_{$e['ident']} = { { \t&enc_to_uni_s2_{$s2tables_idents[0]}_00, \t&enc_to_uni_s2_{$s2tables_idents[1]}_40, \t&enc_to_uni_s2_{$s2tables_idents[2]}_80, \t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } }; "; echo "/* end of stage 1 table for {$e['name']} }}} */\n\n"; } $maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); $a = range(0, $maxencnum); foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } echo "/* {{{ Index of tables for encoding conversion */ static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; foreach ($a as $k => $v) { if (is_numeric($v)) echo "\tNULL,\n"; else echo "\t&enc_to_uni_$v,\n"; } echo "}; /* }}} */\n"; $t = << "iso885915", "name" => "ISO-8859-15", "file" => "mappings/8859-15.TXT", "range" => array(0xA4, 0xBE), ), array( "ident" => "win1252", "name" => "Windows-1252", "file" => "mappings/CP1252.TXT", "range" => array(0x80, 0x9F), ), array( "ident" => "win1251", "name" => "Windows-1251", "file" => "mappings/CP1251.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "koi8r", "name" => "KOI8-R", "file" => "mappings/KOI8-R.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "cp866", "name" => "CP-866", "file" => "mappings/CP866.TXT", "range" => array(0x80, 0xFF), ), array( "ident" => "macroman", "name" => "MacRoman", "file" => "mappings/ROMAN.TXT", "range" => array(0x80, 0xFF), ), ); foreach ($encodings as $e) { echo "/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; /* process file */ $map = array(); $lines = explode("\n", file_get_contents($e{'file'})); foreach ($lines as $l) { if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) $map[] = array($matches[1], $matches[2], rtrim($matches[3])); } $mappy = array(); foreach ($map as $v) { if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); } ksort($mappy); echo "static const uni_to_enc unimap_{$e['ident']}[] = {\n"; foreach ($mappy as $k => $v) { echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", $v[1], " */\n"; } echo "};\n"; echo "/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; } $data = file_get_contents("ents_html5.txt"); $pass2 = false; $name = "HTML5"; $ident = "html5"; again: $t = <<<'CODE' /* HTML 5 has many more named entities. * Some of them map to two unicode code points, not one. * We're going to use a three-stage table (with an extra one for the entities * with two code points). */ #define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ #define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) #define ENT_STAGE3_INDEX(k) ((k) & 0x3F) #define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) /* The default entity may be NULL. Binary search is still possible while is senseless as there are just two rows (see also find_entity_for_char()). */ typedef union { struct { const char *default_entity; unsigned size; /* number of remaining entries in the table */ unsigned short default_entity_len; } leading_entry; struct { const char *entity; unsigned second_cp; /* second code point */ unsigned short entity_len; } normal_entry; } entity_multicodepoint_row; /* blocks of these should start at code points k where k % 0xFC0 == 0 */ typedef struct { char ambiguous; /* if 0 look into entity */ union { struct { const char *entity; /* may be NULL */ unsigned short entity_len; } ent; const entity_multicodepoint_row *multicodepoint_table; } data; } entity_stage3_row; /* Calculate k & 0x3F Use as offset */ typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ /* Calculate k & 0xFC0 >> 6. Use as offset */ typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ /* For stage 1, Calculate k & 0xFFF000 >> 3*4. * If larger than 1D, we have no mapping. Otherwise lookup that index */ typedef struct { const entity_stage1_row *ms_table; /* for tables with only basic entities, this member is to be accessed * directly for better performance: */ const entity_stage3_row *table; } entity_table_opt; /* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ CODE; if (!$pass2) echo $t; $dp = array(); foreach (explode("\n", $data) as $l) { if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); $dp[] = array($matches[1], $matches[2], $matches[3]); } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { $dp[] = array($matches[1], $matches[2]); } } $origdp = $dp; usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); $multicp_rows = array(); foreach ($dp as $el) { if (count($el) == 3) { $multicp_rows[$el[1]] = array(); } } foreach ($dp as $el) { if (key_exists($el[1], $multicp_rows)) { if (count($el) == 3) $multicp_rows[$el[1]][$el[2]] = $el[0]; else $multicp_rows[$el[1]]["default"] = $el[0]; } } if ($pass2 < 2) echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; else echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; if (empty($multicp_rows)) goto skip_multicp; ksort($multicp_rows); foreach ($multicp_rows as &$v) { ksort($v); } unset($v); echo "/* {{{ Start of double code point tables for $name */", "\n\n"; foreach ($multicp_rows as $k => $v) { echo "static const entity_multicodepoint_row multi_cp_{$ident}_", sprintf("%05s", $k), "[] = {", "\n"; if (key_exists("default", $v)) { if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ $v['default'] = "gt"; echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'), "\t", sprintf("%02d", (count($v) - 1)), ",\t\t", sprintf("% 2d", strlen($v["default"])), '} },', "\n"; } else { echo "\t{ {", sprintf("%-22s", 'NULL,'), "\t", sprintf("%02d", count($v)), ",\t\t0} },\n"; } unset($v["default"]); foreach ($v as $l => $w) { echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t", sprintf("% 2d", strlen($w)), '} },', "\n"; } echo "};\n"; } echo "\n/* End of double code point tables }}} */", "\n\n"; skip_multicp: if ($pass2 < 2) echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; $t = <<> 12; $s2 = (hexdec($el[1]) & 0xFC0) >> 6; $s3 = hexdec($el[1]) & 0x3F; if (key_exists($el[1], $multicp_rows)) { $mstable[$s1][$s2][$s3] = ""; } else { $mstable[$s1][$s2][$s3] = $el[0]; } } for ($i = 0; $i < 0x1E; $i++) { for ($k = 0; $k < 64; $k++) { $any3 = false; $col3 = array(); for ($l = 0; $l < 64; $l++) { if (isset($mstable[$i][$k][$l])) { $any3 = true; $col3[$l] = $mstable[$i][$k][$l]; } else { $col3[$l] = null; } } if ($any3) { echo "static const entity_stage3_row stage3_table_{$ident}_", sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; foreach ($col3 as $y => $z) { if ($y == 0) echo "\t"; elseif ($y % 4 == 0) echo "\n\t"; else echo " "; if ($z === NULL) echo "{0, { {NULL, 0} } },"; elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ echo "{0, { {\"quot\", 4} } },"; elseif ($z !== "") echo "{0, { {\"$z\", ", strlen($z), "} } },"; else echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", ($i << 12) | ($k << 6) | $y ), ", 0} } },"; } echo "\n};\n\n"; } } } if ($pass2 < 2) echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; if ($pass2 > 1) goto hashtables; echo "/* {{{ Stage 2 Tables for $name */", "\n\n"; $t = << entity }}} */\n\n"; /* commented-out; this enabled binary search, which turned out to be * significantly slower than the hash tables for html 5 entities */ //echo //"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; //$t = << $entlen*1.2) { // die("violated assumption for traverse_for_entities"); // } // } // // $k++; //} //echo "\n};\n\n"; // //echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; // //echo //"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; hashtables: echo "/* {{{ $name hash table for entity -> codepoint */", "\n\n"; $t = << 0; $nKeyLength--) { $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) & 0xFFFFFFFF; } return $hash; } $numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); $mask = $numelems - 1; $hashes = array(); foreach ($origdp as $e) { $hashes[hashfun($e[0]) & $mask][] = $e; if (isset($e[2])) { $entlen = strlen($e[0]) + 2; $utf8len = strlen( mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); if ($utf8len > $entlen*1.2) { die("violated assumption for traverse_for_entities"); } } } for ($i = 0; $i < $numelems; $i++) { if (empty($hashes[$i])) continue; echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; foreach ($hashes[$i] as $h) { if (isset($h[2])) { echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); } else { echo sprintf(' {"%s", %d, 0x%05X, 0},', $h[0], strlen($h[0]), hexdec($h[1])); } } echo " {NULL, 0, 0, 0} };\n"; } echo "\n"; echo "static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; for ($i = 0; $i < $numelems; $i++) { if ($i == 0) echo "\t"; elseif ($i % 4 == 0) echo "\n\t"; else echo " "; if (empty($hashes[$i])) echo "ht_bucket_empty,"; else echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; } echo "\n};\n\n"; echo "static const entity_ht ent_ht_{$ident} = { ", sprintf("0x%X", $numelems), ", ht_buckets_{$ident} };\n\n"; echo "/* end of $name hash table for entity -> codepoint }}} */\n\n"; if (!$pass2) { $data = file_get_contents("ents_html401.txt"); $pass2 = 1; $name = "HTML 4.01"; $ident = "html4"; goto again; } elseif ($pass2 == 1) { $data = file_get_contents("ents_basic.txt"); $pass2 = 2; $name = "Basic entities (no apos)"; $ident = "be_noapos"; goto again; } elseif ($pass2 == 2) { $data = file_get_contents("ents_basic_apos.txt"); $pass2 = 3; $name = "Basic entities (with apos)"; $ident = "be_apos"; goto again; } echo "#endif /* HTML_TABLES_H */\n";