1<?php 2/* 3 +----------------------------------------------------------------------+ 4 | PHP Version 7 | 5 +----------------------------------------------------------------------+ 6 | Copyright (c) 1997-2018 The PHP Group | 7 +----------------------------------------------------------------------+ 8 | This source file is subject to version 3.01 of the PHP license, | 9 | that is bundled with this package in the file LICENSE, and is | 10 | available through the world-wide-web at the following url: | 11 | http://www.php.net/license/3_01.txt | 12 | If you did not receive a copy of the PHP license and are unable to | 13 | obtain it through the world-wide-web, please send a note to | 14 | license@php.net so we can mail you a copy immediately. | 15 +----------------------------------------------------------------------+ 16 | Authors: Gustavo Lopes <cataphract@php.net> | 17 +----------------------------------------------------------------------+ 18*/ 19 20/* This file prints to stdout the contents of ext/standard/html_tables.h */ 21/* put together with glue; have patience */ 22 23$t = <<<CODE 24/* 25 +----------------------------------------------------------------------+ 26 | PHP Version 7 | 27 +----------------------------------------------------------------------+ 28 | Copyright (c) 1997-%s The PHP Group | 29 +----------------------------------------------------------------------+ 30 | This source file is subject to version 3.01 of the PHP license, | 31 | that is bundled with this package in the file LICENSE, and is | 32 | available through the world-wide-web at the following url: | 33 | http://www.php.net/license/3_01.txt | 34 | If you did not receive a copy of the PHP license and are unable to | 35 | obtain it through the world-wide-web, please send a note to | 36 | license@php.net so we can mail you a copy immediately. | 37 +----------------------------------------------------------------------+ 38*/ 39 40/* \$Id$ */ 41 42#ifndef HTML_TABLES_H 43#define HTML_TABLES_H 44 45/************************************************************************** 46*************************************************************************** 47** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** 48*************************************************************************** 49** Please change html_tables/html_table_gen.php instead and then ** 50** run it in order to generate this file ** 51*************************************************************************** 52**************************************************************************/ 53 54enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, 55 cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, 56 cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, 57 cs_numelems /* used to count the number of charsets */ 58 }; 59#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) 60#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) 61#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) 62 63static const struct { 64 const char *codeset; 65 uint32_t codeset_len; 66 enum entity_charset charset; 67} charset_map[] = { 68 { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 }, 69 { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 }, 70 { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 }, 71 { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 }, 72 { "utf-8", sizeof("utf-8")-1, cs_utf_8 }, 73 { "cp1252", sizeof("cp1252")-1, cs_cp1252 }, 74 { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 }, 75 { "1252", sizeof("1252")-1, cs_cp1252 }, 76 { "BIG5", sizeof("BIG5")-1, cs_big5 }, 77 { "950", sizeof("950")-1, cs_big5 }, 78 { "GB2312", sizeof("GB2312")-1, cs_gb2312 }, 79 { "936", sizeof("936")-1, cs_gb2312 }, 80 { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs }, 81 { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis }, 82 { "SJIS", sizeof("SJIS")-1, cs_sjis }, 83 { "932", sizeof("932")-1, cs_sjis }, 84 { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis }, 85 { "CP932", sizeof("CP932")-1, cs_sjis }, 86 { "EUCJP", sizeof("EUCJP")-1, cs_eucjp }, 87 { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp }, 88 { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp }, 89 { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r }, 90 { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r }, 91 { "koi8r", sizeof("koi8r")-1, cs_koi8r }, 92 { "cp1251", sizeof("cp1251")-1, cs_cp1251 }, 93 { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 }, 94 { "win-1251", sizeof("win-1251")-1, cs_cp1251 }, 95 { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 }, 96 { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 }, 97 { "cp866", sizeof("cp866")-1, cs_cp866 }, 98 { "866", sizeof("866")-1, cs_cp866 }, 99 { "ibm866", sizeof("ibm866")-1, cs_cp866 }, 100 { "MacRoman", sizeof("MacRoman")-1, cs_macroman } 101}; 102 103/* longest entity name length excluding & and ; */ 104#define LONGEST_ENTITY_LENGTH 31 105 106/* Definitions for mappings *to* Unicode. 107 * The origin charset must have at most 256 code points. 108 * The multi-byte encodings are not supported */ 109typedef struct { 110 unsigned short uni_cp[64]; 111} enc_to_uni_stage2; 112 113typedef struct { 114 const enc_to_uni_stage2 *inner[4]; 115} enc_to_uni; 116 117/* bits 7-8 bits (only single bytes encodings supported )*/ 118#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) 119/* bits 1-6 */ 120#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) 121 122 123CODE; 124 125echo sprintf($t, date("Y")); 126 127$encodings = array( 128 array( 129 "ident" => "iso88591", 130 "enumid" => 1, 131 "name" => "ISO-8859-1", 132 "file" => "mappings/8859-1.TXT", 133 ), 134 array( 135 "ident" => "iso88595", 136 "enumid" => 5, 137 "name" => "ISO-8859-5", 138 "file" => "mappings/8859-5.TXT", 139 ), 140 array( 141 "ident" => "iso885915", 142 "enumid" => 3, 143 "name" => "ISO-8859-15", 144 "file" => "mappings/8859-15.TXT", 145 ), 146 array( 147 "ident" => "win1252", 148 "enumid" => 2, 149 "enumident" => "cp1252", 150 "name" => "Windows-1252", 151 "file" => "mappings/CP1252.TXT", 152 ), 153 array( 154 "ident" => "win1251", 155 "enumid" => 4, 156 "enumident" => "cp1252", 157 "name" => "Windows-1251", 158 "file" => "mappings/CP1251.TXT", 159 ), 160 array( 161 "ident" => "koi8r", 162 "enumid" => 8, 163 "name" => "KOI8-R", 164 "file" => "mappings/KOI8-R.TXT", 165 ), 166 array( 167 "ident" => "cp866", 168 "enumid" => 6, 169 "name" => "CP-866", 170 "file" => "mappings/CP866.TXT", 171 ), 172 array( 173 "ident" => "macroman", 174 "enumid" => 7, 175 "name" => "MacRoman", 176 "file" => "mappings/ROMAN.TXT", 177 ), 178); 179 180$prevStage2 = array(); 181 182foreach ($encodings as $e) { 183 echo 184"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; 185 186 /* process file */ 187 $map = array(); 188 $lines = explode("\n", file_get_contents($e{'file'})); 189 foreach ($lines as $l) { 190 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) 191 $map[] = array($matches[1], $matches[2]); 192 } 193 194 $mappy = array(); 195 foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } 196 197 $mstable = array("ident" => $e['ident']); 198 /* calculate two-stage tables */ 199 for ($i = 0; $i < 4; $i++) { 200 for ($j = 0; $j < 64; $j++) { 201 $cp = $i << 6 | $j; 202 $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; 203 } 204 } 205 206 echo 207"/* {{{ Stage 2 tables for {$e['name']} */\n\n"; 208 209 $s2tables_idents = array(); 210 for ($i = 0; $i < 4; $i++) { 211 if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { 212 $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; 213 continue; 214 } 215 216 $s2tables_idents[$i] = $e["ident"]; 217 218 echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". 219 sprintf("%02X", $i << 6)." = { {\n"; 220 for ($j = 0; $j < 64; $j++) { 221 if ($j == 0) echo "\t"; 222 elseif ($j % 6 == 0) echo "\n\t"; 223 else echo " "; 224 if ($mstable[$i][$j] !== NULL) 225 echo sprintf("0x%04X,", $mstable[$i][$j]); 226 else 227 echo "0xFFFF,"; /* special value; indicates no mapping */ 228 } 229 echo "\n} };\n\n"; 230 231 $prevStage2[] = $mstable[$i]; 232 } 233 234 echo 235"/* end of stage 2 tables for {$e['name']} }}} */\n\n"; 236 237 echo 238"/* {{{ Stage 1 table for {$e['name']} */\n"; 239 240 echo 241"static const enc_to_uni enc_to_uni_{$e['ident']} = { { 242\t&enc_to_uni_s2_{$s2tables_idents[0]}_00, 243\t&enc_to_uni_s2_{$s2tables_idents[1]}_40, 244\t&enc_to_uni_s2_{$s2tables_idents[2]}_80, 245\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } 246}; 247"; 248 249 echo 250"/* end of stage 1 table for {$e['name']} }}} */\n\n"; 251} 252 253$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); 254$a = range(0, $maxencnum); 255foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } 256 257 echo 258"/* {{{ Index of tables for encoding conversion */ 259static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; 260 261foreach ($a as $k => $v) { 262 if (is_numeric($v)) 263 echo "\tNULL,\n"; 264 else 265 echo "\t&enc_to_uni_$v,\n"; 266} 267 268 echo 269"}; 270/* }}} */\n"; 271 272$t = <<<CODE 273 274/* Definitions for mappings *from* Unicode */ 275 276typedef struct { 277 unsigned short un_code_point; /* we don't need bigger */ 278 unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ 279} uni_to_enc; 280 281 282CODE; 283 284echo $t; 285 286$encodings = array( 287 array( 288 "ident" => "iso885915", 289 "name" => "ISO-8859-15", 290 "file" => "mappings/8859-15.TXT", 291 "range" => array(0xA4, 0xBE), 292 ), 293 array( 294 "ident" => "win1252", 295 "name" => "Windows-1252", 296 "file" => "mappings/CP1252.TXT", 297 "range" => array(0x80, 0x9F), 298 ), 299 array( 300 "ident" => "win1251", 301 "name" => "Windows-1251", 302 "file" => "mappings/CP1251.TXT", 303 "range" => array(0x80, 0xFF), 304 ), 305 array( 306 "ident" => "koi8r", 307 "name" => "KOI8-R", 308 "file" => "mappings/KOI8-R.TXT", 309 "range" => array(0x80, 0xFF), 310 ), 311 array( 312 "ident" => "cp866", 313 "name" => "CP-866", 314 "file" => "mappings/CP866.TXT", 315 "range" => array(0x80, 0xFF), 316 ), 317 array( 318 "ident" => "macroman", 319 "name" => "MacRoman", 320 "file" => "mappings/ROMAN.TXT", 321 "range" => array(0x80, 0xFF), 322 ), 323); 324 325foreach ($encodings as $e) { 326 echo 327"/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; 328 329 /* process file */ 330 $map = array(); 331 $lines = explode("\n", file_get_contents($e{'file'})); 332 foreach ($lines as $l) { 333 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) 334 $map[] = array($matches[1], $matches[2], rtrim($matches[3])); 335 } 336 337 $mappy = array(); 338 foreach ($map as $v) { 339 if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) 340 $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); 341 } 342 ksort($mappy); 343 344 echo 345"static const uni_to_enc unimap_{$e['ident']}[] = {\n"; 346 347 foreach ($mappy as $k => $v) { 348 echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", 349 $v[1], " */\n"; 350 } 351 echo "};\n"; 352 353 echo 354"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; 355} 356 357$data = file_get_contents("ents_html5.txt"); 358$pass2 = false; 359$name = "HTML5"; 360$ident = "html5"; 361again: 362 363$t = <<<'CODE' 364/* HTML 5 has many more named entities. 365 * Some of them map to two unicode code points, not one. 366 * We're going to use a three-stage table (with an extra one for the entities 367 * with two code points). */ 368 369#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ 370#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) 371#define ENT_STAGE3_INDEX(k) ((k) & 0x3F) 372#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) 373 374/* The default entity may be NULL. Binary search is still possible while 375 is senseless as there are just two rows (see also find_entity_for_char()). */ 376typedef union { 377 struct { 378 const char *default_entity; 379 unsigned size; /* number of remaining entries in the table */ 380 unsigned short default_entity_len; 381 } leading_entry; 382 struct { 383 const char *entity; 384 unsigned second_cp; /* second code point */ 385 unsigned short entity_len; 386 } normal_entry; 387} entity_multicodepoint_row; 388 389/* blocks of these should start at code points k where k % 0xFC0 == 0 */ 390typedef struct { 391 char ambiguous; /* if 0 look into entity */ 392 union { 393 struct { 394 const char *entity; /* may be NULL */ 395 unsigned short entity_len; 396 } ent; 397 const entity_multicodepoint_row *multicodepoint_table; 398 } data; 399} entity_stage3_row; 400 401/* Calculate k & 0x3F Use as offset */ 402typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ 403 404/* Calculate k & 0xFC0 >> 6. Use as offset */ 405typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ 406 407/* For stage 1, Calculate k & 0xFFF000 >> 3*4. 408 * If larger than 1D, we have no mapping. Otherwise lookup that index */ 409 410typedef struct { 411 const entity_stage1_row *ms_table; 412 /* for tables with only basic entities, this member is to be accessed 413 * directly for better performance: */ 414 const entity_stage3_row *table; 415} entity_table_opt; 416 417/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ 418 419 420CODE; 421 422if (!$pass2) 423 echo $t; 424 425$dp = array(); 426 427foreach (explode("\n", $data) as $l) { 428 if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { 429 //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); 430 $dp[] = array($matches[1], $matches[2], $matches[3]); 431 } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { 432 $dp[] = array($matches[1], $matches[2]); 433 } 434} 435 436$origdp = $dp; 437 438usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); 439 440$multicp_rows = array(); 441foreach ($dp as $el) { 442 if (count($el) == 3) { 443 $multicp_rows[$el[1]] = array(); 444 } 445} 446 447foreach ($dp as $el) { 448 if (key_exists($el[1], $multicp_rows)) { 449 if (count($el) == 3) 450 $multicp_rows[$el[1]][$el[2]] = $el[0]; 451 else 452 $multicp_rows[$el[1]]["default"] = $el[0]; 453 } 454} 455 456if ($pass2 < 2) 457 echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; 458else 459 echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; 460 461if (empty($multicp_rows)) 462 goto skip_multicp; 463 464ksort($multicp_rows); 465foreach ($multicp_rows as &$v) { ksort($v); } 466unset($v); 467 468echo 469"/* {{{ Start of double code point tables for $name */", "\n\n"; 470 471foreach ($multicp_rows as $k => $v) { 472 echo "static const entity_multicodepoint_row multi_cp_{$ident}_", 473 sprintf("%05s", $k), "[] = {", "\n"; 474 if (key_exists("default", $v)) { 475 if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ 476 $v['default'] = "gt"; 477 echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'), 478 "\t", sprintf("%02d", (count($v) - 1)), ",\t\t", 479 sprintf("% 2d", strlen($v["default"])), '} },', "\n"; 480 } else { 481 echo "\t{ {", sprintf("%-22s", 'NULL,'), 482 "\t", sprintf("%02d", count($v)), ",\t\t0} },\n"; 483 } 484 unset($v["default"]); 485 foreach ($v as $l => $w) { 486 echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t", 487 sprintf("% 2d", strlen($w)), '} },', "\n"; 488 } 489 echo "};\n"; 490} 491echo "\n/* End of double code point tables }}} */", "\n\n"; 492 493skip_multicp: 494 495if ($pass2 < 2) 496 echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; 497 498$t = <<<CODE 499static const entity_stage3_row empty_stage3_table[] = { 500 /* 64 elements */ 501 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 502 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 503 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 504 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 505 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 506 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 507 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 508 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 509 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 510 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 511 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 512 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 513 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 514 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 515 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 516 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 517}; 518 519CODE; 520 521if (!$pass2) 522 echo $t; 523 524$mstable = array(); 525foreach ($dp as $el) { 526 $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; 527 $s2 = (hexdec($el[1]) & 0xFC0) >> 6; 528 $s3 = hexdec($el[1]) & 0x3F; 529 if (key_exists($el[1], $multicp_rows)) { 530 $mstable[$s1][$s2][$s3] = ""; 531 } else { 532 $mstable[$s1][$s2][$s3] = $el[0]; 533 } 534} 535 536for ($i = 0; $i < 0x1E; $i++) { 537 for ($k = 0; $k < 64; $k++) { 538 $any3 = false; 539 $col3 = array(); 540 for ($l = 0; $l < 64; $l++) { 541 if (isset($mstable[$i][$k][$l])) { 542 $any3 = true; 543 $col3[$l] = $mstable[$i][$k][$l]; 544 } else { 545 $col3[$l] = null; 546 } 547 } 548 if ($any3) { 549 echo "static const entity_stage3_row stage3_table_{$ident}_", 550 sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; 551 foreach ($col3 as $y => $z) { 552 if ($y == 0) echo "\t"; 553 elseif ($y % 4 == 0) echo "\n\t"; 554 else echo " "; 555 if ($z === NULL) 556 echo "{0, { {NULL, 0} } },"; 557 elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ 558 echo "{0, { {\"quot\", 4} } },"; 559 elseif ($z !== "") 560 echo "{0, { {\"$z\", ", strlen($z), "} } },"; 561 else 562 echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", 563 ($i << 12) | ($k << 6) | $y ), ", 0} } },"; 564 565 } 566 echo "\n};\n\n"; 567 } 568 } 569} 570 571if ($pass2 < 2) 572 echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; 573 574if ($pass2 > 1) 575 goto hashtables; 576 577echo 578"/* {{{ Stage 2 Tables for $name */", "\n\n"; 579 580$t = <<<CODE 581static const entity_stage2_row empty_stage2_table[] = { 582 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 583 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 584 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 585 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 586 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 587 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 588 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 589 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 590 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 591 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 592 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 593 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 594 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 595 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 596 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 597 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 598}; 599 600CODE; 601 602if (!$pass2) 603 echo $t; 604 605for ($i = 0; $i < 0x1E; $i++) { 606 $any = false; 607 for ($k = 0; $k < 64; $k++) { 608 if (isset($mstable[$i][$k])) 609 $any = true; 610 } 611 if ($any) { 612 echo "static const entity_stage2_row stage2_table_{$ident}_", 613 sprintf("%02X000", $i), "[] = {\n"; 614 for ($k = 0; $k < 64; $k++) { 615 if ($k == 0) echo "\t"; 616 elseif ($k % 4 == 0) echo "\n\t"; 617 else echo " "; 618 if (isset($mstable[$i][$k])) { 619 echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; 620 } else { 621 echo "empty_stage3_table", ","; 622 } 623 } 624 echo "\n};\n\n"; 625 } 626} 627 628echo 629"/* end of stage 2 tables for $name }}} */", "\n\n"; 630 631echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; 632for ($i = 0; $i < 0x1E; $i++) { 633 if (isset($mstable[$i])) 634 echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; 635 else 636 echo "\tempty_stage2_table,\n"; 637} 638echo "};\n\n"; 639 640echo 641"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; 642 643/* commented-out; this enabled binary search, which turned out to be 644 * significantly slower than the hash tables for html 5 entities */ 645//echo 646//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; 647 648//$t = <<<CODE 649//typedef struct { 650// const char *entity; 651// unsigned short entity_len; 652// unsigned int codepoint1; 653// unsigned int codepoint2; 654//} entity_cp_map; 655// 656//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ 657// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) 658// 659//static const entity_cp_map html5_ent_cp_map[] = { 660// 661//CODE; 662//echo $t; 663// 664//$dp = $origdp; 665//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); 666// return $d==0?strcmp($a[0], $b[0]):$d; }); 667// 668//$k = 0; 669//foreach ($dp as $o) { 670// if ($k == 0) echo "\t"; 671// elseif ($k % 3 == 0) echo "\n\t"; 672// else echo " "; 673// if (isset($o[2])) 674// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), 675// hexdec($o[1]), hexdec($o[2])); 676// else 677// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), 678// hexdec($o[1])); 679// 680// if (isset($o[2])) { 681// $entlen = strlen($o[0]) + 2; 682// $utf8len = strlen( 683// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); 684// if ($utf8len > $entlen*1.2) { 685// die("violated assumption for traverse_for_entities"); 686// } 687// } 688// 689// $k++; 690//} 691//echo "\n};\n\n"; 692// 693//echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; 694// 695//echo 696//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; 697 698hashtables: 699 700echo 701"/* {{{ $name hash table for entity -> codepoint */", "\n\n"; 702 703$t = <<<CODE 704typedef struct { 705 const char *entity; 706 unsigned short entity_len; 707 unsigned int codepoint1; 708 unsigned int codepoint2; 709} entity_cp_map; 710 711typedef const entity_cp_map *entity_ht_bucket; 712 713typedef struct { 714 unsigned num_elems; /* power of 2 */ 715 const entity_ht_bucket *buckets; /* .num_elems elements */ 716} entity_ht; 717 718static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; 719 720CODE; 721 722if (!$pass2) 723 echo $t; 724 725function hashfun($str) 726{ 727 728 $hash = 5381; 729 $nKeyLength = strlen($str); 730 $pos = 0; 731 732 for (; $nKeyLength > 0; $nKeyLength--) { 733 $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) 734 & 0xFFFFFFFF; 735 } 736 return $hash; 737 738} 739 740$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); 741$mask = $numelems - 1; 742$hashes = array(); 743foreach ($origdp as $e) { 744 $hashes[hashfun($e[0]) & $mask][] = $e; 745 if (isset($e[2])) { 746 $entlen = strlen($e[0]) + 2; 747 $utf8len = strlen( 748 mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); 749 if ($utf8len > $entlen*1.2) { 750 die("violated assumption for traverse_for_entities"); 751 } 752 } 753} 754 755for ($i = 0; $i < $numelems; $i++) { 756 if (empty($hashes[$i])) 757 continue; 758 echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; 759 foreach ($hashes[$i] as $h) { 760 if (isset($h[2])) { 761 echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', 762 $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); 763 } else { 764 echo sprintf(' {"%s", %d, 0x%05X, 0},', 765 $h[0], strlen($h[0]), hexdec($h[1])); 766 } 767 } 768 echo " {NULL, 0, 0, 0} };\n"; 769} 770echo "\n"; 771 772echo 773"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; 774 775for ($i = 0; $i < $numelems; $i++) { 776 if ($i == 0) echo "\t"; 777 elseif ($i % 4 == 0) echo "\n\t"; 778 else echo " "; 779 if (empty($hashes[$i])) 780 echo "ht_bucket_empty,"; 781 else 782 echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; 783} 784echo "\n};\n\n"; 785 786echo 787"static const entity_ht ent_ht_{$ident} = { 788 ", sprintf("0x%X", $numelems), ", 789 ht_buckets_{$ident} 790};\n\n"; 791 792echo 793"/* end of $name hash table for entity -> codepoint }}} */\n\n"; 794 795if (!$pass2) { 796 $data = file_get_contents("ents_html401.txt"); 797 $pass2 = 1; 798 $name = "HTML 4.01"; 799 $ident = "html4"; 800 goto again; 801} elseif ($pass2 == 1) { 802 $data = file_get_contents("ents_basic.txt"); 803 $pass2 = 2; 804 $name = "Basic entities (no apos)"; 805 $ident = "be_noapos"; 806 goto again; 807} elseif ($pass2 == 2) { 808 $data = file_get_contents("ents_basic_apos.txt"); 809 $pass2 = 3; 810 $name = "Basic entities (with apos)"; 811 $ident = "be_apos"; 812 goto again; 813} 814 815echo "#endif /* HTML_TABLES_H */\n"; 816