1#!/usr/bin/env php 2<?php 3/* 4 +----------------------------------------------------------------------+ 5 | PHP Version 7 | 6 +----------------------------------------------------------------------+ 7 | Copyright (c) The PHP Group | 8 +----------------------------------------------------------------------+ 9 | This source file is subject to version 3.01 of the PHP license, | 10 | that is bundled with this package in the file LICENSE, and is | 11 | available through the world-wide-web at the following url: | 12 | http://www.php.net/license/3_01.txt | 13 | If you did not receive a copy of the PHP license and are unable to | 14 | obtain it through the world-wide-web, please send a note to | 15 | license@php.net so we can mail you a copy immediately. | 16 +----------------------------------------------------------------------+ 17 | Authors: Gustavo Lopes <cataphract@php.net> | 18 +----------------------------------------------------------------------+ 19*/ 20 21/* This file prints to stdout the contents of ext/standard/html_tables.h */ 22/* put together with glue; have patience */ 23 24$t = <<<CODE 25/* 26 +----------------------------------------------------------------------+ 27 | PHP Version 7 | 28 +----------------------------------------------------------------------+ 29 | Copyright (c) The PHP Group | 30 +----------------------------------------------------------------------+ 31 | This source file is subject to version 3.01 of the PHP license, | 32 | that is bundled with this package in the file LICENSE, and is | 33 | available through the world-wide-web at the following url: | 34 | http://www.php.net/license/3_01.txt | 35 | If you did not receive a copy of the PHP license and are unable to | 36 | obtain it through the world-wide-web, please send a note to | 37 | license@php.net so we can mail you a copy immediately. | 38 +----------------------------------------------------------------------+ 39*/ 40 41#ifndef HTML_TABLES_H 42#define HTML_TABLES_H 43 44/************************************************************************** 45*************************************************************************** 46** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** 47*************************************************************************** 48** Please change html_tables/html_table_gen.php instead and then ** 49** run it in order to generate this file ** 50*************************************************************************** 51**************************************************************************/ 52 53enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, 54 cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, 55 cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, 56 cs_numelems /* used to count the number of charsets */ 57 }; 58#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) 59#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) 60#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) 61 62static const struct { 63 const char *codeset; 64 uint32_t codeset_len; 65 enum entity_charset charset; 66} charset_map[] = { 67 { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 }, 68 { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 }, 69 { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 }, 70 { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 }, 71 { "utf-8", sizeof("utf-8")-1, cs_utf_8 }, 72 { "cp1252", sizeof("cp1252")-1, cs_cp1252 }, 73 { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 }, 74 { "1252", sizeof("1252")-1, cs_cp1252 }, 75 { "BIG5", sizeof("BIG5")-1, cs_big5 }, 76 { "950", sizeof("950")-1, cs_big5 }, 77 { "GB2312", sizeof("GB2312")-1, cs_gb2312 }, 78 { "936", sizeof("936")-1, cs_gb2312 }, 79 { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs }, 80 { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis }, 81 { "SJIS", sizeof("SJIS")-1, cs_sjis }, 82 { "932", sizeof("932")-1, cs_sjis }, 83 { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis }, 84 { "CP932", sizeof("CP932")-1, cs_sjis }, 85 { "EUCJP", sizeof("EUCJP")-1, cs_eucjp }, 86 { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp }, 87 { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp }, 88 { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r }, 89 { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r }, 90 { "koi8r", sizeof("koi8r")-1, cs_koi8r }, 91 { "cp1251", sizeof("cp1251")-1, cs_cp1251 }, 92 { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 }, 93 { "win-1251", sizeof("win-1251")-1, cs_cp1251 }, 94 { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 }, 95 { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 }, 96 { "cp866", sizeof("cp866")-1, cs_cp866 }, 97 { "866", sizeof("866")-1, cs_cp866 }, 98 { "ibm866", sizeof("ibm866")-1, cs_cp866 }, 99 { "MacRoman", sizeof("MacRoman")-1, cs_macroman } 100}; 101 102/* longest entity name length excluding & and ; */ 103#define LONGEST_ENTITY_LENGTH 31 104 105/* Definitions for mappings *to* Unicode. 106 * The origin charset must have at most 256 code points. 107 * The multi-byte encodings are not supported */ 108typedef struct { 109 unsigned short uni_cp[64]; 110} enc_to_uni_stage2; 111 112typedef struct { 113 const enc_to_uni_stage2 *inner[4]; 114} enc_to_uni; 115 116/* bits 7-8 bits (only single bytes encodings supported )*/ 117#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) 118/* bits 1-6 */ 119#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) 120 121 122CODE; 123 124echo $t; 125 126$encodings = array( 127 array( 128 "ident" => "iso88591", 129 "enumid" => 1, 130 "name" => "ISO-8859-1", 131 "file" => "mappings/8859-1.TXT", 132 ), 133 array( 134 "ident" => "iso88595", 135 "enumid" => 5, 136 "name" => "ISO-8859-5", 137 "file" => "mappings/8859-5.TXT", 138 ), 139 array( 140 "ident" => "iso885915", 141 "enumid" => 3, 142 "name" => "ISO-8859-15", 143 "file" => "mappings/8859-15.TXT", 144 ), 145 array( 146 "ident" => "win1252", 147 "enumid" => 2, 148 "enumident" => "cp1252", 149 "name" => "Windows-1252", 150 "file" => "mappings/CP1252.TXT", 151 ), 152 array( 153 "ident" => "win1251", 154 "enumid" => 4, 155 "enumident" => "cp1252", 156 "name" => "Windows-1251", 157 "file" => "mappings/CP1251.TXT", 158 ), 159 array( 160 "ident" => "koi8r", 161 "enumid" => 8, 162 "name" => "KOI8-R", 163 "file" => "mappings/KOI8-R.TXT", 164 ), 165 array( 166 "ident" => "cp866", 167 "enumid" => 6, 168 "name" => "CP-866", 169 "file" => "mappings/CP866.TXT", 170 ), 171 array( 172 "ident" => "macroman", 173 "enumid" => 7, 174 "name" => "MacRoman", 175 "file" => "mappings/ROMAN.TXT", 176 ), 177); 178 179$prevStage2 = array(); 180 181foreach ($encodings as $e) { 182 echo 183"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; 184 185 /* process file */ 186 $map = array(); 187 $lines = explode("\n", file_get_contents($e{'file'})); 188 foreach ($lines as $l) { 189 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) 190 $map[] = array($matches[1], $matches[2]); 191 } 192 193 $mappy = array(); 194 foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } 195 196 $mstable = array("ident" => $e['ident']); 197 /* calculate two-stage tables */ 198 for ($i = 0; $i < 4; $i++) { 199 for ($j = 0; $j < 64; $j++) { 200 $cp = $i << 6 | $j; 201 $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; 202 } 203 } 204 205 echo 206"/* {{{ Stage 2 tables for {$e['name']} */\n\n"; 207 208 $s2tables_idents = array(); 209 for ($i = 0; $i < 4; $i++) { 210 if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { 211 $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; 212 continue; 213 } 214 215 $s2tables_idents[$i] = $e["ident"]; 216 217 echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". 218 sprintf("%02X", $i << 6)." = { {\n"; 219 for ($j = 0; $j < 64; $j++) { 220 if ($j == 0) echo "\t"; 221 elseif ($j % 6 == 0) echo "\n\t"; 222 else echo " "; 223 if ($mstable[$i][$j] !== NULL) 224 echo sprintf("0x%04X,", $mstable[$i][$j]); 225 else 226 echo "0xFFFF,"; /* special value; indicates no mapping */ 227 } 228 echo "\n} };\n\n"; 229 230 $prevStage2[] = $mstable[$i]; 231 } 232 233 echo 234"/* end of stage 2 tables for {$e['name']} }}} */\n\n"; 235 236 echo 237"/* {{{ Stage 1 table for {$e['name']} */\n"; 238 239 echo 240"static const enc_to_uni enc_to_uni_{$e['ident']} = { { 241\t&enc_to_uni_s2_{$s2tables_idents[0]}_00, 242\t&enc_to_uni_s2_{$s2tables_idents[1]}_40, 243\t&enc_to_uni_s2_{$s2tables_idents[2]}_80, 244\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } 245}; 246"; 247 248 echo 249"/* end of stage 1 table for {$e['name']} }}} */\n\n"; 250} 251 252$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); 253$a = range(0, $maxencnum); 254foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } 255 256 echo 257"/* {{{ Index of tables for encoding conversion */ 258static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; 259 260foreach ($a as $k => $v) { 261 if (is_numeric($v)) 262 echo "\tNULL,\n"; 263 else 264 echo "\t&enc_to_uni_$v,\n"; 265} 266 267 echo 268"}; 269/* }}} */\n"; 270 271$t = <<<CODE 272 273/* Definitions for mappings *from* Unicode */ 274 275typedef struct { 276 unsigned short un_code_point; /* we don't need bigger */ 277 unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ 278} uni_to_enc; 279 280 281CODE; 282 283echo $t; 284 285$encodings = array( 286 array( 287 "ident" => "iso885915", 288 "name" => "ISO-8859-15", 289 "file" => "mappings/8859-15.TXT", 290 "range" => array(0xA4, 0xBE), 291 ), 292 array( 293 "ident" => "win1252", 294 "name" => "Windows-1252", 295 "file" => "mappings/CP1252.TXT", 296 "range" => array(0x80, 0x9F), 297 ), 298 array( 299 "ident" => "win1251", 300 "name" => "Windows-1251", 301 "file" => "mappings/CP1251.TXT", 302 "range" => array(0x80, 0xFF), 303 ), 304 array( 305 "ident" => "koi8r", 306 "name" => "KOI8-R", 307 "file" => "mappings/KOI8-R.TXT", 308 "range" => array(0x80, 0xFF), 309 ), 310 array( 311 "ident" => "cp866", 312 "name" => "CP-866", 313 "file" => "mappings/CP866.TXT", 314 "range" => array(0x80, 0xFF), 315 ), 316 array( 317 "ident" => "macroman", 318 "name" => "MacRoman", 319 "file" => "mappings/ROMAN.TXT", 320 "range" => array(0x80, 0xFF), 321 ), 322); 323 324foreach ($encodings as $e) { 325 echo 326"/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; 327 328 /* process file */ 329 $map = array(); 330 $lines = explode("\n", file_get_contents($e{'file'})); 331 foreach ($lines as $l) { 332 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) 333 $map[] = array($matches[1], $matches[2], rtrim($matches[3])); 334 } 335 336 $mappy = array(); 337 foreach ($map as $v) { 338 if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) 339 $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); 340 } 341 ksort($mappy); 342 343 echo 344"static const uni_to_enc unimap_{$e['ident']}[] = {\n"; 345 346 foreach ($mappy as $k => $v) { 347 echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", 348 $v[1], " */\n"; 349 } 350 echo "};\n"; 351 352 echo 353"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; 354} 355 356$data = file_get_contents("ents_html5.txt"); 357$pass2 = false; 358$name = "HTML5"; 359$ident = "html5"; 360again: 361 362$t = <<<'CODE' 363/* HTML 5 has many more named entities. 364 * Some of them map to two unicode code points, not one. 365 * We're going to use a three-stage table (with an extra one for the entities 366 * with two code points). */ 367 368#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ 369#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) 370#define ENT_STAGE3_INDEX(k) ((k) & 0x3F) 371#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) 372 373/* The default entity may be NULL. Binary search is still possible while 374 is senseless as there are just two rows (see also find_entity_for_char()). */ 375typedef union { 376 struct { 377 const char *default_entity; 378 unsigned size; /* number of remaining entries in the table */ 379 unsigned short default_entity_len; 380 } leading_entry; 381 struct { 382 const char *entity; 383 unsigned second_cp; /* second code point */ 384 unsigned short entity_len; 385 } normal_entry; 386} entity_multicodepoint_row; 387 388/* blocks of these should start at code points k where k % 0xFC0 == 0 */ 389typedef struct { 390 char ambiguous; /* if 0 look into entity */ 391 union { 392 struct { 393 const char *entity; /* may be NULL */ 394 unsigned short entity_len; 395 } ent; 396 const entity_multicodepoint_row *multicodepoint_table; 397 } data; 398} entity_stage3_row; 399 400/* Calculate k & 0x3F Use as offset */ 401typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ 402 403/* Calculate k & 0xFC0 >> 6. Use as offset */ 404typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ 405 406/* For stage 1, Calculate k & 0xFFF000 >> 3*4. 407 * If larger than 1D, we have no mapping. Otherwise lookup that index */ 408 409typedef struct { 410 const entity_stage1_row *ms_table; 411 /* for tables with only basic entities, this member is to be accessed 412 * directly for better performance: */ 413 const entity_stage3_row *table; 414} entity_table_opt; 415 416/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ 417 418 419CODE; 420 421if (!$pass2) 422 echo $t; 423 424$dp = array(); 425 426foreach (explode("\n", $data) as $l) { 427 if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { 428 //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); 429 $dp[] = array($matches[1], $matches[2], $matches[3]); 430 } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { 431 $dp[] = array($matches[1], $matches[2]); 432 } 433} 434 435$origdp = $dp; 436 437usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); 438 439$multicp_rows = array(); 440foreach ($dp as $el) { 441 if (count($el) == 3) { 442 $multicp_rows[$el[1]] = array(); 443 } 444} 445 446foreach ($dp as $el) { 447 if (key_exists($el[1], $multicp_rows)) { 448 if (count($el) == 3) 449 $multicp_rows[$el[1]][$el[2]] = $el[0]; 450 else 451 $multicp_rows[$el[1]]["default"] = $el[0]; 452 } 453} 454 455if ($pass2 < 2) 456 echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; 457else 458 echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; 459 460if (empty($multicp_rows)) 461 goto skip_multicp; 462 463ksort($multicp_rows); 464foreach ($multicp_rows as &$v) { ksort($v); } 465unset($v); 466 467echo 468"/* {{{ Start of double code point tables for $name */", "\n\n"; 469 470foreach ($multicp_rows as $k => $v) { 471 echo "static const entity_multicodepoint_row multi_cp_{$ident}_", 472 sprintf("%05s", $k), "[] = {", "\n"; 473 if (key_exists("default", $v)) { 474 if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ 475 $v['default'] = "gt"; 476 echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'), 477 "\t", sprintf("%02d", (count($v) - 1)), ",\t\t", 478 sprintf("% 2d", strlen($v["default"])), '} },', "\n"; 479 } else { 480 echo "\t{ {", sprintf("%-22s", 'NULL,'), 481 "\t", sprintf("%02d", count($v)), ",\t\t0} },\n"; 482 } 483 unset($v["default"]); 484 foreach ($v as $l => $w) { 485 echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t", 486 sprintf("% 2d", strlen($w)), '} },', "\n"; 487 } 488 echo "};\n"; 489} 490echo "\n/* End of double code point tables }}} */", "\n\n"; 491 492skip_multicp: 493 494if ($pass2 < 2) 495 echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; 496 497$t = <<<CODE 498static const entity_stage3_row empty_stage3_table[] = { 499 /* 64 elements */ 500 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 501 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 502 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 503 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 504 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 505 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 506 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 507 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 508 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 509 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 510 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 511 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 512 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 513 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 514 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 515 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 516}; 517 518CODE; 519 520if (!$pass2) 521 echo $t; 522 523$mstable = array(); 524foreach ($dp as $el) { 525 $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; 526 $s2 = (hexdec($el[1]) & 0xFC0) >> 6; 527 $s3 = hexdec($el[1]) & 0x3F; 528 if (key_exists($el[1], $multicp_rows)) { 529 $mstable[$s1][$s2][$s3] = ""; 530 } else { 531 $mstable[$s1][$s2][$s3] = $el[0]; 532 } 533} 534 535for ($i = 0; $i < 0x1E; $i++) { 536 for ($k = 0; $k < 64; $k++) { 537 $any3 = false; 538 $col3 = array(); 539 for ($l = 0; $l < 64; $l++) { 540 if (isset($mstable[$i][$k][$l])) { 541 $any3 = true; 542 $col3[$l] = $mstable[$i][$k][$l]; 543 } else { 544 $col3[$l] = null; 545 } 546 } 547 if ($any3) { 548 echo "static const entity_stage3_row stage3_table_{$ident}_", 549 sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; 550 foreach ($col3 as $y => $z) { 551 if ($y == 0) echo "\t"; 552 elseif ($y % 4 == 0) echo "\n\t"; 553 else echo " "; 554 if ($z === NULL) 555 echo "{0, { {NULL, 0} } },"; 556 elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ 557 echo "{0, { {\"quot\", 4} } },"; 558 elseif ($z !== "") 559 echo "{0, { {\"$z\", ", strlen($z), "} } },"; 560 else 561 echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", 562 ($i << 12) | ($k << 6) | $y ), ", 0} } },"; 563 564 } 565 echo "\n};\n\n"; 566 } 567 } 568} 569 570if ($pass2 < 2) 571 echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; 572 573if ($pass2 > 1) 574 goto hashtables; 575 576echo 577"/* {{{ Stage 2 Tables for $name */", "\n\n"; 578 579$t = <<<CODE 580static const entity_stage2_row empty_stage2_table[] = { 581 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 582 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 583 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 584 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 585 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 586 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 587 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 588 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 589 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 590 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 591 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 592 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 593 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 594 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 595 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 596 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 597}; 598 599CODE; 600 601if (!$pass2) 602 echo $t; 603 604for ($i = 0; $i < 0x1E; $i++) { 605 $any = false; 606 for ($k = 0; $k < 64; $k++) { 607 if (isset($mstable[$i][$k])) 608 $any = true; 609 } 610 if ($any) { 611 echo "static const entity_stage2_row stage2_table_{$ident}_", 612 sprintf("%02X000", $i), "[] = {\n"; 613 for ($k = 0; $k < 64; $k++) { 614 if ($k == 0) echo "\t"; 615 elseif ($k % 4 == 0) echo "\n\t"; 616 else echo " "; 617 if (isset($mstable[$i][$k])) { 618 echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; 619 } else { 620 echo "empty_stage3_table", ","; 621 } 622 } 623 echo "\n};\n\n"; 624 } 625} 626 627echo 628"/* end of stage 2 tables for $name }}} */", "\n\n"; 629 630echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; 631for ($i = 0; $i < 0x1E; $i++) { 632 if (isset($mstable[$i])) 633 echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; 634 else 635 echo "\tempty_stage2_table,\n"; 636} 637echo "};\n\n"; 638 639echo 640"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; 641 642/* commented-out; this enabled binary search, which turned out to be 643 * significantly slower than the hash tables for html 5 entities */ 644//echo 645//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; 646 647//$t = <<<CODE 648//typedef struct { 649// const char *entity; 650// unsigned short entity_len; 651// unsigned int codepoint1; 652// unsigned int codepoint2; 653//} entity_cp_map; 654// 655//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ 656// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) 657// 658//static const entity_cp_map html5_ent_cp_map[] = { 659// 660//CODE; 661//echo $t; 662// 663//$dp = $origdp; 664//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); 665// return $d==0?strcmp($a[0], $b[0]):$d; }); 666// 667//$k = 0; 668//foreach ($dp as $o) { 669// if ($k == 0) echo "\t"; 670// elseif ($k % 3 == 0) echo "\n\t"; 671// else echo " "; 672// if (isset($o[2])) 673// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), 674// hexdec($o[1]), hexdec($o[2])); 675// else 676// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), 677// hexdec($o[1])); 678// 679// if (isset($o[2])) { 680// $entlen = strlen($o[0]) + 2; 681// $utf8len = strlen( 682// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); 683// if ($utf8len > $entlen*1.2) { 684// die("violated assumption for traverse_for_entities"); 685// } 686// } 687// 688// $k++; 689//} 690//echo "\n};\n\n"; 691// 692//echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; 693// 694//echo 695//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; 696 697hashtables: 698 699echo 700"/* {{{ $name hash table for entity -> codepoint */", "\n\n"; 701 702$t = <<<CODE 703typedef struct { 704 const char *entity; 705 unsigned short entity_len; 706 unsigned int codepoint1; 707 unsigned int codepoint2; 708} entity_cp_map; 709 710typedef const entity_cp_map *entity_ht_bucket; 711 712typedef struct { 713 unsigned num_elems; /* power of 2 */ 714 const entity_ht_bucket *buckets; /* .num_elems elements */ 715} entity_ht; 716 717static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; 718 719CODE; 720 721if (!$pass2) 722 echo $t; 723 724function hashfun($str) 725{ 726 727 $hash = 5381; 728 $nKeyLength = strlen($str); 729 $pos = 0; 730 731 for (; $nKeyLength > 0; $nKeyLength--) { 732 $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) 733 & 0xFFFFFFFF; 734 } 735 return $hash; 736 737} 738 739$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); 740$mask = $numelems - 1; 741$hashes = array(); 742foreach ($origdp as $e) { 743 $hashes[hashfun($e[0]) & $mask][] = $e; 744 if (isset($e[2])) { 745 $entlen = strlen($e[0]) + 2; 746 $utf8len = strlen( 747 mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); 748 if ($utf8len > $entlen*1.2) { 749 die("violated assumption for traverse_for_entities"); 750 } 751 } 752} 753 754for ($i = 0; $i < $numelems; $i++) { 755 if (empty($hashes[$i])) 756 continue; 757 echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; 758 foreach ($hashes[$i] as $h) { 759 if (isset($h[2])) { 760 echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', 761 $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); 762 } else { 763 echo sprintf(' {"%s", %d, 0x%05X, 0},', 764 $h[0], strlen($h[0]), hexdec($h[1])); 765 } 766 } 767 echo " {NULL, 0, 0, 0} };\n"; 768} 769echo "\n"; 770 771echo 772"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; 773 774for ($i = 0; $i < $numelems; $i++) { 775 if ($i == 0) echo "\t"; 776 elseif ($i % 4 == 0) echo "\n\t"; 777 else echo " "; 778 if (empty($hashes[$i])) 779 echo "ht_bucket_empty,"; 780 else 781 echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; 782} 783echo "\n};\n\n"; 784 785echo 786"static const entity_ht ent_ht_{$ident} = { 787 ", sprintf("0x%X", $numelems), ", 788 ht_buckets_{$ident} 789};\n\n"; 790 791echo 792"/* end of $name hash table for entity -> codepoint }}} */\n\n"; 793 794if (!$pass2) { 795 $data = file_get_contents("ents_html401.txt"); 796 $pass2 = 1; 797 $name = "HTML 4.01"; 798 $ident = "html4"; 799 goto again; 800} elseif ($pass2 == 1) { 801 $data = file_get_contents("ents_basic.txt"); 802 $pass2 = 2; 803 $name = "Basic entities (no apos)"; 804 $ident = "be_noapos"; 805 goto again; 806} elseif ($pass2 == 2) { 807 $data = file_get_contents("ents_basic_apos.txt"); 808 $pass2 = 3; 809 $name = "Basic entities (with apos)"; 810 $ident = "be_apos"; 811 goto again; 812} 813 814echo "#endif /* HTML_TABLES_H */\n"; 815