1#!/usr/bin/env php 2<?php 3/* 4 +----------------------------------------------------------------------+ 5 | Copyright (c) The PHP Group | 6 +----------------------------------------------------------------------+ 7 | This source file is subject to version 3.01 of the PHP license, | 8 | that is bundled with this package in the file LICENSE, and is | 9 | available through the world-wide-web at the following url: | 10 | https://www.php.net/license/3_01.txt | 11 | If you did not receive a copy of the PHP license and are unable to | 12 | obtain it through the world-wide-web, please send a note to | 13 | license@php.net so we can mail you a copy immediately. | 14 +----------------------------------------------------------------------+ 15 | Authors: Gustavo Lopes <cataphract@php.net> | 16 +----------------------------------------------------------------------+ 17*/ 18 19/* This file prints to stdout the contents of ext/standard/html_tables.h */ 20/* put together with glue; have patience */ 21 22$t = <<<CODE 23/* 24 +----------------------------------------------------------------------+ 25 | Copyright (c) The PHP Group | 26 +----------------------------------------------------------------------+ 27 | This source file is subject to version 3.01 of the PHP license, | 28 | that is bundled with this package in the file LICENSE, and is | 29 | available through the world-wide-web at the following url: | 30 | https://www.php.net/license/3_01.txt | 31 | If you did not receive a copy of the PHP license and are unable to | 32 | obtain it through the world-wide-web, please send a note to | 33 | license@php.net so we can mail you a copy immediately. | 34 +----------------------------------------------------------------------+ 35*/ 36 37#ifndef HTML_TABLES_H 38#define HTML_TABLES_H 39 40/************************************************************************** 41*************************************************************************** 42** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** 43*************************************************************************** 44** Please change html_tables/html_table_gen.php instead and then ** 45** run it in order to generate this file ** 46*************************************************************************** 47**************************************************************************/ 48 49enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, 50 cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, 51 cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, 52 cs_numelems /* used to count the number of charsets */ 53 }; 54#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) 55#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) 56#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) 57 58static const struct { 59 const char *codeset; 60 uint32_t codeset_len; 61 enum entity_charset charset; 62} charset_map[] = { 63 { "ISO-8859-1", sizeof("ISO-8859-1")-1, cs_8859_1 }, 64 { "ISO8859-1", sizeof("ISO8859-1")-1, cs_8859_1 }, 65 { "ISO-8859-15", sizeof("ISO-8859-15")-1, cs_8859_15 }, 66 { "ISO8859-15", sizeof("ISO8859-15")-1, cs_8859_15 }, 67 { "utf-8", sizeof("utf-8")-1, cs_utf_8 }, 68 { "cp1252", sizeof("cp1252")-1, cs_cp1252 }, 69 { "Windows-1252", sizeof("Windows-1252")-1, cs_cp1252 }, 70 { "1252", sizeof("1252")-1, cs_cp1252 }, 71 { "BIG5", sizeof("BIG5")-1, cs_big5 }, 72 { "950", sizeof("950")-1, cs_big5 }, 73 { "GB2312", sizeof("GB2312")-1, cs_gb2312 }, 74 { "936", sizeof("936")-1, cs_gb2312 }, 75 { "BIG5-HKSCS", sizeof("BIG5-HKSCS")-1, cs_big5hkscs }, 76 { "Shift_JIS", sizeof("Shift_JIS")-1, cs_sjis }, 77 { "SJIS", sizeof("SJIS")-1, cs_sjis }, 78 { "932", sizeof("932")-1, cs_sjis }, 79 { "SJIS-win", sizeof("SJIS-win")-1, cs_sjis }, 80 { "CP932", sizeof("CP932")-1, cs_sjis }, 81 { "EUCJP", sizeof("EUCJP")-1, cs_eucjp }, 82 { "EUC-JP", sizeof("EUC-JP")-1, cs_eucjp }, 83 { "eucJP-win", sizeof("eucJP-win")-1, cs_eucjp }, 84 { "KOI8-R", sizeof("KOI8-R")-1, cs_koi8r }, 85 { "koi8-ru", sizeof("koi8-ru")-1, cs_koi8r }, 86 { "koi8r", sizeof("koi8r")-1, cs_koi8r }, 87 { "cp1251", sizeof("cp1251")-1, cs_cp1251 }, 88 { "Windows-1251", sizeof("Windows-1251")-1, cs_cp1251 }, 89 { "win-1251", sizeof("win-1251")-1, cs_cp1251 }, 90 { "iso8859-5", sizeof("iso8859-5")-1, cs_8859_5 }, 91 { "iso-8859-5", sizeof("iso-8859-5")-1, cs_8859_5 }, 92 { "cp866", sizeof("cp866")-1, cs_cp866 }, 93 { "866", sizeof("866")-1, cs_cp866 }, 94 { "ibm866", sizeof("ibm866")-1, cs_cp866 }, 95 { "MacRoman", sizeof("MacRoman")-1, cs_macroman } 96}; 97 98/* longest entity name length excluding & and ; */ 99#define LONGEST_ENTITY_LENGTH 31 100 101/* Definitions for mappings *to* Unicode. 102 * The origin charset must have at most 256 code points. 103 * The multi-byte encodings are not supported */ 104typedef struct { 105 unsigned short uni_cp[64]; 106} enc_to_uni_stage2; 107 108typedef struct { 109 const enc_to_uni_stage2 *inner[4]; 110} enc_to_uni; 111 112/* bits 7-8 bits (only single bytes encodings supported )*/ 113#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) 114/* bits 1-6 */ 115#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) 116 117 118CODE; 119 120echo $t; 121 122$encodings = array( 123 array( 124 "ident" => "iso88591", 125 "enumid" => 1, 126 "name" => "ISO-8859-1", 127 "file" => "mappings/8859-1.TXT", 128 ), 129 array( 130 "ident" => "iso88595", 131 "enumid" => 5, 132 "name" => "ISO-8859-5", 133 "file" => "mappings/8859-5.TXT", 134 ), 135 array( 136 "ident" => "iso885915", 137 "enumid" => 3, 138 "name" => "ISO-8859-15", 139 "file" => "mappings/8859-15.TXT", 140 ), 141 array( 142 "ident" => "win1252", 143 "enumid" => 2, 144 "enumident" => "cp1252", 145 "name" => "Windows-1252", 146 "file" => "mappings/CP1252.TXT", 147 ), 148 array( 149 "ident" => "win1251", 150 "enumid" => 4, 151 "enumident" => "cp1252", 152 "name" => "Windows-1251", 153 "file" => "mappings/CP1251.TXT", 154 ), 155 array( 156 "ident" => "koi8r", 157 "enumid" => 8, 158 "name" => "KOI8-R", 159 "file" => "mappings/KOI8-R.TXT", 160 ), 161 array( 162 "ident" => "cp866", 163 "enumid" => 6, 164 "name" => "CP-866", 165 "file" => "mappings/CP866.TXT", 166 ), 167 array( 168 "ident" => "macroman", 169 "enumid" => 7, 170 "name" => "MacRoman", 171 "file" => "mappings/ROMAN.TXT", 172 ), 173); 174 175$prevStage2 = array(); 176 177foreach ($encodings as $e) { 178 echo 179"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; 180 181 /* process file */ 182 $map = array(); 183 $lines = explode("\n", file_get_contents($e{'file'})); 184 foreach ($lines as $l) { 185 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) 186 $map[] = array($matches[1], $matches[2]); 187 } 188 189 $mappy = array(); 190 foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } 191 192 $mstable = array("ident" => $e['ident']); 193 /* calculate two-stage tables */ 194 for ($i = 0; $i < 4; $i++) { 195 for ($j = 0; $j < 64; $j++) { 196 $cp = $i << 6 | $j; 197 $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; 198 } 199 } 200 201 echo 202"/* {{{ Stage 2 tables for {$e['name']} */\n\n"; 203 204 $s2tables_idents = array(); 205 for ($i = 0; $i < 4; $i++) { 206 if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { 207 $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; 208 continue; 209 } 210 211 $s2tables_idents[$i] = $e["ident"]; 212 213 echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". 214 sprintf("%02X", $i << 6)." = { {\n"; 215 for ($j = 0; $j < 64; $j++) { 216 if ($j == 0) echo "\t"; 217 elseif ($j % 6 == 0) echo "\n\t"; 218 else echo " "; 219 if ($mstable[$i][$j] !== NULL) 220 echo sprintf("0x%04X,", $mstable[$i][$j]); 221 else 222 echo "0xFFFF,"; /* special value; indicates no mapping */ 223 } 224 echo "\n} };\n\n"; 225 226 $prevStage2[] = $mstable[$i]; 227 } 228 229 echo 230"/* end of stage 2 tables for {$e['name']} }}} */\n\n"; 231 232 echo 233"/* {{{ Stage 1 table for {$e['name']} */\n"; 234 235 echo 236"static const enc_to_uni enc_to_uni_{$e['ident']} = { { 237\t&enc_to_uni_s2_{$s2tables_idents[0]}_00, 238\t&enc_to_uni_s2_{$s2tables_idents[1]}_40, 239\t&enc_to_uni_s2_{$s2tables_idents[2]}_80, 240\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } 241}; 242"; 243 244 echo 245"/* end of stage 1 table for {$e['name']} }}} */\n\n"; 246} 247 248$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); 249$a = range(0, $maxencnum); 250foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } 251 252 echo 253"/* {{{ Index of tables for encoding conversion */ 254static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; 255 256foreach ($a as $k => $v) { 257 if (is_numeric($v)) 258 echo "\tNULL,\n"; 259 else 260 echo "\t&enc_to_uni_$v,\n"; 261} 262 263 echo 264"}; 265/* }}} */\n"; 266 267$t = <<<CODE 268 269/* Definitions for mappings *from* Unicode */ 270 271typedef struct { 272 unsigned short un_code_point; /* we don't need bigger */ 273 unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ 274} uni_to_enc; 275 276 277CODE; 278 279echo $t; 280 281$encodings = array( 282 array( 283 "ident" => "iso885915", 284 "name" => "ISO-8859-15", 285 "file" => "mappings/8859-15.TXT", 286 "range" => array(0xA4, 0xBE), 287 ), 288 array( 289 "ident" => "win1252", 290 "name" => "Windows-1252", 291 "file" => "mappings/CP1252.TXT", 292 "range" => array(0x80, 0x9F), 293 ), 294 array( 295 "ident" => "win1251", 296 "name" => "Windows-1251", 297 "file" => "mappings/CP1251.TXT", 298 "range" => array(0x80, 0xFF), 299 ), 300 array( 301 "ident" => "koi8r", 302 "name" => "KOI8-R", 303 "file" => "mappings/KOI8-R.TXT", 304 "range" => array(0x80, 0xFF), 305 ), 306 array( 307 "ident" => "cp866", 308 "name" => "CP-866", 309 "file" => "mappings/CP866.TXT", 310 "range" => array(0x80, 0xFF), 311 ), 312 array( 313 "ident" => "macroman", 314 "name" => "MacRoman", 315 "file" => "mappings/ROMAN.TXT", 316 "range" => array(0x80, 0xFF), 317 ), 318); 319 320foreach ($encodings as $e) { 321 echo 322"/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; 323 324 /* process file */ 325 $map = array(); 326 $lines = explode("\n", file_get_contents($e{'file'})); 327 foreach ($lines as $l) { 328 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) 329 $map[] = array($matches[1], $matches[2], rtrim($matches[3])); 330 } 331 332 $mappy = array(); 333 foreach ($map as $v) { 334 if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) 335 $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); 336 } 337 ksort($mappy); 338 339 echo 340"static const uni_to_enc unimap_{$e['ident']}[] = {\n"; 341 342 foreach ($mappy as $k => $v) { 343 echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", 344 $v[1], " */\n"; 345 } 346 echo "};\n"; 347 348 echo 349"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; 350} 351 352$data = file_get_contents("ents_html5.txt"); 353$pass2 = false; 354$name = "HTML5"; 355$ident = "html5"; 356again: 357 358$t = <<<'CODE' 359/* HTML 5 has many more named entities. 360 * Some of them map to two unicode code points, not one. 361 * We're going to use a three-stage table (with an extra one for the entities 362 * with two code points). */ 363 364#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ 365#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) 366#define ENT_STAGE3_INDEX(k) ((k) & 0x3F) 367#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) 368 369/* The default entity may be NULL. Binary search is still possible while 370 is senseless as there are just two rows (see also find_entity_for_char()). */ 371typedef union { 372 struct { 373 const char *default_entity; 374 unsigned size; /* number of remaining entries in the table */ 375 unsigned short default_entity_len; 376 } leading_entry; 377 struct { 378 const char *entity; 379 unsigned second_cp; /* second code point */ 380 unsigned short entity_len; 381 } normal_entry; 382} entity_multicodepoint_row; 383 384/* blocks of these should start at code points k where k % 0xFC0 == 0 */ 385typedef struct { 386 char ambiguous; /* if 0 look into entity */ 387 union { 388 struct { 389 const char *entity; /* may be NULL */ 390 unsigned short entity_len; 391 } ent; 392 const entity_multicodepoint_row *multicodepoint_table; 393 } data; 394} entity_stage3_row; 395 396/* Calculate k & 0x3F Use as offset */ 397typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ 398 399/* Calculate k & 0xFC0 >> 6. Use as offset */ 400typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ 401 402/* For stage 1, Calculate k & 0xFFF000 >> 3*4. 403 * If larger than 1D, we have no mapping. Otherwise lookup that index */ 404 405typedef struct { 406 const entity_stage1_row *ms_table; 407 /* for tables with only basic entities, this member is to be accessed 408 * directly for better performance: */ 409 const entity_stage3_row *table; 410} entity_table_opt; 411 412/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ 413 414 415CODE; 416 417if (!$pass2) 418 echo $t; 419 420$dp = array(); 421 422foreach (explode("\n", $data) as $l) { 423 if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { 424 //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); 425 $dp[] = array($matches[1], $matches[2], $matches[3]); 426 } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { 427 $dp[] = array($matches[1], $matches[2]); 428 } 429} 430 431$origdp = $dp; 432 433usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); 434 435$multicp_rows = array(); 436foreach ($dp as $el) { 437 if (count($el) == 3) { 438 $multicp_rows[$el[1]] = array(); 439 } 440} 441 442foreach ($dp as $el) { 443 if (key_exists($el[1], $multicp_rows)) { 444 if (count($el) == 3) 445 $multicp_rows[$el[1]][$el[2]] = $el[0]; 446 else 447 $multicp_rows[$el[1]]["default"] = $el[0]; 448 } 449} 450 451if ($pass2 < 2) 452 echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; 453else 454 echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; 455 456if (empty($multicp_rows)) 457 goto skip_multicp; 458 459ksort($multicp_rows); 460foreach ($multicp_rows as &$v) { ksort($v); } 461unset($v); 462 463echo 464"/* {{{ Start of double code point tables for $name */", "\n\n"; 465 466foreach ($multicp_rows as $k => $v) { 467 echo "static const entity_multicodepoint_row multi_cp_{$ident}_", 468 sprintf("%05s", $k), "[] = {", "\n"; 469 if (key_exists("default", $v)) { 470 if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ 471 $v['default'] = "gt"; 472 echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'), 473 "\t", sprintf("%02d", (count($v) - 1)), ",\t\t", 474 sprintf("% 2d", strlen($v["default"])), '} },', "\n"; 475 } else { 476 echo "\t{ {", sprintf("%-22s", 'NULL,'), 477 "\t", sprintf("%02d", count($v)), ",\t\t0} },\n"; 478 } 479 unset($v["default"]); 480 foreach ($v as $l => $w) { 481 echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t", 482 sprintf("% 2d", strlen($w)), '} },', "\n"; 483 } 484 echo "};\n"; 485} 486echo "\n/* End of double code point tables }}} */", "\n\n"; 487 488skip_multicp: 489 490if ($pass2 < 2) 491 echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; 492 493$t = <<<CODE 494static const entity_stage3_row empty_stage3_table[] = { 495 /* 64 elements */ 496 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 497 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 498 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 499 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 500 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 501 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 502 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 503 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 504 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 505 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 506 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 507 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 508 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 509 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 510 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 511 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 512}; 513 514CODE; 515 516if (!$pass2) 517 echo $t; 518 519$mstable = array(); 520foreach ($dp as $el) { 521 $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; 522 $s2 = (hexdec($el[1]) & 0xFC0) >> 6; 523 $s3 = hexdec($el[1]) & 0x3F; 524 if (key_exists($el[1], $multicp_rows)) { 525 $mstable[$s1][$s2][$s3] = ""; 526 } else { 527 $mstable[$s1][$s2][$s3] = $el[0]; 528 } 529} 530 531for ($i = 0; $i < 0x1E; $i++) { 532 for ($k = 0; $k < 64; $k++) { 533 $any3 = false; 534 $col3 = array(); 535 for ($l = 0; $l < 64; $l++) { 536 if (isset($mstable[$i][$k][$l])) { 537 $any3 = true; 538 $col3[$l] = $mstable[$i][$k][$l]; 539 } else { 540 $col3[$l] = null; 541 } 542 } 543 if ($any3) { 544 echo "static const entity_stage3_row stage3_table_{$ident}_", 545 sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; 546 foreach ($col3 as $y => $z) { 547 if ($y == 0) echo "\t"; 548 elseif ($y % 4 == 0) echo "\n\t"; 549 else echo " "; 550 if ($z === NULL) 551 echo "{0, { {NULL, 0} } },"; 552 elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ 553 echo "{0, { {\"quot\", 4} } },"; 554 elseif ($z !== "") 555 echo "{0, { {\"$z\", ", strlen($z), "} } },"; 556 else 557 echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", 558 ($i << 12) | ($k << 6) | $y ), ", 0} } },"; 559 560 } 561 echo "\n};\n\n"; 562 } 563 } 564} 565 566if ($pass2 < 2) 567 echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; 568 569if ($pass2 > 1) 570 goto hashtables; 571 572echo 573"/* {{{ Stage 2 Tables for $name */", "\n\n"; 574 575$t = <<<CODE 576static const entity_stage2_row empty_stage2_table[] = { 577 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 578 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 579 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 580 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 581 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 582 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 583 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 584 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 585 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 586 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 587 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 588 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 589 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 590 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 591 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 592 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 593}; 594 595CODE; 596 597if (!$pass2) 598 echo $t; 599 600for ($i = 0; $i < 0x1E; $i++) { 601 $any = false; 602 for ($k = 0; $k < 64; $k++) { 603 if (isset($mstable[$i][$k])) 604 $any = true; 605 } 606 if ($any) { 607 echo "static const entity_stage2_row stage2_table_{$ident}_", 608 sprintf("%02X000", $i), "[] = {\n"; 609 for ($k = 0; $k < 64; $k++) { 610 if ($k == 0) echo "\t"; 611 elseif ($k % 4 == 0) echo "\n\t"; 612 else echo " "; 613 if (isset($mstable[$i][$k])) { 614 echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; 615 } else { 616 echo "empty_stage3_table", ","; 617 } 618 } 619 echo "\n};\n\n"; 620 } 621} 622 623echo 624"/* end of stage 2 tables for $name }}} */", "\n\n"; 625 626echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; 627for ($i = 0; $i < 0x1E; $i++) { 628 if (isset($mstable[$i])) 629 echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; 630 else 631 echo "\tempty_stage2_table,\n"; 632} 633echo "};\n\n"; 634 635echo 636"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; 637 638/* commented-out; this enabled binary search, which turned out to be 639 * significantly slower than the hash tables for html 5 entities */ 640//echo 641//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; 642 643//$t = <<<CODE 644//typedef struct { 645// const char *entity; 646// unsigned short entity_len; 647// unsigned int codepoint1; 648// unsigned int codepoint2; 649//} entity_cp_map; 650// 651//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ 652// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) 653// 654//static const entity_cp_map html5_ent_cp_map[] = { 655// 656//CODE; 657//echo $t; 658// 659//$dp = $origdp; 660//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); 661// return $d==0?strcmp($a[0], $b[0]):$d; }); 662// 663//$k = 0; 664//foreach ($dp as $o) { 665// if ($k == 0) echo "\t"; 666// elseif ($k % 3 == 0) echo "\n\t"; 667// else echo " "; 668// if (isset($o[2])) 669// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), 670// hexdec($o[1]), hexdec($o[2])); 671// else 672// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), 673// hexdec($o[1])); 674// 675// if (isset($o[2])) { 676// $entlen = strlen($o[0]) + 2; 677// $utf8len = strlen( 678// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); 679// if ($utf8len > $entlen*1.2) { 680// die("violated assumption for traverse_for_entities"); 681// } 682// } 683// 684// $k++; 685//} 686//echo "\n};\n\n"; 687// 688//echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; 689// 690//echo 691//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; 692 693hashtables: 694 695echo 696"/* {{{ $name hash table for entity -> codepoint */", "\n\n"; 697 698$t = <<<CODE 699typedef struct { 700 const char *entity; 701 unsigned short entity_len; 702 unsigned int codepoint1; 703 unsigned int codepoint2; 704} entity_cp_map; 705 706typedef const entity_cp_map *entity_ht_bucket; 707 708typedef struct { 709 unsigned num_elems; /* power of 2 */ 710 const entity_ht_bucket *buckets; /* .num_elems elements */ 711} entity_ht; 712 713static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; 714 715CODE; 716 717if (!$pass2) 718 echo $t; 719 720function hashfun($str) 721{ 722 723 $hash = 5381; 724 $nKeyLength = strlen($str); 725 $pos = 0; 726 727 for (; $nKeyLength > 0; $nKeyLength--) { 728 $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) 729 & 0xFFFFFFFF; 730 } 731 return $hash; 732 733} 734 735$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); 736$mask = $numelems - 1; 737$hashes = array(); 738foreach ($origdp as $e) { 739 $hashes[hashfun($e[0]) & $mask][] = $e; 740 if (isset($e[2])) { 741 $entlen = strlen($e[0]) + 2; 742 $utf8len = strlen( 743 mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); 744 if ($utf8len > $entlen*1.2) { 745 die("violated assumption for traverse_for_entities"); 746 } 747 } 748} 749 750for ($i = 0; $i < $numelems; $i++) { 751 if (empty($hashes[$i])) 752 continue; 753 echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; 754 foreach ($hashes[$i] as $h) { 755 if (isset($h[2])) { 756 echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', 757 $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); 758 } else { 759 echo sprintf(' {"%s", %d, 0x%05X, 0},', 760 $h[0], strlen($h[0]), hexdec($h[1])); 761 } 762 } 763 echo " {NULL, 0, 0, 0} };\n"; 764} 765echo "\n"; 766 767echo 768"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; 769 770for ($i = 0; $i < $numelems; $i++) { 771 if ($i == 0) echo "\t"; 772 elseif ($i % 4 == 0) echo "\n\t"; 773 else echo " "; 774 if (empty($hashes[$i])) 775 echo "ht_bucket_empty,"; 776 else 777 echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; 778} 779echo "\n};\n\n"; 780 781echo 782"static const entity_ht ent_ht_{$ident} = { 783 ", sprintf("0x%X", $numelems), ", 784 ht_buckets_{$ident} 785};\n\n"; 786 787echo 788"/* end of $name hash table for entity -> codepoint }}} */\n\n"; 789 790if (!$pass2) { 791 $data = file_get_contents("ents_html401.txt"); 792 $pass2 = 1; 793 $name = "HTML 4.01"; 794 $ident = "html4"; 795 goto again; 796} elseif ($pass2 == 1) { 797 $data = file_get_contents("ents_basic.txt"); 798 $pass2 = 2; 799 $name = "Basic entities (no apos)"; 800 $ident = "be_noapos"; 801 goto again; 802} elseif ($pass2 == 2) { 803 $data = file_get_contents("ents_basic_apos.txt"); 804 $pass2 = 3; 805 $name = "Basic entities (with apos)"; 806 $ident = "be_apos"; 807 goto again; 808} 809 810echo "#endif /* HTML_TABLES_H */\n"; 811