1<?php
2/*
3   +----------------------------------------------------------------------+
4   | PHP Version 7                                                        |
5   +----------------------------------------------------------------------+
6   | Copyright (c) 1997-2018 The PHP Group                                |
7   +----------------------------------------------------------------------+
8   | This source file is subject to version 3.01 of the PHP license,      |
9   | that is bundled with this package in the file LICENSE, and is        |
10   | available through the world-wide-web at the following url:           |
11   | http://www.php.net/license/3_01.txt                                  |
12   | If you did not receive a copy of the PHP license and are unable to   |
13   | obtain it through the world-wide-web, please send a note to          |
14   | license@php.net so we can mail you a copy immediately.               |
15   +----------------------------------------------------------------------+
16   | Authors: Gustavo Lopes  <cataphract@php.net>                         |
17   +----------------------------------------------------------------------+
18*/
19
20/* This file prints to stdout the contents of ext/standard/html_tables.h */
21/* put together with glue; have patience */
22
23$t = <<<CODE
24/*
25   +----------------------------------------------------------------------+
26   | PHP Version 7                                                        |
27   +----------------------------------------------------------------------+
28   | Copyright (c) 1997-%s The PHP Group                                |
29   +----------------------------------------------------------------------+
30   | This source file is subject to version 3.01 of the PHP license,      |
31   | that is bundled with this package in the file LICENSE, and is        |
32   | available through the world-wide-web at the following url:           |
33   | http://www.php.net/license/3_01.txt                                  |
34   | If you did not receive a copy of the PHP license and are unable to   |
35   | obtain it through the world-wide-web, please send a note to          |
36   | license@php.net so we can mail you a copy immediately.               |
37   +----------------------------------------------------------------------+
38*/
39
40#ifndef HTML_TABLES_H
41#define HTML_TABLES_H
42
43/**************************************************************************
44***************************************************************************
45**        THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT.        **
46***************************************************************************
47** Please change html_tables/html_table_gen.php instead and then         **
48** run it in order to generate this file                                 **
49***************************************************************************
50**************************************************************************/
51
52enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
53					  cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
54					  cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
55					  cs_numelems /* used to count the number of charsets */
56					};
57#define CHARSET_UNICODE_COMPAT(cs)	((cs) <= cs_8859_1)
58#define CHARSET_SINGLE_BYTE(cs)		((cs) > cs_utf_8 && (cs) < cs_big5)
59#define CHARSET_PARTIAL_SUPPORT(cs)	((cs) >= cs_big5)
60
61static const struct {
62	const char *codeset;
63	uint32_t codeset_len;
64	enum entity_charset charset;
65} charset_map[] = {
66	{ "ISO-8859-1",		sizeof("ISO-8859-1")-1,		cs_8859_1 },
67	{ "ISO8859-1",		sizeof("ISO8859-1")-1,		cs_8859_1 },
68	{ "ISO-8859-15",	sizeof("ISO-8859-15")-1,	cs_8859_15 },
69	{ "ISO8859-15",		sizeof("ISO8859-15")-1,		cs_8859_15 },
70	{ "utf-8",			sizeof("utf-8")-1,			cs_utf_8 },
71	{ "cp1252", 		sizeof("cp1252")-1, 		cs_cp1252 },
72	{ "Windows-1252",	sizeof("Windows-1252")-1,	cs_cp1252 },
73	{ "1252",			sizeof("1252")-1,			cs_cp1252 },
74	{ "BIG5",			sizeof("BIG5")-1,			cs_big5 },
75	{ "950",			sizeof("950")-1,			cs_big5 },
76	{ "GB2312",			sizeof("GB2312")-1,			cs_gb2312 },
77	{ "936",			sizeof("936")-1,			cs_gb2312 },
78	{ "BIG5-HKSCS",		sizeof("BIG5-HKSCS")-1,		cs_big5hkscs },
79	{ "Shift_JIS",		sizeof("Shift_JIS")-1,		cs_sjis },
80	{ "SJIS",			sizeof("SJIS")-1,			cs_sjis },
81	{ "932",			sizeof("932")-1,			cs_sjis },
82	{ "SJIS-win",		sizeof("SJIS-win")-1,		cs_sjis },
83	{ "CP932",			sizeof("CP932")-1,			cs_sjis },
84	{ "EUCJP",			sizeof("EUCJP")-1,			cs_eucjp },
85	{ "EUC-JP",			sizeof("EUC-JP")-1,			cs_eucjp },
86	{ "eucJP-win",		sizeof("eucJP-win")-1,		cs_eucjp },
87	{ "KOI8-R",			sizeof("KOI8-R")-1,			cs_koi8r },
88	{ "koi8-ru",		sizeof("koi8-ru")-1,		cs_koi8r },
89	{ "koi8r",			sizeof("koi8r")-1,			cs_koi8r },
90	{ "cp1251",			sizeof("cp1251")-1,			cs_cp1251 },
91	{ "Windows-1251",	sizeof("Windows-1251")-1,	cs_cp1251 },
92	{ "win-1251",		sizeof("win-1251")-1,		cs_cp1251 },
93	{ "iso8859-5",		sizeof("iso8859-5")-1,		cs_8859_5 },
94	{ "iso-8859-5",		sizeof("iso-8859-5")-1,		cs_8859_5 },
95	{ "cp866",			sizeof("cp866")-1,			cs_cp866 },
96	{ "866",			sizeof("866")-1,			cs_cp866 },
97	{ "ibm866",			sizeof("ibm866")-1,			cs_cp866 },
98	{ "MacRoman",		sizeof("MacRoman")-1,		cs_macroman }
99};
100
101/* longest entity name length excluding & and ; */
102#define LONGEST_ENTITY_LENGTH 31
103
104/* Definitions for mappings *to* Unicode.
105 * The origin charset must have at most 256 code points.
106 * The multi-byte encodings are not supported */
107typedef struct {
108    unsigned short uni_cp[64];
109} enc_to_uni_stage2;
110
111typedef struct {
112    const enc_to_uni_stage2 *inner[4];
113} enc_to_uni;
114
115/* bits 7-8 bits (only single bytes encodings supported )*/
116#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
117/* bits 1-6 */
118#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
119
120
121CODE;
122
123echo sprintf($t, date("Y"));
124
125$encodings = array(
126    array(
127        "ident" => "iso88591",
128        "enumid" => 1,
129        "name" => "ISO-8859-1",
130        "file" => "mappings/8859-1.TXT",
131    ),
132    array(
133        "ident" => "iso88595",
134        "enumid" => 5,
135        "name" => "ISO-8859-5",
136        "file" => "mappings/8859-5.TXT",
137    ),
138    array(
139        "ident" => "iso885915",
140        "enumid" => 3,
141        "name" => "ISO-8859-15",
142        "file" => "mappings/8859-15.TXT",
143    ),
144    array(
145        "ident" => "win1252",
146        "enumid" => 2,
147        "enumident" => "cp1252",
148        "name" => "Windows-1252",
149        "file" => "mappings/CP1252.TXT",
150    ),
151    array(
152        "ident" => "win1251",
153        "enumid" => 4,
154        "enumident" => "cp1252",
155        "name" => "Windows-1251",
156        "file" => "mappings/CP1251.TXT",
157    ),
158    array(
159        "ident" => "koi8r",
160        "enumid" => 8,
161        "name" => "KOI8-R",
162        "file" => "mappings/KOI8-R.TXT",
163    ),
164    array(
165        "ident" => "cp866",
166        "enumid" => 6,
167        "name" => "CP-866",
168        "file" => "mappings/CP866.TXT",
169    ),
170    array(
171        "ident" => "macroman",
172        "enumid" => 7,
173        "name" => "MacRoman",
174        "file" => "mappings/ROMAN.TXT",
175    ),
176);
177
178$prevStage2 = array();
179
180foreach ($encodings as $e) {
181    echo
182"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
183
184    /* process file */
185    $map = array();
186    $lines = explode("\n", file_get_contents($e{'file'}));
187    foreach ($lines as $l) {
188        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
189            $map[] = array($matches[1], $matches[2]);
190    }
191
192    $mappy = array();
193    foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
194
195    $mstable = array("ident" => $e['ident']);
196    /* calculate two-stage tables */
197    for ($i = 0; $i < 4; $i++) {
198        for ($j = 0; $j < 64; $j++) {
199            $cp = $i << 6 | $j;
200            $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
201        }
202    }
203
204    echo
205"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
206
207    $s2tables_idents = array();
208    for ($i = 0; $i < 4; $i++) {
209        if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
210            $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
211            continue;
212        }
213
214        $s2tables_idents[$i] = $e["ident"];
215
216        echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
217            sprintf("%02X", $i << 6)." = { {\n";
218        for ($j = 0; $j < 64; $j++) {
219            if ($j == 0) echo "\t";
220            elseif ($j % 6 == 0) echo "\n\t";
221            else echo " ";
222            if ($mstable[$i][$j] !== NULL)
223                echo sprintf("0x%04X,", $mstable[$i][$j]);
224            else
225                echo "0xFFFF,"; /* special value; indicates no mapping */
226        }
227        echo "\n} };\n\n";
228
229        $prevStage2[] = $mstable[$i];
230    }
231
232    echo
233"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
234
235    echo
236"/* {{{ Stage 1 table for {$e['name']} */\n";
237
238    echo
239"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
240\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
241\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
242\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
243\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
244};
245";
246
247    echo
248"/* end of stage 1 table for {$e['name']} }}} */\n\n";
249}
250
251$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
252$a = range(0, $maxencnum);
253foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
254
255    echo
256"/* {{{ Index of tables for encoding conversion */
257static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
258
259foreach ($a as $k => $v) {
260    if (is_numeric($v))
261        echo "\tNULL,\n";
262    else
263        echo "\t&enc_to_uni_$v,\n";
264}
265
266    echo
267"};
268/* }}} */\n";
269
270$t = <<<CODE
271
272/* Definitions for mappings *from* Unicode */
273
274typedef struct {
275	unsigned short un_code_point; /* we don't need bigger */
276	unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
277} uni_to_enc;
278
279
280CODE;
281
282echo $t;
283
284$encodings = array(
285    array(
286        "ident" => "iso885915",
287        "name" => "ISO-8859-15",
288        "file" => "mappings/8859-15.TXT",
289        "range" => array(0xA4, 0xBE),
290    ),
291    array(
292        "ident" => "win1252",
293        "name" => "Windows-1252",
294        "file" => "mappings/CP1252.TXT",
295        "range" => array(0x80, 0x9F),
296    ),
297    array(
298        "ident" => "win1251",
299        "name" => "Windows-1251",
300        "file" => "mappings/CP1251.TXT",
301        "range" => array(0x80, 0xFF),
302    ),
303    array(
304        "ident" => "koi8r",
305        "name" => "KOI8-R",
306        "file" => "mappings/KOI8-R.TXT",
307        "range" => array(0x80, 0xFF),
308    ),
309    array(
310        "ident" => "cp866",
311        "name" => "CP-866",
312        "file" => "mappings/CP866.TXT",
313        "range" => array(0x80, 0xFF),
314    ),
315    array(
316        "ident" => "macroman",
317        "name" => "MacRoman",
318        "file" => "mappings/ROMAN.TXT",
319        "range" => array(0x80, 0xFF),
320    ),
321);
322
323foreach ($encodings as $e) {
324    echo
325"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
326
327    /* process file */
328    $map = array();
329    $lines = explode("\n", file_get_contents($e{'file'}));
330    foreach ($lines as $l) {
331        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
332            $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
333    }
334
335    $mappy = array();
336    foreach ($map as $v) {
337        if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
338            $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
339    }
340    ksort($mappy);
341
342    echo
343"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
344
345    foreach ($mappy as $k => $v) {
346        echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
347            $v[1], " */\n";
348    }
349    echo "};\n";
350
351    echo
352"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
353}
354
355$data = file_get_contents("ents_html5.txt");
356$pass2 = false;
357$name = "HTML5";
358$ident = "html5";
359again:
360
361$t = <<<'CODE'
362/* HTML 5 has many more named entities.
363 * Some of them map to two unicode code points, not one.
364 * We're going to use a three-stage table (with an extra one for the entities
365 * with two code points). */
366
367#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
368#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
369#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
370#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
371
372/* The default entity may be NULL. Binary search is still possible while
373   is senseless as there are just two rows (see also find_entity_for_char()). */
374typedef union {
375	struct {
376		const char *default_entity;
377		unsigned size; /* number of remaining entries in the table */
378		unsigned short default_entity_len;
379	} leading_entry;
380	struct {
381		const char *entity;
382		unsigned second_cp; /* second code point */
383		unsigned short entity_len;
384	} normal_entry;
385} entity_multicodepoint_row;
386
387/* blocks of these should start at code points k where k % 0xFC0 == 0 */
388typedef struct {
389	char ambiguous; /* if 0 look into entity */
390	union {
391		struct {
392			const char *entity; /* may be NULL */
393			unsigned short entity_len;
394		} ent;
395		const entity_multicodepoint_row *multicodepoint_table;
396	} data;
397} entity_stage3_row;
398
399/* Calculate k & 0x3F Use as offset */
400typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
401
402/* Calculate k & 0xFC0 >> 6. Use as offset */
403typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
404
405/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
406 * If larger than 1D, we have no mapping. Otherwise lookup that index */
407
408typedef struct {
409	const entity_stage1_row *ms_table;
410	/* for tables with only basic entities, this member is to be accessed
411	 * directly for better performance: */
412	const entity_stage3_row *table;
413} entity_table_opt;
414
415/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
416
417
418CODE;
419
420if (!$pass2)
421    echo $t;
422
423$dp = array();
424
425foreach (explode("\n", $data) as $l) {
426	if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
427		//echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
428		$dp[] = array($matches[1], $matches[2], $matches[3]);
429	} else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
430		$dp[] = array($matches[1], $matches[2]);
431	}
432}
433
434$origdp = $dp;
435
436usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
437
438$multicp_rows = array();
439foreach ($dp as $el) {
440	if (count($el) == 3) {
441		$multicp_rows[$el[1]] = array();
442	}
443}
444
445foreach ($dp as $el) {
446	if (key_exists($el[1], $multicp_rows)) {
447		if (count($el) == 3)
448			$multicp_rows[$el[1]][$el[2]] = $el[0];
449		else
450			$multicp_rows[$el[1]]["default"] = $el[0];
451	}
452}
453
454if ($pass2 < 2)
455    echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
456else
457    echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
458
459if (empty($multicp_rows))
460    goto skip_multicp;
461
462ksort($multicp_rows);
463foreach ($multicp_rows as &$v) { ksort($v); }
464unset($v);
465
466echo
467"/* {{{ Start of double code point tables for $name */", "\n\n";
468
469foreach ($multicp_rows as $k => $v) {
470	echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
471		sprintf("%05s", $k), "[] = {", "\n";
472	if (key_exists("default", $v)) {
473        if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
474            $v['default'] = "gt";
475		echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
476			"\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
477            sprintf("% 2d", strlen($v["default"])), '} },', "\n";
478	} else {
479		echo "\t{ {", sprintf("%-22s", 'NULL,'),
480			"\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
481	}
482	unset($v["default"]);
483	foreach ($v as $l => $w) {
484		echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
485            sprintf("% 2d", strlen($w)), '} },', "\n";
486	}
487	echo "};\n";
488}
489echo "\n/* End of double code point tables }}} */", "\n\n";
490
491skip_multicp:
492
493if ($pass2 < 2)
494    echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
495
496$t = <<<CODE
497static const entity_stage3_row empty_stage3_table[] = {
498	/* 64 elements */
499	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
500	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
501	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
502	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
503	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
504	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
505	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
506	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
507	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
508	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
509	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
510	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
511	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
512	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
513	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
514	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
515};
516
517CODE;
518
519if (!$pass2)
520    echo $t;
521
522$mstable = array();
523foreach ($dp as $el) {
524	$s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
525	$s2 = (hexdec($el[1]) & 0xFC0) >> 6;
526	$s3 = hexdec($el[1]) & 0x3F;
527	if (key_exists($el[1], $multicp_rows)) {
528		$mstable[$s1][$s2][$s3] = "";
529	} else {
530		$mstable[$s1][$s2][$s3] = $el[0];
531	}
532}
533
534for ($i = 0; $i < 0x1E; $i++) {
535	for ($k = 0; $k < 64; $k++) {
536		$any3 = false;
537		$col3 = array();
538		for ($l = 0; $l < 64; $l++) {
539			if (isset($mstable[$i][$k][$l])) {
540				$any3 = true;
541				$col3[$l] = $mstable[$i][$k][$l];
542			} else {
543				$col3[$l] = null;
544			}
545		}
546		if ($any3) {
547			echo "static const entity_stage3_row stage3_table_{$ident}_",
548				sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
549			foreach ($col3 as $y => $z) {
550				if ($y == 0) echo "\t";
551				elseif ($y % 4 == 0) echo "\n\t";
552				else echo " ";
553				if ($z === NULL)
554					echo "{0, { {NULL, 0} } },";
555                elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
556                    echo "{0, { {\"quot\", 4} } },";
557				elseif ($z !== "")
558					echo "{0, { {\"$z\", ", strlen($z), "} } },";
559				else
560					echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
561						($i << 12) | ($k << 6) | $y ), ", 0} } },";
562
563			}
564			echo "\n};\n\n";
565		}
566	}
567}
568
569if ($pass2 < 2)
570    echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
571
572if ($pass2 > 1)
573    goto hashtables;
574
575echo
576"/* {{{ Stage 2 Tables for $name */", "\n\n";
577
578$t = <<<CODE
579static const entity_stage2_row empty_stage2_table[] = {
580	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
581	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
582	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
583	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
584	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
585	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
586	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
587	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
588	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
589	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
590	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
591	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
592	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
593	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
594	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
595	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
596};
597
598CODE;
599
600if (!$pass2)
601    echo $t;
602
603for ($i = 0; $i < 0x1E; $i++) {
604	$any = false;
605	for ($k = 0; $k < 64; $k++) {
606		if (isset($mstable[$i][$k]))
607			$any = true;
608	}
609	if ($any) {
610		echo "static const entity_stage2_row stage2_table_{$ident}_",
611			sprintf("%02X000", $i), "[] = {\n";
612		for ($k = 0; $k < 64; $k++) {
613			if ($k == 0) echo "\t";
614			elseif ($k % 4 == 0) echo "\n\t";
615			else echo " ";
616			if (isset($mstable[$i][$k])) {
617				echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
618			} else {
619				echo "empty_stage3_table", ",";
620			}
621		}
622		echo "\n};\n\n";
623	}
624}
625
626echo
627"/* end of stage 2 tables for $name }}} */", "\n\n";
628
629echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
630for ($i = 0; $i < 0x1E; $i++) {
631	if (isset($mstable[$i]))
632		echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
633	else
634		echo "\tempty_stage2_table,\n";
635}
636echo "};\n\n";
637
638echo
639"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
640
641/* commented-out; this enabled binary search, which turned out to be
642 * significantly slower than the hash tables for html 5 entities */
643//echo
644//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
645
646//$t = <<<CODE
647//typedef struct {
648//	const char *entity;
649//	unsigned short entity_len;
650//	unsigned int codepoint1;
651//	unsigned int codepoint2;
652//} entity_cp_map;
653//
654//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
655//	( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
656//
657//static const entity_cp_map html5_ent_cp_map[] = {
658//
659//CODE;
660//echo $t;
661//
662//$dp = $origdp;
663//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
664//	return $d==0?strcmp($a[0], $b[0]):$d; });
665//
666//$k = 0;
667//foreach ($dp as $o) {
668//	if ($k == 0) echo "\t";
669//	elseif ($k % 3 == 0) echo "\n\t";
670//	else echo " ";
671//	if (isset($o[2]))
672//		echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
673//			hexdec($o[1]), hexdec($o[2]));
674//	else
675//		echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
676//			hexdec($o[1]));
677//
678//	if (isset($o[2])) {
679//		$entlen = strlen($o[0]) + 2;
680//		$utf8len = strlen(
681//			mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
682//		if ($utf8len > $entlen*1.2) {
683//			die("violated assumption for traverse_for_entities");
684//		}
685//	}
686//
687//	$k++;
688//}
689//echo "\n};\n\n";
690//
691//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
692//
693//echo
694//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
695
696hashtables:
697
698echo
699"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
700
701$t = <<<CODE
702typedef struct {
703	const char *entity;
704	unsigned short entity_len;
705	unsigned int codepoint1;
706	unsigned int codepoint2;
707} entity_cp_map;
708
709typedef const entity_cp_map *entity_ht_bucket;
710
711typedef struct {
712	unsigned num_elems; /* power of 2 */
713	const entity_ht_bucket *buckets; /* .num_elems elements */
714} entity_ht;
715
716static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
717
718CODE;
719
720if (!$pass2)
721    echo $t;
722
723function hashfun($str)
724{
725
726	$hash = 5381;
727	$nKeyLength = strlen($str);
728	$pos = 0;
729
730	for (; $nKeyLength > 0; $nKeyLength--) {
731		$hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
732				 & 0xFFFFFFFF;
733	}
734	return $hash;
735
736}
737
738$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
739$mask = $numelems - 1;
740$hashes = array();
741foreach ($origdp as $e) {
742	$hashes[hashfun($e[0]) & $mask][] = $e;
743	if (isset($e[2])) {
744		$entlen = strlen($e[0]) + 2;
745		$utf8len = strlen(
746			mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
747		if ($utf8len > $entlen*1.2) {
748			die("violated assumption for traverse_for_entities");
749		}
750	}
751}
752
753for ($i = 0; $i < $numelems; $i++) {
754	if (empty($hashes[$i]))
755		continue;
756	echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
757	foreach ($hashes[$i] as $h) {
758		if (isset($h[2])) {
759			echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
760				$h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
761		} else {
762			echo sprintf(' {"%s", %d, 0x%05X, 0},',
763				$h[0], strlen($h[0]), hexdec($h[1]));
764		}
765	}
766	echo " {NULL, 0, 0, 0} };\n";
767}
768echo "\n";
769
770echo
771"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
772
773for ($i = 0; $i < $numelems; $i++) {
774	if ($i == 0) echo "\t";
775	elseif ($i % 4 == 0) echo "\n\t";
776	else echo " ";
777	if (empty($hashes[$i]))
778		echo "ht_bucket_empty,";
779	else
780		echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
781}
782echo "\n};\n\n";
783
784echo
785"static const entity_ht ent_ht_{$ident} = {
786	", sprintf("0x%X", $numelems), ",
787	ht_buckets_{$ident}
788};\n\n";
789
790echo
791"/* end of $name hash table for entity -> codepoint }}} */\n\n";
792
793if (!$pass2) {
794    $data = file_get_contents("ents_html401.txt");
795    $pass2 = 1;
796    $name = "HTML 4.01";
797    $ident = "html4";
798    goto again;
799} elseif ($pass2 == 1) {
800    $data = file_get_contents("ents_basic.txt");
801    $pass2 = 2;
802    $name = "Basic entities (no apos)";
803    $ident = "be_noapos";
804    goto again;
805} elseif ($pass2 == 2) {
806    $data = file_get_contents("ents_basic_apos.txt");
807    $pass2 = 3;
808    $name = "Basic entities (with apos)";
809    $ident = "be_apos";
810    goto again;
811}
812
813echo "#endif /* HTML_TABLES_H */\n";
814