1<?php
2/*
3   +----------------------------------------------------------------------+
4   | PHP Version 7                                                        |
5   +----------------------------------------------------------------------+
6   | Copyright (c) 1997-2018 The PHP Group                                |
7   +----------------------------------------------------------------------+
8   | This source file is subject to version 3.01 of the PHP license,      |
9   | that is bundled with this package in the file LICENSE, and is        |
10   | available through the world-wide-web at the following url:           |
11   | http://www.php.net/license/3_01.txt                                  |
12   | If you did not receive a copy of the PHP license and are unable to   |
13   | obtain it through the world-wide-web, please send a note to          |
14   | license@php.net so we can mail you a copy immediately.               |
15   +----------------------------------------------------------------------+
16   | Authors: Gustavo Lopes  <cataphract@php.net>                         |
17   +----------------------------------------------------------------------+
18*/
19
20/* This file prints to stdout the contents of ext/standard/html_tables.h */
21/* put together with glue; have patience */
22
23$t = <<<CODE
24/*
25   +----------------------------------------------------------------------+
26   | PHP Version 7                                                        |
27   +----------------------------------------------------------------------+
28   | Copyright (c) 1997-%s The PHP Group                                |
29   +----------------------------------------------------------------------+
30   | This source file is subject to version 3.01 of the PHP license,      |
31   | that is bundled with this package in the file LICENSE, and is        |
32   | available through the world-wide-web at the following url:           |
33   | http://www.php.net/license/3_01.txt                                  |
34   | If you did not receive a copy of the PHP license and are unable to   |
35   | obtain it through the world-wide-web, please send a note to          |
36   | license@php.net so we can mail you a copy immediately.               |
37   +----------------------------------------------------------------------+
38*/
39
40/* \$Id$ */
41
42#ifndef HTML_TABLES_H
43#define HTML_TABLES_H
44
45/**************************************************************************
46***************************************************************************
47**        THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT.        **
48***************************************************************************
49** Please change html_tables/html_table_gen.php instead and then         **
50** run it in order to generate this file                                 **
51***************************************************************************
52**************************************************************************/
53
54enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
55					  cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
56					  cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
57					  cs_numelems /* used to count the number of charsets */
58					};
59#define CHARSET_UNICODE_COMPAT(cs)	((cs) <= cs_8859_1)
60#define CHARSET_SINGLE_BYTE(cs)		((cs) > cs_utf_8 && (cs) < cs_big5)
61#define CHARSET_PARTIAL_SUPPORT(cs)	((cs) >= cs_big5)
62
63static const struct {
64	const char *codeset;
65	uint32_t codeset_len;
66	enum entity_charset charset;
67} charset_map[] = {
68	{ "ISO-8859-1",		sizeof("ISO-8859-1")-1,		cs_8859_1 },
69	{ "ISO8859-1",		sizeof("ISO8859-1")-1,		cs_8859_1 },
70	{ "ISO-8859-15",	sizeof("ISO-8859-15")-1,	cs_8859_15 },
71	{ "ISO8859-15",		sizeof("ISO8859-15")-1,		cs_8859_15 },
72	{ "utf-8",			sizeof("utf-8")-1,			cs_utf_8 },
73	{ "cp1252", 		sizeof("cp1252")-1, 		cs_cp1252 },
74	{ "Windows-1252",	sizeof("Windows-1252")-1,	cs_cp1252 },
75	{ "1252",			sizeof("1252")-1,			cs_cp1252 },
76	{ "BIG5",			sizeof("BIG5")-1,			cs_big5 },
77	{ "950",			sizeof("950")-1,			cs_big5 },
78	{ "GB2312",			sizeof("GB2312")-1,			cs_gb2312 },
79	{ "936",			sizeof("936")-1,			cs_gb2312 },
80	{ "BIG5-HKSCS",		sizeof("BIG5-HKSCS")-1,		cs_big5hkscs },
81	{ "Shift_JIS",		sizeof("Shift_JIS")-1,		cs_sjis },
82	{ "SJIS",			sizeof("SJIS")-1,			cs_sjis },
83	{ "932",			sizeof("932")-1,			cs_sjis },
84	{ "SJIS-win",		sizeof("SJIS-win")-1,		cs_sjis },
85	{ "CP932",			sizeof("CP932")-1,			cs_sjis },
86	{ "EUCJP",			sizeof("EUCJP")-1,			cs_eucjp },
87	{ "EUC-JP",			sizeof("EUC-JP")-1,			cs_eucjp },
88	{ "eucJP-win",		sizeof("eucJP-win")-1,		cs_eucjp },
89	{ "KOI8-R",			sizeof("KOI8-R")-1,			cs_koi8r },
90	{ "koi8-ru",		sizeof("koi8-ru")-1,		cs_koi8r },
91	{ "koi8r",			sizeof("koi8r")-1,			cs_koi8r },
92	{ "cp1251",			sizeof("cp1251")-1,			cs_cp1251 },
93	{ "Windows-1251",	sizeof("Windows-1251")-1,	cs_cp1251 },
94	{ "win-1251",		sizeof("win-1251")-1,		cs_cp1251 },
95	{ "iso8859-5",		sizeof("iso8859-5")-1,		cs_8859_5 },
96	{ "iso-8859-5",		sizeof("iso-8859-5")-1,		cs_8859_5 },
97	{ "cp866",			sizeof("cp866")-1,			cs_cp866 },
98	{ "866",			sizeof("866")-1,			cs_cp866 },
99	{ "ibm866",			sizeof("ibm866")-1,			cs_cp866 },
100	{ "MacRoman",		sizeof("MacRoman")-1,		cs_macroman }
101};
102
103/* longest entity name length excluding & and ; */
104#define LONGEST_ENTITY_LENGTH 31
105
106/* Definitions for mappings *to* Unicode.
107 * The origin charset must have at most 256 code points.
108 * The multi-byte encodings are not supported */
109typedef struct {
110    unsigned short uni_cp[64];
111} enc_to_uni_stage2;
112
113typedef struct {
114    const enc_to_uni_stage2 *inner[4];
115} enc_to_uni;
116
117/* bits 7-8 bits (only single bytes encodings supported )*/
118#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
119/* bits 1-6 */
120#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
121
122
123CODE;
124
125echo sprintf($t, date("Y"));
126
127$encodings = array(
128    array(
129        "ident" => "iso88591",
130        "enumid" => 1,
131        "name" => "ISO-8859-1",
132        "file" => "mappings/8859-1.TXT",
133    ),
134    array(
135        "ident" => "iso88595",
136        "enumid" => 5,
137        "name" => "ISO-8859-5",
138        "file" => "mappings/8859-5.TXT",
139    ),
140    array(
141        "ident" => "iso885915",
142        "enumid" => 3,
143        "name" => "ISO-8859-15",
144        "file" => "mappings/8859-15.TXT",
145    ),
146    array(
147        "ident" => "win1252",
148        "enumid" => 2,
149        "enumident" => "cp1252",
150        "name" => "Windows-1252",
151        "file" => "mappings/CP1252.TXT",
152    ),
153    array(
154        "ident" => "win1251",
155        "enumid" => 4,
156        "enumident" => "cp1252",
157        "name" => "Windows-1251",
158        "file" => "mappings/CP1251.TXT",
159    ),
160    array(
161        "ident" => "koi8r",
162        "enumid" => 8,
163        "name" => "KOI8-R",
164        "file" => "mappings/KOI8-R.TXT",
165    ),
166    array(
167        "ident" => "cp866",
168        "enumid" => 6,
169        "name" => "CP-866",
170        "file" => "mappings/CP866.TXT",
171    ),
172    array(
173        "ident" => "macroman",
174        "enumid" => 7,
175        "name" => "MacRoman",
176        "file" => "mappings/ROMAN.TXT",
177    ),
178);
179
180$prevStage2 = array();
181
182foreach ($encodings as $e) {
183    echo
184"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
185
186    /* process file */
187    $map = array();
188    $lines = explode("\n", file_get_contents($e{'file'}));
189    foreach ($lines as $l) {
190        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
191            $map[] = array($matches[1], $matches[2]);
192    }
193
194    $mappy = array();
195    foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
196
197    $mstable = array("ident" => $e['ident']);
198    /* calculate two-stage tables */
199    for ($i = 0; $i < 4; $i++) {
200        for ($j = 0; $j < 64; $j++) {
201            $cp = $i << 6 | $j;
202            $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
203        }
204    }
205
206    echo
207"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
208
209    $s2tables_idents = array();
210    for ($i = 0; $i < 4; $i++) {
211        if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
212            $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
213            continue;
214        }
215
216        $s2tables_idents[$i] = $e["ident"];
217
218        echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
219            sprintf("%02X", $i << 6)." = { {\n";
220        for ($j = 0; $j < 64; $j++) {
221            if ($j == 0) echo "\t";
222            elseif ($j % 6 == 0) echo "\n\t";
223            else echo " ";
224            if ($mstable[$i][$j] !== NULL)
225                echo sprintf("0x%04X,", $mstable[$i][$j]);
226            else
227                echo "0xFFFF,"; /* special value; indicates no mapping */
228        }
229        echo "\n} };\n\n";
230
231        $prevStage2[] = $mstable[$i];
232    }
233
234    echo
235"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
236
237    echo
238"/* {{{ Stage 1 table for {$e['name']} */\n";
239
240    echo
241"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
242\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
243\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
244\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
245\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
246};
247";
248
249    echo
250"/* end of stage 1 table for {$e['name']} }}} */\n\n";
251}
252
253$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
254$a = range(0, $maxencnum);
255foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
256
257    echo
258"/* {{{ Index of tables for encoding conversion */
259static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
260
261foreach ($a as $k => $v) {
262    if (is_numeric($v))
263        echo "\tNULL,\n";
264    else
265        echo "\t&enc_to_uni_$v,\n";
266}
267
268    echo
269"};
270/* }}} */\n";
271
272$t = <<<CODE
273
274/* Definitions for mappings *from* Unicode */
275
276typedef struct {
277	unsigned short un_code_point; /* we don't need bigger */
278	unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
279} uni_to_enc;
280
281
282CODE;
283
284echo $t;
285
286$encodings = array(
287    array(
288        "ident" => "iso885915",
289        "name" => "ISO-8859-15",
290        "file" => "mappings/8859-15.TXT",
291        "range" => array(0xA4, 0xBE),
292    ),
293    array(
294        "ident" => "win1252",
295        "name" => "Windows-1252",
296        "file" => "mappings/CP1252.TXT",
297        "range" => array(0x80, 0x9F),
298    ),
299    array(
300        "ident" => "win1251",
301        "name" => "Windows-1251",
302        "file" => "mappings/CP1251.TXT",
303        "range" => array(0x80, 0xFF),
304    ),
305    array(
306        "ident" => "koi8r",
307        "name" => "KOI8-R",
308        "file" => "mappings/KOI8-R.TXT",
309        "range" => array(0x80, 0xFF),
310    ),
311    array(
312        "ident" => "cp866",
313        "name" => "CP-866",
314        "file" => "mappings/CP866.TXT",
315        "range" => array(0x80, 0xFF),
316    ),
317    array(
318        "ident" => "macroman",
319        "name" => "MacRoman",
320        "file" => "mappings/ROMAN.TXT",
321        "range" => array(0x80, 0xFF),
322    ),
323);
324
325foreach ($encodings as $e) {
326    echo
327"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
328
329    /* process file */
330    $map = array();
331    $lines = explode("\n", file_get_contents($e{'file'}));
332    foreach ($lines as $l) {
333        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
334            $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
335    }
336
337    $mappy = array();
338    foreach ($map as $v) {
339        if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
340            $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
341    }
342    ksort($mappy);
343
344    echo
345"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
346
347    foreach ($mappy as $k => $v) {
348        echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
349            $v[1], " */\n";
350    }
351    echo "};\n";
352
353    echo
354"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
355}
356
357$data = file_get_contents("ents_html5.txt");
358$pass2 = false;
359$name = "HTML5";
360$ident = "html5";
361again:
362
363$t = <<<'CODE'
364/* HTML 5 has many more named entities.
365 * Some of them map to two unicode code points, not one.
366 * We're going to use a three-stage table (with an extra one for the entities
367 * with two code points). */
368
369#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
370#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
371#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
372#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
373
374/* The default entity may be NULL. Binary search is still possible while
375   is senseless as there are just two rows (see also find_entity_for_char()). */
376typedef union {
377	struct {
378		const char *default_entity;
379		unsigned size; /* number of remaining entries in the table */
380		unsigned short default_entity_len;
381	} leading_entry;
382	struct {
383		const char *entity;
384		unsigned second_cp; /* second code point */
385		unsigned short entity_len;
386	} normal_entry;
387} entity_multicodepoint_row;
388
389/* blocks of these should start at code points k where k % 0xFC0 == 0 */
390typedef struct {
391	char ambiguous; /* if 0 look into entity */
392	union {
393		struct {
394			const char *entity; /* may be NULL */
395			unsigned short entity_len;
396		} ent;
397		const entity_multicodepoint_row *multicodepoint_table;
398	} data;
399} entity_stage3_row;
400
401/* Calculate k & 0x3F Use as offset */
402typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
403
404/* Calculate k & 0xFC0 >> 6. Use as offset */
405typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
406
407/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
408 * If larger than 1D, we have no mapping. Otherwise lookup that index */
409
410typedef struct {
411	const entity_stage1_row *ms_table;
412	/* for tables with only basic entities, this member is to be accessed
413	 * directly for better performance: */
414	const entity_stage3_row *table;
415} entity_table_opt;
416
417/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
418
419
420CODE;
421
422if (!$pass2)
423    echo $t;
424
425$dp = array();
426
427foreach (explode("\n", $data) as $l) {
428	if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
429		//echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
430		$dp[] = array($matches[1], $matches[2], $matches[3]);
431	} else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
432		$dp[] = array($matches[1], $matches[2]);
433	}
434}
435
436$origdp = $dp;
437
438usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
439
440$multicp_rows = array();
441foreach ($dp as $el) {
442	if (count($el) == 3) {
443		$multicp_rows[$el[1]] = array();
444	}
445}
446
447foreach ($dp as $el) {
448	if (key_exists($el[1], $multicp_rows)) {
449		if (count($el) == 3)
450			$multicp_rows[$el[1]][$el[2]] = $el[0];
451		else
452			$multicp_rows[$el[1]]["default"] = $el[0];
453	}
454}
455
456if ($pass2 < 2)
457    echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
458else
459    echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
460
461if (empty($multicp_rows))
462    goto skip_multicp;
463
464ksort($multicp_rows);
465foreach ($multicp_rows as &$v) { ksort($v); }
466unset($v);
467
468echo
469"/* {{{ Start of double code point tables for $name */", "\n\n";
470
471foreach ($multicp_rows as $k => $v) {
472	echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
473		sprintf("%05s", $k), "[] = {", "\n";
474	if (key_exists("default", $v)) {
475        if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
476            $v['default'] = "gt";
477		echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
478			"\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
479            sprintf("% 2d", strlen($v["default"])), '} },', "\n";
480	} else {
481		echo "\t{ {", sprintf("%-22s", 'NULL,'),
482			"\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
483	}
484	unset($v["default"]);
485	foreach ($v as $l => $w) {
486		echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
487            sprintf("% 2d", strlen($w)), '} },', "\n";
488	}
489	echo "};\n";
490}
491echo "\n/* End of double code point tables }}} */", "\n\n";
492
493skip_multicp:
494
495if ($pass2 < 2)
496    echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
497
498$t = <<<CODE
499static const entity_stage3_row empty_stage3_table[] = {
500	/* 64 elements */
501	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
502	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
503	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
504	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
505	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
506	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
507	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
508	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
509	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
510	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
511	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
512	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
513	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
514	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
515	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
516	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
517};
518
519CODE;
520
521if (!$pass2)
522    echo $t;
523
524$mstable = array();
525foreach ($dp as $el) {
526	$s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
527	$s2 = (hexdec($el[1]) & 0xFC0) >> 6;
528	$s3 = hexdec($el[1]) & 0x3F;
529	if (key_exists($el[1], $multicp_rows)) {
530		$mstable[$s1][$s2][$s3] = "";
531	} else {
532		$mstable[$s1][$s2][$s3] = $el[0];
533	}
534}
535
536for ($i = 0; $i < 0x1E; $i++) {
537	for ($k = 0; $k < 64; $k++) {
538		$any3 = false;
539		$col3 = array();
540		for ($l = 0; $l < 64; $l++) {
541			if (isset($mstable[$i][$k][$l])) {
542				$any3 = true;
543				$col3[$l] = $mstable[$i][$k][$l];
544			} else {
545				$col3[$l] = null;
546			}
547		}
548		if ($any3) {
549			echo "static const entity_stage3_row stage3_table_{$ident}_",
550				sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
551			foreach ($col3 as $y => $z) {
552				if ($y == 0) echo "\t";
553				elseif ($y % 4 == 0) echo "\n\t";
554				else echo " ";
555				if ($z === NULL)
556					echo "{0, { {NULL, 0} } },";
557                elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
558                    echo "{0, { {\"quot\", 4} } },";
559				elseif ($z !== "")
560					echo "{0, { {\"$z\", ", strlen($z), "} } },";
561				else
562					echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
563						($i << 12) | ($k << 6) | $y ), ", 0} } },";
564
565			}
566			echo "\n};\n\n";
567		}
568	}
569}
570
571if ($pass2 < 2)
572    echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
573
574if ($pass2 > 1)
575    goto hashtables;
576
577echo
578"/* {{{ Stage 2 Tables for $name */", "\n\n";
579
580$t = <<<CODE
581static const entity_stage2_row empty_stage2_table[] = {
582	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
583	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
584	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
585	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
586	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
587	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
588	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
589	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
590	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
591	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
592	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
593	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
594	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
595	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
596	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
597	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
598};
599
600CODE;
601
602if (!$pass2)
603    echo $t;
604
605for ($i = 0; $i < 0x1E; $i++) {
606	$any = false;
607	for ($k = 0; $k < 64; $k++) {
608		if (isset($mstable[$i][$k]))
609			$any = true;
610	}
611	if ($any) {
612		echo "static const entity_stage2_row stage2_table_{$ident}_",
613			sprintf("%02X000", $i), "[] = {\n";
614		for ($k = 0; $k < 64; $k++) {
615			if ($k == 0) echo "\t";
616			elseif ($k % 4 == 0) echo "\n\t";
617			else echo " ";
618			if (isset($mstable[$i][$k])) {
619				echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
620			} else {
621				echo "empty_stage3_table", ",";
622			}
623		}
624		echo "\n};\n\n";
625	}
626}
627
628echo
629"/* end of stage 2 tables for $name }}} */", "\n\n";
630
631echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
632for ($i = 0; $i < 0x1E; $i++) {
633	if (isset($mstable[$i]))
634		echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
635	else
636		echo "\tempty_stage2_table,\n";
637}
638echo "};\n\n";
639
640echo
641"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
642
643/* commented-out; this enabled binary search, which turned out to be
644 * significantly slower than the hash tables for html 5 entities */
645//echo
646//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
647
648//$t = <<<CODE
649//typedef struct {
650//	const char *entity;
651//	unsigned short entity_len;
652//	unsigned int codepoint1;
653//	unsigned int codepoint2;
654//} entity_cp_map;
655//
656//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
657//	( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
658//
659//static const entity_cp_map html5_ent_cp_map[] = {
660//
661//CODE;
662//echo $t;
663//
664//$dp = $origdp;
665//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
666//	return $d==0?strcmp($a[0], $b[0]):$d; });
667//
668//$k = 0;
669//foreach ($dp as $o) {
670//	if ($k == 0) echo "\t";
671//	elseif ($k % 3 == 0) echo "\n\t";
672//	else echo " ";
673//	if (isset($o[2]))
674//		echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
675//			hexdec($o[1]), hexdec($o[2]));
676//	else
677//		echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
678//			hexdec($o[1]));
679//
680//	if (isset($o[2])) {
681//		$entlen = strlen($o[0]) + 2;
682//		$utf8len = strlen(
683//			mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
684//		if ($utf8len > $entlen*1.2) {
685//			die("violated assumption for traverse_for_entities");
686//		}
687//	}
688//
689//	$k++;
690//}
691//echo "\n};\n\n";
692//
693//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
694//
695//echo
696//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
697
698hashtables:
699
700echo
701"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
702
703$t = <<<CODE
704typedef struct {
705	const char *entity;
706	unsigned short entity_len;
707	unsigned int codepoint1;
708	unsigned int codepoint2;
709} entity_cp_map;
710
711typedef const entity_cp_map *entity_ht_bucket;
712
713typedef struct {
714	unsigned num_elems; /* power of 2 */
715	const entity_ht_bucket *buckets; /* .num_elems elements */
716} entity_ht;
717
718static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
719
720CODE;
721
722if (!$pass2)
723    echo $t;
724
725function hashfun($str)
726{
727
728	$hash = 5381;
729	$nKeyLength = strlen($str);
730	$pos = 0;
731
732	for (; $nKeyLength > 0; $nKeyLength--) {
733		$hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
734				 & 0xFFFFFFFF;
735	}
736	return $hash;
737
738}
739
740$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
741$mask = $numelems - 1;
742$hashes = array();
743foreach ($origdp as $e) {
744	$hashes[hashfun($e[0]) & $mask][] = $e;
745	if (isset($e[2])) {
746		$entlen = strlen($e[0]) + 2;
747		$utf8len = strlen(
748			mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
749		if ($utf8len > $entlen*1.2) {
750			die("violated assumption for traverse_for_entities");
751		}
752	}
753}
754
755for ($i = 0; $i < $numelems; $i++) {
756	if (empty($hashes[$i]))
757		continue;
758	echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
759	foreach ($hashes[$i] as $h) {
760		if (isset($h[2])) {
761			echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
762				$h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
763		} else {
764			echo sprintf(' {"%s", %d, 0x%05X, 0},',
765				$h[0], strlen($h[0]), hexdec($h[1]));
766		}
767	}
768	echo " {NULL, 0, 0, 0} };\n";
769}
770echo "\n";
771
772echo
773"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
774
775for ($i = 0; $i < $numelems; $i++) {
776	if ($i == 0) echo "\t";
777	elseif ($i % 4 == 0) echo "\n\t";
778	else echo " ";
779	if (empty($hashes[$i]))
780		echo "ht_bucket_empty,";
781	else
782		echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
783}
784echo "\n};\n\n";
785
786echo
787"static const entity_ht ent_ht_{$ident} = {
788	", sprintf("0x%X", $numelems), ",
789	ht_buckets_{$ident}
790};\n\n";
791
792echo
793"/* end of $name hash table for entity -> codepoint }}} */\n\n";
794
795if (!$pass2) {
796    $data = file_get_contents("ents_html401.txt");
797    $pass2 = 1;
798    $name = "HTML 4.01";
799    $ident = "html4";
800    goto again;
801} elseif ($pass2 == 1) {
802    $data = file_get_contents("ents_basic.txt");
803    $pass2 = 2;
804    $name = "Basic entities (no apos)";
805    $ident = "be_noapos";
806    goto again;
807} elseif ($pass2 == 2) {
808    $data = file_get_contents("ents_basic_apos.txt");
809    $pass2 = 3;
810    $name = "Basic entities (with apos)";
811    $ident = "be_apos";
812    goto again;
813}
814
815echo "#endif /* HTML_TABLES_H */\n";
816