1<?php
2/*
3   +----------------------------------------------------------------------+
4   | PHP Version 5                                                        |
5   +----------------------------------------------------------------------+
6   | Copyright (c) 1997-2010 The PHP Group                                |
7   +----------------------------------------------------------------------+
8   | This source file is subject to version 3.01 of the PHP license,      |
9   | that is bundled with this package in the file LICENSE, and is        |
10   | available through the world-wide-web at the following url:           |
11   | http://www.php.net/license/3_01.txt                                  |
12   | If you did not receive a copy of the PHP license and are unable to   |
13   | obtain it through the world-wide-web, please send a note to          |
14   | license@php.net so we can mail you a copy immediately.               |
15   +----------------------------------------------------------------------+
16   | Authors: Gustavo Lopes  <cataphract@php.net>                         |
17   +----------------------------------------------------------------------+
18*/
19
20/* This file prints to stdout the contents of ext/standard/html_tables.h */
21/* put together with glue; have patience */
22
23$t = <<<CODE
24/*
25   +----------------------------------------------------------------------+
26   | PHP Version 5                                                        |
27   +----------------------------------------------------------------------+
28   | Copyright (c) 1997-%s The PHP Group                                |
29   +----------------------------------------------------------------------+
30   | This source file is subject to version 3.01 of the PHP license,      |
31   | that is bundled with this package in the file LICENSE, and is        |
32   | available through the world-wide-web at the following url:           |
33   | http://www.php.net/license/3_01.txt                                  |
34   | If you did not receive a copy of the PHP license and are unable to   |
35   | obtain it through the world-wide-web, please send a note to          |
36   | license@php.net so we can mail you a copy immediately.               |
37   +----------------------------------------------------------------------+
38*/
39
40/* \$Id$ */
41
42#ifndef HTML_TABLES_H
43#define HTML_TABLES_H
44
45/**************************************************************************
46***************************************************************************
47**        THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT.        **
48***************************************************************************
49** Please change html_tables/html_table_gen.php instead and then         **
50** run it in order to generate this file                                 **
51***************************************************************************
52**************************************************************************/
53
54enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
55					  cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
56					  cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
57					  cs_numelems /* used to count the number of charsets */
58					};
59#define CHARSET_UNICODE_COMPAT(cs)	((cs) <= cs_8859_1)
60#define CHARSET_SINGLE_BYTE(cs)		((cs) > cs_utf_8 && (cs) < cs_big5)
61#define CHARSET_PARTIAL_SUPPORT(cs)	((cs) >= cs_big5)
62
63static const struct {
64	const char *codeset;
65	enum entity_charset charset;
66} charset_map[] = {
67	{ "ISO-8859-1",		cs_8859_1 },
68	{ "ISO8859-1",		cs_8859_1 },
69	{ "ISO-8859-15",	cs_8859_15 },
70	{ "ISO8859-15",		cs_8859_15 },
71	{ "utf-8",			cs_utf_8 },
72	{ "cp1252", 		cs_cp1252 },
73	{ "Windows-1252",	cs_cp1252 },
74	{ "1252",			cs_cp1252 },
75	{ "BIG5",			cs_big5 },
76	{ "950",			cs_big5 },
77	{ "GB2312",			cs_gb2312 },
78	{ "936",			cs_gb2312 },
79	{ "BIG5-HKSCS",		cs_big5hkscs },
80	{ "Shift_JIS",		cs_sjis },
81	{ "SJIS",			cs_sjis },
82	{ "932",			cs_sjis },
83	{ "SJIS-win",		cs_sjis },
84	{ "CP932",			cs_sjis },
85	{ "EUCJP",			cs_eucjp },
86	{ "EUC-JP",			cs_eucjp },
87	{ "eucJP-win",		cs_eucjp },
88	{ "KOI8-R",			cs_koi8r },
89	{ "koi8-ru",		cs_koi8r },
90	{ "koi8r",			cs_koi8r },
91	{ "cp1251",			cs_cp1251 },
92	{ "Windows-1251",	cs_cp1251 },
93	{ "win-1251",		cs_cp1251 },
94	{ "iso8859-5",		cs_8859_5 },
95	{ "iso-8859-5",		cs_8859_5 },
96	{ "cp866",			cs_cp866 },
97	{ "866",			cs_cp866 },
98	{ "ibm866",			cs_cp866 },
99	{ "MacRoman",		cs_macroman },
100	{ NULL }
101};
102
103/* longest entity name length excluding & and ; */
104#define LONGEST_ENTITY_LENGTH 31
105
106/* Definitions for mappings *to* Unicode.
107 * The origin charset must have at most 256 code points.
108 * The multi-byte encodings are not supported */
109typedef struct {
110    unsigned short uni_cp[64];
111} enc_to_uni_stage2;
112
113typedef struct {
114    const enc_to_uni_stage2 *inner[4];
115} enc_to_uni;
116
117/* bits 7-8 bits (only single bytes encodings supported )*/
118#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
119/* bits 1-6 */
120#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
121
122
123CODE;
124
125echo sprintf($t, date("Y"));
126
127$encodings = array(
128    array(
129        "ident" => "iso88591",
130        "enumid" => 1,
131        "name" => "ISO-8859-1",
132        "file" => "mappings/8859-1.TXT",
133    ),
134    array(
135        "ident" => "iso88595",
136        "enumid" => 5,
137        "name" => "ISO-8859-5",
138        "file" => "mappings/8859-5.TXT",
139    ),
140    array(
141        "ident" => "iso885915",
142        "enumid" => 3,
143        "name" => "ISO-8859-15",
144        "file" => "mappings/8859-15.TXT",
145    ),
146    array(
147        "ident" => "win1252",
148        "enumid" => 2,
149        "enumident" => "cp1252",
150        "name" => "Windows-1252",
151        "file" => "mappings/CP1252.TXT",
152    ),
153    array(
154        "ident" => "win1251",
155        "enumid" => 4,
156        "enumident" => "cp1252",
157        "name" => "Windows-1251",
158        "file" => "mappings/CP1251.TXT",
159    ),
160    array(
161        "ident" => "koi8r",
162        "enumid" => 8,
163        "name" => "KOI8-R",
164        "file" => "mappings/KOI8-R.TXT",
165    ),
166    array(
167        "ident" => "cp866",
168        "enumid" => 6,
169        "name" => "CP-866",
170        "file" => "mappings/CP866.TXT",
171    ),
172    array(
173        "ident" => "macroman",
174        "enumid" => 7,
175        "name" => "MacRoman",
176        "file" => "mappings/ROMAN.TXT",
177    ),
178);
179
180$prevStage2 = array();
181
182foreach ($encodings as $e) {
183    echo
184"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
185
186    /* process file */
187    $map = array();
188    $lines = explode("\n", file_get_contents($e{'file'}));
189    foreach ($lines as $l) {
190        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
191            $map[] = array($matches[1], $matches[2]);
192    }
193
194    $mappy = array();
195    foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
196
197    $mstable = array("ident" => $e['ident']);
198    /* calculate two-stage tables */
199    for ($i = 0; $i < 4; $i++) {
200        for ($j = 0; $j < 64; $j++) {
201            $cp = $i << 6 | $j;
202            $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
203        }
204    }
205
206    echo
207"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
208
209    $s2tables_idents = array();
210    for ($i = 0; $i < 4; $i++) {
211        if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
212            $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
213            continue;
214        }
215
216        $s2tables_idents[$i] = $e["ident"];
217
218        echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
219            sprintf("%02X", $i << 6)." = { {\n";
220        for ($j = 0; $j < 64; $j++) {
221            if ($j == 0) echo "\t";
222            elseif ($j % 6 == 0) echo "\n\t";
223            else echo " ";
224            if ($mstable[$i][$j] !== NULL)
225                echo sprintf("0x%04X,", $mstable[$i][$j]);
226            else
227                echo "0xFFFF,"; /* special value; indicates no mapping */
228        }
229        echo "\n} };\n\n";
230
231        $prevStage2[] = $mstable[$i];
232    }
233
234    echo
235"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
236
237    echo
238"/* {{{ Stage 1 table for {$e['name']} */\n";
239
240    echo
241"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
242\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
243\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
244\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
245\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
246};
247";
248
249    echo
250"/* end of stage 1 table for {$e['name']} }}} */\n\n";
251}
252
253$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
254$a = range(0, $maxencnum);
255foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
256
257    echo
258"/* {{{ Index of tables for encoding conversion */
259static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
260
261foreach ($a as $k => $v) {
262    if (is_numeric($v))
263        echo "\tNULL,\n";
264    else
265        echo "\t&enc_to_uni_$v,\n";
266}
267
268    echo
269"};
270/* }}} */\n";
271
272$t = <<<CODE
273
274/* Definitions for mappings *from* Unicode */
275
276typedef struct {
277	unsigned short un_code_point; /* we don't need bigger */
278	unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
279} uni_to_enc;
280
281
282CODE;
283
284echo $t;
285
286$encodings = array(
287    array(
288        "ident" => "iso885915",
289        "name" => "ISO-8859-15",
290        "file" => "mappings/8859-15.TXT",
291        "range" => array(0xA4, 0xBE),
292    ),
293    array(
294        "ident" => "win1252",
295        "name" => "Windows-1252",
296        "file" => "mappings/CP1252.TXT",
297        "range" => array(0x80, 0x9F),
298    ),
299    array(
300        "ident" => "win1251",
301        "name" => "Windows-1251",
302        "file" => "mappings/CP1251.TXT",
303        "range" => array(0x80, 0xFF),
304    ),
305    array(
306        "ident" => "koi8r",
307        "name" => "KOI8-R",
308        "file" => "mappings/KOI8-R.TXT",
309        "range" => array(0x80, 0xFF),
310    ),
311    array(
312        "ident" => "cp866",
313        "name" => "CP-866",
314        "file" => "mappings/CP866.TXT",
315        "range" => array(0x80, 0xFF),
316    ),
317    array(
318        "ident" => "macroman",
319        "name" => "MacRoman",
320        "file" => "mappings/ROMAN.TXT",
321        "range" => array(0x80, 0xFF),
322    ),
323);
324
325foreach ($encodings as $e) {
326    echo
327"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
328
329    /* process file */
330    $map = array();
331    $lines = explode("\n", file_get_contents($e{'file'}));
332    foreach ($lines as $l) {
333        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
334            $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
335    }
336
337    $mappy = array();
338    foreach ($map as $v) {
339        if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
340            $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
341    }
342    ksort($mappy);
343
344    echo
345"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
346
347    foreach ($mappy as $k => $v) {
348        echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
349            $v[1], " */\n";
350    }
351    echo "};\n";
352
353    echo
354"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
355}
356
357$data = file_get_contents("ents_html5.txt");
358$pass2 = false;
359$name = "HTML5";
360$ident = "html5";
361again:
362
363$t = <<<'CODE'
364/* HTML 5 has many more named entities.
365 * Some of them map to two unicode code points, not one.
366 * We're going to use a three-stage table (with an extra one for the entities
367 * with two code points). */
368
369#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
370#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
371#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
372#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
373
374/* Table should be organized with a leading row telling the size of
375 * the table and the default entity (maybe NULL) and the rest being
376 * normal rows ordered by code point so that we can do a binary search */
377typedef union {
378	struct {
379		unsigned size; /* number of remaining entries in the table */
380		const char *default_entity;
381		unsigned short default_entity_len;
382	} leading_entry;
383	struct {
384		unsigned second_cp; /* second code point */
385		const char *entity;
386		unsigned short entity_len;
387	} normal_entry;
388} entity_multicodepoint_row;
389
390/* blocks of these should start at code points k where k % 0xFC0 == 0 */
391typedef struct {
392	char ambiguous; /* if 0 look into entity */
393	union {
394		struct {
395			const char *entity; /* may be NULL */
396			unsigned short entity_len;
397		} ent;
398		const entity_multicodepoint_row *multicodepoint_table;
399	} data;
400} entity_stage3_row;
401
402/* Calculate k & 0x3F Use as offset */
403typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
404
405/* Calculate k & 0xFC0 >> 6. Use as offset */
406typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
407
408/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
409 * If larger than 1D, we have no mapping. Otherwise lookup that index */
410
411typedef struct {
412	const entity_stage1_row *ms_table;
413	/* for tables with only basic entities, this member is to be accessed
414	 * directly for better performance: */
415	const entity_stage3_row *table;
416} entity_table_opt;
417
418/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
419
420
421CODE;
422
423if (!$pass2)
424    echo $t;
425
426$dp = array();
427
428foreach (explode("\n", $data) as $l) {
429	if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
430		//echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
431		$dp[] = array($matches[1], $matches[2], $matches[3]);
432	} else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
433		$dp[] = array($matches[1], $matches[2]);
434	}
435}
436
437$origdp = $dp;
438
439usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
440
441$multicp_rows = array();
442foreach ($dp as $el) {
443	if (count($el) == 3) {
444		$multicp_rows[$el[1]] = array();
445	}
446}
447
448foreach ($dp as $el) {
449	if (key_exists($el[1], $multicp_rows)) {
450		if (count($el) == 3)
451			$multicp_rows[$el[1]][$el[2]] = $el[0];
452		else
453			$multicp_rows[$el[1]]["default"] = $el[0];
454	}
455}
456
457if ($pass2 < 2)
458    echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
459else
460    echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
461
462if (empty($multicp_rows))
463    goto skip_multicp;
464
465ksort($multicp_rows);
466foreach ($multicp_rows as &$v) { ksort($v); }
467unset($v);
468
469echo
470"/* {{{ Start of double code point tables for $name */", "\n\n";
471
472foreach ($multicp_rows as $k => $v) {
473	echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
474		sprintf("%05s", $k), "[] = {", "\n";
475	if (key_exists("default", $v)) {
476        if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
477            $v['default'] = "gt";
478		echo "\t{ {", sprintf("%02d", count($v) - 1),
479			",\t\t", sprintf("\"%-21s", $v["default"].'",'), "\t",
480            sprintf("% 2d", strlen($v["default"])), '} },', "\n";
481	} else {
482		echo "\t{ {", sprintf("%02d", count($v)),
483			",\t\t", sprintf("%-22s", 'NULL'), ",\t0} },\n";
484	}
485	unset($v["default"]);
486	foreach ($v as $l => $w) {
487		echo "\t{ {", sprintf("0x%05s", $l), ",\t", sprintf("\"%-21s", $w.'",'), "\t",
488            sprintf("% 2d", strlen($w)), '} },', "\n";
489	}
490	echo "};\n";
491}
492echo "\n/* End of double code point tables }}} */", "\n\n";
493
494skip_multicp:
495
496if ($pass2 < 2)
497    echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
498
499$t = <<<CODE
500static const entity_stage3_row empty_stage3_table[] = {
501	/* 64 elements */
502	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
503	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
504	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
505	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
506	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
507	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
508	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
509	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
510	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
511	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
512	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
513	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
514	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
515	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
516	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
517	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
518};
519
520CODE;
521
522if (!$pass2)
523    echo $t;
524
525$mstable = array();
526foreach ($dp as $el) {
527	$s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
528	$s2 = (hexdec($el[1]) & 0xFC0) >> 6;
529	$s3 = hexdec($el[1]) & 0x3F;
530	if (key_exists($el[1], $multicp_rows)) {
531		$mstable[$s1][$s2][$s3] = "";
532	} else {
533		$mstable[$s1][$s2][$s3] = $el[0];
534	}
535}
536
537for ($i = 0; $i < 0x1E; $i++) {
538	for ($k = 0; $k < 64; $k++) {
539		$any3 = false;
540		$col3 = array();
541		for ($l = 0; $l < 64; $l++) {
542			if (isset($mstable[$i][$k][$l])) {
543				$any3 = true;
544				$col3[$l] = $mstable[$i][$k][$l];
545			} else {
546				$col3[$l] = null;
547			}
548		}
549		if ($any3) {
550			echo "static const entity_stage3_row stage3_table_{$ident}_",
551				sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
552			foreach ($col3 as $y => $z) {
553				if ($y == 0) echo "\t";
554				elseif ($y % 4 == 0) echo "\n\t";
555				else echo " ";
556				if ($z === NULL)
557					echo "{0, { {NULL, 0} } },";
558                elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
559                    echo "{0, { {\"quot\", 4} } },";
560				elseif ($z !== "")
561					echo "{0, { {\"$z\", ", strlen($z), "} } },";
562				else
563					echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
564						($i << 12) | ($k << 6) | $y ), "} } },";
565
566			}
567			echo "\n};\n\n";
568		}
569	}
570}
571
572if ($pass2 < 2)
573    echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
574
575if ($pass2 > 1)
576    goto hashtables;
577
578echo
579"/* {{{ Stage 2 Tables for $name */", "\n\n";
580
581$t = <<<CODE
582static const entity_stage2_row empty_stage2_table[] = {
583	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
584	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
585	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
586	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
587	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
588	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
589	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
590	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
591	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
592	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
593	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
594	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
595	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
596	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
597	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
598	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
599};
600
601CODE;
602
603if (!$pass2)
604    echo $t;
605
606for ($i = 0; $i < 0x1E; $i++) {
607	$any = false;
608	for ($k = 0; $k < 64; $k++) {
609		if (isset($mstable[$i][$k]))
610			$any = true;
611	}
612	if ($any) {
613		echo "static const entity_stage2_row stage2_table_{$ident}_",
614			sprintf("%02X000", $i), "[] = {\n";
615		for ($k = 0; $k < 64; $k++) {
616			if ($k == 0) echo "\t";
617			elseif ($k % 4 == 0) echo "\n\t";
618			else echo " ";
619			if (isset($mstable[$i][$k])) {
620				echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
621			} else {
622				echo "empty_stage3_table", ",";
623			}
624		}
625		echo "\n};\n\n";
626	}
627}
628
629echo
630"/* end of stage 2 tables for $name }}} */", "\n\n";
631
632echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
633for ($i = 0; $i < 0x1E; $i++) {
634	if (isset($mstable[$i]))
635		echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
636	else
637		echo "\tempty_stage2_table,\n";
638}
639echo "};\n\n";
640
641echo
642"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
643
644/* commented-out; this enabled binary search, which turned out to be
645 * significantly slower than the hash tables for html 5 entities */
646//echo
647//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
648
649//$t = <<<CODE
650//typedef struct {
651//	const char *entity;
652//	unsigned short entity_len;
653//	unsigned int codepoint1;
654//	unsigned int codepoint2;
655//} entity_cp_map;
656//
657//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
658//	( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
659//
660//static const entity_cp_map html5_ent_cp_map[] = {
661//
662//CODE;
663//echo $t;
664//
665//$dp = $origdp;
666//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
667//	return $d==0?strcmp($a[0], $b[0]):$d; });
668//
669//$k = 0;
670//foreach ($dp as $o) {
671//	if ($k == 0) echo "\t";
672//	elseif ($k % 3 == 0) echo "\n\t";
673//	else echo " ";
674//	if (isset($o[2]))
675//		echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
676//			hexdec($o[1]), hexdec($o[2]));
677//	else
678//		echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
679//			hexdec($o[1]));
680//
681//	if (isset($o[2])) {
682//		$entlen = strlen($o[0]) + 2;
683//		$utf8len = strlen(
684//			mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
685//		if ($utf8len > $entlen*1.2) {
686//			die("violated assumption for traverse_for_entities");
687//		}
688//	}
689//
690//	$k++;
691//}
692//echo "\n};\n\n";
693//
694//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
695//
696//echo
697//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
698
699hashtables:
700
701echo
702"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
703
704$t = <<<CODE
705typedef struct {
706	const char *entity;
707	unsigned short entity_len;
708	unsigned int codepoint1;
709	unsigned int codepoint2;
710} entity_cp_map;
711
712typedef const entity_cp_map *entity_ht_bucket;
713
714typedef struct {
715	unsigned num_elems; /* power of 2 */
716	const entity_ht_bucket *buckets; /* .num_elems elements */
717} entity_ht;
718
719static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
720
721CODE;
722
723if (!$pass2)
724    echo $t;
725
726function hashfun($str)
727{
728
729	$hash = 5381;
730	$nKeyLength = strlen($str);
731	$pos = 0;
732
733	for (; $nKeyLength > 0; $nKeyLength--) {
734		$hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
735				 & 0xFFFFFFFF;
736	}
737	return $hash;
738
739}
740
741$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
742$mask = $numelems - 1;
743$hashes = array();
744foreach ($origdp as $e) {
745	$hashes[hashfun($e[0]) & $mask][] = $e;
746	if (isset($e[2])) {
747		$entlen = strlen($e[0]) + 2;
748		$utf8len = strlen(
749			mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
750		if ($utf8len > $entlen*1.2) {
751			die("violated assumption for traverse_for_entities");
752		}
753	}
754}
755
756for ($i = 0; $i < $numelems; $i++) {
757	if (empty($hashes[$i]))
758		continue;
759	echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
760	foreach ($hashes[$i] as $h) {
761		if (isset($h[2])) {
762			echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
763				$h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
764		} else {
765			echo sprintf(' {"%s", %d, 0x%05X, 0},',
766				$h[0], strlen($h[0]), hexdec($h[1]));
767		}
768	}
769	echo " {NULL, 0, 0, 0} };\n";
770}
771echo "\n";
772
773echo
774"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
775
776for ($i = 0; $i < $numelems; $i++) {
777	if ($i == 0) echo "\t";
778	elseif ($i % 4 == 0) echo "\n\t";
779	else echo " ";
780	if (empty($hashes[$i]))
781		echo "ht_bucket_empty,";
782	else
783		echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
784}
785echo "\n};\n\n";
786
787echo
788"static const entity_ht ent_ht_{$ident} = {
789	", sprintf("0x%X", $numelems), ",
790	ht_buckets_{$ident}
791};\n\n";
792
793echo
794"/* end of $name hash table for entity -> codepoint }}} */\n\n";
795
796if (!$pass2) {
797    $data = file_get_contents("ents_html401.txt");
798    $pass2 = 1;
799    $name = "HTML 4.01";
800    $ident = "html4";
801    goto again;
802} elseif ($pass2 == 1) {
803    $data = file_get_contents("ents_basic.txt");
804    $pass2 = 2;
805    $name = "Basic entities (no apos)";
806    $ident = "be_noapos";
807    goto again;
808} elseif ($pass2 == 2) {
809    $data = file_get_contents("ents_basic_apos.txt");
810    $pass2 = 3;
811    $name = "Basic entities (with apos)";
812    $ident = "be_apos";
813    goto again;
814}
815
816echo "#endif /* HTML_TABLES_H */\n";
817