1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this part: Marcus Boerger <helly@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34 
35 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 
38 static const int htmlentitifieds[256] = {
39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
43   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
55 };
56 
57 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
58 
59 const mbfl_encoding mbfl_encoding_html_ent = {
60 	mbfl_no_encoding_html_ent,
61 	"HTML-ENTITIES",
62 	"HTML-ENTITIES",
63 	mbfl_encoding_html_ent_aliases,
64 	NULL,
65 	MBFL_ENCTYPE_GL_UNSAFE,
66 	&vtbl_html_wchar,
67 	&vtbl_wchar_html,
68 	mb_htmlent_to_wchar,
69 	mb_wchar_to_htmlent,
70 	NULL,
71 	NULL,
72 };
73 
74 const struct mbfl_convert_vtbl vtbl_wchar_html = {
75 	mbfl_no_encoding_wchar,
76 	mbfl_no_encoding_html_ent,
77 	mbfl_filt_conv_common_ctor,
78 	NULL,
79 	mbfl_filt_conv_html_enc,
80 	mbfl_filt_conv_html_enc_flush,
81 	NULL,
82 };
83 
84 const struct mbfl_convert_vtbl vtbl_html_wchar = {
85 	mbfl_no_encoding_html_ent,
86 	mbfl_no_encoding_wchar,
87 	mbfl_filt_conv_html_dec_ctor,
88 	mbfl_filt_conv_html_dec_dtor,
89 	mbfl_filt_conv_html_dec,
90 	mbfl_filt_conv_html_dec_flush,
91 	mbfl_filt_conv_html_dec_copy,
92 };
93 
94 
95 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
96 
97 /*
98  * any => HTML
99  */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)100 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
101 {
102 	int tmp[64];
103 	int i;
104 	unsigned int uc;
105 	const mbfl_html_entity_entry *e;
106 
107 	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
108 				htmlentitifieds[c] != 1) {
109 		CK((*filter->output_function)(c, filter->data));
110 	} else {
111  		CK((*filter->output_function)('&', filter->data));
112 		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
113 			if (c == e->code) {
114 				char *p;
115 
116 				for (p = e->name; *p != '\0'; p++) {
117 					CK((*filter->output_function)((int)*p, filter->data));
118 				}
119 				goto last;
120 			}
121 		}
122 
123 		{
124 			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
125 
126 			CK((*filter->output_function)('#', filter->data));
127 
128 			uc = (unsigned int)c;
129 
130 			*(--p) = '\0';
131 			do {
132 				*(--p) = "0123456789"[uc % 10];
133 				uc /= 10;
134 			} while (uc);
135 
136 			for (; *p != '\0'; p++) {
137 				CK((*filter->output_function)(*p, filter->data));
138 			}
139 		}
140 	last:
141 		CK((*filter->output_function)(';', filter->data));
142 	}
143 	return 0;
144 }
145 
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)146 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
147 {
148 	filter->status = 0;
149 	filter->opaque = NULL;
150 
151 	if (filter->flush_function != NULL) {
152 		(*filter->flush_function)(filter->data);
153 	}
154 
155 	return 0;
156 }
157 
158 /*
159  * HTML => any
160  */
161 #define html_enc_buffer_size	16
162 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
163 
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)164 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
165 {
166 	filter->status = 0;
167 	filter->opaque = emalloc(html_enc_buffer_size+1);
168 }
169 
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)170 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
171 {
172 	filter->status = 0;
173 	if (filter->opaque)
174 	{
175 		efree((void*)filter->opaque);
176 	}
177 	filter->opaque = NULL;
178 }
179 
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)180 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
181 {
182 	int pos;
183 	unsigned int ent = 0;
184 	mbfl_html_entity_entry *entity;
185 	unsigned char *buffer = (unsigned char*)filter->opaque;
186 
187 	if (!filter->status) {
188 		if (c == '&' ) {
189 			filter->status = 1;
190 			buffer[0] = '&';
191 		} else {
192 			CK((*filter->output_function)(c, filter->data));
193 		}
194 	} else {
195 		if (c == ';') {
196 			if (buffer[1]=='#') {
197 				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
198 					if (filter->status > 3) {
199 						/* numeric entity */
200 						for (pos=3; pos<filter->status; pos++) {
201 							int v = buffer[pos];
202 							if (v >= '0' && v <= '9') {
203 								v = v - '0';
204 							} else if (v >= 'A' && v <= 'F') {
205 								v = v - 'A' + 10;
206 							} else if (v >= 'a' && v <= 'f') {
207 								v = v - 'a' + 10;
208 							} else {
209 								ent = -1;
210 								break;
211 							}
212 							ent = ent * 16 + v;
213 						}
214 					} else {
215 						ent = -1;
216 					}
217 				} else {
218 					/* numeric entity */
219 					if (filter->status > 2) {
220 						for (pos=2; pos<filter->status; pos++) {
221 							if (ent > 0x19999999) {
222 								ent = -1;
223 								break;
224 							}
225 							int v = buffer[pos];
226 							if (v >= '0' && v <= '9') {
227 								v = v - '0';
228 							} else {
229 								ent = -1;
230 								break;
231 							}
232 							ent = ent*10 + v;
233 						}
234 					} else {
235 						ent = -1;
236 					}
237 				}
238 				if (ent < 0x110000) {
239 					CK((*filter->output_function)(ent, filter->data));
240 				} else {
241 					for (pos = 0; pos < filter->status; pos++) {
242 						CK((*filter->output_function)(buffer[pos], filter->data));
243 					}
244 					CK((*filter->output_function)(c, filter->data));
245 				}
246 				filter->status = 0;
247 			} else {
248 				/* named entity */
249 				buffer[filter->status] = 0;
250 				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
251 				while (entity->name) {
252 					if (!strcmp((const char*)buffer+1, entity->name)) {
253 						ent = entity->code;
254 						break;
255 					}
256 					entity++;
257 				}
258 				if (ent) {
259 					/* decoded */
260 					CK((*filter->output_function)(ent, filter->data));
261 					filter->status = 0;
262 
263 				} else {
264 					/* failure */
265 					buffer[filter->status++] = ';';
266 					buffer[filter->status] = 0;
267 
268 					/* flush fragments */
269 					pos = 0;
270 					while (filter->status--) {
271 						int e = (*filter->output_function)(buffer[pos++], filter->data);
272 						if (e != 0)
273 							return e;
274 					}
275 					filter->status = 0;
276 				}
277 			}
278 		} else {
279 			/* add character */
280 			buffer[filter->status++] = c;
281 			/* add character and check */
282 			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
283 			{
284 				/* illegal character or end of buffer */
285 				if (c=='&')
286 					filter->status--;
287 				buffer[filter->status] = 0;
288 
289 				pos = 0;
290 				while (filter->status--) {
291 					int e = (*filter->output_function)(buffer[pos++], filter->data);
292 					if (e != 0)
293 						return e;
294 				}
295 				filter->status = 0;
296 
297 				if (c=='&')
298 				{
299 					buffer[filter->status++] = '&';
300 				}
301 			}
302 		}
303 	}
304 	return 0;
305 }
306 
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)307 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
308 {
309 	int status, pos = 0;
310 	unsigned char *buffer;
311 	int err = 0;
312 
313 	buffer = (unsigned char*)filter->opaque;
314 	status = filter->status;
315 	filter->status = 0;
316 
317 	/* flush fragments */
318 	while (status--) {
319 		int e = (*filter->output_function)(buffer[pos++], filter->data);
320 		if (e != 0)
321 			err = e;
322 	}
323 
324 	if (filter->flush_function != NULL) {
325 		(*filter->flush_function)(filter->data);
326 	}
327 
328 	return err;
329 }
330 
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)331 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
332 {
333 	*dest = *src;
334 	dest->opaque = emalloc(html_enc_buffer_size+1);
335 	memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
336 }
337 
is_html_entity_char(unsigned char c)338 static bool is_html_entity_char(unsigned char c)
339 {
340 	return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#';
341 }
342 
mb_htmlent_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)343 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
344 {
345 	unsigned char *p = *in, *e = p + *in_len;
346 	uint32_t *out = buf, *limit = buf + bufsize;
347 
348 	while (p < e && out < limit) {
349 		unsigned char c = *p++;
350 
351 		if (c == '&') {
352 			/* Find terminating ; for HTML entity */
353 			unsigned char *terminator = p;
354 			while (terminator < e && is_html_entity_char(*terminator))
355 				terminator++;
356 			if (terminator < e && *terminator == ';') {
357 				if (*p == '#' && (e - p) >= 2) {
358 					/* Numeric entity */
359 					unsigned int value = 0;
360 					unsigned char *digits = p + 1;
361 					if (*digits == 'x' || *digits == 'X') {
362 						/* Hexadecimal */
363 						digits++;
364 						if (digits == terminator) {
365 							goto bad_entity;
366 						}
367 						while (digits < terminator) {
368 							unsigned char digit = *digits++;
369 							if (digit >= '0' && digit <= '9') {
370 								value = (value * 16) + (digit - '0');
371 							} else if (digit >= 'A' && digit <= 'F') {
372 								value = (value * 16) + (digit - 'A' + 10);
373 							} else if (digit >= 'a' && digit <= 'f') {
374 								value = (value * 16) + (digit - 'a' + 10);
375 							} else {
376 								goto bad_entity;
377 							}
378 						}
379 					} else {
380 						/* Decimal */
381 						if (digits == terminator) {
382 							goto bad_entity;
383 						}
384 						while (digits < terminator) {
385 							unsigned char digit = *digits++;
386 							if (digit >= '0' && digit <= '9') {
387 								value = (value * 10) + (digit - '0');
388 							} else {
389 								goto bad_entity;
390 							}
391 						}
392 					}
393 					if (value > 0x10FFFF) {
394 						goto bad_entity;
395 					}
396 					*out++ = value;
397 					p = terminator + 1;
398 					goto next_iteration;
399 				} else if (terminator > p && terminator < e) {
400 					/* Named entity */
401 					mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
402 					while (entity->name) {
403 						if (!strncmp((char*)p, entity->name, terminator - p) && strlen(entity->name) == terminator - p) {
404 							*out++ = entity->code;
405 							p = terminator + 1;
406 							goto next_iteration;
407 						}
408 						entity++;
409 					}
410 				}
411 			}
412 			/* Either we didn't find ;, or the name of the entity was not recognized */
413 bad_entity:
414 			*out++ = '&';
415 			while (p < terminator && out < limit) {
416 				*out++ = *p++;
417 			}
418 			if (terminator < e && *terminator == ';' && out < limit) {
419 				*out++ = *p++;
420 			}
421 		} else {
422 			*out++ = c;
423 		}
424 
425 next_iteration: ;
426 	}
427 
428 	*in_len = e - p;
429 	*in = p;
430 	return out - buf;
431 }
432 
mb_wchar_to_htmlent(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)433 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
434 {
435 	unsigned char *out, *limit;
436 	MB_CONVERT_BUF_LOAD(buf, out, limit);
437 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
438 
439 	while (len--) {
440 		uint32_t w = *in++;
441 
442 		if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) {
443 			/* Fast path for most ASCII characters */
444 			out = mb_convert_buf_add(out, w);
445 		} else {
446 			out = mb_convert_buf_add(out, '&');
447 
448 			/* See if there is a matching named entity */
449 			mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
450 			while (entity->name) {
451 				if (w == entity->code) {
452 					MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name));
453 					for (char *str = entity->name; *str; str++) {
454 						out = mb_convert_buf_add(out, *str);
455 					}
456 					out = mb_convert_buf_add(out, ';');
457 					goto next_iteration;
458 				}
459 				entity++;
460 			}
461 
462 			/* There is no matching named entity; emit a numeric entity instead */
463 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12);
464 			out = mb_convert_buf_add(out, '#');
465 
466 			if (!w) {
467 				out = mb_convert_buf_add(out, '0');
468 			} else {
469 				unsigned char buf[12];
470 				unsigned char *converted = buf + sizeof(buf);
471 				while (w) {
472 					*(--converted) = "0123456789"[w % 10];
473 					w /= 10;
474 				}
475 				while (converted < buf + sizeof(buf)) {
476 					out = mb_convert_buf_add(out, *converted++);
477 				}
478 			}
479 
480 			out = mb_convert_buf_add(out, ';');
481 		}
482 
483 next_iteration: ;
484 	}
485 
486 	MB_CONVERT_BUF_STORE(buf, out, limit);
487 }
488