1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this part: Marcus Boerger <helly@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34 
35 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37 
38 static const int htmlentitifieds[256] = {
39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
43   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
55 };
56 
57 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
58 
59 const mbfl_encoding mbfl_encoding_html_ent = {
60 	mbfl_no_encoding_html_ent,
61 	"HTML-ENTITIES",
62 	"HTML-ENTITIES",
63 	mbfl_encoding_html_ent_aliases,
64 	NULL,
65 	MBFL_ENCTYPE_GL_UNSAFE,
66 	&vtbl_html_wchar,
67 	&vtbl_wchar_html,
68 	mb_htmlent_to_wchar,
69 	mb_wchar_to_htmlent,
70 	NULL
71 };
72 
73 const struct mbfl_convert_vtbl vtbl_wchar_html = {
74 	mbfl_no_encoding_wchar,
75 	mbfl_no_encoding_html_ent,
76 	mbfl_filt_conv_common_ctor,
77 	NULL,
78 	mbfl_filt_conv_html_enc,
79 	mbfl_filt_conv_html_enc_flush,
80 	NULL,
81 };
82 
83 const struct mbfl_convert_vtbl vtbl_html_wchar = {
84 	mbfl_no_encoding_html_ent,
85 	mbfl_no_encoding_wchar,
86 	mbfl_filt_conv_html_dec_ctor,
87 	mbfl_filt_conv_html_dec_dtor,
88 	mbfl_filt_conv_html_dec,
89 	mbfl_filt_conv_html_dec_flush,
90 	mbfl_filt_conv_html_dec_copy,
91 };
92 
93 
94 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
95 
96 /*
97  * any => HTML
98  */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)99 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
100 {
101 	int tmp[64];
102 	int i;
103 	unsigned int uc;
104 	const mbfl_html_entity_entry *e;
105 
106 	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
107 				htmlentitifieds[c] != 1) {
108 		CK((*filter->output_function)(c, filter->data));
109 	} else {
110  		CK((*filter->output_function)('&', filter->data));
111 		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
112 			if (c == e->code) {
113 				char *p;
114 
115 				for (p = e->name; *p != '\0'; p++) {
116 					CK((*filter->output_function)((int)*p, filter->data));
117 				}
118 				goto last;
119 			}
120 		}
121 
122 		{
123 			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
124 
125 			CK((*filter->output_function)('#', filter->data));
126 
127 			uc = (unsigned int)c;
128 
129 			*(--p) = '\0';
130 			do {
131 				*(--p) = "0123456789"[uc % 10];
132 				uc /= 10;
133 			} while (uc);
134 
135 			for (; *p != '\0'; p++) {
136 				CK((*filter->output_function)(*p, filter->data));
137 			}
138 		}
139 	last:
140 		CK((*filter->output_function)(';', filter->data));
141 	}
142 	return 0;
143 }
144 
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)145 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
146 {
147 	filter->status = 0;
148 	filter->opaque = NULL;
149 
150 	if (filter->flush_function != NULL) {
151 		(*filter->flush_function)(filter->data);
152 	}
153 
154 	return 0;
155 }
156 
157 /*
158  * HTML => any
159  */
160 #define html_enc_buffer_size	16
161 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
162 
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)163 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
164 {
165 	filter->status = 0;
166 	filter->opaque = emalloc(html_enc_buffer_size+1);
167 }
168 
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)169 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
170 {
171 	filter->status = 0;
172 	if (filter->opaque)
173 	{
174 		efree((void*)filter->opaque);
175 	}
176 	filter->opaque = NULL;
177 }
178 
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)179 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
180 {
181 	int pos;
182 	unsigned int ent = 0;
183 	mbfl_html_entity_entry *entity;
184 	unsigned char *buffer = (unsigned char*)filter->opaque;
185 
186 	if (!filter->status) {
187 		if (c == '&' ) {
188 			filter->status = 1;
189 			buffer[0] = '&';
190 		} else {
191 			CK((*filter->output_function)(c, filter->data));
192 		}
193 	} else {
194 		if (c == ';') {
195 			if (buffer[1]=='#') {
196 				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
197 					if (filter->status > 3) {
198 						/* numeric entity */
199 						for (pos=3; pos<filter->status; pos++) {
200 							int v = buffer[pos];
201 							if (v >= '0' && v <= '9') {
202 								v = v - '0';
203 							} else if (v >= 'A' && v <= 'F') {
204 								v = v - 'A' + 10;
205 							} else if (v >= 'a' && v <= 'f') {
206 								v = v - 'a' + 10;
207 							} else {
208 								ent = -1;
209 								break;
210 							}
211 							ent = ent * 16 + v;
212 						}
213 					} else {
214 						ent = -1;
215 					}
216 				} else {
217 					/* numeric entity */
218 					if (filter->status > 2) {
219 						for (pos=2; pos<filter->status; pos++) {
220 							if (ent > 0x19999999) {
221 								ent = -1;
222 								break;
223 							}
224 							int v = buffer[pos];
225 							if (v >= '0' && v <= '9') {
226 								v = v - '0';
227 							} else {
228 								ent = -1;
229 								break;
230 							}
231 							ent = ent*10 + v;
232 						}
233 					} else {
234 						ent = -1;
235 					}
236 				}
237 				if (ent < 0x110000) {
238 					CK((*filter->output_function)(ent, filter->data));
239 				} else {
240 					for (pos = 0; pos < filter->status; pos++) {
241 						CK((*filter->output_function)(buffer[pos], filter->data));
242 					}
243 					CK((*filter->output_function)(c, filter->data));
244 				}
245 				filter->status = 0;
246 			} else {
247 				/* named entity */
248 				buffer[filter->status] = 0;
249 				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
250 				while (entity->name) {
251 					if (!strcmp((const char*)buffer+1, entity->name)) {
252 						ent = entity->code;
253 						break;
254 					}
255 					entity++;
256 				}
257 				if (ent) {
258 					/* decoded */
259 					CK((*filter->output_function)(ent, filter->data));
260 					filter->status = 0;
261 
262 				} else {
263 					/* failure */
264 					buffer[filter->status++] = ';';
265 					buffer[filter->status] = 0;
266 
267 					/* flush fragments */
268 					pos = 0;
269 					while (filter->status--) {
270 						int e = (*filter->output_function)(buffer[pos++], filter->data);
271 						if (e != 0)
272 							return e;
273 					}
274 					filter->status = 0;
275 				}
276 			}
277 		} else {
278 			/* add character */
279 			buffer[filter->status++] = c;
280 			/* add character and check */
281 			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
282 			{
283 				/* illegal character or end of buffer */
284 				if (c=='&')
285 					filter->status--;
286 				buffer[filter->status] = 0;
287 
288 				pos = 0;
289 				while (filter->status--) {
290 					int e = (*filter->output_function)(buffer[pos++], filter->data);
291 					if (e != 0)
292 						return e;
293 				}
294 				filter->status = 0;
295 
296 				if (c=='&')
297 				{
298 					buffer[filter->status++] = '&';
299 				}
300 			}
301 		}
302 	}
303 	return 0;
304 }
305 
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)306 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
307 {
308 	int status, pos = 0;
309 	unsigned char *buffer;
310 	int err = 0;
311 
312 	buffer = (unsigned char*)filter->opaque;
313 	status = filter->status;
314 	filter->status = 0;
315 
316 	/* flush fragments */
317 	while (status--) {
318 		int e = (*filter->output_function)(buffer[pos++], filter->data);
319 		if (e != 0)
320 			err = e;
321 	}
322 
323 	if (filter->flush_function != NULL) {
324 		(*filter->flush_function)(filter->data);
325 	}
326 
327 	return err;
328 }
329 
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)330 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
331 {
332 	*dest = *src;
333 	dest->opaque = emalloc(html_enc_buffer_size+1);
334 	memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
335 }
336 
is_html_entity_char(unsigned char c)337 static bool is_html_entity_char(unsigned char c)
338 {
339 	return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#';
340 }
341 
mb_htmlent_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)342 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
343 {
344 	unsigned char *p = *in, *e = p + *in_len;
345 	uint32_t *out = buf, *limit = buf + bufsize;
346 
347 	while (p < e && out < limit) {
348 		unsigned char c = *p++;
349 
350 		if (c == '&') {
351 			/* Find terminating ; for HTML entity */
352 			unsigned char *terminator = p;
353 			while (terminator < e && is_html_entity_char(*terminator))
354 				terminator++;
355 			if (terminator < e && *terminator == ';') {
356 				if (*p == '#' && (e - p) >= 2) {
357 					/* Numeric entity */
358 					unsigned int value = 0;
359 					unsigned char *digits = p + 1;
360 					if (*digits == 'x' || *digits == 'X') {
361 						/* Hexadecimal */
362 						digits++;
363 						if (digits == terminator) {
364 							goto bad_entity;
365 						}
366 						while (digits < terminator) {
367 							unsigned char digit = *digits++;
368 							if (digit >= '0' && digit <= '9') {
369 								value = (value * 16) + (digit - '0');
370 							} else if (digit >= 'A' && digit <= 'F') {
371 								value = (value * 16) + (digit - 'A' + 10);
372 							} else if (digit >= 'a' && digit <= 'f') {
373 								value = (value * 16) + (digit - 'a' + 10);
374 							} else {
375 								goto bad_entity;
376 							}
377 						}
378 					} else {
379 						/* Decimal */
380 						if (digits == terminator) {
381 							goto bad_entity;
382 						}
383 						while (digits < terminator) {
384 							unsigned char digit = *digits++;
385 							if (digit >= '0' && digit <= '9') {
386 								value = (value * 10) + (digit - '0');
387 							} else {
388 								goto bad_entity;
389 							}
390 						}
391 					}
392 					if (value > 0x10FFFF) {
393 						goto bad_entity;
394 					}
395 					*out++ = value;
396 					p = terminator + 1;
397 					goto next_iteration;
398 				} else if (terminator > p && terminator < e) {
399 					/* Named entity */
400 					mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
401 					while (entity->name) {
402 						if (!strncmp((char*)p, entity->name, terminator - p) && strlen(entity->name) == terminator - p) {
403 							*out++ = entity->code;
404 							p = terminator + 1;
405 							goto next_iteration;
406 						}
407 						entity++;
408 					}
409 				}
410 			}
411 			/* Either we didn't find ;, or the name of the entity was not recognized */
412 bad_entity:
413 			*out++ = '&';
414 			while (p < terminator && out < limit) {
415 				*out++ = *p++;
416 			}
417 			if (terminator < e && *terminator == ';' && out < limit) {
418 				*out++ = *p++;
419 			}
420 		} else {
421 			*out++ = c;
422 		}
423 
424 next_iteration: ;
425 	}
426 
427 	*in_len = e - p;
428 	*in = p;
429 	return out - buf;
430 }
431 
mb_wchar_to_htmlent(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)432 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
433 {
434 	unsigned char *out, *limit;
435 	MB_CONVERT_BUF_LOAD(buf, out, limit);
436 	MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
437 
438 	while (len--) {
439 		uint32_t w = *in++;
440 
441 		if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) {
442 			/* Fast path for most ASCII characters */
443 			out = mb_convert_buf_add(out, w);
444 		} else {
445 			out = mb_convert_buf_add(out, '&');
446 
447 			/* See if there is a matching named entity */
448 			mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
449 			while (entity->name) {
450 				if (w == entity->code) {
451 					MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name));
452 					for (char *str = entity->name; *str; str++) {
453 						out = mb_convert_buf_add(out, *str);
454 					}
455 					out = mb_convert_buf_add(out, ';');
456 					goto next_iteration;
457 				}
458 				entity++;
459 			}
460 
461 			/* There is no matching named entity; emit a numeric entity instead */
462 			MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12);
463 			out = mb_convert_buf_add(out, '#');
464 
465 			if (!w) {
466 				out = mb_convert_buf_add(out, '0');
467 			} else {
468 				unsigned char buf[12];
469 				unsigned char *converted = buf + sizeof(buf);
470 				while (w) {
471 					*(--converted) = "0123456789"[w % 10];
472 					w /= 10;
473 				}
474 				while (converted < buf + sizeof(buf)) {
475 					out = mb_convert_buf_add(out, *converted++);
476 				}
477 			}
478 
479 			out = mb_convert_buf_add(out, ';');
480 		}
481 
482 next_iteration: ;
483 	}
484 
485 	MB_CONVERT_BUF_STORE(buf, out, limit);
486 }
487