1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34
35 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37
38 static const int htmlentitifieds[256] = {
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
55 };
56
57 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
58
59 const mbfl_encoding mbfl_encoding_html_ent = {
60 mbfl_no_encoding_html_ent,
61 "HTML-ENTITIES",
62 "HTML-ENTITIES",
63 mbfl_encoding_html_ent_aliases,
64 NULL,
65 MBFL_ENCTYPE_GL_UNSAFE,
66 &vtbl_html_wchar,
67 &vtbl_wchar_html,
68 mb_htmlent_to_wchar,
69 mb_wchar_to_htmlent,
70 NULL
71 };
72
73 const struct mbfl_convert_vtbl vtbl_wchar_html = {
74 mbfl_no_encoding_wchar,
75 mbfl_no_encoding_html_ent,
76 mbfl_filt_conv_common_ctor,
77 NULL,
78 mbfl_filt_conv_html_enc,
79 mbfl_filt_conv_html_enc_flush,
80 NULL,
81 };
82
83 const struct mbfl_convert_vtbl vtbl_html_wchar = {
84 mbfl_no_encoding_html_ent,
85 mbfl_no_encoding_wchar,
86 mbfl_filt_conv_html_dec_ctor,
87 mbfl_filt_conv_html_dec_dtor,
88 mbfl_filt_conv_html_dec,
89 mbfl_filt_conv_html_dec_flush,
90 mbfl_filt_conv_html_dec_copy,
91 };
92
93
94 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
95
96 /*
97 * any => HTML
98 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)99 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
100 {
101 int tmp[64];
102 int i;
103 unsigned int uc;
104 const mbfl_html_entity_entry *e;
105
106 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
107 htmlentitifieds[c] != 1) {
108 CK((*filter->output_function)(c, filter->data));
109 } else {
110 CK((*filter->output_function)('&', filter->data));
111 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
112 if (c == e->code) {
113 char *p;
114
115 for (p = e->name; *p != '\0'; p++) {
116 CK((*filter->output_function)((int)*p, filter->data));
117 }
118 goto last;
119 }
120 }
121
122 {
123 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
124
125 CK((*filter->output_function)('#', filter->data));
126
127 uc = (unsigned int)c;
128
129 *(--p) = '\0';
130 do {
131 *(--p) = "0123456789"[uc % 10];
132 uc /= 10;
133 } while (uc);
134
135 for (; *p != '\0'; p++) {
136 CK((*filter->output_function)(*p, filter->data));
137 }
138 }
139 last:
140 CK((*filter->output_function)(';', filter->data));
141 }
142 return 0;
143 }
144
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)145 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
146 {
147 filter->status = 0;
148 filter->opaque = NULL;
149
150 if (filter->flush_function != NULL) {
151 (*filter->flush_function)(filter->data);
152 }
153
154 return 0;
155 }
156
157 /*
158 * HTML => any
159 */
160 #define html_enc_buffer_size 16
161 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
162
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)163 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
164 {
165 filter->status = 0;
166 filter->opaque = emalloc(html_enc_buffer_size+1);
167 }
168
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)169 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
170 {
171 filter->status = 0;
172 if (filter->opaque)
173 {
174 efree((void*)filter->opaque);
175 }
176 filter->opaque = NULL;
177 }
178
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)179 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
180 {
181 int pos;
182 unsigned int ent = 0;
183 mbfl_html_entity_entry *entity;
184 unsigned char *buffer = (unsigned char*)filter->opaque;
185
186 if (!filter->status) {
187 if (c == '&' ) {
188 filter->status = 1;
189 buffer[0] = '&';
190 } else {
191 CK((*filter->output_function)(c, filter->data));
192 }
193 } else {
194 if (c == ';') {
195 if (buffer[1]=='#') {
196 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
197 if (filter->status > 3) {
198 /* numeric entity */
199 for (pos=3; pos<filter->status; pos++) {
200 int v = buffer[pos];
201 if (v >= '0' && v <= '9') {
202 v = v - '0';
203 } else if (v >= 'A' && v <= 'F') {
204 v = v - 'A' + 10;
205 } else if (v >= 'a' && v <= 'f') {
206 v = v - 'a' + 10;
207 } else {
208 ent = -1;
209 break;
210 }
211 ent = ent * 16 + v;
212 }
213 } else {
214 ent = -1;
215 }
216 } else {
217 /* numeric entity */
218 if (filter->status > 2) {
219 for (pos=2; pos<filter->status; pos++) {
220 if (ent > 0x19999999) {
221 ent = -1;
222 break;
223 }
224 int v = buffer[pos];
225 if (v >= '0' && v <= '9') {
226 v = v - '0';
227 } else {
228 ent = -1;
229 break;
230 }
231 ent = ent*10 + v;
232 }
233 } else {
234 ent = -1;
235 }
236 }
237 if (ent < 0x110000) {
238 CK((*filter->output_function)(ent, filter->data));
239 } else {
240 for (pos = 0; pos < filter->status; pos++) {
241 CK((*filter->output_function)(buffer[pos], filter->data));
242 }
243 CK((*filter->output_function)(c, filter->data));
244 }
245 filter->status = 0;
246 } else {
247 /* named entity */
248 buffer[filter->status] = 0;
249 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
250 while (entity->name) {
251 if (!strcmp((const char*)buffer+1, entity->name)) {
252 ent = entity->code;
253 break;
254 }
255 entity++;
256 }
257 if (ent) {
258 /* decoded */
259 CK((*filter->output_function)(ent, filter->data));
260 filter->status = 0;
261
262 } else {
263 /* failure */
264 buffer[filter->status++] = ';';
265 buffer[filter->status] = 0;
266
267 /* flush fragments */
268 pos = 0;
269 while (filter->status--) {
270 int e = (*filter->output_function)(buffer[pos++], filter->data);
271 if (e != 0)
272 return e;
273 }
274 filter->status = 0;
275 }
276 }
277 } else {
278 /* add character */
279 buffer[filter->status++] = c;
280 /* add character and check */
281 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
282 {
283 /* illegal character or end of buffer */
284 if (c=='&')
285 filter->status--;
286 buffer[filter->status] = 0;
287
288 pos = 0;
289 while (filter->status--) {
290 int e = (*filter->output_function)(buffer[pos++], filter->data);
291 if (e != 0)
292 return e;
293 }
294 filter->status = 0;
295
296 if (c=='&')
297 {
298 buffer[filter->status++] = '&';
299 }
300 }
301 }
302 }
303 return 0;
304 }
305
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)306 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
307 {
308 int status, pos = 0;
309 unsigned char *buffer;
310 int err = 0;
311
312 buffer = (unsigned char*)filter->opaque;
313 status = filter->status;
314 filter->status = 0;
315
316 /* flush fragments */
317 while (status--) {
318 int e = (*filter->output_function)(buffer[pos++], filter->data);
319 if (e != 0)
320 err = e;
321 }
322
323 if (filter->flush_function != NULL) {
324 (*filter->flush_function)(filter->data);
325 }
326
327 return err;
328 }
329
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)330 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
331 {
332 *dest = *src;
333 dest->opaque = emalloc(html_enc_buffer_size+1);
334 memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
335 }
336
is_html_entity_char(unsigned char c)337 static bool is_html_entity_char(unsigned char c)
338 {
339 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#';
340 }
341
mb_htmlent_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)342 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
343 {
344 unsigned char *p = *in, *e = p + *in_len;
345 uint32_t *out = buf, *limit = buf + bufsize;
346
347 while (p < e && out < limit) {
348 unsigned char c = *p++;
349
350 if (c == '&') {
351 /* Find terminating ; for HTML entity */
352 unsigned char *terminator = p;
353 while (terminator < e && is_html_entity_char(*terminator))
354 terminator++;
355 if (terminator < e && *terminator == ';') {
356 if (*p == '#' && (e - p) >= 2) {
357 /* Numeric entity */
358 unsigned int value = 0;
359 unsigned char *digits = p + 1;
360 if (*digits == 'x' || *digits == 'X') {
361 /* Hexadecimal */
362 digits++;
363 if (digits == terminator) {
364 goto bad_entity;
365 }
366 while (digits < terminator) {
367 unsigned char digit = *digits++;
368 if (digit >= '0' && digit <= '9') {
369 value = (value * 16) + (digit - '0');
370 } else if (digit >= 'A' && digit <= 'F') {
371 value = (value * 16) + (digit - 'A' + 10);
372 } else if (digit >= 'a' && digit <= 'f') {
373 value = (value * 16) + (digit - 'a' + 10);
374 } else {
375 goto bad_entity;
376 }
377 }
378 } else {
379 /* Decimal */
380 if (digits == terminator) {
381 goto bad_entity;
382 }
383 while (digits < terminator) {
384 unsigned char digit = *digits++;
385 if (digit >= '0' && digit <= '9') {
386 value = (value * 10) + (digit - '0');
387 } else {
388 goto bad_entity;
389 }
390 }
391 }
392 if (value > 0x10FFFF) {
393 goto bad_entity;
394 }
395 *out++ = value;
396 p = terminator + 1;
397 goto next_iteration;
398 } else if (terminator > p && terminator < e) {
399 /* Named entity */
400 mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
401 while (entity->name) {
402 if (!strncmp((char*)p, entity->name, terminator - p) && strlen(entity->name) == terminator - p) {
403 *out++ = entity->code;
404 p = terminator + 1;
405 goto next_iteration;
406 }
407 entity++;
408 }
409 }
410 }
411 /* Either we didn't find ;, or the name of the entity was not recognized */
412 bad_entity:
413 *out++ = '&';
414 while (p < terminator && out < limit) {
415 *out++ = *p++;
416 }
417 if (terminator < e && *terminator == ';' && out < limit) {
418 *out++ = *p++;
419 }
420 } else {
421 *out++ = c;
422 }
423
424 next_iteration: ;
425 }
426
427 *in_len = e - p;
428 *in = p;
429 return out - buf;
430 }
431
mb_wchar_to_htmlent(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)432 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
433 {
434 unsigned char *out, *limit;
435 MB_CONVERT_BUF_LOAD(buf, out, limit);
436 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
437
438 while (len--) {
439 uint32_t w = *in++;
440
441 if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) {
442 /* Fast path for most ASCII characters */
443 out = mb_convert_buf_add(out, w);
444 } else {
445 out = mb_convert_buf_add(out, '&');
446
447 /* See if there is a matching named entity */
448 mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
449 while (entity->name) {
450 if (w == entity->code) {
451 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name));
452 for (char *str = entity->name; *str; str++) {
453 out = mb_convert_buf_add(out, *str);
454 }
455 out = mb_convert_buf_add(out, ';');
456 goto next_iteration;
457 }
458 entity++;
459 }
460
461 /* There is no matching named entity; emit a numeric entity instead */
462 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12);
463 out = mb_convert_buf_add(out, '#');
464
465 if (!w) {
466 out = mb_convert_buf_add(out, '0');
467 } else {
468 unsigned char buf[12];
469 unsigned char *converted = buf + sizeof(buf);
470 while (w) {
471 *(--converted) = "0123456789"[w % 10];
472 w /= 10;
473 }
474 while (converted < buf + sizeof(buf)) {
475 out = mb_convert_buf_add(out, *converted++);
476 }
477 }
478
479 out = mb_convert_buf_add(out, ';');
480 }
481
482 next_iteration: ;
483 }
484
485 MB_CONVERT_BUF_STORE(buf, out, limit);
486 }
487