1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34
35 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
36 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
37
38 static const int htmlentitifieds[256] = {
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
55 };
56
57 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
58
59 const mbfl_encoding mbfl_encoding_html_ent = {
60 mbfl_no_encoding_html_ent,
61 "HTML-ENTITIES",
62 "HTML-ENTITIES",
63 mbfl_encoding_html_ent_aliases,
64 NULL,
65 MBFL_ENCTYPE_GL_UNSAFE,
66 &vtbl_html_wchar,
67 &vtbl_wchar_html,
68 mb_htmlent_to_wchar,
69 mb_wchar_to_htmlent,
70 NULL,
71 NULL,
72 };
73
74 const struct mbfl_convert_vtbl vtbl_wchar_html = {
75 mbfl_no_encoding_wchar,
76 mbfl_no_encoding_html_ent,
77 mbfl_filt_conv_common_ctor,
78 NULL,
79 mbfl_filt_conv_html_enc,
80 mbfl_filt_conv_html_enc_flush,
81 NULL,
82 };
83
84 const struct mbfl_convert_vtbl vtbl_html_wchar = {
85 mbfl_no_encoding_html_ent,
86 mbfl_no_encoding_wchar,
87 mbfl_filt_conv_html_dec_ctor,
88 mbfl_filt_conv_html_dec_dtor,
89 mbfl_filt_conv_html_dec,
90 mbfl_filt_conv_html_dec_flush,
91 mbfl_filt_conv_html_dec_copy,
92 };
93
94
95 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
96
97 /*
98 * any => HTML
99 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)100 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
101 {
102 int tmp[64];
103 int i;
104 unsigned int uc;
105 const mbfl_html_entity_entry *e;
106
107 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
108 htmlentitifieds[c] != 1) {
109 CK((*filter->output_function)(c, filter->data));
110 } else {
111 CK((*filter->output_function)('&', filter->data));
112 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
113 if (c == e->code) {
114 char *p;
115
116 for (p = e->name; *p != '\0'; p++) {
117 CK((*filter->output_function)((int)*p, filter->data));
118 }
119 goto last;
120 }
121 }
122
123 {
124 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
125
126 CK((*filter->output_function)('#', filter->data));
127
128 uc = (unsigned int)c;
129
130 *(--p) = '\0';
131 do {
132 *(--p) = "0123456789"[uc % 10];
133 uc /= 10;
134 } while (uc);
135
136 for (; *p != '\0'; p++) {
137 CK((*filter->output_function)(*p, filter->data));
138 }
139 }
140 last:
141 CK((*filter->output_function)(';', filter->data));
142 }
143 return 0;
144 }
145
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)146 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
147 {
148 filter->status = 0;
149 filter->opaque = NULL;
150
151 if (filter->flush_function != NULL) {
152 (*filter->flush_function)(filter->data);
153 }
154
155 return 0;
156 }
157
158 /*
159 * HTML => any
160 */
161 #define html_enc_buffer_size 16
162 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
163
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)164 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
165 {
166 filter->status = 0;
167 filter->opaque = emalloc(html_enc_buffer_size+1);
168 }
169
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)170 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
171 {
172 filter->status = 0;
173 if (filter->opaque)
174 {
175 efree((void*)filter->opaque);
176 }
177 filter->opaque = NULL;
178 }
179
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)180 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
181 {
182 int pos;
183 unsigned int ent = 0;
184 mbfl_html_entity_entry *entity;
185 unsigned char *buffer = (unsigned char*)filter->opaque;
186
187 if (!filter->status) {
188 if (c == '&' ) {
189 filter->status = 1;
190 buffer[0] = '&';
191 } else {
192 CK((*filter->output_function)(c, filter->data));
193 }
194 } else {
195 if (c == ';') {
196 if (buffer[1]=='#') {
197 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
198 if (filter->status > 3) {
199 /* numeric entity */
200 for (pos=3; pos<filter->status; pos++) {
201 int v = buffer[pos];
202 if (v >= '0' && v <= '9') {
203 v = v - '0';
204 } else if (v >= 'A' && v <= 'F') {
205 v = v - 'A' + 10;
206 } else if (v >= 'a' && v <= 'f') {
207 v = v - 'a' + 10;
208 } else {
209 ent = -1;
210 break;
211 }
212 ent = ent * 16 + v;
213 }
214 } else {
215 ent = -1;
216 }
217 } else {
218 /* numeric entity */
219 if (filter->status > 2) {
220 for (pos=2; pos<filter->status; pos++) {
221 if (ent > 0x19999999) {
222 ent = -1;
223 break;
224 }
225 int v = buffer[pos];
226 if (v >= '0' && v <= '9') {
227 v = v - '0';
228 } else {
229 ent = -1;
230 break;
231 }
232 ent = ent*10 + v;
233 }
234 } else {
235 ent = -1;
236 }
237 }
238 if (ent < 0x110000) {
239 CK((*filter->output_function)(ent, filter->data));
240 } else {
241 for (pos = 0; pos < filter->status; pos++) {
242 CK((*filter->output_function)(buffer[pos], filter->data));
243 }
244 CK((*filter->output_function)(c, filter->data));
245 }
246 filter->status = 0;
247 } else {
248 /* named entity */
249 buffer[filter->status] = 0;
250 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
251 while (entity->name) {
252 if (!strcmp((const char*)buffer+1, entity->name)) {
253 ent = entity->code;
254 break;
255 }
256 entity++;
257 }
258 if (ent) {
259 /* decoded */
260 CK((*filter->output_function)(ent, filter->data));
261 filter->status = 0;
262
263 } else {
264 /* failure */
265 buffer[filter->status++] = ';';
266 buffer[filter->status] = 0;
267
268 /* flush fragments */
269 pos = 0;
270 while (filter->status--) {
271 int e = (*filter->output_function)(buffer[pos++], filter->data);
272 if (e != 0)
273 return e;
274 }
275 filter->status = 0;
276 }
277 }
278 } else {
279 /* add character */
280 buffer[filter->status++] = c;
281 /* add character and check */
282 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
283 {
284 /* illegal character or end of buffer */
285 if (c=='&')
286 filter->status--;
287 buffer[filter->status] = 0;
288
289 pos = 0;
290 while (filter->status--) {
291 int e = (*filter->output_function)(buffer[pos++], filter->data);
292 if (e != 0)
293 return e;
294 }
295 filter->status = 0;
296
297 if (c=='&')
298 {
299 buffer[filter->status++] = '&';
300 }
301 }
302 }
303 }
304 return 0;
305 }
306
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)307 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
308 {
309 int status, pos = 0;
310 unsigned char *buffer;
311 int err = 0;
312
313 buffer = (unsigned char*)filter->opaque;
314 status = filter->status;
315 filter->status = 0;
316
317 /* flush fragments */
318 while (status--) {
319 int e = (*filter->output_function)(buffer[pos++], filter->data);
320 if (e != 0)
321 err = e;
322 }
323
324 if (filter->flush_function != NULL) {
325 (*filter->flush_function)(filter->data);
326 }
327
328 return err;
329 }
330
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)331 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
332 {
333 *dest = *src;
334 dest->opaque = emalloc(html_enc_buffer_size+1);
335 memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
336 }
337
is_html_entity_char(unsigned char c)338 static bool is_html_entity_char(unsigned char c)
339 {
340 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#';
341 }
342
mb_htmlent_to_wchar(unsigned char ** in,size_t * in_len,uint32_t * buf,size_t bufsize,unsigned int * state)343 static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
344 {
345 unsigned char *p = *in, *e = p + *in_len;
346 uint32_t *out = buf, *limit = buf + bufsize;
347
348 while (p < e && out < limit) {
349 unsigned char c = *p++;
350
351 if (c == '&') {
352 /* Find terminating ; for HTML entity */
353 unsigned char *terminator = p;
354 while (terminator < e && is_html_entity_char(*terminator))
355 terminator++;
356 if (terminator < e && *terminator == ';') {
357 if (*p == '#' && (e - p) >= 2) {
358 /* Numeric entity */
359 unsigned int value = 0;
360 unsigned char *digits = p + 1;
361 if (*digits == 'x' || *digits == 'X') {
362 /* Hexadecimal */
363 digits++;
364 if (digits == terminator) {
365 goto bad_entity;
366 }
367 while (digits < terminator) {
368 unsigned char digit = *digits++;
369 if (digit >= '0' && digit <= '9') {
370 value = (value * 16) + (digit - '0');
371 } else if (digit >= 'A' && digit <= 'F') {
372 value = (value * 16) + (digit - 'A' + 10);
373 } else if (digit >= 'a' && digit <= 'f') {
374 value = (value * 16) + (digit - 'a' + 10);
375 } else {
376 goto bad_entity;
377 }
378 }
379 } else {
380 /* Decimal */
381 if (digits == terminator) {
382 goto bad_entity;
383 }
384 while (digits < terminator) {
385 unsigned char digit = *digits++;
386 if (digit >= '0' && digit <= '9') {
387 value = (value * 10) + (digit - '0');
388 } else {
389 goto bad_entity;
390 }
391 }
392 }
393 if (value > 0x10FFFF) {
394 goto bad_entity;
395 }
396 *out++ = value;
397 p = terminator + 1;
398 goto next_iteration;
399 } else if (terminator > p && terminator < e) {
400 /* Named entity */
401 mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
402 while (entity->name) {
403 if (!strncmp((char*)p, entity->name, terminator - p) && strlen(entity->name) == terminator - p) {
404 *out++ = entity->code;
405 p = terminator + 1;
406 goto next_iteration;
407 }
408 entity++;
409 }
410 }
411 }
412 /* Either we didn't find ;, or the name of the entity was not recognized */
413 bad_entity:
414 *out++ = '&';
415 while (p < terminator && out < limit) {
416 *out++ = *p++;
417 }
418 if (terminator < e && *terminator == ';' && out < limit) {
419 *out++ = *p++;
420 }
421 } else {
422 *out++ = c;
423 }
424
425 next_iteration: ;
426 }
427
428 *in_len = e - p;
429 *in = p;
430 return out - buf;
431 }
432
mb_wchar_to_htmlent(uint32_t * in,size_t len,mb_convert_buf * buf,bool end)433 static void mb_wchar_to_htmlent(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
434 {
435 unsigned char *out, *limit;
436 MB_CONVERT_BUF_LOAD(buf, out, limit);
437 MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
438
439 while (len--) {
440 uint32_t w = *in++;
441
442 if (w < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) && htmlentitifieds[w] != 1) {
443 /* Fast path for most ASCII characters */
444 out = mb_convert_buf_add(out, w);
445 } else {
446 out = mb_convert_buf_add(out, '&');
447
448 /* See if there is a matching named entity */
449 mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
450 while (entity->name) {
451 if (w == entity->code) {
452 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 1 + strlen(entity->name));
453 for (char *str = entity->name; *str; str++) {
454 out = mb_convert_buf_add(out, *str);
455 }
456 out = mb_convert_buf_add(out, ';');
457 goto next_iteration;
458 }
459 entity++;
460 }
461
462 /* There is no matching named entity; emit a numeric entity instead */
463 MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 12);
464 out = mb_convert_buf_add(out, '#');
465
466 if (!w) {
467 out = mb_convert_buf_add(out, '0');
468 } else {
469 unsigned char buf[12];
470 unsigned char *converted = buf + sizeof(buf);
471 while (w) {
472 *(--converted) = "0123456789"[w % 10];
473 w /= 10;
474 }
475 while (converted < buf + sizeof(buf)) {
476 out = mb_convert_buf_add(out, *converted++);
477 }
478 }
479
480 out = mb_convert_buf_add(out, ';');
481 }
482
483 next_iteration: ;
484 }
485
486 MB_CONVERT_BUF_STORE(buf, out, limit);
487 }
488