1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #ifdef HAVE_STRING_H
35 #include <string.h>
36 #endif
37
38 #ifdef HAVE_STRINGS_H
39 #include <strings.h>
40 #endif
41
42 #include "mbfilter.h"
43 #include "mbfilter_htmlent.h"
44 #include "html_entities.h"
45
46 static const int htmlentitifieds[256] = {
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
63 };
64
65 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
66
67 const mbfl_encoding mbfl_encoding_html_ent = {
68 mbfl_no_encoding_html_ent,
69 "HTML-ENTITIES",
70 "HTML-ENTITIES",
71 (const char *(*)[])&mbfl_encoding_html_ent_aliases,
72 NULL,
73 MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE,
74 &vtbl_html_wchar,
75 &vtbl_wchar_html
76 };
77
78 const struct mbfl_convert_vtbl vtbl_wchar_html = {
79 mbfl_no_encoding_wchar,
80 mbfl_no_encoding_html_ent,
81 mbfl_filt_conv_common_ctor,
82 mbfl_filt_conv_common_dtor,
83 mbfl_filt_conv_html_enc,
84 mbfl_filt_conv_html_enc_flush
85 };
86
87 const struct mbfl_convert_vtbl vtbl_html_wchar = {
88 mbfl_no_encoding_html_ent,
89 mbfl_no_encoding_wchar,
90 mbfl_filt_conv_html_dec_ctor,
91 mbfl_filt_conv_html_dec_dtor,
92 mbfl_filt_conv_html_dec,
93 mbfl_filt_conv_html_dec_flush,
94 mbfl_filt_conv_html_dec_copy };
95
96
97 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
98
99 /*
100 * any => HTML
101 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)102 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
103 {
104 int tmp[64];
105 int i;
106 unsigned int uc;
107 const mbfl_html_entity_entry *e;
108
109 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
110 htmlentitifieds[c] != 1) {
111 CK((*filter->output_function)(c, filter->data));
112 } else {
113 CK((*filter->output_function)('&', filter->data));
114 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
115 if (c == e->code) {
116 char *p;
117
118 for (p = e->name; *p != '\0'; p++) {
119 CK((*filter->output_function)((int)*p, filter->data));
120 }
121 goto last;
122 }
123 }
124
125 {
126 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
127
128 CK((*filter->output_function)('#', filter->data));
129
130 uc = (unsigned int)c;
131
132 *(--p) = '\0';
133 do {
134 *(--p) = "0123456789"[uc % 10];
135 uc /= 10;
136 } while (uc);
137
138 for (; *p != '\0'; p++) {
139 CK((*filter->output_function)(*p, filter->data));
140 }
141 }
142 last:
143 CK((*filter->output_function)(';', filter->data));
144 }
145 return c;
146 }
147
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)148 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
149 {
150 filter->status = 0;
151 filter->opaque = NULL;
152
153 if (filter->flush_function != NULL) {
154 (*filter->flush_function)(filter->data);
155 }
156
157 return 0;
158 }
159
160 /*
161 * HTML => any
162 */
163 #define html_enc_buffer_size 16
164 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
165
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)166 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
167 {
168 filter->status = 0;
169 filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
170 }
171
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)172 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
173 {
174 filter->status = 0;
175 if (filter->opaque)
176 {
177 mbfl_free((void*)filter->opaque);
178 }
179 filter->opaque = NULL;
180 }
181
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)182 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
183 {
184 int pos, ent = 0;
185 mbfl_html_entity_entry *entity;
186 char *buffer = (char*)filter->opaque;
187
188 if (!filter->status) {
189 if (c == '&' ) {
190 filter->status = 1;
191 buffer[0] = '&';
192 } else {
193 CK((*filter->output_function)(c, filter->data));
194 }
195 } else {
196 if (c == ';') {
197 if (buffer[1]=='#') {
198 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
199 if (filter->status > 3) {
200 /* numeric entity */
201 for (pos=3; pos<filter->status; pos++) {
202 int v = buffer[pos];
203 if (v >= '0' && v <= '9') {
204 v = v - '0';
205 } else if (v >= 'A' && v <= 'F') {
206 v = v - 'A' + 10;
207 } else if (v >= 'a' && v <= 'f') {
208 v = v - 'a' + 10;
209 } else {
210 ent = -1;
211 break;
212 }
213 ent = ent * 16 + v;
214 }
215 } else {
216 ent = -1;
217 }
218 } else {
219 /* numeric entity */
220 if (filter->status > 2) {
221 for (pos=2; pos<filter->status; pos++) {
222 int v = buffer[pos];
223 if (v >= '0' && v <= '9') {
224 v = v - '0';
225 } else {
226 ent = -1;
227 break;
228 }
229 ent = ent*10 + v;
230 }
231 } else {
232 ent = -1;
233 }
234 }
235 if (ent >= 0 && ent < 0x110000) {
236 CK((*filter->output_function)(ent, filter->data));
237 } else {
238 for (pos = 0; pos < filter->status; pos++) {
239 CK((*filter->output_function)(buffer[pos], filter->data));
240 }
241 CK((*filter->output_function)(c, filter->data));
242 }
243 filter->status = 0;
244 /*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
245 } else {
246 /* named entity */
247 buffer[filter->status] = 0;
248 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
249 while (entity->name) {
250 if (!strcmp(buffer+1, entity->name)) {
251 ent = entity->code;
252 break;
253 }
254 entity++;
255 }
256 if (ent) {
257 /* decoded */
258 CK((*filter->output_function)(ent, filter->data));
259 filter->status = 0;
260 /*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
261 } else {
262 /* failure */
263 buffer[filter->status++] = ';';
264 buffer[filter->status] = 0;
265 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
266 mbfl_filt_conv_html_dec_flush(filter);
267 }
268 }
269 } else {
270 /* add character */
271 buffer[filter->status++] = c;
272 /* add character and check */
273 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
274 {
275 /* illegal character or end of buffer */
276 if (c=='&')
277 filter->status--;
278 buffer[filter->status] = 0;
279 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
280 mbfl_filt_conv_html_dec_flush(filter);
281 if (c=='&')
282 {
283 buffer[filter->status++] = '&';
284 }
285 }
286 }
287 }
288 return c;
289 }
290
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)291 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
292 {
293 int status, pos = 0;
294 unsigned char *buffer;
295 int err = 0;
296
297 buffer = (unsigned char*)filter->opaque;
298 status = filter->status;
299 filter->status = 0;
300
301 /* flush fragments */
302 while (status--) {
303 int e = (*filter->output_function)(buffer[pos++], filter->data);
304 if (e != 0)
305 err = e;
306 }
307
308 if (filter->flush_function != NULL) {
309 (*filter->flush_function)(filter->data);
310 }
311
312 return err;
313 }
314
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)315 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
316 {
317 *dest = *src;
318 dest->opaque = mbfl_malloc(html_enc_buffer_size+1);
319 memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
320 }
321