1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #ifdef HAVE_STRING_H
35 #include <string.h>
36 #endif
37
38 #ifdef HAVE_STRINGS_H
39 #include <strings.h>
40 #endif
41
42 #include "mbfilter.h"
43 #include "mbfilter_htmlent.h"
44 #include "html_entities.h"
45
46 static const int htmlentitifieds[256] = {
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
63 };
64
65 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
66
67 const mbfl_encoding mbfl_encoding_html_ent = {
68 mbfl_no_encoding_html_ent,
69 "HTML-ENTITIES",
70 "HTML-ENTITIES",
71 (const char *(*)[])&mbfl_encoding_html_ent_aliases,
72 NULL,
73 MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE
74 };
75
76 const struct mbfl_convert_vtbl vtbl_wchar_html = {
77 mbfl_no_encoding_wchar,
78 mbfl_no_encoding_html_ent,
79 mbfl_filt_conv_common_ctor,
80 mbfl_filt_conv_common_dtor,
81 mbfl_filt_conv_html_enc,
82 mbfl_filt_conv_html_enc_flush
83 };
84
85 const struct mbfl_convert_vtbl vtbl_html_wchar = {
86 mbfl_no_encoding_html_ent,
87 mbfl_no_encoding_wchar,
88 mbfl_filt_conv_html_dec_ctor,
89 mbfl_filt_conv_html_dec_dtor,
90 mbfl_filt_conv_html_dec,
91 mbfl_filt_conv_html_dec_flush };
92
93
94 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
95
96 /*
97 * any => HTML
98 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)99 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
100 {
101 int tmp[64];
102 int i;
103 unsigned int uc;
104 const mbfl_html_entity_entry *e;
105
106 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
107 htmlentitifieds[c] != 1) {
108 CK((*filter->output_function)(c, filter->data));
109 } else {
110 CK((*filter->output_function)('&', filter->data));
111 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
112 if (c == e->code) {
113 char *p;
114
115 for (p = e->name; *p != '\0'; p++) {
116 CK((*filter->output_function)((int)*p, filter->data));
117 }
118 goto last;
119 }
120 }
121
122 {
123 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
124
125 CK((*filter->output_function)('#', filter->data));
126
127 uc = (unsigned int)c;
128
129 *(--p) = '\0';
130 do {
131 *(--p) = "0123456789"[uc % 10];
132 uc /= 10;
133 } while (uc);
134
135 for (; *p != '\0'; p++) {
136 CK((*filter->output_function)(*p, filter->data));
137 }
138 }
139 last:
140 CK((*filter->output_function)(';', filter->data));
141 }
142 return c;
143 }
144
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)145 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
146 {
147 filter->status = 0;
148 filter->opaque = NULL;
149
150 if (filter->flush_function != NULL) {
151 (*filter->flush_function)(filter->data);
152 }
153
154 return 0;
155 }
156
157 /*
158 * HTML => any
159 */
160 #define html_enc_buffer_size 16
161 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
162
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)163 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
164 {
165 filter->status = 0;
166 filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
167 }
168
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)169 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
170 {
171 filter->status = 0;
172 if (filter->opaque)
173 {
174 mbfl_free((void*)filter->opaque);
175 }
176 filter->opaque = NULL;
177 }
178
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)179 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
180 {
181 int pos, ent = 0;
182 mbfl_html_entity_entry *entity;
183 char *buffer = (char*)filter->opaque;
184
185 if (!filter->status) {
186 if (c == '&' ) {
187 filter->status = 1;
188 buffer[0] = '&';
189 } else {
190 CK((*filter->output_function)(c, filter->data));
191 }
192 } else {
193 if (c == ';') {
194 if (buffer[1]=='#') {
195 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
196 if (filter->status > 3) {
197 /* numeric entity */
198 for (pos=3; pos<filter->status; pos++) {
199 int v = buffer[pos];
200 if (v >= '0' && v <= '9') {
201 v = v - '0';
202 } else if (v >= 'A' && v <= 'F') {
203 v = v - 'A' + 10;
204 } else if (v >= 'a' && v <= 'f') {
205 v = v - 'a' + 10;
206 } else {
207 ent = -1;
208 break;
209 }
210 ent = ent * 16 + v;
211 }
212 } else {
213 ent = -1;
214 }
215 } else {
216 /* numeric entity */
217 if (filter->status > 2) {
218 for (pos=2; pos<filter->status; pos++) {
219 int v = buffer[pos];
220 if (v >= '0' && v <= '9') {
221 v = v - '0';
222 } else {
223 ent = -1;
224 break;
225 }
226 ent = ent*10 + v;
227 }
228 } else {
229 ent = -1;
230 }
231 }
232 if (ent >= 0 && ent < 0x110000) {
233 CK((*filter->output_function)(ent, filter->data));
234 } else {
235 for (pos = 0; pos < filter->status; pos++) {
236 CK((*filter->output_function)(buffer[pos], filter->data));
237 }
238 CK((*filter->output_function)(c, filter->data));
239 }
240 filter->status = 0;
241 /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
242 } else {
243 /* named entity */
244 buffer[filter->status] = 0;
245 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
246 while (entity->name) {
247 if (!strcmp(buffer+1, entity->name)) {
248 ent = entity->code;
249 break;
250 }
251 entity++;
252 }
253 if (ent) {
254 /* decoded */
255 CK((*filter->output_function)(ent, filter->data));
256 filter->status = 0;
257 /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
258 } else {
259 /* failure */
260 buffer[filter->status++] = ';';
261 buffer[filter->status] = 0;
262 /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer); */
263 mbfl_filt_conv_html_dec_flush(filter);
264 }
265 }
266 } else {
267 /* add character */
268 buffer[filter->status++] = c;
269 /* add character and check */
270 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
271 {
272 /* illegal character or end of buffer */
273 if (c=='&')
274 filter->status--;
275 buffer[filter->status] = 0;
276 /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer)l */
277 mbfl_filt_conv_html_dec_flush(filter);
278 if (c=='&')
279 {
280 buffer[filter->status++] = '&';
281 }
282 }
283 }
284 }
285 return c;
286 }
287
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)288 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
289 {
290 int status, pos = 0;
291 unsigned char *buffer;
292 int err = 0;
293
294 buffer = (unsigned char*)filter->opaque;
295 status = filter->status;
296 filter->status = 0;
297
298 /* flush fragments */
299 while (status--) {
300 int e = (*filter->output_function)(buffer[pos++], filter->data);
301 if (e != 0)
302 err = e;
303 }
304
305 if (filter->flush_function != NULL) {
306 (*filter->flush_function)(filter->data);
307 }
308
309 return err;
310 }
311
312
313