1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include <string.h>
35 #include "mbfilter.h"
36 #include "mbfilter_htmlent.h"
37 #include "html_entities.h"
38
39 static const int htmlentitifieds[256] = {
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
56 };
57
58 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
59
60 const mbfl_encoding mbfl_encoding_html_ent = {
61 mbfl_no_encoding_html_ent,
62 "HTML-ENTITIES",
63 "HTML-ENTITIES",
64 (const char *(*)[])&mbfl_encoding_html_ent_aliases,
65 NULL,
66 MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE,
67 &vtbl_html_wchar,
68 &vtbl_wchar_html
69 };
70
71 const struct mbfl_convert_vtbl vtbl_wchar_html = {
72 mbfl_no_encoding_wchar,
73 mbfl_no_encoding_html_ent,
74 mbfl_filt_conv_common_ctor,
75 mbfl_filt_conv_common_dtor,
76 mbfl_filt_conv_html_enc,
77 mbfl_filt_conv_html_enc_flush
78 };
79
80 const struct mbfl_convert_vtbl vtbl_html_wchar = {
81 mbfl_no_encoding_html_ent,
82 mbfl_no_encoding_wchar,
83 mbfl_filt_conv_html_dec_ctor,
84 mbfl_filt_conv_html_dec_dtor,
85 mbfl_filt_conv_html_dec,
86 mbfl_filt_conv_html_dec_flush,
87 mbfl_filt_conv_html_dec_copy };
88
89
90 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
91
92 /*
93 * any => HTML
94 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)95 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
96 {
97 int tmp[64];
98 int i;
99 unsigned int uc;
100 const mbfl_html_entity_entry *e;
101
102 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
103 htmlentitifieds[c] != 1) {
104 CK((*filter->output_function)(c, filter->data));
105 } else {
106 CK((*filter->output_function)('&', filter->data));
107 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
108 if (c == e->code) {
109 char *p;
110
111 for (p = e->name; *p != '\0'; p++) {
112 CK((*filter->output_function)((int)*p, filter->data));
113 }
114 goto last;
115 }
116 }
117
118 {
119 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
120
121 CK((*filter->output_function)('#', filter->data));
122
123 uc = (unsigned int)c;
124
125 *(--p) = '\0';
126 do {
127 *(--p) = "0123456789"[uc % 10];
128 uc /= 10;
129 } while (uc);
130
131 for (; *p != '\0'; p++) {
132 CK((*filter->output_function)(*p, filter->data));
133 }
134 }
135 last:
136 CK((*filter->output_function)(';', filter->data));
137 }
138 return c;
139 }
140
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)141 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
142 {
143 filter->status = 0;
144 filter->opaque = NULL;
145
146 if (filter->flush_function != NULL) {
147 (*filter->flush_function)(filter->data);
148 }
149
150 return 0;
151 }
152
153 /*
154 * HTML => any
155 */
156 #define html_enc_buffer_size 16
157 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
158
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)159 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
160 {
161 filter->status = 0;
162 filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
163 }
164
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)165 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
166 {
167 filter->status = 0;
168 if (filter->opaque)
169 {
170 mbfl_free((void*)filter->opaque);
171 }
172 filter->opaque = NULL;
173 }
174
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)175 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
176 {
177 int pos, ent = 0;
178 mbfl_html_entity_entry *entity;
179 char *buffer = (char*)filter->opaque;
180
181 if (!filter->status) {
182 if (c == '&' ) {
183 filter->status = 1;
184 buffer[0] = '&';
185 } else {
186 CK((*filter->output_function)(c, filter->data));
187 }
188 } else {
189 if (c == ';') {
190 if (buffer[1]=='#') {
191 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
192 if (filter->status > 3) {
193 /* numeric entity */
194 for (pos=3; pos<filter->status; pos++) {
195 int v = buffer[pos];
196 if (v >= '0' && v <= '9') {
197 v = v - '0';
198 } else if (v >= 'A' && v <= 'F') {
199 v = v - 'A' + 10;
200 } else if (v >= 'a' && v <= 'f') {
201 v = v - 'a' + 10;
202 } else {
203 ent = -1;
204 break;
205 }
206 ent = ent * 16 + v;
207 }
208 } else {
209 ent = -1;
210 }
211 } else {
212 /* numeric entity */
213 if (filter->status > 2) {
214 for (pos=2; pos<filter->status; pos++) {
215 int v = buffer[pos];
216 if (v >= '0' && v <= '9') {
217 v = v - '0';
218 } else {
219 ent = -1;
220 break;
221 }
222 ent = ent*10 + v;
223 }
224 } else {
225 ent = -1;
226 }
227 }
228 if (ent >= 0 && ent < 0x110000) {
229 CK((*filter->output_function)(ent, filter->data));
230 } else {
231 for (pos = 0; pos < filter->status; pos++) {
232 CK((*filter->output_function)(buffer[pos], filter->data));
233 }
234 CK((*filter->output_function)(c, filter->data));
235 }
236 filter->status = 0;
237 /*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
238 } else {
239 /* named entity */
240 buffer[filter->status] = 0;
241 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
242 while (entity->name) {
243 if (!strcmp(buffer+1, entity->name)) {
244 ent = entity->code;
245 break;
246 }
247 entity++;
248 }
249 if (ent) {
250 /* decoded */
251 CK((*filter->output_function)(ent, filter->data));
252 filter->status = 0;
253 /*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
254 } else {
255 /* failure */
256 buffer[filter->status++] = ';';
257 buffer[filter->status] = 0;
258 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
259 mbfl_filt_conv_html_dec_flush(filter);
260 }
261 }
262 } else {
263 /* add character */
264 buffer[filter->status++] = c;
265 /* add character and check */
266 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
267 {
268 /* illegal character or end of buffer */
269 if (c=='&')
270 filter->status--;
271 buffer[filter->status] = 0;
272 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
273 mbfl_filt_conv_html_dec_flush(filter);
274 if (c=='&')
275 {
276 buffer[filter->status++] = '&';
277 }
278 }
279 }
280 }
281 return c;
282 }
283
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)284 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
285 {
286 int status, pos = 0;
287 unsigned char *buffer;
288 int err = 0;
289
290 buffer = (unsigned char*)filter->opaque;
291 status = filter->status;
292 filter->status = 0;
293
294 /* flush fragments */
295 while (status--) {
296 int e = (*filter->output_function)(buffer[pos++], filter->data);
297 if (e != 0)
298 err = e;
299 }
300
301 if (filter->flush_function != NULL) {
302 (*filter->flush_function)(filter->data);
303 }
304
305 return err;
306 }
307
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)308 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
309 {
310 *dest = *src;
311 dest->opaque = mbfl_malloc(html_enc_buffer_size+1);
312 memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
313 }
314