1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this part: Marcus Boerger <helly@php.net>
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.c
26 * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 *
28 */
29
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34
35 static const int htmlentitifieds[256] = {
36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
52 };
53
54 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
55
56 const mbfl_encoding mbfl_encoding_html_ent = {
57 mbfl_no_encoding_html_ent,
58 "HTML-ENTITIES",
59 "HTML-ENTITIES",
60 (const char *(*)[])&mbfl_encoding_html_ent_aliases,
61 NULL,
62 MBFL_ENCTYPE_GL_UNSAFE,
63 &vtbl_html_wchar,
64 &vtbl_wchar_html
65 };
66
67 const struct mbfl_convert_vtbl vtbl_wchar_html = {
68 mbfl_no_encoding_wchar,
69 mbfl_no_encoding_html_ent,
70 mbfl_filt_conv_common_ctor,
71 NULL,
72 mbfl_filt_conv_html_enc,
73 mbfl_filt_conv_html_enc_flush,
74 NULL,
75 };
76
77 const struct mbfl_convert_vtbl vtbl_html_wchar = {
78 mbfl_no_encoding_html_ent,
79 mbfl_no_encoding_wchar,
80 mbfl_filt_conv_html_dec_ctor,
81 mbfl_filt_conv_html_dec_dtor,
82 mbfl_filt_conv_html_dec,
83 mbfl_filt_conv_html_dec_flush,
84 mbfl_filt_conv_html_dec_copy,
85 };
86
87
88 #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
89
90 /*
91 * any => HTML
92 */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)93 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
94 {
95 int tmp[64];
96 int i;
97 unsigned int uc;
98 const mbfl_html_entity_entry *e;
99
100 if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
101 htmlentitifieds[c] != 1) {
102 CK((*filter->output_function)(c, filter->data));
103 } else {
104 CK((*filter->output_function)('&', filter->data));
105 for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
106 if (c == e->code) {
107 char *p;
108
109 for (p = e->name; *p != '\0'; p++) {
110 CK((*filter->output_function)((int)*p, filter->data));
111 }
112 goto last;
113 }
114 }
115
116 {
117 int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
118
119 CK((*filter->output_function)('#', filter->data));
120
121 uc = (unsigned int)c;
122
123 *(--p) = '\0';
124 do {
125 *(--p) = "0123456789"[uc % 10];
126 uc /= 10;
127 } while (uc);
128
129 for (; *p != '\0'; p++) {
130 CK((*filter->output_function)(*p, filter->data));
131 }
132 }
133 last:
134 CK((*filter->output_function)(';', filter->data));
135 }
136 return c;
137 }
138
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)139 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
140 {
141 filter->status = 0;
142 filter->opaque = NULL;
143
144 if (filter->flush_function != NULL) {
145 (*filter->flush_function)(filter->data);
146 }
147
148 return 0;
149 }
150
151 /*
152 * HTML => any
153 */
154 #define html_enc_buffer_size 16
155 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
156
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)157 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
158 {
159 filter->status = 0;
160 filter->opaque = emalloc(html_enc_buffer_size+1);
161 }
162
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)163 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
164 {
165 filter->status = 0;
166 if (filter->opaque)
167 {
168 efree((void*)filter->opaque);
169 }
170 filter->opaque = NULL;
171 }
172
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)173 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
174 {
175 int pos, ent = 0;
176 mbfl_html_entity_entry *entity;
177 char *buffer = (char*)filter->opaque;
178
179 if (!filter->status) {
180 if (c == '&' ) {
181 filter->status = 1;
182 buffer[0] = '&';
183 } else {
184 CK((*filter->output_function)(c, filter->data));
185 }
186 } else {
187 if (c == ';') {
188 if (buffer[1]=='#') {
189 if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
190 if (filter->status > 3) {
191 /* numeric entity */
192 for (pos=3; pos<filter->status; pos++) {
193 int v = buffer[pos];
194 if (v >= '0' && v <= '9') {
195 v = v - '0';
196 } else if (v >= 'A' && v <= 'F') {
197 v = v - 'A' + 10;
198 } else if (v >= 'a' && v <= 'f') {
199 v = v - 'a' + 10;
200 } else {
201 ent = -1;
202 break;
203 }
204 ent = ent * 16 + v;
205 }
206 } else {
207 ent = -1;
208 }
209 } else {
210 /* numeric entity */
211 if (filter->status > 2) {
212 for (pos=2; pos<filter->status; pos++) {
213 int v = buffer[pos];
214 if (v >= '0' && v <= '9') {
215 v = v - '0';
216 } else {
217 ent = -1;
218 break;
219 }
220 ent = ent*10 + v;
221 }
222 } else {
223 ent = -1;
224 }
225 }
226 if (ent >= 0 && ent < 0x110000) {
227 CK((*filter->output_function)(ent, filter->data));
228 } else {
229 for (pos = 0; pos < filter->status; pos++) {
230 CK((*filter->output_function)(buffer[pos], filter->data));
231 }
232 CK((*filter->output_function)(c, filter->data));
233 }
234 filter->status = 0;
235 /*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
236 } else {
237 /* named entity */
238 buffer[filter->status] = 0;
239 entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
240 while (entity->name) {
241 if (!strcmp(buffer+1, entity->name)) {
242 ent = entity->code;
243 break;
244 }
245 entity++;
246 }
247 if (ent) {
248 /* decoded */
249 CK((*filter->output_function)(ent, filter->data));
250 filter->status = 0;
251 /*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
252 } else {
253 /* failure */
254 buffer[filter->status++] = ';';
255 buffer[filter->status] = 0;
256 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
257 mbfl_filt_conv_html_dec_flush(filter);
258 }
259 }
260 } else {
261 /* add character */
262 buffer[filter->status++] = c;
263 /* add character and check */
264 if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
265 {
266 /* illegal character or end of buffer */
267 if (c=='&')
268 filter->status--;
269 buffer[filter->status] = 0;
270 /* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
271 mbfl_filt_conv_html_dec_flush(filter);
272 if (c=='&')
273 {
274 buffer[filter->status++] = '&';
275 }
276 }
277 }
278 }
279 return c;
280 }
281
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)282 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
283 {
284 int status, pos = 0;
285 unsigned char *buffer;
286 int err = 0;
287
288 buffer = (unsigned char*)filter->opaque;
289 status = filter->status;
290 filter->status = 0;
291
292 /* flush fragments */
293 while (status--) {
294 int e = (*filter->output_function)(buffer[pos++], filter->data);
295 if (e != 0)
296 err = e;
297 }
298
299 if (filter->flush_function != NULL) {
300 (*filter->flush_function)(filter->data);
301 }
302
303 return err;
304 }
305
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)306 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
307 {
308 *dest = *src;
309 dest->opaque = emalloc(html_enc_buffer_size+1);
310 memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
311 }
312