1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this part: Marcus Boerger <helly@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include <string.h>
35 #include "mbfilter.h"
36 #include "mbfilter_htmlent.h"
37 #include "html_entities.h"
38 
39 static const int htmlentitifieds[256] = {
40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
44   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
56 };
57 
58 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
59 
60 const mbfl_encoding mbfl_encoding_html_ent = {
61 	mbfl_no_encoding_html_ent,
62 	"HTML-ENTITIES",
63 	"HTML-ENTITIES",
64 	(const char *(*)[])&mbfl_encoding_html_ent_aliases,
65 	NULL,
66 	MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE,
67 	&vtbl_html_wchar,
68 	&vtbl_wchar_html
69 };
70 
71 const struct mbfl_convert_vtbl vtbl_wchar_html = {
72 	mbfl_no_encoding_wchar,
73 	mbfl_no_encoding_html_ent,
74 	mbfl_filt_conv_common_ctor,
75 	mbfl_filt_conv_common_dtor,
76 	mbfl_filt_conv_html_enc,
77 	mbfl_filt_conv_html_enc_flush
78 };
79 
80 const struct mbfl_convert_vtbl vtbl_html_wchar = {
81 	mbfl_no_encoding_html_ent,
82 	mbfl_no_encoding_wchar,
83 	mbfl_filt_conv_html_dec_ctor,
84 	mbfl_filt_conv_html_dec_dtor,
85 	mbfl_filt_conv_html_dec,
86 	mbfl_filt_conv_html_dec_flush,
87 	mbfl_filt_conv_html_dec_copy };
88 
89 
90 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
91 
92 /*
93  * any => HTML
94  */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)95 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
96 {
97 	int tmp[64];
98 	int i;
99 	unsigned int uc;
100 	const mbfl_html_entity_entry *e;
101 
102 	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
103 				htmlentitifieds[c] != 1) {
104 		CK((*filter->output_function)(c, filter->data));
105 	} else {
106  		CK((*filter->output_function)('&', filter->data));
107 		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
108 			if (c == e->code) {
109 				char *p;
110 
111 				for (p = e->name; *p != '\0'; p++) {
112 					CK((*filter->output_function)((int)*p, filter->data));
113 				}
114 				goto last;
115 			}
116 		}
117 
118 		{
119 			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
120 
121 			CK((*filter->output_function)('#', filter->data));
122 
123 			uc = (unsigned int)c;
124 
125 			*(--p) = '\0';
126 			do {
127 				*(--p) = "0123456789"[uc % 10];
128 				uc /= 10;
129 			} while (uc);
130 
131 			for (; *p != '\0'; p++) {
132 				CK((*filter->output_function)(*p, filter->data));
133 			}
134 		}
135 	last:
136 		CK((*filter->output_function)(';', filter->data));
137 	}
138 	return c;
139 }
140 
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)141 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
142 {
143 	filter->status = 0;
144 	filter->opaque = NULL;
145 
146 	if (filter->flush_function != NULL) {
147 		(*filter->flush_function)(filter->data);
148 	}
149 
150 	return 0;
151 }
152 
153 /*
154  * HTML => any
155  */
156 #define html_enc_buffer_size	16
157 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
158 
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)159 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
160 {
161 	filter->status = 0;
162 	filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
163 }
164 
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)165 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
166 {
167 	filter->status = 0;
168 	if (filter->opaque)
169 	{
170 		mbfl_free((void*)filter->opaque);
171 	}
172 	filter->opaque = NULL;
173 }
174 
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)175 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
176 {
177 	int  pos, ent = 0;
178 	mbfl_html_entity_entry *entity;
179 	char *buffer = (char*)filter->opaque;
180 
181 	if (!filter->status) {
182 		if (c == '&' ) {
183 			filter->status = 1;
184 			buffer[0] = '&';
185 		} else {
186 			CK((*filter->output_function)(c, filter->data));
187 		}
188 	} else {
189 		if (c == ';') {
190 			if (buffer[1]=='#') {
191 				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
192 					if (filter->status > 3) {
193 						/* numeric entity */
194 						for (pos=3; pos<filter->status; pos++) {
195 							int v =  buffer[pos];
196 							if (v >= '0' && v <= '9') {
197 								v = v - '0';
198 							} else if (v >= 'A' && v <= 'F') {
199 								v = v - 'A' + 10;
200 							} else if (v >= 'a' && v <= 'f') {
201 								v = v - 'a' + 10;
202 							} else {
203 								ent = -1;
204 								break;
205 							}
206 							ent = ent * 16 + v;
207 						}
208 					} else {
209 						ent = -1;
210 					}
211 				} else {
212 					/* numeric entity */
213 					if (filter->status > 2) {
214 						for (pos=2; pos<filter->status; pos++) {
215 							int v = buffer[pos];
216 							if (v >= '0' && v <= '9') {
217 								v = v - '0';
218 							} else {
219 								ent = -1;
220 								break;
221 							}
222 							ent = ent*10 + v;
223 						}
224 					} else {
225 						ent = -1;
226 					}
227 				}
228 				if (ent >= 0 && ent < 0x110000) {
229 					CK((*filter->output_function)(ent, filter->data));
230 				} else {
231 					for (pos = 0; pos < filter->status; pos++) {
232 						CK((*filter->output_function)(buffer[pos], filter->data));
233 					}
234 					CK((*filter->output_function)(c, filter->data));
235 				}
236 				filter->status = 0;
237 				/*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
238 			} else {
239 				/* named entity */
240 				buffer[filter->status] = 0;
241 				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
242 				while (entity->name) {
243 					if (!strcmp(buffer+1, entity->name))	{
244 						ent = entity->code;
245 						break;
246 					}
247 					entity++;
248 				}
249 				if (ent) {
250 					/* decoded */
251 					CK((*filter->output_function)(ent, filter->data));
252 					filter->status = 0;
253 					/*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
254 				} else {
255 					/* failure */
256 					buffer[filter->status++] = ';';
257 					buffer[filter->status] = 0;
258 					/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
259 					mbfl_filt_conv_html_dec_flush(filter);
260 				}
261 			}
262 		} else {
263 			/* add character */
264 			buffer[filter->status++] = c;
265 			/* add character and check */
266 			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
267 			{
268 				/* illegal character or end of buffer */
269 				if (c=='&')
270 					filter->status--;
271 				buffer[filter->status] = 0;
272 				/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
273 				mbfl_filt_conv_html_dec_flush(filter);
274 				if (c=='&')
275 				{
276 					buffer[filter->status++] = '&';
277 				}
278 			}
279 		}
280 	}
281 	return c;
282 }
283 
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)284 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
285 {
286 	int status, pos = 0;
287 	unsigned char *buffer;
288 	int err = 0;
289 
290 	buffer = (unsigned char*)filter->opaque;
291 	status = filter->status;
292 	filter->status = 0;
293 
294 	/* flush fragments */
295 	while (status--) {
296 		int e = (*filter->output_function)(buffer[pos++], filter->data);
297 		if (e != 0)
298 			err = e;
299 	}
300 
301 	if (filter->flush_function != NULL) {
302 		(*filter->flush_function)(filter->data);
303 	}
304 
305 	return err;
306 }
307 
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)308 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
309 {
310 	*dest = *src;
311 	dest->opaque = mbfl_malloc(html_enc_buffer_size+1);
312 	memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
313 }
314