xref: /PHP-5.6/ext/zip/lib/zip_utf-8.c (revision 5dc37b35)
1 /*
2   zip_utf-8.c -- UTF-8 support functions for libzip
3   Copyright (C) 2011-2012 Dieter Baron and Thomas Klausner
4 
5   This file is part of libzip, a library to manipulate ZIP archives.
6   The authors can be contacted at <libzip@nih.at>
7 
8   Redistribution and use in source and binary forms, with or without
9   modification, are permitted provided that the following conditions
10   are met:
11   1. Redistributions of source code must retain the above copyright
12      notice, this list of conditions and the following disclaimer.
13   2. Redistributions in binary form must reproduce the above copyright
14      notice, this list of conditions and the following disclaimer in
15      the documentation and/or other materials provided with the
16      distribution.
17   3. The names of the authors may not be used to endorse or promote
18      products derived from this software without specific prior
19      written permission.
20 
21   THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 
35 
36 #include "zipint.h"
37 
38 #include <stdlib.h>
39 
40 
41 
42 static const zip_uint16_t _cp437_to_unicode[256] = {
43     /* 0x00 - 0x0F */
44     0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
45     0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
46 
47     /* 0x10 - 0x1F */
48     0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
49     0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
50 
51     /* 0x20 - 0x2F */
52     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
53     0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
54 
55     /* 0x30 - 0x3F */
56     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
57     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
58 
59     /* 0x40 - 0x4F */
60     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
61     0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
62 
63     /* 0x50 - 0x5F */
64     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
65     0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
66 
67     /* 0x60 - 0x6F */
68     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
69     0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
70 
71     /* 0x70 - 0x7F */
72     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
73     0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
74 
75     /* 0x80 - 0x8F */
76     0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
77     0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
78 
79     /* 0x90 - 0x9F */
80     0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
81     0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
82 
83     /* 0xA0 - 0xAF */
84     0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
85     0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
86 
87     /* 0xB0 - 0xBF */
88     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
89     0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
90 
91     /* 0xC0 - 0xCF */
92     0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
93     0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
94 
95     /* 0xD0 - 0xDF */
96     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
97     0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
98 
99     /* 0xE0 - 0xEF */
100     0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
101     0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
102 
103     /* 0xF0 - 0xFF */
104     0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
105     0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
106 };
107 
108 #define UTF_8_LEN_2_MASK     0xe0
109 #define UTF_8_LEN_2_MATCH    0xc0
110 #define UTF_8_LEN_3_MASK     0xf0
111 #define UTF_8_LEN_3_MATCH    0xe0
112 #define UTF_8_LEN_4_MASK     0xf8
113 #define UTF_8_LEN_4_MATCH    0xf0
114 #define UTF_8_CONTINUE_MASK  0xc0
115 #define UTF_8_CONTINUE_MATCH 0x80
116 
117 
118 
119 enum zip_encoding_type
_zip_guess_encoding(struct zip_string * str,enum zip_encoding_type expected_encoding)120 _zip_guess_encoding(struct zip_string *str, enum zip_encoding_type expected_encoding)
121 {
122     enum zip_encoding_type enc;
123     const zip_uint8_t *name;
124     zip_uint32_t i, j, ulen;
125 
126     if (str == NULL)
127 	return ZIP_ENCODING_ASCII;
128 
129     name = str->raw;
130 
131     if (str->encoding != ZIP_ENCODING_UNKNOWN)
132 	enc = str->encoding;
133     else {
134 	enc = ZIP_ENCODING_ASCII;
135 	for (i=0; i<str->length; i++) {
136 	    if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
137 		continue;
138 
139 	    enc = ZIP_ENCODING_UTF8_GUESSED;
140 	    if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
141 		ulen = 1;
142 	    else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
143 		ulen = 2;
144 	    else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
145 		ulen = 3;
146 	    else {
147 		enc = ZIP_ENCODING_CP437;
148 		break;
149 	    }
150 
151 	    if (i + ulen >= str->length) {
152 		enc = ZIP_ENCODING_CP437;
153 		break;
154 	    }
155 
156 	    for (j=1; j<=ulen; j++) {
157 		if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
158 		    enc = ZIP_ENCODING_CP437;
159 		    goto done;
160 		}
161 	    }
162 	    i += ulen;
163 	}
164     }
165 
166 done:
167     str->encoding = enc;
168 
169     if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
170 	if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
171 	    str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
172 
173 	if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
174 	    return ZIP_ENCODING_ERROR;
175     }
176 
177     return enc;
178 }
179 
180 
181 
182 static zip_uint32_t
_zip_unicode_to_utf8_len(zip_uint32_t codepoint)183 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
184 {
185     if (codepoint < 0x0080)
186 	return 1;
187     if (codepoint < 0x0800)
188 	return 2;
189     if (codepoint < 0x10000)
190 	return 3;
191     return 4;
192 }
193 
194 
195 
196 static zip_uint32_t
_zip_unicode_to_utf8(zip_uint32_t codepoint,zip_uint8_t * buf)197 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
198 {
199     if (codepoint < 0x0080) {
200 	buf[0] = codepoint & 0xff;
201 	return 1;
202     }
203     if (codepoint < 0x0800) {
204 	buf[0] = UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f);
205 	buf[1] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
206 	return 2;
207     }
208     if (codepoint < 0x10000) {
209 	buf[0] = UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f);
210 	buf[1] = UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f);
211 	buf[2] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
212 	return 3;
213     }
214     buf[0] = UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07);
215     buf[1] = UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f);
216     buf[2] = UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f);
217     buf[3] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
218     return 4;
219 }
220 
221 
222 
223 zip_uint8_t *
_zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf,zip_uint32_t len,zip_uint32_t * utf8_lenp,struct zip_error * error)224 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
225 		   zip_uint32_t *utf8_lenp, struct zip_error *error)
226 {
227     zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
228     zip_uint8_t *utf8buf;
229     zip_uint32_t buflen, i, offset;
230 
231     if (len == 0) {
232 	if (utf8_lenp)
233 	    *utf8_lenp = 0;
234 	return NULL;
235     }
236 
237     buflen = 1;
238     for (i=0; i<len; i++)
239 	buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
240 
241     if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
242 	_zip_error_set(error, ZIP_ER_MEMORY, 0);
243 	return NULL;
244     }
245 
246     offset = 0;
247     for (i=0; i<len; i++)
248 	offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
249 				       utf8buf+offset);
250 
251     utf8buf[buflen-1] = 0;
252     if (utf8_lenp)
253 	*utf8_lenp = buflen-1;
254     return utf8buf;
255 }
256