xref: /PHP-7.0/ext/zip/lib/zip_utf-8.c (revision 9e0cc7a1)
1 /*
2   zip_utf-8.c -- UTF-8 support functions for libzip
3   Copyright (C) 2011-2014 Dieter Baron and Thomas Klausner
4 
5   This file is part of libzip, a library to manipulate ZIP archives.
6   The authors can be contacted at <libzip@nih.at>
7 
8   Redistribution and use in source and binary forms, with or without
9   modification, are permitted provided that the following conditions
10   are met:
11   1. Redistributions of source code must retain the above copyright
12      notice, this list of conditions and the following disclaimer.
13   2. Redistributions in binary form must reproduce the above copyright
14      notice, this list of conditions and the following disclaimer in
15      the documentation and/or other materials provided with the
16      distribution.
17   3. The names of the authors may not be used to endorse or promote
18      products derived from this software without specific prior
19      written permission.
20 
21   THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 
35 #include "zipint.h"
36 
37 #include <stdlib.h>
38 
39 
40 static const zip_uint16_t _cp437_to_unicode[256] = {
41     /* 0x00 - 0x0F */
42     0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
43     0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
44 
45     /* 0x10 - 0x1F */
46     0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
47     0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
48 
49     /* 0x20 - 0x2F */
50     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
51     0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
52 
53     /* 0x30 - 0x3F */
54     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
55     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
56 
57     /* 0x40 - 0x4F */
58     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
59     0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
60 
61     /* 0x50 - 0x5F */
62     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
63     0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
64 
65     /* 0x60 - 0x6F */
66     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
67     0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
68 
69     /* 0x70 - 0x7F */
70     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
71     0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
72 
73     /* 0x80 - 0x8F */
74     0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
75     0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
76 
77     /* 0x90 - 0x9F */
78     0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
79     0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
80 
81     /* 0xA0 - 0xAF */
82     0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
83     0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
84 
85     /* 0xB0 - 0xBF */
86     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
87     0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
88 
89     /* 0xC0 - 0xCF */
90     0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
91     0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
92 
93     /* 0xD0 - 0xDF */
94     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
95     0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
96 
97     /* 0xE0 - 0xEF */
98     0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
99     0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
100 
101     /* 0xF0 - 0xFF */
102     0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
103     0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
104 };
105 
106 #define UTF_8_LEN_2_MASK     0xe0
107 #define UTF_8_LEN_2_MATCH    0xc0
108 #define UTF_8_LEN_3_MASK     0xf0
109 #define UTF_8_LEN_3_MATCH    0xe0
110 #define UTF_8_LEN_4_MASK     0xf8
111 #define UTF_8_LEN_4_MATCH    0xf0
112 #define UTF_8_CONTINUE_MASK  0xc0
113 #define UTF_8_CONTINUE_MATCH 0x80
114 
115 
116 zip_encoding_type_t
_zip_guess_encoding(zip_string_t * str,zip_encoding_type_t expected_encoding)117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
118 {
119     zip_encoding_type_t enc;
120     const zip_uint8_t *name;
121     zip_uint32_t i, j, ulen;
122 
123     if (str == NULL)
124 	return ZIP_ENCODING_ASCII;
125 
126     name = str->raw;
127 
128     if (str->encoding != ZIP_ENCODING_UNKNOWN)
129 	enc = str->encoding;
130     else {
131 	enc = ZIP_ENCODING_ASCII;
132 	for (i=0; i<str->length; i++) {
133 	    if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
134 		continue;
135 
136 	    enc = ZIP_ENCODING_UTF8_GUESSED;
137 	    if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
138 		ulen = 1;
139 	    else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
140 		ulen = 2;
141 	    else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
142 		ulen = 3;
143 	    else {
144 		enc = ZIP_ENCODING_CP437;
145 		break;
146 	    }
147 
148 	    if (i + ulen >= str->length) {
149 		enc = ZIP_ENCODING_CP437;
150 		break;
151 	    }
152 
153 	    for (j=1; j<=ulen; j++) {
154 		if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
155 		    enc = ZIP_ENCODING_CP437;
156 		    goto done;
157 		}
158 	    }
159 	    i += ulen;
160 	}
161     }
162 
163 done:
164     str->encoding = enc;
165 
166     if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
167 	if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
168 	    str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
169 
170 	if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
171 	    return ZIP_ENCODING_ERROR;
172     }
173 
174     return enc;
175 }
176 
177 
178 static zip_uint32_t
_zip_unicode_to_utf8_len(zip_uint32_t codepoint)179 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
180 {
181     if (codepoint < 0x0080)
182 	return 1;
183     if (codepoint < 0x0800)
184 	return 2;
185     if (codepoint < 0x10000)
186 	return 3;
187     return 4;
188 }
189 
190 
191 static zip_uint32_t
_zip_unicode_to_utf8(zip_uint32_t codepoint,zip_uint8_t * buf)192 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
193 {
194     if (codepoint < 0x0080) {
195 	buf[0] = codepoint & 0xff;
196 	return 1;
197     }
198     if (codepoint < 0x0800) {
199 	buf[0] = (zip_uint8_t)(UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f));
200 	buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
201 	return 2;
202     }
203     if (codepoint < 0x10000) {
204 	buf[0] = (zip_uint8_t)(UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f));
205 	buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
206 	buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
207 	return 3;
208     }
209     buf[0] = (zip_uint8_t)(UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07));
210     buf[1] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f));
211     buf[2] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f));
212     buf[3] = (zip_uint8_t)(UTF_8_CONTINUE_MATCH | (codepoint & 0x3f));
213     return 4;
214 }
215 
216 
217 zip_uint8_t *
_zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf,zip_uint32_t len,zip_uint32_t * utf8_lenp,zip_error_t * error)218 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
219 		   zip_uint32_t *utf8_lenp, zip_error_t *error)
220 {
221     zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
222     zip_uint8_t *utf8buf;
223     zip_uint32_t buflen, i, offset;
224 
225     if (len == 0) {
226 	if (utf8_lenp)
227 	    *utf8_lenp = 0;
228 	return NULL;
229     }
230 
231     buflen = 1;
232     for (i=0; i<len; i++)
233 	buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
234 
235     if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
236 	zip_error_set(error, ZIP_ER_MEMORY, 0);
237 	return NULL;
238     }
239 
240     offset = 0;
241     for (i=0; i<len; i++)
242 	offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
243 				       utf8buf+offset);
244 
245     utf8buf[buflen-1] = 0;
246     if (utf8_lenp)
247 	*utf8_lenp = buflen-1;
248     return utf8buf;
249 }
250