xref: /PHP-7.4/ext/standard/cyr_convert.c (revision 92ac598a)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) The PHP Group                                          |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Kirill Maximov <kir@rus.net>                                 |
16    +----------------------------------------------------------------------+
17  */
18 
19 #include <stdlib.h>
20 
21 #ifdef HAVE_UNISTD_H
22 #include <unistd.h>
23 #endif
24 #include <string.h>
25 #include <errno.h>
26 
27 #include "php.h"
28 #include "cyr_convert.h"
29 
30 #include <stdio.h>
31 
32 /*****************************************************************************
33 * This is codetables for different Cyrillic charsets (relative to koi8-r).
34 * Each table contains data for 128-255 symbols from ASCII table.
35 * First 256 symbols are for conversion from koi8-r to corresponding charset,
36 * second 256 symbols are for reverse conversion, from charset to koi8-r.
37 *
38 * Here we have the following tables:
39 * _cyr_win1251   - for windows-1251 charset
40 * _cyr_iso88595  - for iso8859-5 charset
41 * _cyr_cp866     - for x-cp866 charset
42 * _cyr_mac       - for x-mac-cyrillic charset
43 *
44 *****************************************************************************/
45 
46 typedef unsigned char _cyr_charset_table[512];
47 
48 /* {{{ static const _cyr_charset_table _cyr_win1251
49  */
50 static const _cyr_charset_table _cyr_win1251 = {
51 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
52 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
53 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
54 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
55 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
56 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
57 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
58 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
59 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
60 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
61 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
62 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
63 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
64 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
65 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
66 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
67 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
68 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
69 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
70 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
71 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
72 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
73 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
75 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
76 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
77 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
78 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
79 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
80 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
81 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
82 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
83 },
84 _cyr_cp866 = {
85 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
86 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
87 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
88 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
89 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
90 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
91 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
92 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
93 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
94 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
95 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
96 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
97 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
98 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
99 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
100 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
101 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
102 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
103 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
104 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
105 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
106 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
107 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
108 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
109 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
110 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
111 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
112 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
113 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
114 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
115 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
116 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
117 },
118 _cyr_iso88595 = {
119 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
120 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
121 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
122 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
123 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
124 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
125 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
126 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
127 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
128 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
129 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
130 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
131 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
132 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
133 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
134 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
135 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
136 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
137 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
138 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
139 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
140 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
141 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
142 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
143 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
144 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
145 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
146 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
147 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
148 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
149 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
150 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
151 },
152 _cyr_mac = {
153 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
154 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
155 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
156 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
157 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
158 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
159 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
160 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
161 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
162 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
163 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
164 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
165 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
166 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
167 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
168 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
169 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
170 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
171 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
172 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
173 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
174 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
175 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
176 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
177 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
178 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
179 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
180 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
181 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
182 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
183 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
184 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
185 };
186 /* }}} */
187 
188 /* {{{ static char * php_convert_cyr_string(unsigned char *str, int length, char from, char to)
189 * This is the function that performs real in-place conversion of the string
190 * between charsets.
191 * Parameters:
192 *    str - string to be converted
193 *    from,to - one-symbol label of source and destination charset
194 * The following symbols are used as labels:
195 *    k - koi8-r
196 *    w - windows-1251
197 *    i - iso8859-5
198 *    a - x-cp866
199 *    d - x-cp866
200 *    m - x-mac-cyrillic
201 *****************************************************************************/
php_convert_cyr_string(unsigned char * str,size_t length,char from,char to)202 static char * php_convert_cyr_string(unsigned char *str, size_t length, char from, char to)
203 {
204 	const unsigned char *from_table, *to_table;
205 	unsigned char tmp;
206 	size_t i;
207 
208 	from_table = NULL;
209 	to_table   = NULL;
210 
211 	switch (toupper((int)(unsigned char)from))
212 	{
213 		case 'W':
214 			from_table = _cyr_win1251;
215 			break;
216 		case 'A':
217 		case 'D':
218 			from_table = _cyr_cp866;
219 			break;
220 		case 'I':
221 			from_table = _cyr_iso88595;
222 			break;
223 		case 'M':
224 			from_table = _cyr_mac;
225 			break;
226 		case 'K':
227 			break;
228 		default:
229 			php_error_docref(NULL, E_WARNING, "Unknown source charset: %c", from);
230 			break;
231 	}
232 
233 	switch (toupper((int)(unsigned char)to))
234 	{
235 		case 'W':
236 			to_table = _cyr_win1251;
237 			break;
238 		case 'A':
239 		case 'D':
240 			to_table = _cyr_cp866;
241 			break;
242 		case 'I':
243 			to_table = _cyr_iso88595;
244 			break;
245 		case 'M':
246 			to_table = _cyr_mac;
247 			break;
248 		case 'K':
249 			break;
250 		default:
251 			php_error_docref(NULL, E_WARNING, "Unknown destination charset: %c", to);
252 			break;
253 	}
254 
255 
256 	if (!str)
257 		return (char *)str;
258 
259 	for (i = 0; i < length; i++) {
260 		tmp = (from_table == NULL)? str[i] : from_table[ str[i] ];
261 		str[i] = (to_table == NULL) ? tmp : to_table[tmp + 256];
262 	}
263 	return (char *)str;
264 }
265 /* }}} */
266 
267 /* {{{ proto string convert_cyr_string(string str, string from, string to)
268    Convert from one Cyrillic character set to another */
PHP_FUNCTION(convert_cyr_string)269 PHP_FUNCTION(convert_cyr_string)
270 {
271 	char *input, *fr_cs, *to_cs;
272 	size_t input_len, fr_cs_len, to_cs_len;
273 	zend_string *str;
274 
275 	ZEND_PARSE_PARAMETERS_START(3, 3)
276 		Z_PARAM_STRING(input, input_len)
277 		Z_PARAM_STRING(fr_cs, fr_cs_len)
278 		Z_PARAM_STRING(to_cs, to_cs_len)
279 	ZEND_PARSE_PARAMETERS_END();
280 
281 	str = zend_string_init(input, input_len, 0);
282 
283 	php_convert_cyr_string((unsigned char *) ZSTR_VAL(str), ZSTR_LEN(str), fr_cs[0], to_cs[0]);
284 	RETVAL_NEW_STR(str);
285 }
286 /* }}} */
287