xref: /PHP-7.4/win32/cp_enc_map_gen.c (revision 92ac598a)
1 #include <stdio.h>
2 
3 #include <windows.h>
4 
5 struct cp {
6 	DWORD id;
7 	char *name;
8 	char *enc;
9 	char *desc;
10 };
11 
12 static const struct cp cp_map[] = {
13 	{ 37,	"IBM037", "", "IBM EBCDIC US-Canada" },
14 	{ 437,	"IBM437", "", "OEM United States" },
15 	{ 500,	"IBM500", "", "IBM EBCDIC International" },
16 	{ 708,	"ASMO-708", "", "Arabic (ASMO 708)" },
17 	{ 709,	"", "", "Arabic (ASMO-449+, BCON V4)" },
18 	{ 710,	"", "", "Arabic - Transparent Arabic" },
19 	{ 720,	"DOS-720", "", "Arabic (Transparent ASMO); Arabic (DOS)" },
20 	{ 737,	"ibm737", "", "OEM Greek (formerly 437G); Greek (DOS)" },
21 	{ 775,	"ibm775", "", "OEM Baltic; Baltic (DOS)" },
22 	{ 850,	"ibm850", "850|CP850|IBM850|CSPC850MULTILINGUAL", "OEM Multilingual Latin 1; Western European (DOS)" },
23 	{ 852,	"ibm852", "", "OEM Latin 2; Central European (DOS)" },
24 	{ 855,	"IBM855", "", "OEM Cyrillic (primarily Russian)" },
25 	{ 857,	"ibm857", "", "OEM Turkish; Turkish (DOS)" },
26 	{ 858,	"IBM00858", "", "OEM Multilingual Latin 1 + Euro symbol" },
27 	{ 860,	"IBM860", "", "OEM Portuguese; Portuguese (DOS)" },
28 	{ 861,	"ibm861", "", "OEM Icelandic; Icelandic (DOS)" },
29 	{ 862,	"DOS-862", "862|CP862|IBM862|CSPC862LATINHEBREW", "OEM Hebrew; Hebrew (DOS)" },
30 	{ 863,	"IBM863", "", "OEM French Canadian; French Canadian (DOS)" },
31 	{ 864,	"IBM864", "", "OEM Arabic; Arabic (864)" },
32 	{ 865,	"IBM865", "", "OEM Nordic; Nordic (DOS)" },
33 	{ 866,	"cp866", "866|CP866|IBM866|CSIBM866", "OEM Russian; Cyrillic (DOS)" },
34 	{ 869,	"ibm869", "", "OEM Modern Greek; Greek, Modern (DOS)" },
35 	{ 870,	"IBM870", "", "IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2" },
36 	{ 874,	"windows-874", "CP874", "ANSI/OEM Thai (ISO 8859-11); Thai (Windows)" },
37 	{ 875,	"cp875", "", "IBM EBCDIC Greek Modern" },
38 	{ 932,	"shift_jis", "CP932|SHIFT_JIS|MS_KANJI|CSSHIFTJIS", "ANSI/OEM Japanese; Japanese (Shift-JIS)" },
39 	{ 936,	"gb2312", "GB2312|GBK|CP936|MS936|WINDOWS-936", "ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)" },
40 	{ 949,	"ks_c_5601-1987", "CP949|UHC", "ANSI/OEM Korean (Unified Hangul Code)" },
41 	{ 950,	"big5", "CP950|BIG-5", "ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)" },
42 	{ 1026,	"IBM1026", "", "IBM EBCDIC Turkish (Latin 5)" },
43 	{ 1047,	"IBM01047", "", "IBM EBCDIC Latin 1/Open System" },
44 	{ 1140,	"IBM01140", "", "IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)" },
45 	{ 1141,	"IBM01141", "", "IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)" },
46 	{ 1142,	"IBM01142", "", "IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)" },
47 	{ 1143,	"IBM01143", "", "IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)" },
48 	{ 1144,	"IBM01144", "", "IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)" },
49 	{ 1145,	"IBM01145", "", "IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)" },
50 	{ 1146,	"IBM01146", "", "IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)" },
51 	{ 1147,	"IBM01147", "", "IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)" },
52 	{ 1148,	"IBM01148", "", "IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)" },
53 	{ 1149,	"IBM01149", "", "IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)" },
54 	{ 1200,	"utf-16", "", "Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications" },
55 	{ 1201,	"unicodeFFFE", "", "Unicode UTF-16, big endian byte order; available only to managed applications" },
56 	{ 1250,	"windows-1250", "CP1250|MS-EE|WINDOWS-1250", "ANSI Central European; Central European (Windows)" },
57 	{ 1251,	"windows-1251", "CP1251|MS-CYRL|WINDOWS-1251", "ANSI Cyrillic; Cyrillic (Windows)" },
58 	{ 1252,	"windows-1252", "CP1252|MS-ANSI|WINDOWS-1252", "ANSI Latin 1; Western European (Windows)" },
59 	{ 1253,	"windows-1253", "CP1253|MS-GREEK|WINDOWS-1253", "ANSI Greek; Greek (Windows)" },
60 	{ 1254,	"windows-1254", "CP1254|MS-TURK|WINDOWS-1254", "ANSI Turkish; Turkish (Windows)" },
61 	{ 1255,	"windows-1255", "CP1255|MS-HEBR|WINDOWS-1255", "ANSI Hebrew; Hebrew (Windows)" },
62 	{ 1256,	"windows-1256", "CP1256|MS-ARAB|WINDOWS-1256", "ANSI Arabic; Arabic (Windows)" },
63 	{ 1257,	"windows-1257", "CP1257|WINBALTRIM|WINDOWS-1257", "ANSI Baltic; Baltic (Windows)" },
64 	{ 1258,	"windows-1258", "CP1258|WINDOWS-1258", "ANSI/OEM Vietnamese; Vietnamese (Windows)" },
65 	{ 1361,	"Johab", "CP1361|JOHAB", "Korean (Johab)" },
66 	{ 10000,	"macintosh", "MAC|MACINTOSH|MACROMAN|CSMACINTOSH", "MAC Roman; Western European (Mac)" },
67 	{ 10001,	"x-mac-japanese", "", "Japanese (Mac)" },
68 	{ 10002,	"x-mac-chinesetrad", "", "MAC Traditional Chinese (Big5); Chinese Traditional (Mac)" },
69 	{ 10003,	"x-mac-korean", "", "Korean (Mac)" },
70 	{ 10004,	"x-mac-arabic", "MACARABIC", "Arabic (Mac)" },
71 	{ 10005,	"x-mac-hebrew", "MACHEBREW", "Hebrew (Mac)" },
72 	{ 10006,	"x-mac-greek", "MACGREEK", "Greek (Mac)" },
73 	{ 10007,	"x-mac-cyrillic", "MACCYRILLIC", "Cyrillic (Mac)" },
74 	{ 10008,	"x-mac-chinesesimp", "", "MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)" },
75 	{ 10010,	"x-mac-romanian", "MACROMANIA", "Romanian (Mac)" },
76 	{ 10017,	"x-mac-ukrainian", "MACUKRAINE", "Ukrainian (Mac)" },
77 	{ 10021,	"x-mac-thai", "MACTHAI", "Thai (Mac)" },
78 	{ 10029,	"x-mac-ce", "MACCENTRALEUROPE", "MAC Latin 2; Central European (Mac)" },
79 	{ 10079,	"x-mac-icelandic", "MACICELAND", "Icelandic (Mac)" },
80 	{ 10081,	"x-mac-turkish", "MACTURKISH", "Turkish (Mac)" },
81 	{ 10082,	"x-mac-croatian", "MACCROATIAN", "Croatian (Mac)" },
82 	{ 12000,	"utf-32", "", "Unicode UTF-32, little endian byte order; available only to managed applications" },
83 	{ 12001,	"utf-32BE", "", "Unicode UTF-32, big endian byte order; available only to managed applications" },
84 	{ 20000,	"x-Chinese_CNS", "", "CNS Taiwan; Chinese Traditional (CNS)" },
85 	{ 20001,	"x-cp20001", "", "TCA Taiwan" },
86 	{ 20002,	"x_Chinese-Eten", "", "Eten Taiwan; Chinese Traditional (Eten)" },
87 	{ 20003,	"x-cp20003", "", "IBM5550 Taiwan" },
88 	{ 20004,	"x-cp20004", "", "TeleText Taiwan" },
89 	{ 20005,	"x-cp20005", "", "Wang Taiwan" },
90 	{ 20105,	"x-IA5", "", "IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)" },
91 	{ 20106,	"x-IA5-German", "", "IA5 German (7-bit)" },
92 	{ 20107,	"x-IA5-Swedish", "", "IA5 Swedish (7-bit)" },
93 	{ 20108,	"x-IA5-Norwegian", "", "IA5 Norwegian (7-bit)" },
94 	{ 20127,	"us-ascii", "", "US-ASCII (7-bit)" },
95 	{ 20261,	"x-cp20261", "", "T.61" },
96 	{ 20269,	"x-cp20269", "", "ISO 6937 Non-Spacing Accent" },
97 	{ 20273,	"IBM273", "", "IBM EBCDIC Germany" },
98 	{ 20277,	"IBM277", "", "IBM EBCDIC Denmark-Norway" },
99 	{ 20278,	"IBM278", "", "IBM EBCDIC Finland-Sweden" },
100 	{ 20280,	"IBM280", "", "IBM EBCDIC Italy" },
101 	{ 20284,	"IBM284", "", "IBM EBCDIC Latin America-Spain" },
102 	{ 20285,	"IBM285", "", "IBM EBCDIC United Kingdom" },
103 	{ 20290,	"IBM290", "", "IBM EBCDIC Japanese Katakana Extended" },
104 	{ 20297,	"IBM297", "", "IBM EBCDIC France" },
105 	{ 20420,	"IBM420", "", "IBM EBCDIC Arabic" },
106 	{ 20423,	"IBM423", "", "IBM EBCDIC Greek" },
107 	{ 20424,	"IBM424", "", "IBM EBCDIC Hebrew" },
108 	{ 20833,	"x-EBCDIC-KoreanExtended", "", "IBM EBCDIC Korean Extended" },
109 	{ 20838,	"IBM-Thai", "", "IBM EBCDIC Thai" },
110 	{ 20866,	"koi8-r", "KOI8-R|CSKOI8R", "Russian (KOI8-R); Cyrillic (KOI8-R)" },
111 	{ 20871,	"IBM871", "", "IBM EBCDIC Icelandic" },
112 	{ 20880,	"IBM880", "", "IBM EBCDIC Cyrillic Russian" },
113 	{ 20905,	"IBM905", "", "IBM EBCDIC Turkish" },
114 	{ 20924,	"IBM00924", "", "IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)" },
115 	{ 20932,	"EUC-JP", "EUC-JP|EUCJP|EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE|CSEUCPKDFMTJAPANESE", "Japanese (JIS 0208-1990 and 0212-1990)" },
116 	{ 20936,	"x-cp20936", "", "Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)" },
117 	{ 20949,	"x-cp20949", "", "Korean Wansung" },
118 	{ 21025,	"cp1025", "", "IBM EBCDIC Cyrillic Serbian-Bulgarian" },
119 	/*{ 21027,	"", "", "(deprecated)" },*/
120 	{ 21866,	"koi8-u", "KOI8-U", "Ukrainian (KOI8-U); Cyrillic (KOI8-U)" },
121 	{ 28591,	"iso-8859-1", "CP819|IBM819|ISO-8859-1|ISO-IR-100|ISO8859-1|ISO_8859-1|ISO_8859-1:1987|L1|LATIN1|CSISOLATIN1", "ISO 8859-1 Latin 1; Western European (ISO)" },
122 	{ 28592,	"iso-8859-2", "ISO-8859-2|ISO-IR-101|ISO8859-2|ISO_8859-2|ISO_8859-2:1987|L2|LATIN2|CSISOLATIN2", "ISO 8859-2 Central European; Central European (ISO)" },
123 	{ 28593,	"iso-8859-3", "ISO-8859-3|ISO-IR-109|ISO8859-3|ISO_8859-3|ISO_8859-3:1988|L3|LATIN3|CSISOLATIN3", "ISO 8859-3 Latin 3" },
124 	{ 28594,	"iso-8859-4", "ISO-8859-4|ISO-IR-110|ISO8859-4|ISO_8859-4|ISO_8859-4:1988|L4|LATIN4|CSISOLATIN4", "ISO 8859-4 Baltic" },
125 	{ 28595,	"iso-8859-5", "CYRILLIC|ISO-8859-5|ISO-IR-144|ISO8859-5|ISO_8859-5|ISO_8859-5:1988|CSISOLATINCYRILLIC", "ISO 8859-5 Cyrillic" },
126 	{ 28596,	"iso-8859-6", "ARABIC|ASMO-708|ECMA-114|ISO-8859-6|ISO-IR-127|ISO8859-6|ISO_8859-6|ISO_8859-6:1987|CSISOLATINARABIC", "ISO 8859-6 Arabic" },
127 	{ 28597,	"iso-8859-7", "ECMA-118|ELOT_928|GREEK|GREEK8|ISO-8859-7|ISO-IR-126|ISO8859-7|ISO_8859-7|ISO_8859-7:1987|ISO_8859-7:2003|CSISOLATINGREEK", "ISO 8859-7 Greek" },
128 	{ 28598,	"iso-8859-8", "HEBREW|ISO-8859-8|ISO-IR-138|ISO8859-8|ISO_8859-8|ISO_8859-8:1988|CSISOLATINHEBREW", "ISO 8859-8 Hebrew; Hebrew (ISO-Visual)" },
129 	{ 28599,	"iso-8859-9", "ISO-8859-9|ISO-IR-148|ISO8859-9|ISO_8859-9|ISO_8859-9:1989|L5|LATIN5|CSISOLATIN5", "ISO 8859-9 Turkish" },
130 	{ 28603,	"iso-8859-13", "ISO-8859-13|ISO-IR-179|ISO8859-13|ISO_8859-13|L7|LATIN7", "ISO 8859-13 Estonian" },
131 	{ 28605,	"iso-8859-15", "ISO-8859-15|ISO-IR-203|ISO8859-15|ISO_8859-15|ISO_8859-15:1998|LATIN-9", "ISO 8859-15 Latin 9" },
132 	{ 29001,	"x-Europa", "", "Europa 3" },
133 	{ 38598,	"iso-8859-8-i", "", "ISO 8859-8 Hebrew; Hebrew (ISO-Logical)" },
134 	{ 50220,	"iso-2022-jp", "CP50220", "ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)" },
135 	{ 50221,	"csISO2022JP", "CP50221", "ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)" },
136 	{ 50222,	"iso-2022-jp", "ISO-2022-JP|CP50222", "ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)" },
137 	{ 50225,	"iso-2022-kr", "ISO-2022-KR|CSISO2022KR", "ISO 2022 Korean" },
138 	{ 50227,	"x-cp50227", "", "ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)" },
139 	{ 50229,	"x-cp50229", "", "ISO 2022 Traditional Chinese" },
140 	{ 50930,	"", "", "EBCDIC Japanese (Katakana) Extended" },
141 	{ 50931,	"", "", "EBCDIC US-Canada and Japanese" },
142 	{ 50933,	"", "", "EBCDIC Korean Extended and Korean" },
143 	{ 50935,	"", "", "EBCDIC Simplified Chinese Extended and Simplified Chinese" },
144 	{ 50936,	"", "", "EBCDIC Simplified Chinese" },
145 	{ 50937,	"", "", "EBCDIC US-Canada and Traditional Chinese" },
146 	{ 50939,	"", "", "EBCDIC Japanese (Latin) Extended and Japanese" },
147 	{ 51932,	"euc-jp", "", "EUC Japanese" },
148 	{ 51936,	"EUC-CN", "", "EUC Simplified Chinese; Chinese Simplified (EUC)" },
149 	{ 51949,	"euc-kr", "EUC-KR|EUCKR|CSEUCKR", "EUC Korean" },
150 	{ 51950,	"", "", "EUC Traditional Chinese" },
151 	{ 52936,	"hz-gb-2312", "HZ|HZ-GB-2312", "HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)" },
152 	{ 54936,	"GB18030", "GB18030|CSGB18030", "Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)" },
153 	{ 57002,	"x-iscii-de", "", "ISCII Devanagari" },
154 	{ 57003,	"x-iscii-be", "", "ISCII Bangla" },
155 	{ 57004,	"x-iscii-ta", "", "ISCII Tamil" },
156 	{ 57005,	"x-iscii-te", "", "ISCII Telugu" },
157 	{ 57006,	"x-iscii-as", "", "ISCII Assamese" },
158 	{ 57007,	"x-iscii-or", "", "ISCII Odia" },
159 	{ 57008,	"x-iscii-ka", "", "ISCII Kannada" },
160 	{ 57009,	"x-iscii-ma", "", "ISCII Malayalam" },
161 	{ 57010,	"x-iscii-gu", "", "ISCII Gujarati" },
162 	{ 57011,	"x-iscii-pa", "", "ISCII Punjabi" },
163 	{ 65000,	"utf-7", "UTF-7", "Unicode (UTF-7)" },
164 	{ 65001,	"utf-8", "UTF-8", "Unicode (UTF-8)" },
165 	{ 0,	NULL, NULL },
166 };
167 
168 
169 
170 
171 int
main(int argc,char ** argv)172 main(int argc, char **argv)
173 {
174 	DWORD cp;
175 	CPINFOEX info;
176 	struct cp *cur;
177 	int rnd = 0;
178 
179 	/*if (argc < 2) {
180 		printf("Usage: cpinfoex cp_id\n");
181 		return 0;
182 	}
183 
184 	cp = atoi(argv[1]);*/
185 #if 0
186 /* Ref:
187     http://www.iana.org/assignments/character-sets/character-sets.xhtml
188     https://msdn.microsoft.com/en-us/goglobal/bb964653
189     http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
190  */
191 #endif
192 	/*
193 	struct php_win32_cp {
194 		DWORD id;
195 		DWORD to_w_fl;
196 		DWORD from_w_fl;
197 		DWORD char_size;
198 		char *name;
199 		char *enc;
200 		char *desc;
201 	};
202 	*/
203 	/*printf("struct php_win32_cp {\n\tDWORD id;\n\tDWORD to_w_fl;\n\tDWORD from_w_fl;\n\tDWORD char_size;\n\tchar *name;\n\tchar *enc;\n\tchar *desc;\n};\n\n"); */
204 	printf("/* Autogenerated file. Update cp_enc_map_gen.c and regen like \n"
205 	" cp_enc_map_gen.exe > cp_enc_map.c \n*/\n\n");
206 	printf("static const struct php_win32_cp php_win32_cp_map[] = {");
207 
208 	cur = &cp_map[0];
209 
210 #ifdef ORDER_IT
211 	while (rnd <= 2 && ++rnd && (cur = &cp_map[0]))
212 #endif
213 	while (cur->desc != NULL) {
214 		if (!IsValidCodePage(cur->id)) {
215 #ifdef ORDER_IT
216 			if (2 == rnd)
217 #endif
218 			printf("\t/* %u is invalid */\n", cur->id);
219 			//printf("#if 0\n\t{ %u, 0, \"%s\", \"%s\" },\n#endif\n", cur->id, cur->name, cur->desc);
220 		} else if (GetCPInfoEx(cur->id, 0, &info)) {
221 			DWORD to_w_fl = 0, from_w_fl = 0;
222 
223 			if (65001U == cur->id || 54936U == cur->id) {
224 				from_w_fl = WC_ERR_INVALID_CHARS;
225 				to_w_fl = MB_ERR_INVALID_CHARS;
226 			}
227 
228 			//printf("\t{ %u, %u, \"%s\", \"%s\" },\n", cur->id, info.MaxCharSize, cur->name, cur->desc);
229 			if (!cur->enc[0]) {
230 #ifdef ORDER_IT
231 				if (2 == rnd)
232 #endif
233 				//printf("\t/* { %u, %u, \"%s\", NULL, \"%s\" }, */\n", info.CodePage, info.MaxCharSize, cur->name, info.CodePageName);
234 				printf("\t{ %u, %u, %u, %u, \"%s\", NULL, \"%s\" },\n", info.CodePage, to_w_fl, from_w_fl, info.MaxCharSize, cur->name, info.CodePageName);
235 			} else {
236 #ifdef ORDER_IT
237 				if (1 == rnd)
238 #endif
239 				printf("\t{ %u, %u, %u, %u, \"%s\", \"%s\", \"%s\" },\n", info.CodePage, to_w_fl, from_w_fl, info.MaxCharSize, cur->name, cur->enc, info.CodePageName);
240 			}
241 		}
242 		cur++;
243 	}
244 
245 	printf("};\n\n");
246 
247 	return 0;
248 }
249