1 | /* Determine a canonical name for the current locale's character encoding. |
2 | |
3 | Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc. |
4 | |
5 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by |
7 | the Free Software Foundation; either version 3, or (at your option) |
8 | any later version. |
9 | |
10 | This program is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License along |
16 | with this program; if not, see <https://www.gnu.org/licenses/>. */ |
17 | |
18 | /* Written by Bruno Haible <bruno@clisp.org>. */ |
19 | |
20 | #include <config.h> |
21 | |
22 | /* Specification. */ |
23 | #include "localcharset.h" |
24 | |
25 | #include <stddef.h> |
26 | #include <stdio.h> |
27 | #include <string.h> |
28 | #include <stdlib.h> |
29 | |
30 | #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET |
31 | # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ |
32 | #endif |
33 | |
34 | #if defined _WIN32 && !defined __CYGWIN__ |
35 | # define WINDOWS_NATIVE |
36 | # include <locale.h> |
37 | #endif |
38 | |
39 | #if defined __EMX__ |
40 | /* Assume EMX program runs on OS/2, even if compiled under DOS. */ |
41 | # ifndef OS2 |
42 | # define OS2 |
43 | # endif |
44 | #endif |
45 | |
46 | #if !defined WINDOWS_NATIVE |
47 | # if HAVE_LANGINFO_CODESET |
48 | # include <langinfo.h> |
49 | # else |
50 | # if 0 /* see comment regarding use of setlocale(), below */ |
51 | # include <locale.h> |
52 | # endif |
53 | # endif |
54 | # ifdef __CYGWIN__ |
55 | # define WIN32_LEAN_AND_MEAN |
56 | # include <windows.h> |
57 | # endif |
58 | #elif defined WINDOWS_NATIVE |
59 | # define WIN32_LEAN_AND_MEAN |
60 | # include <windows.h> |
61 | #endif |
62 | #if defined OS2 |
63 | # define INCL_DOS |
64 | # include <os2.h> |
65 | #endif |
66 | |
67 | /* For MB_CUR_MAX_L */ |
68 | #if defined DARWIN7 |
69 | # include <xlocale.h> |
70 | #endif |
71 | |
72 | |
73 | #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 |
74 | |
75 | /* On these platforms, we use a mapping from non-canonical encoding name |
76 | to GNU canonical encoding name. */ |
77 | |
78 | /* With glibc-2.1 or newer, we don't need any canonicalization, |
79 | because glibc has iconv and both glibc and libiconv support all |
80 | GNU canonical names directly. */ |
81 | # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__) |
82 | |
83 | struct table_entry |
84 | { |
85 | const char alias[11+1]; |
86 | const char canonical[11+1]; |
87 | }; |
88 | |
89 | /* Table of platform-dependent mappings, sorted in ascending order. */ |
90 | static const struct table_entry alias_table[] = |
91 | { |
92 | # if defined __FreeBSD__ /* FreeBSD */ |
93 | /*{ "ARMSCII-8", "ARMSCII-8" },*/ |
94 | { "Big5" , "BIG5" }, |
95 | { "C" , "ASCII" }, |
96 | /*{ "CP1131", "CP1131" },*/ |
97 | /*{ "CP1251", "CP1251" },*/ |
98 | /*{ "CP866", "CP866" },*/ |
99 | /*{ "GB18030", "GB18030" },*/ |
100 | /*{ "GB2312", "GB2312" },*/ |
101 | /*{ "GBK", "GBK" },*/ |
102 | /*{ "ISCII-DEV", "?" },*/ |
103 | { "ISO8859-1" , "ISO-8859-1" }, |
104 | { "ISO8859-13" , "ISO-8859-13" }, |
105 | { "ISO8859-15" , "ISO-8859-15" }, |
106 | { "ISO8859-2" , "ISO-8859-2" }, |
107 | { "ISO8859-5" , "ISO-8859-5" }, |
108 | { "ISO8859-7" , "ISO-8859-7" }, |
109 | { "ISO8859-9" , "ISO-8859-9" }, |
110 | /*{ "KOI8-R", "KOI8-R" },*/ |
111 | /*{ "KOI8-U", "KOI8-U" },*/ |
112 | { "SJIS" , "SHIFT_JIS" }, |
113 | { "US-ASCII" , "ASCII" }, |
114 | { "eucCN" , "GB2312" }, |
115 | { "eucJP" , "EUC-JP" }, |
116 | { "eucKR" , "EUC-KR" } |
117 | # define alias_table_defined |
118 | # endif |
119 | # if defined __NetBSD__ /* NetBSD */ |
120 | { "646" , "ASCII" }, |
121 | /*{ "ARMSCII-8", "ARMSCII-8" },*/ |
122 | /*{ "BIG5", "BIG5" },*/ |
123 | { "Big5-HKSCS" , "BIG5-HKSCS" }, |
124 | /*{ "CP1251", "CP1251" },*/ |
125 | /*{ "CP866", "CP866" },*/ |
126 | /*{ "GB18030", "GB18030" },*/ |
127 | /*{ "GB2312", "GB2312" },*/ |
128 | { "ISO8859-1" , "ISO-8859-1" }, |
129 | { "ISO8859-13" , "ISO-8859-13" }, |
130 | { "ISO8859-15" , "ISO-8859-15" }, |
131 | { "ISO8859-2" , "ISO-8859-2" }, |
132 | { "ISO8859-4" , "ISO-8859-4" }, |
133 | { "ISO8859-5" , "ISO-8859-5" }, |
134 | { "ISO8859-7" , "ISO-8859-7" }, |
135 | /*{ "KOI8-R", "KOI8-R" },*/ |
136 | /*{ "KOI8-U", "KOI8-U" },*/ |
137 | /*{ "PT154", "PT154" },*/ |
138 | { "SJIS" , "SHIFT_JIS" }, |
139 | { "eucCN" , "GB2312" }, |
140 | { "eucJP" , "EUC-JP" }, |
141 | { "eucKR" , "EUC-KR" }, |
142 | { "eucTW" , "EUC-TW" } |
143 | # define alias_table_defined |
144 | # endif |
145 | # if defined __OpenBSD__ /* OpenBSD */ |
146 | { "646" , "ASCII" }, |
147 | { "ISO8859-1" , "ISO-8859-1" }, |
148 | { "ISO8859-13" , "ISO-8859-13" }, |
149 | { "ISO8859-15" , "ISO-8859-15" }, |
150 | { "ISO8859-2" , "ISO-8859-2" }, |
151 | { "ISO8859-4" , "ISO-8859-4" }, |
152 | { "ISO8859-5" , "ISO-8859-5" }, |
153 | { "ISO8859-7" , "ISO-8859-7" } |
154 | # define alias_table_defined |
155 | # endif |
156 | # if defined __APPLE__ && defined __MACH__ /* Mac OS X */ |
157 | /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is |
158 | useless: |
159 | - It returns the empty string when LANG is set to a locale of the |
160 | form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 |
161 | LC_CTYPE file. |
162 | - The environment variables LANG, LC_CTYPE, LC_ALL are not set by |
163 | the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. |
164 | - The documentation says: |
165 | "... all code that calls BSD system routines should ensure |
166 | that the const *char parameters of these routines are in UTF-8 |
167 | encoding. All BSD system functions expect their string |
168 | parameters to be in UTF-8 encoding and nothing else." |
169 | It also says |
170 | "An additional caveat is that string parameters for files, |
171 | paths, and other file-system entities must be in canonical |
172 | UTF-8. In a canonical UTF-8 Unicode string, all decomposable |
173 | characters are decomposed ..." |
174 | but this is not true: You can pass non-decomposed UTF-8 strings |
175 | to file system functions, and it is the OS which will convert |
176 | them to decomposed UTF-8 before accessing the file system. |
177 | - The Apple Terminal application displays UTF-8 by default. |
178 | - However, other applications are free to use different encodings: |
179 | - xterm uses ISO-8859-1 by default. |
180 | - TextEdit uses MacRoman by default. |
181 | We prefer UTF-8 over decomposed UTF-8-MAC because one should |
182 | minimize the use of decomposed Unicode. Unfortunately, through the |
183 | Darwin file system, decomposed UTF-8 strings are leaked into user |
184 | space nevertheless. |
185 | Then there are also the locales with encodings other than US-ASCII |
186 | and UTF-8. These locales can be occasionally useful to users (e.g. |
187 | when grepping through ISO-8859-1 encoded text files), when all their |
188 | file names are in US-ASCII. |
189 | */ |
190 | { "ARMSCII-8" , "ARMSCII-8" }, |
191 | { "Big5" , "BIG5" }, |
192 | { "Big5HKSCS" , "BIG5-HKSCS" }, |
193 | { "CP1131" , "CP1131" }, |
194 | { "CP1251" , "CP1251" }, |
195 | { "CP866" , "CP866" }, |
196 | { "CP949" , "CP949" }, |
197 | { "GB18030" , "GB18030" }, |
198 | { "GB2312" , "GB2312" }, |
199 | { "GBK" , "GBK" }, |
200 | /*{ "ISCII-DEV", "?" },*/ |
201 | { "ISO8859-1" , "ISO-8859-1" }, |
202 | { "ISO8859-13" , "ISO-8859-13" }, |
203 | { "ISO8859-15" , "ISO-8859-15" }, |
204 | { "ISO8859-2" , "ISO-8859-2" }, |
205 | { "ISO8859-4" , "ISO-8859-4" }, |
206 | { "ISO8859-5" , "ISO-8859-5" }, |
207 | { "ISO8859-7" , "ISO-8859-7" }, |
208 | { "ISO8859-9" , "ISO-8859-9" }, |
209 | { "KOI8-R" , "KOI8-R" }, |
210 | { "KOI8-U" , "KOI8-U" }, |
211 | { "PT154" , "PT154" }, |
212 | { "SJIS" , "SHIFT_JIS" }, |
213 | { "eucCN" , "GB2312" }, |
214 | { "eucJP" , "EUC-JP" }, |
215 | { "eucKR" , "EUC-KR" } |
216 | # define alias_table_defined |
217 | # endif |
218 | # if defined _AIX /* AIX */ |
219 | /*{ "GBK", "GBK" },*/ |
220 | { "IBM-1046" , "CP1046" }, |
221 | { "IBM-1124" , "CP1124" }, |
222 | { "IBM-1129" , "CP1129" }, |
223 | { "IBM-1252" , "CP1252" }, |
224 | { "IBM-850" , "CP850" }, |
225 | { "IBM-856" , "CP856" }, |
226 | { "IBM-921" , "ISO-8859-13" }, |
227 | { "IBM-922" , "CP922" }, |
228 | { "IBM-932" , "CP932" }, |
229 | { "IBM-943" , "CP943" }, |
230 | { "IBM-eucCN" , "GB2312" }, |
231 | { "IBM-eucJP" , "EUC-JP" }, |
232 | { "IBM-eucKR" , "EUC-KR" }, |
233 | { "IBM-eucTW" , "EUC-TW" }, |
234 | { "ISO8859-1" , "ISO-8859-1" }, |
235 | { "ISO8859-15" , "ISO-8859-15" }, |
236 | { "ISO8859-2" , "ISO-8859-2" }, |
237 | { "ISO8859-5" , "ISO-8859-5" }, |
238 | { "ISO8859-6" , "ISO-8859-6" }, |
239 | { "ISO8859-7" , "ISO-8859-7" }, |
240 | { "ISO8859-8" , "ISO-8859-8" }, |
241 | { "ISO8859-9" , "ISO-8859-9" }, |
242 | { "TIS-620" , "TIS-620" }, |
243 | /*{ "UTF-8", "UTF-8" },*/ |
244 | { "big5" , "BIG5" } |
245 | # define alias_table_defined |
246 | # endif |
247 | # if defined __hpux /* HP-UX */ |
248 | { "SJIS" , "SHIFT_JIS" }, |
249 | { "arabic8" , "HP-ARABIC8" }, |
250 | { "big5" , "BIG5" }, |
251 | { "cp1251" , "CP1251" }, |
252 | { "eucJP" , "EUC-JP" }, |
253 | { "eucKR" , "EUC-KR" }, |
254 | { "eucTW" , "EUC-TW" }, |
255 | { "gb18030" , "GB18030" }, |
256 | { "greek8" , "HP-GREEK8" }, |
257 | { "hebrew8" , "HP-HEBREW8" }, |
258 | { "hkbig5" , "BIG5-HKSCS" }, |
259 | { "hp15CN" , "GB2312" }, |
260 | { "iso88591" , "ISO-8859-1" }, |
261 | { "iso885913" , "ISO-8859-13" }, |
262 | { "iso885915" , "ISO-8859-15" }, |
263 | { "iso88592" , "ISO-8859-2" }, |
264 | { "iso88594" , "ISO-8859-4" }, |
265 | { "iso88595" , "ISO-8859-5" }, |
266 | { "iso88596" , "ISO-8859-6" }, |
267 | { "iso88597" , "ISO-8859-7" }, |
268 | { "iso88598" , "ISO-8859-8" }, |
269 | { "iso88599" , "ISO-8859-9" }, |
270 | { "kana8" , "HP-KANA8" }, |
271 | { "koi8r" , "KOI8-R" }, |
272 | { "roman8" , "HP-ROMAN8" }, |
273 | { "tis620" , "TIS-620" }, |
274 | { "turkish8" , "HP-TURKISH8" }, |
275 | { "utf8" , "UTF-8" } |
276 | # define alias_table_defined |
277 | # endif |
278 | # if defined __sgi /* IRIX */ |
279 | { "ISO8859-1" , "ISO-8859-1" }, |
280 | { "ISO8859-15" , "ISO-8859-15" }, |
281 | { "ISO8859-2" , "ISO-8859-2" }, |
282 | { "ISO8859-5" , "ISO-8859-5" }, |
283 | { "ISO8859-7" , "ISO-8859-7" }, |
284 | { "ISO8859-9" , "ISO-8859-9" }, |
285 | { "eucCN" , "GB2312" }, |
286 | { "eucJP" , "EUC-JP" }, |
287 | { "eucKR" , "EUC-KR" }, |
288 | { "eucTW" , "EUC-TW" } |
289 | # define alias_table_defined |
290 | # endif |
291 | # if defined __osf__ /* OSF/1 */ |
292 | /*{ "GBK", "GBK" },*/ |
293 | { "ISO8859-1" , "ISO-8859-1" }, |
294 | { "ISO8859-15" , "ISO-8859-15" }, |
295 | { "ISO8859-2" , "ISO-8859-2" }, |
296 | { "ISO8859-4" , "ISO-8859-4" }, |
297 | { "ISO8859-5" , "ISO-8859-5" }, |
298 | { "ISO8859-7" , "ISO-8859-7" }, |
299 | { "ISO8859-8" , "ISO-8859-8" }, |
300 | { "ISO8859-9" , "ISO-8859-9" }, |
301 | { "KSC5601" , "CP949" }, |
302 | { "SJIS" , "SHIFT_JIS" }, |
303 | { "TACTIS" , "TIS-620" }, |
304 | /*{ "UTF-8", "UTF-8" },*/ |
305 | { "big5" , "BIG5" }, |
306 | { "cp850" , "CP850" }, |
307 | { "dechanyu" , "DEC-HANYU" }, |
308 | { "dechanzi" , "GB2312" }, |
309 | { "deckanji" , "DEC-KANJI" }, |
310 | { "deckorean" , "EUC-KR" }, |
311 | { "eucJP" , "EUC-JP" }, |
312 | { "eucKR" , "EUC-KR" }, |
313 | { "eucTW" , "EUC-TW" }, |
314 | { "sdeckanji" , "EUC-JP" } |
315 | # define alias_table_defined |
316 | # endif |
317 | # if defined __sun /* Solaris */ |
318 | { "5601" , "EUC-KR" }, |
319 | { "646" , "ASCII" }, |
320 | /*{ "BIG5", "BIG5" },*/ |
321 | { "Big5-HKSCS" , "BIG5-HKSCS" }, |
322 | { "GB18030" , "GB18030" }, |
323 | /*{ "GBK", "GBK" },*/ |
324 | { "ISO8859-1" , "ISO-8859-1" }, |
325 | { "ISO8859-11" , "TIS-620" }, |
326 | { "ISO8859-13" , "ISO-8859-13" }, |
327 | { "ISO8859-15" , "ISO-8859-15" }, |
328 | { "ISO8859-2" , "ISO-8859-2" }, |
329 | { "ISO8859-3" , "ISO-8859-3" }, |
330 | { "ISO8859-4" , "ISO-8859-4" }, |
331 | { "ISO8859-5" , "ISO-8859-5" }, |
332 | { "ISO8859-6" , "ISO-8859-6" }, |
333 | { "ISO8859-7" , "ISO-8859-7" }, |
334 | { "ISO8859-8" , "ISO-8859-8" }, |
335 | { "ISO8859-9" , "ISO-8859-9" }, |
336 | { "PCK" , "SHIFT_JIS" }, |
337 | { "TIS620.2533" , "TIS-620" }, |
338 | /*{ "UTF-8", "UTF-8" },*/ |
339 | { "ansi-1251" , "CP1251" }, |
340 | { "cns11643" , "EUC-TW" }, |
341 | { "eucJP" , "EUC-JP" }, |
342 | { "gb2312" , "GB2312" }, |
343 | { "koi8-r" , "KOI8-R" } |
344 | # define alias_table_defined |
345 | # endif |
346 | # if defined __minix /* Minix */ |
347 | { "646" , "ASCII" } |
348 | # define alias_table_defined |
349 | # endif |
350 | # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */ |
351 | { "CP1361" , "JOHAB" }, |
352 | { "CP20127" , "ASCII" }, |
353 | { "CP20866" , "KOI8-R" }, |
354 | { "CP20936" , "GB2312" }, |
355 | { "CP21866" , "KOI8-RU" }, |
356 | { "CP28591" , "ISO-8859-1" }, |
357 | { "CP28592" , "ISO-8859-2" }, |
358 | { "CP28593" , "ISO-8859-3" }, |
359 | { "CP28594" , "ISO-8859-4" }, |
360 | { "CP28595" , "ISO-8859-5" }, |
361 | { "CP28596" , "ISO-8859-6" }, |
362 | { "CP28597" , "ISO-8859-7" }, |
363 | { "CP28598" , "ISO-8859-8" }, |
364 | { "CP28599" , "ISO-8859-9" }, |
365 | { "CP28605" , "ISO-8859-15" }, |
366 | { "CP38598" , "ISO-8859-8" }, |
367 | { "CP51932" , "EUC-JP" }, |
368 | { "CP51936" , "GB2312" }, |
369 | { "CP51949" , "EUC-KR" }, |
370 | { "CP51950" , "EUC-TW" }, |
371 | { "CP54936" , "GB18030" }, |
372 | { "CP65001" , "UTF-8" }, |
373 | { "CP936" , "GBK" } |
374 | # define alias_table_defined |
375 | # endif |
376 | # if defined OS2 /* OS/2 */ |
377 | /* The list of encodings is taken from "List of OS/2 Codepages" |
378 | by Alex Taylor: |
379 | <http://altsan.org/os2/toolkits/uls/index.html#codepages>. |
380 | See also "__convcp() of kLIBC": |
381 | <http://trac.netlabs.org/libc/browser/branches/libc-0.6/src/emx/src/lib/locale/__convcp.c>, |
382 | or: |
383 | <https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>. */ |
384 | { "CP1004" , "CP1252" }, |
385 | /*{ "CP1041", "CP943" },*/ |
386 | /*{ "CP1088", "CP949" },*/ |
387 | { "CP1089" , "ISO-8859-6" }, |
388 | /*{ "CP1114", "CP950" },*/ |
389 | /*{ "CP1115", "GB2312" },*/ |
390 | { "CP1208" , "UTF-8" }, |
391 | /*{ "CP1380", "GB2312" },*/ |
392 | { "CP1381" , "GB2312" }, |
393 | { "CP1383" , "GB2312" }, |
394 | { "CP1386" , "GBK" }, |
395 | /*{ "CP301", "CP943" },*/ |
396 | { "CP3372" , "EUC-JP" }, |
397 | { "CP4946" , "CP850" }, |
398 | /*{ "CP5048", "JIS_X0208-1990" },*/ |
399 | /*{ "CP5049", "JIS_X0212-1990" },*/ |
400 | /*{ "CP5067", "KS_C_5601-1987" },*/ |
401 | { "CP813" , "ISO-8859-7" }, |
402 | { "CP819" , "ISO-8859-1" }, |
403 | { "CP878" , "KOI8-R" }, |
404 | /*{ "CP897", "CP943" },*/ |
405 | { "CP912" , "ISO-8859-2" }, |
406 | { "CP913" , "ISO-8859-3" }, |
407 | { "CP914" , "ISO-8859-4" }, |
408 | { "CP915" , "ISO-8859-5" }, |
409 | { "CP916" , "ISO-8859-8" }, |
410 | { "CP920" , "ISO-8859-9" }, |
411 | { "CP921" , "ISO-8859-13" }, |
412 | { "CP923" , "ISO-8859-15" }, |
413 | /*{ "CP941", "CP943" },*/ |
414 | /*{ "CP947", "CP950" },*/ |
415 | /*{ "CP951", "CP949" },*/ |
416 | /*{ "CP952", "JIS_X0208-1990" },*/ |
417 | /*{ "CP953", "JIS_X0212-1990" },*/ |
418 | { "CP954" , "EUC-JP" }, |
419 | { "CP964" , "EUC-TW" }, |
420 | { "CP970" , "EUC-KR" }, |
421 | /*{ "CP971", "KS_C_5601-1987" },*/ |
422 | { "IBM-1004" , "CP1252" }, |
423 | /*{ "IBM-1006", "?" },*/ |
424 | /*{ "IBM-1008", "?" },*/ |
425 | /*{ "IBM-1041", "CP943" },*/ |
426 | /*{ "IBM-1051", "?" },*/ |
427 | /*{ "IBM-1088", "CP949" },*/ |
428 | { "IBM-1089" , "ISO-8859-6" }, |
429 | /*{ "IBM-1098", "?" },*/ |
430 | /*{ "IBM-1114", "CP950" },*/ |
431 | /*{ "IBM-1115", "GB2312" },*/ |
432 | /*{ "IBM-1116", "?" },*/ |
433 | /*{ "IBM-1117", "?" },*/ |
434 | /*{ "IBM-1118", "?" },*/ |
435 | /*{ "IBM-1119", "?" },*/ |
436 | { "IBM-1124" , "CP1124" }, |
437 | { "IBM-1125" , "CP1125" }, |
438 | { "IBM-1131" , "CP1131" }, |
439 | { "IBM-1208" , "UTF-8" }, |
440 | { "IBM-1250" , "CP1250" }, |
441 | { "IBM-1251" , "CP1251" }, |
442 | { "IBM-1252" , "CP1252" }, |
443 | { "IBM-1253" , "CP1253" }, |
444 | { "IBM-1254" , "CP1254" }, |
445 | { "IBM-1255" , "CP1255" }, |
446 | { "IBM-1256" , "CP1256" }, |
447 | { "IBM-1257" , "CP1257" }, |
448 | /*{ "IBM-1275", "?" },*/ |
449 | /*{ "IBM-1276", "?" },*/ |
450 | /*{ "IBM-1277", "?" },*/ |
451 | /*{ "IBM-1280", "?" },*/ |
452 | /*{ "IBM-1281", "?" },*/ |
453 | /*{ "IBM-1282", "?" },*/ |
454 | /*{ "IBM-1283", "?" },*/ |
455 | /*{ "IBM-1380", "GB2312" },*/ |
456 | { "IBM-1381" , "GB2312" }, |
457 | { "IBM-1383" , "GB2312" }, |
458 | { "IBM-1386" , "GBK" }, |
459 | /*{ "IBM-301", "CP943" },*/ |
460 | { "IBM-3372" , "EUC-JP" }, |
461 | { "IBM-367" , "ASCII" }, |
462 | { "IBM-437" , "CP437" }, |
463 | { "IBM-4946" , "CP850" }, |
464 | /*{ "IBM-5048", "JIS_X0208-1990" },*/ |
465 | /*{ "IBM-5049", "JIS_X0212-1990" },*/ |
466 | /*{ "IBM-5067", "KS_C_5601-1987" },*/ |
467 | { "IBM-813" , "ISO-8859-7" }, |
468 | { "IBM-819" , "ISO-8859-1" }, |
469 | { "IBM-850" , "CP850" }, |
470 | /*{ "IBM-851", "?" },*/ |
471 | { "IBM-852" , "CP852" }, |
472 | { "IBM-855" , "CP855" }, |
473 | { "IBM-856" , "CP856" }, |
474 | { "IBM-857" , "CP857" }, |
475 | /*{ "IBM-859", "?" },*/ |
476 | { "IBM-860" , "CP860" }, |
477 | { "IBM-861" , "CP861" }, |
478 | { "IBM-862" , "CP862" }, |
479 | { "IBM-863" , "CP863" }, |
480 | { "IBM-864" , "CP864" }, |
481 | { "IBM-865" , "CP865" }, |
482 | { "IBM-866" , "CP866" }, |
483 | /*{ "IBM-868", "?" },*/ |
484 | { "IBM-869" , "CP869" }, |
485 | { "IBM-874" , "CP874" }, |
486 | { "IBM-878" , "KOI8-R" }, |
487 | /*{ "IBM-895", "?" },*/ |
488 | /*{ "IBM-897", "CP943" },*/ |
489 | /*{ "IBM-907", "?" },*/ |
490 | /*{ "IBM-909", "?" },*/ |
491 | { "IBM-912" , "ISO-8859-2" }, |
492 | { "IBM-913" , "ISO-8859-3" }, |
493 | { "IBM-914" , "ISO-8859-4" }, |
494 | { "IBM-915" , "ISO-8859-5" }, |
495 | { "IBM-916" , "ISO-8859-8" }, |
496 | { "IBM-920" , "ISO-8859-9" }, |
497 | { "IBM-921" , "ISO-8859-13" }, |
498 | { "IBM-922" , "CP922" }, |
499 | { "IBM-923" , "ISO-8859-15" }, |
500 | { "IBM-932" , "CP932" }, |
501 | /*{ "IBM-941", "CP943" },*/ |
502 | /*{ "IBM-942", "?" },*/ |
503 | { "IBM-943" , "CP943" }, |
504 | /*{ "IBM-947", "CP950" },*/ |
505 | { "IBM-949" , "CP949" }, |
506 | { "IBM-950" , "CP950" }, |
507 | /*{ "IBM-951", "CP949" },*/ |
508 | /*{ "IBM-952", "JIS_X0208-1990" },*/ |
509 | /*{ "IBM-953", "JIS_X0212-1990" },*/ |
510 | { "IBM-954" , "EUC-JP" }, |
511 | /*{ "IBM-955", "?" },*/ |
512 | { "IBM-964" , "EUC-TW" }, |
513 | { "IBM-970" , "EUC-KR" }, |
514 | /*{ "IBM-971", "KS_C_5601-1987" },*/ |
515 | { "IBM-eucCN" , "GB2312" }, |
516 | { "IBM-eucJP" , "EUC-JP" }, |
517 | { "IBM-eucKR" , "EUC-KR" }, |
518 | { "IBM-eucTW" , "EUC-TW" }, |
519 | { "IBM33722" , "EUC-JP" }, |
520 | { "ISO8859-1" , "ISO-8859-1" }, |
521 | { "ISO8859-2" , "ISO-8859-2" }, |
522 | { "ISO8859-3" , "ISO-8859-3" }, |
523 | { "ISO8859-4" , "ISO-8859-4" }, |
524 | { "ISO8859-5" , "ISO-8859-5" }, |
525 | { "ISO8859-6" , "ISO-8859-6" }, |
526 | { "ISO8859-7" , "ISO-8859-7" }, |
527 | { "ISO8859-8" , "ISO-8859-8" }, |
528 | { "ISO8859-9" , "ISO-8859-9" }, |
529 | /*{ "JISX0201-1976", "JISX0201-1976" },*/ |
530 | /*{ "JISX0208-1978", "?" },*/ |
531 | /*{ "JISX0208-1983", "JIS_X0208-1983" },*/ |
532 | /*{ "JISX0208-1990", "JIS_X0208-1990" },*/ |
533 | /*{ "JISX0212-1990", "JIS_X0212-1990" },*/ |
534 | /*{ "KSC5601-1987", "KS_C_5601-1987" },*/ |
535 | { "SJIS-1" , "CP943" }, |
536 | { "SJIS-2" , "CP943" }, |
537 | { "eucJP" , "EUC-JP" }, |
538 | { "eucKR" , "EUC-KR" }, |
539 | { "eucTW-1993" , "EUC-TW" } |
540 | # define alias_table_defined |
541 | # endif |
542 | # if defined VMS /* OpenVMS */ |
543 | /* The list of encodings is taken from the OpenVMS 7.3-1 documentation |
544 | "Compaq C Run-Time Library Reference Manual for OpenVMS systems" |
545 | section 10.7 "Handling Different Character Sets". */ |
546 | { "DECHANYU" , "DEC-HANYU" }, |
547 | { "DECHANZI" , "GB2312" }, |
548 | { "DECKANJI" , "DEC-KANJI" }, |
549 | { "DECKOREAN" , "EUC-KR" }, |
550 | { "ISO8859-1" , "ISO-8859-1" }, |
551 | { "ISO8859-2" , "ISO-8859-2" }, |
552 | { "ISO8859-5" , "ISO-8859-5" }, |
553 | { "ISO8859-7" , "ISO-8859-7" }, |
554 | { "ISO8859-8" , "ISO-8859-8" }, |
555 | { "ISO8859-9" , "ISO-8859-9" }, |
556 | { "SDECKANJI" , "EUC-JP" }, |
557 | { "SJIS" , "SHIFT_JIS" }, |
558 | { "eucJP" , "EUC-JP" }, |
559 | { "eucTW" , "EUC-TW" } |
560 | # define alias_table_defined |
561 | # endif |
562 | # ifndef alias_table_defined |
563 | /* Just a dummy entry, to avoid a C syntax error. */ |
564 | { "" , "" } |
565 | # endif |
566 | }; |
567 | |
568 | # endif |
569 | |
570 | #else |
571 | |
572 | /* On these platforms, we use a mapping from locale name to GNU canonical |
573 | encoding name. */ |
574 | |
575 | struct table_entry |
576 | { |
577 | const char locale[17+1]; |
578 | const char canonical[11+1]; |
579 | }; |
580 | |
581 | /* Table of platform-dependent mappings, sorted in ascending order. */ |
582 | static const struct table_entry locale_table[] = |
583 | { |
584 | # if defined __FreeBSD__ /* FreeBSD 4.2 */ |
585 | { "cs_CZ.ISO_8859-2" , "ISO-8859-2" }, |
586 | { "da_DK.DIS_8859-15" , "ISO-8859-15" }, |
587 | { "da_DK.ISO_8859-1" , "ISO-8859-1" }, |
588 | { "de_AT.DIS_8859-15" , "ISO-8859-15" }, |
589 | { "de_AT.ISO_8859-1" , "ISO-8859-1" }, |
590 | { "de_CH.DIS_8859-15" , "ISO-8859-15" }, |
591 | { "de_CH.ISO_8859-1" , "ISO-8859-1" }, |
592 | { "de_DE.DIS_8859-15" , "ISO-8859-15" }, |
593 | { "de_DE.ISO_8859-1" , "ISO-8859-1" }, |
594 | { "en_AU.DIS_8859-15" , "ISO-8859-15" }, |
595 | { "en_AU.ISO_8859-1" , "ISO-8859-1" }, |
596 | { "en_CA.DIS_8859-15" , "ISO-8859-15" }, |
597 | { "en_CA.ISO_8859-1" , "ISO-8859-1" }, |
598 | { "en_GB.DIS_8859-15" , "ISO-8859-15" }, |
599 | { "en_GB.ISO_8859-1" , "ISO-8859-1" }, |
600 | { "en_US.DIS_8859-15" , "ISO-8859-15" }, |
601 | { "en_US.ISO_8859-1" , "ISO-8859-1" }, |
602 | { "es_ES.DIS_8859-15" , "ISO-8859-15" }, |
603 | { "es_ES.ISO_8859-1" , "ISO-8859-1" }, |
604 | { "fi_FI.DIS_8859-15" , "ISO-8859-15" }, |
605 | { "fi_FI.ISO_8859-1" , "ISO-8859-1" }, |
606 | { "fr_BE.DIS_8859-15" , "ISO-8859-15" }, |
607 | { "fr_BE.ISO_8859-1" , "ISO-8859-1" }, |
608 | { "fr_CA.DIS_8859-15" , "ISO-8859-15" }, |
609 | { "fr_CA.ISO_8859-1" , "ISO-8859-1" }, |
610 | { "fr_CH.DIS_8859-15" , "ISO-8859-15" }, |
611 | { "fr_CH.ISO_8859-1" , "ISO-8859-1" }, |
612 | { "fr_FR.DIS_8859-15" , "ISO-8859-15" }, |
613 | { "fr_FR.ISO_8859-1" , "ISO-8859-1" }, |
614 | { "hr_HR.ISO_8859-2" , "ISO-8859-2" }, |
615 | { "hu_HU.ISO_8859-2" , "ISO-8859-2" }, |
616 | { "is_IS.DIS_8859-15" , "ISO-8859-15" }, |
617 | { "is_IS.ISO_8859-1" , "ISO-8859-1" }, |
618 | { "it_CH.DIS_8859-15" , "ISO-8859-15" }, |
619 | { "it_CH.ISO_8859-1" , "ISO-8859-1" }, |
620 | { "it_IT.DIS_8859-15" , "ISO-8859-15" }, |
621 | { "it_IT.ISO_8859-1" , "ISO-8859-1" }, |
622 | { "ja_JP.EUC" , "EUC-JP" }, |
623 | { "ja_JP.SJIS" , "SHIFT_JIS" }, |
624 | { "ja_JP.Shift_JIS" , "SHIFT_JIS" }, |
625 | { "ko_KR.EUC" , "EUC-KR" }, |
626 | { "la_LN.ASCII" , "ASCII" }, |
627 | { "la_LN.DIS_8859-15" , "ISO-8859-15" }, |
628 | { "la_LN.ISO_8859-1" , "ISO-8859-1" }, |
629 | { "la_LN.ISO_8859-2" , "ISO-8859-2" }, |
630 | { "la_LN.ISO_8859-4" , "ISO-8859-4" }, |
631 | { "lt_LN.ASCII" , "ASCII" }, |
632 | { "lt_LN.DIS_8859-15" , "ISO-8859-15" }, |
633 | { "lt_LN.ISO_8859-1" , "ISO-8859-1" }, |
634 | { "lt_LN.ISO_8859-2" , "ISO-8859-2" }, |
635 | { "lt_LT.ISO_8859-4" , "ISO-8859-4" }, |
636 | { "nl_BE.DIS_8859-15" , "ISO-8859-15" }, |
637 | { "nl_BE.ISO_8859-1" , "ISO-8859-1" }, |
638 | { "nl_NL.DIS_8859-15" , "ISO-8859-15" }, |
639 | { "nl_NL.ISO_8859-1" , "ISO-8859-1" }, |
640 | { "no_NO.DIS_8859-15" , "ISO-8859-15" }, |
641 | { "no_NO.ISO_8859-1" , "ISO-8859-1" }, |
642 | { "pl_PL.ISO_8859-2" , "ISO-8859-2" }, |
643 | { "pt_PT.DIS_8859-15" , "ISO-8859-15" }, |
644 | { "pt_PT.ISO_8859-1" , "ISO-8859-1" }, |
645 | { "ru_RU.CP866" , "CP866" }, |
646 | { "ru_RU.ISO_8859-5" , "ISO-8859-5" }, |
647 | { "ru_RU.KOI8-R" , "KOI8-R" }, |
648 | { "ru_SU.CP866" , "CP866" }, |
649 | { "ru_SU.ISO_8859-5" , "ISO-8859-5" }, |
650 | { "ru_SU.KOI8-R" , "KOI8-R" }, |
651 | { "sl_SI.ISO_8859-2" , "ISO-8859-2" }, |
652 | { "sv_SE.DIS_8859-15" , "ISO-8859-15" }, |
653 | { "sv_SE.ISO_8859-1" , "ISO-8859-1" }, |
654 | { "uk_UA.KOI8-U" , "KOI8-U" }, |
655 | { "zh_CN.EUC" , "GB2312" }, |
656 | { "zh_TW.BIG5" , "BIG5" }, |
657 | { "zh_TW.Big5" , "BIG5" } |
658 | # define locale_table_defined |
659 | # endif |
660 | # if defined __DJGPP__ /* DOS / DJGPP 2.03 */ |
661 | /* The encodings given here may not all be correct. |
662 | If you find that the encoding given for your language and |
663 | country is not the one your DOS machine actually uses, just |
664 | correct it in this file, and send a mail to |
665 | Juan Manuel Guerrero <juan.guerrero@gmx.de> |
666 | and <bug-gnulib@gnu.org>. */ |
667 | { "C" , "ASCII" }, |
668 | { "ar" , "CP864" }, |
669 | { "ar_AE" , "CP864" }, |
670 | { "ar_DZ" , "CP864" }, |
671 | { "ar_EG" , "CP864" }, |
672 | { "ar_IQ" , "CP864" }, |
673 | { "ar_IR" , "CP864" }, |
674 | { "ar_JO" , "CP864" }, |
675 | { "ar_KW" , "CP864" }, |
676 | { "ar_MA" , "CP864" }, |
677 | { "ar_OM" , "CP864" }, |
678 | { "ar_QA" , "CP864" }, |
679 | { "ar_SA" , "CP864" }, |
680 | { "ar_SY" , "CP864" }, |
681 | { "be" , "CP866" }, |
682 | { "be_BE" , "CP866" }, |
683 | { "bg" , "CP866" }, /* not CP855 ?? */ |
684 | { "bg_BG" , "CP866" }, /* not CP855 ?? */ |
685 | { "ca" , "CP850" }, |
686 | { "ca_ES" , "CP850" }, |
687 | { "cs" , "CP852" }, |
688 | { "cs_CZ" , "CP852" }, |
689 | { "da" , "CP865" }, /* not CP850 ?? */ |
690 | { "da_DK" , "CP865" }, /* not CP850 ?? */ |
691 | { "de" , "CP850" }, |
692 | { "de_AT" , "CP850" }, |
693 | { "de_CH" , "CP850" }, |
694 | { "de_DE" , "CP850" }, |
695 | { "el" , "CP869" }, |
696 | { "el_GR" , "CP869" }, |
697 | { "en" , "CP850" }, |
698 | { "en_AU" , "CP850" }, /* not CP437 ?? */ |
699 | { "en_CA" , "CP850" }, |
700 | { "en_GB" , "CP850" }, |
701 | { "en_NZ" , "CP437" }, |
702 | { "en_US" , "CP437" }, |
703 | { "en_ZA" , "CP850" }, /* not CP437 ?? */ |
704 | { "eo" , "CP850" }, |
705 | { "eo_EO" , "CP850" }, |
706 | { "es" , "CP850" }, |
707 | { "es_AR" , "CP850" }, |
708 | { "es_BO" , "CP850" }, |
709 | { "es_CL" , "CP850" }, |
710 | { "es_CO" , "CP850" }, |
711 | { "es_CR" , "CP850" }, |
712 | { "es_CU" , "CP850" }, |
713 | { "es_DO" , "CP850" }, |
714 | { "es_EC" , "CP850" }, |
715 | { "es_ES" , "CP850" }, |
716 | { "es_GT" , "CP850" }, |
717 | { "es_HN" , "CP850" }, |
718 | { "es_MX" , "CP850" }, |
719 | { "es_NI" , "CP850" }, |
720 | { "es_PA" , "CP850" }, |
721 | { "es_PE" , "CP850" }, |
722 | { "es_PY" , "CP850" }, |
723 | { "es_SV" , "CP850" }, |
724 | { "es_UY" , "CP850" }, |
725 | { "es_VE" , "CP850" }, |
726 | { "et" , "CP850" }, |
727 | { "et_EE" , "CP850" }, |
728 | { "eu" , "CP850" }, |
729 | { "eu_ES" , "CP850" }, |
730 | { "fi" , "CP850" }, |
731 | { "fi_FI" , "CP850" }, |
732 | { "fr" , "CP850" }, |
733 | { "fr_BE" , "CP850" }, |
734 | { "fr_CA" , "CP850" }, |
735 | { "fr_CH" , "CP850" }, |
736 | { "fr_FR" , "CP850" }, |
737 | { "ga" , "CP850" }, |
738 | { "ga_IE" , "CP850" }, |
739 | { "gd" , "CP850" }, |
740 | { "gd_GB" , "CP850" }, |
741 | { "gl" , "CP850" }, |
742 | { "gl_ES" , "CP850" }, |
743 | { "he" , "CP862" }, |
744 | { "he_IL" , "CP862" }, |
745 | { "hr" , "CP852" }, |
746 | { "hr_HR" , "CP852" }, |
747 | { "hu" , "CP852" }, |
748 | { "hu_HU" , "CP852" }, |
749 | { "id" , "CP850" }, /* not CP437 ?? */ |
750 | { "id_ID" , "CP850" }, /* not CP437 ?? */ |
751 | { "is" , "CP861" }, /* not CP850 ?? */ |
752 | { "is_IS" , "CP861" }, /* not CP850 ?? */ |
753 | { "it" , "CP850" }, |
754 | { "it_CH" , "CP850" }, |
755 | { "it_IT" , "CP850" }, |
756 | { "ja" , "CP932" }, |
757 | { "ja_JP" , "CP932" }, |
758 | { "kr" , "CP949" }, /* not CP934 ?? */ |
759 | { "kr_KR" , "CP949" }, /* not CP934 ?? */ |
760 | { "lt" , "CP775" }, |
761 | { "lt_LT" , "CP775" }, |
762 | { "lv" , "CP775" }, |
763 | { "lv_LV" , "CP775" }, |
764 | { "mk" , "CP866" }, /* not CP855 ?? */ |
765 | { "mk_MK" , "CP866" }, /* not CP855 ?? */ |
766 | { "mt" , "CP850" }, |
767 | { "mt_MT" , "CP850" }, |
768 | { "nb" , "CP865" }, /* not CP850 ?? */ |
769 | { "nb_NO" , "CP865" }, /* not CP850 ?? */ |
770 | { "nl" , "CP850" }, |
771 | { "nl_BE" , "CP850" }, |
772 | { "nl_NL" , "CP850" }, |
773 | { "nn" , "CP865" }, /* not CP850 ?? */ |
774 | { "nn_NO" , "CP865" }, /* not CP850 ?? */ |
775 | { "no" , "CP865" }, /* not CP850 ?? */ |
776 | { "no_NO" , "CP865" }, /* not CP850 ?? */ |
777 | { "pl" , "CP852" }, |
778 | { "pl_PL" , "CP852" }, |
779 | { "pt" , "CP850" }, |
780 | { "pt_BR" , "CP850" }, |
781 | { "pt_PT" , "CP850" }, |
782 | { "ro" , "CP852" }, |
783 | { "ro_RO" , "CP852" }, |
784 | { "ru" , "CP866" }, |
785 | { "ru_RU" , "CP866" }, |
786 | { "sk" , "CP852" }, |
787 | { "sk_SK" , "CP852" }, |
788 | { "sl" , "CP852" }, |
789 | { "sl_SI" , "CP852" }, |
790 | { "sq" , "CP852" }, |
791 | { "sq_AL" , "CP852" }, |
792 | { "sr" , "CP852" }, /* CP852 or CP866 or CP855 ?? */ |
793 | { "sr_CS" , "CP852" }, /* CP852 or CP866 or CP855 ?? */ |
794 | { "sr_YU" , "CP852" }, /* CP852 or CP866 or CP855 ?? */ |
795 | { "sv" , "CP850" }, |
796 | { "sv_SE" , "CP850" }, |
797 | { "th" , "CP874" }, |
798 | { "th_TH" , "CP874" }, |
799 | { "tr" , "CP857" }, |
800 | { "tr_TR" , "CP857" }, |
801 | { "uk" , "CP1125" }, |
802 | { "uk_UA" , "CP1125" }, |
803 | { "zh_CN" , "GBK" }, |
804 | { "zh_TW" , "CP950" } /* not CP938 ?? */ |
805 | # define locale_table_defined |
806 | # endif |
807 | # ifndef locale_table_defined |
808 | /* Just a dummy entry, to avoid a C syntax error. */ |
809 | { "" , "" } |
810 | # endif |
811 | }; |
812 | |
813 | #endif |
814 | |
815 | |
816 | /* Determine the current locale's character encoding, and canonicalize it |
817 | into one of the canonical names listed in localcharset.h. |
818 | The result must not be freed; it is statically allocated. |
819 | If the canonical name cannot be determined, the result is a non-canonical |
820 | name. */ |
821 | |
822 | #ifdef STATIC |
823 | STATIC |
824 | #endif |
825 | const char * |
826 | locale_charset (void) |
827 | { |
828 | const char *codeset; |
829 | |
830 | #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 |
831 | |
832 | # if HAVE_LANGINFO_CODESET |
833 | |
834 | /* Most systems support nl_langinfo (CODESET) nowadays. */ |
835 | codeset = nl_langinfo (CODESET); |
836 | |
837 | # ifdef __CYGWIN__ |
838 | /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always |
839 | returns "US-ASCII". Return the suffix of the locale name from the |
840 | environment variables (if present) or the codepage as a number. */ |
841 | if (codeset != NULL && strcmp (codeset, "US-ASCII" ) == 0) |
842 | { |
843 | const char *locale; |
844 | static char buf[2 + 10 + 1]; |
845 | |
846 | locale = getenv ("LC_ALL" ); |
847 | if (locale == NULL || locale[0] == '\0') |
848 | { |
849 | locale = getenv ("LC_CTYPE" ); |
850 | if (locale == NULL || locale[0] == '\0') |
851 | locale = getenv ("LANG" ); |
852 | } |
853 | if (locale != NULL && locale[0] != '\0') |
854 | { |
855 | /* If the locale name contains an encoding after the dot, return |
856 | it. */ |
857 | const char *dot = strchr (locale, '.'); |
858 | |
859 | if (dot != NULL) |
860 | { |
861 | const char *modifier; |
862 | |
863 | dot++; |
864 | /* Look for the possible @... trailer and remove it, if any. */ |
865 | modifier = strchr (dot, '@'); |
866 | if (modifier == NULL) |
867 | return dot; |
868 | if (modifier - dot < sizeof (buf)) |
869 | { |
870 | memcpy (buf, dot, modifier - dot); |
871 | buf [modifier - dot] = '\0'; |
872 | return buf; |
873 | } |
874 | } |
875 | } |
876 | |
877 | /* The Windows API has a function returning the locale's codepage as a |
878 | number: GetACP(). This encoding is used by Cygwin, unless the user |
879 | has set the environment variable CYGWIN=codepage:oem (which very few |
880 | people do). |
881 | Output directed to console windows needs to be converted (to |
882 | GetOEMCP() if the console is using a raster font, or to |
883 | GetConsoleOutputCP() if it is using a TrueType font). Cygwin does |
884 | this conversion transparently (see winsup/cygwin/fhandler_console.cc), |
885 | converting to GetConsoleOutputCP(). This leads to correct results, |
886 | except when SetConsoleOutputCP has been called and a raster font is |
887 | in use. */ |
888 | sprintf (buf, "CP%u" , GetACP ()); |
889 | codeset = buf; |
890 | } |
891 | # endif |
892 | |
893 | if (codeset == NULL) |
894 | /* The canonical name cannot be determined. */ |
895 | codeset = "" ; |
896 | |
897 | # elif defined WINDOWS_NATIVE |
898 | |
899 | static char buf[2 + 10 + 1]; |
900 | |
901 | /* The Windows API has a function returning the locale's codepage as |
902 | a number, but the value doesn't change according to what the |
903 | 'setlocale' call specified. So we use it as a last resort, in |
904 | case the string returned by 'setlocale' doesn't specify the |
905 | codepage. */ |
906 | char *current_locale = setlocale (LC_ALL, NULL); |
907 | char *pdot; |
908 | |
909 | /* If they set different locales for different categories, |
910 | 'setlocale' will return a semi-colon separated list of locale |
911 | values. To make sure we use the correct one, we choose LC_CTYPE. */ |
912 | if (strchr (current_locale, ';')) |
913 | current_locale = setlocale (LC_CTYPE, NULL); |
914 | |
915 | pdot = strrchr (current_locale, '.'); |
916 | if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf)) |
917 | sprintf (buf, "CP%s" , pdot + 1); |
918 | else |
919 | { |
920 | /* The Windows API has a function returning the locale's codepage as a |
921 | number: GetACP(). |
922 | When the output goes to a console window, it needs to be provided in |
923 | GetOEMCP() encoding if the console is using a raster font, or in |
924 | GetConsoleOutputCP() encoding if it is using a TrueType font. |
925 | But in GUI programs and for output sent to files and pipes, GetACP() |
926 | encoding is the best bet. */ |
927 | sprintf (buf, "CP%u" , GetACP ()); |
928 | } |
929 | /* For a locale name such as "French_France.65001", in Windows 10, |
930 | setlocale now returns "French_France.utf8" instead. */ |
931 | if (strcmp (buf + 2, "65001" ) == 0 || strcmp (buf + 2, "utf8" ) == 0) |
932 | codeset = "UTF-8" ; |
933 | else |
934 | codeset = buf; |
935 | |
936 | # elif defined OS2 |
937 | |
938 | const char *locale; |
939 | static char buf[2 + 10 + 1]; |
940 | ULONG cp[3]; |
941 | ULONG cplen; |
942 | |
943 | codeset = NULL; |
944 | |
945 | /* Allow user to override the codeset, as set in the operating system, |
946 | with standard language environment variables. */ |
947 | locale = getenv ("LC_ALL" ); |
948 | if (locale == NULL || locale[0] == '\0') |
949 | { |
950 | locale = getenv ("LC_CTYPE" ); |
951 | if (locale == NULL || locale[0] == '\0') |
952 | locale = getenv ("LANG" ); |
953 | } |
954 | if (locale != NULL && locale[0] != '\0') |
955 | { |
956 | /* If the locale name contains an encoding after the dot, return it. */ |
957 | const char *dot = strchr (locale, '.'); |
958 | |
959 | if (dot != NULL) |
960 | { |
961 | const char *modifier; |
962 | |
963 | dot++; |
964 | /* Look for the possible @... trailer and remove it, if any. */ |
965 | modifier = strchr (dot, '@'); |
966 | if (modifier == NULL) |
967 | return dot; |
968 | if (modifier - dot < sizeof (buf)) |
969 | { |
970 | memcpy (buf, dot, modifier - dot); |
971 | buf [modifier - dot] = '\0'; |
972 | return buf; |
973 | } |
974 | } |
975 | |
976 | /* For the POSIX locale, don't use the system's codepage. */ |
977 | if (strcmp (locale, "C" ) == 0 || strcmp (locale, "POSIX" ) == 0) |
978 | codeset = "" ; |
979 | } |
980 | |
981 | if (codeset == NULL) |
982 | { |
983 | /* OS/2 has a function returning the locale's codepage as a number. */ |
984 | if (DosQueryCp (sizeof (cp), cp, &cplen)) |
985 | codeset = "" ; |
986 | else |
987 | { |
988 | sprintf (buf, "CP%u" , cp[0]); |
989 | codeset = buf; |
990 | } |
991 | } |
992 | |
993 | # else |
994 | |
995 | # error "Add code for other platforms here." |
996 | |
997 | # endif |
998 | |
999 | /* Resolve alias. */ |
1000 | { |
1001 | # ifdef alias_table_defined |
1002 | /* On some platforms, UTF-8 locales are the most frequently used ones. |
1003 | Speed up the common case and slow down the less common cases by |
1004 | testing for this case first. */ |
1005 | # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__ |
1006 | if (strcmp (codeset, "UTF-8" ) == 0) |
1007 | goto done_table_lookup; |
1008 | else |
1009 | # endif |
1010 | { |
1011 | const struct table_entry * const table = alias_table; |
1012 | size_t const table_size = |
1013 | sizeof (alias_table) / sizeof (struct table_entry); |
1014 | /* The table is sorted. Perform a binary search. */ |
1015 | size_t hi = table_size; |
1016 | size_t lo = 0; |
1017 | while (lo < hi) |
1018 | { |
1019 | /* Invariant: |
1020 | for i < lo, strcmp (table[i].alias, codeset) < 0, |
1021 | for i >= hi, strcmp (table[i].alias, codeset) > 0. */ |
1022 | size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ |
1023 | int cmp = strcmp (table[mid].alias, codeset); |
1024 | if (cmp < 0) |
1025 | lo = mid + 1; |
1026 | else if (cmp > 0) |
1027 | hi = mid; |
1028 | else |
1029 | { |
1030 | /* Found an i with |
1031 | strcmp (table[i].alias, codeset) == 0. */ |
1032 | codeset = table[mid].canonical; |
1033 | goto done_table_lookup; |
1034 | } |
1035 | } |
1036 | } |
1037 | if (0) |
1038 | done_table_lookup: ; |
1039 | else |
1040 | # endif |
1041 | { |
1042 | /* Did not find it in the table. */ |
1043 | /* On Mac OS X, all modern locales use the UTF-8 encoding. |
1044 | BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ |
1045 | # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ |
1046 | codeset = "UTF-8" ; |
1047 | # else |
1048 | /* Don't return an empty string. GNU libc and GNU libiconv interpret |
1049 | the empty string as denoting "the locale's character encoding", |
1050 | thus GNU libiconv would call this function a second time. */ |
1051 | if (codeset[0] == '\0') |
1052 | codeset = "ASCII" ; |
1053 | # endif |
1054 | } |
1055 | } |
1056 | |
1057 | #else |
1058 | |
1059 | /* On old systems which lack it, use setlocale or getenv. */ |
1060 | const char *locale = NULL; |
1061 | |
1062 | /* But most old systems don't have a complete set of locales. Some |
1063 | (like DJGPP) have only the C locale. Therefore we don't use setlocale |
1064 | here; it would return "C" when it doesn't support the locale name the |
1065 | user has set. */ |
1066 | # if 0 |
1067 | locale = setlocale (LC_CTYPE, NULL); |
1068 | # endif |
1069 | if (locale == NULL || locale[0] == '\0') |
1070 | { |
1071 | locale = getenv ("LC_ALL" ); |
1072 | if (locale == NULL || locale[0] == '\0') |
1073 | { |
1074 | locale = getenv ("LC_CTYPE" ); |
1075 | if (locale == NULL || locale[0] == '\0') |
1076 | locale = getenv ("LANG" ); |
1077 | if (locale == NULL) |
1078 | locale = "" ; |
1079 | } |
1080 | } |
1081 | |
1082 | /* Map locale name to canonical encoding name. */ |
1083 | { |
1084 | # ifdef locale_table_defined |
1085 | const struct table_entry * const table = locale_table; |
1086 | size_t const table_size = |
1087 | sizeof (locale_table) / sizeof (struct table_entry); |
1088 | /* The table is sorted. Perform a binary search. */ |
1089 | size_t hi = table_size; |
1090 | size_t lo = 0; |
1091 | while (lo < hi) |
1092 | { |
1093 | /* Invariant: |
1094 | for i < lo, strcmp (table[i].locale, locale) < 0, |
1095 | for i >= hi, strcmp (table[i].locale, locale) > 0. */ |
1096 | size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ |
1097 | int cmp = strcmp (table[mid].locale, locale); |
1098 | if (cmp < 0) |
1099 | lo = mid + 1; |
1100 | else if (cmp > 0) |
1101 | hi = mid; |
1102 | else |
1103 | { |
1104 | /* Found an i with |
1105 | strcmp (table[i].locale, locale) == 0. */ |
1106 | codeset = table[mid].canonical; |
1107 | goto done_table_lookup; |
1108 | } |
1109 | } |
1110 | if (0) |
1111 | done_table_lookup: ; |
1112 | else |
1113 | # endif |
1114 | { |
1115 | /* Did not find it in the table. */ |
1116 | /* On Mac OS X, all modern locales use the UTF-8 encoding. |
1117 | BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ |
1118 | # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ |
1119 | codeset = "UTF-8" ; |
1120 | # else |
1121 | /* The canonical name cannot be determined. */ |
1122 | /* Don't return an empty string. GNU libc and GNU libiconv interpret |
1123 | the empty string as denoting "the locale's character encoding", |
1124 | thus GNU libiconv would call this function a second time. */ |
1125 | codeset = "ASCII" ; |
1126 | # endif |
1127 | } |
1128 | } |
1129 | |
1130 | #endif |
1131 | |
1132 | #ifdef DARWIN7 |
1133 | /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" |
1134 | (the default codeset) does not work when MB_CUR_MAX is 1. */ |
1135 | if (strcmp (codeset, "UTF-8" ) == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) |
1136 | codeset = "ASCII" ; |
1137 | #endif |
1138 | |
1139 | return codeset; |
1140 | } |
1141 | |