localcharset.c source code [bison/lib/localcharset.c]

1	/ Determine a canonical name for the current locale's character encoding.*
2
3	Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License along
16	with this program; if not, see <https://www.gnu.org/licenses/>. /*
17
18	/ Written by Bruno Haible <bruno@clisp.org>. /
19
20	#include <config.h>
21
22	/ Specification. /
23	#include "localcharset.h"
24
25	#include <stddef.h>
26	#include <stdio.h>
27	#include <string.h>
28	#include <stdlib.h>
29
30	#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31	# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
32	#endif
33
34	#if defined _WIN32 && !defined __CYGWIN__
35	# define WINDOWS_NATIVE
36	# include <locale.h>
37	#endif
38
39	#if defined __EMX__
40	/ Assume EMX program runs on OS/2, even if compiled under DOS. /
41	# ifndef OS2
42	# define OS2
43	# endif
44	#endif
45
46	#if !defined WINDOWS_NATIVE
47	# if HAVE_LANGINFO_CODESET
48	# include <langinfo.h>
49	# else
50	# if 0 /* see comment regarding use of setlocale(), below */
51	# include <locale.h>
52	# endif
53	# endif
54	# ifdef __CYGWIN__
55	# define WIN32_LEAN_AND_MEAN
56	# include <windows.h>
57	# endif
58	#elif defined WINDOWS_NATIVE
59	# define WIN32_LEAN_AND_MEAN
60	# include <windows.h>
61	#endif
62	#if defined OS2
63	# define INCL_DOS
64	# include <os2.h>
65	#endif
66
67	/ For MB_CUR_MAX_L /
68	#if defined DARWIN7
69	# include <xlocale.h>
70	#endif
71
72
73	#if HAVE_LANGINFO_CODESET \|\| defined WINDOWS_NATIVE \|\| defined OS2
74
75	/ On these platforms, we use a mapping from non-canonical encoding name*
76	to GNU canonical encoding name. /*
77
78	/ With glibc-2.1 or newer, we don't need any canonicalization,*
79	because glibc has iconv and both glibc and libiconv support all
80	GNU canonical names directly. /*
81	# if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) \|\| defined __UCLIBC__)
82
83	struct table_entry
84	{
85	const char alias[`11`+`1`];
86	const char canonical[`11`+`1`];
87	};
88
89	/ Table of platform-dependent mappings, sorted in ascending order. /
90	static const struct table_entry alias_table[] =
91	{
92	# if defined __FreeBSD__ /* FreeBSD */
93	/{ "ARMSCII-8", "ARMSCII-8" },/
94	{ "Big5", "BIG5" },
95	{ "C", "ASCII" },
96	/{ "CP1131", "CP1131" },/
97	/{ "CP1251", "CP1251" },/
98	/{ "CP866", "CP866" },/
99	/{ "GB18030", "GB18030" },/
100	/{ "GB2312", "GB2312" },/
101	/{ "GBK", "GBK" },/
102	/{ "ISCII-DEV", "?" },/
103	{ "ISO8859-1", "ISO-8859-1" },
104	{ "ISO8859-13", "ISO-8859-13" },
105	{ "ISO8859-15", "ISO-8859-15" },
106	{ "ISO8859-2", "ISO-8859-2" },
107	{ "ISO8859-5", "ISO-8859-5" },
108	{ "ISO8859-7", "ISO-8859-7" },
109	{ "ISO8859-9", "ISO-8859-9" },
110	/{ "KOI8-R", "KOI8-R" },/
111	/{ "KOI8-U", "KOI8-U" },/
112	{ "SJIS", "SHIFT_JIS" },
113	{ "US-ASCII", "ASCII" },
114	{ "eucCN", "GB2312" },
115	{ "eucJP", "EUC-JP" },
116	{ "eucKR", "EUC-KR" }
117	# define alias_table_defined
118	# endif
119	# if defined __NetBSD__ /* NetBSD */
120	{ "646", "ASCII" },
121	/{ "ARMSCII-8", "ARMSCII-8" },/
122	/{ "BIG5", "BIG5" },/
123	{ "Big5-HKSCS", "BIG5-HKSCS" },
124	/{ "CP1251", "CP1251" },/
125	/{ "CP866", "CP866" },/
126	/{ "GB18030", "GB18030" },/
127	/{ "GB2312", "GB2312" },/
128	{ "ISO8859-1", "ISO-8859-1" },
129	{ "ISO8859-13", "ISO-8859-13" },
130	{ "ISO8859-15", "ISO-8859-15" },
131	{ "ISO8859-2", "ISO-8859-2" },
132	{ "ISO8859-4", "ISO-8859-4" },
133	{ "ISO8859-5", "ISO-8859-5" },
134	{ "ISO8859-7", "ISO-8859-7" },
135	/{ "KOI8-R", "KOI8-R" },/
136	/{ "KOI8-U", "KOI8-U" },/
137	/{ "PT154", "PT154" },/
138	{ "SJIS", "SHIFT_JIS" },
139	{ "eucCN", "GB2312" },
140	{ "eucJP", "EUC-JP" },
141	{ "eucKR", "EUC-KR" },
142	{ "eucTW", "EUC-TW" }
143	# define alias_table_defined
144	# endif
145	# if defined __OpenBSD__ /* OpenBSD */
146	{ "646", "ASCII" },
147	{ "ISO8859-1", "ISO-8859-1" },
148	{ "ISO8859-13", "ISO-8859-13" },
149	{ "ISO8859-15", "ISO-8859-15" },
150	{ "ISO8859-2", "ISO-8859-2" },
151	{ "ISO8859-4", "ISO-8859-4" },
152	{ "ISO8859-5", "ISO-8859-5" },
153	{ "ISO8859-7", "ISO-8859-7" }
154	# define alias_table_defined
155	# endif
156	# if defined __APPLE__ && defined __MACH__ /* Mac OS X */
157	/ Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is*
158	useless:
159	- It returns the empty string when LANG is set to a locale of the
160	form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
161	LC_CTYPE file.
162	- The environment variables LANG, LC_CTYPE, LC_ALL are not set by
163	the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
164	- The documentation says:
165	"... all code that calls BSD system routines should ensure
166	that the const char parameters of these routines are in UTF-8*
167	encoding. All BSD system functions expect their string
168	parameters to be in UTF-8 encoding and nothing else."
169	It also says
170	"An additional caveat is that string parameters for files,
171	paths, and other file-system entities must be in canonical
172	UTF-8. In a canonical UTF-8 Unicode string, all decomposable
173	characters are decomposed ..."
174	but this is not true: You can pass non-decomposed UTF-8 strings
175	to file system functions, and it is the OS which will convert
176	them to decomposed UTF-8 before accessing the file system.
177	- The Apple Terminal application displays UTF-8 by default.
178	- However, other applications are free to use different encodings:
179	- xterm uses ISO-8859-1 by default.
180	- TextEdit uses MacRoman by default.
181	We prefer UTF-8 over decomposed UTF-8-MAC because one should
182	minimize the use of decomposed Unicode. Unfortunately, through the
183	Darwin file system, decomposed UTF-8 strings are leaked into user
184	space nevertheless.
185	Then there are also the locales with encodings other than US-ASCII
186	and UTF-8. These locales can be occasionally useful to users (e.g.
187	when grepping through ISO-8859-1 encoded text files), when all their
188	file names are in US-ASCII.
189	*/
190	{ "ARMSCII-8", "ARMSCII-8" },
191	{ "Big5", "BIG5" },
192	{ "Big5HKSCS", "BIG5-HKSCS" },
193	{ "CP1131", "CP1131" },
194	{ "CP1251", "CP1251" },
195	{ "CP866", "CP866" },
196	{ "CP949", "CP949" },
197	{ "GB18030", "GB18030" },
198	{ "GB2312", "GB2312" },
199	{ "GBK", "GBK" },
200	/{ "ISCII-DEV", "?" },/
201	{ "ISO8859-1", "ISO-8859-1" },
202	{ "ISO8859-13", "ISO-8859-13" },
203	{ "ISO8859-15", "ISO-8859-15" },
204	{ "ISO8859-2", "ISO-8859-2" },
205	{ "ISO8859-4", "ISO-8859-4" },
206	{ "ISO8859-5", "ISO-8859-5" },
207	{ "ISO8859-7", "ISO-8859-7" },
208	{ "ISO8859-9", "ISO-8859-9" },
209	{ "KOI8-R", "KOI8-R" },
210	{ "KOI8-U", "KOI8-U" },
211	{ "PT154", "PT154" },
212	{ "SJIS", "SHIFT_JIS" },
213	{ "eucCN", "GB2312" },
214	{ "eucJP", "EUC-JP" },
215	{ "eucKR", "EUC-KR" }
216	# define alias_table_defined
217	# endif
218	# if defined _AIX /* AIX */
219	/{ "GBK", "GBK" },/
220	{ "IBM-1046", "CP1046" },
221	{ "IBM-1124", "CP1124" },
222	{ "IBM-1129", "CP1129" },
223	{ "IBM-1252", "CP1252" },
224	{ "IBM-850", "CP850" },
225	{ "IBM-856", "CP856" },
226	{ "IBM-921", "ISO-8859-13" },
227	{ "IBM-922", "CP922" },
228	{ "IBM-932", "CP932" },
229	{ "IBM-943", "CP943" },
230	{ "IBM-eucCN", "GB2312" },
231	{ "IBM-eucJP", "EUC-JP" },
232	{ "IBM-eucKR", "EUC-KR" },
233	{ "IBM-eucTW", "EUC-TW" },
234	{ "ISO8859-1", "ISO-8859-1" },
235	{ "ISO8859-15", "ISO-8859-15" },
236	{ "ISO8859-2", "ISO-8859-2" },
237	{ "ISO8859-5", "ISO-8859-5" },
238	{ "ISO8859-6", "ISO-8859-6" },
239	{ "ISO8859-7", "ISO-8859-7" },
240	{ "ISO8859-8", "ISO-8859-8" },
241	{ "ISO8859-9", "ISO-8859-9" },
242	{ "TIS-620", "TIS-620" },
243	/{ "UTF-8", "UTF-8" },/
244	{ "big5", "BIG5" }
245	# define alias_table_defined
246	# endif
247	# if defined __hpux /* HP-UX */
248	{ "SJIS", "SHIFT_JIS" },
249	{ "arabic8", "HP-ARABIC8" },
250	{ "big5", "BIG5" },
251	{ "cp1251", "CP1251" },
252	{ "eucJP", "EUC-JP" },
253	{ "eucKR", "EUC-KR" },
254	{ "eucTW", "EUC-TW" },
255	{ "gb18030", "GB18030" },
256	{ "greek8", "HP-GREEK8" },
257	{ "hebrew8", "HP-HEBREW8" },
258	{ "hkbig5", "BIG5-HKSCS" },
259	{ "hp15CN", "GB2312" },
260	{ "iso88591", "ISO-8859-1" },
261	{ "iso885913", "ISO-8859-13" },
262	{ "iso885915", "ISO-8859-15" },
263	{ "iso88592", "ISO-8859-2" },
264	{ "iso88594", "ISO-8859-4" },
265	{ "iso88595", "ISO-8859-5" },
266	{ "iso88596", "ISO-8859-6" },
267	{ "iso88597", "ISO-8859-7" },
268	{ "iso88598", "ISO-8859-8" },
269	{ "iso88599", "ISO-8859-9" },
270	{ "kana8", "HP-KANA8" },
271	{ "koi8r", "KOI8-R" },
272	{ "roman8", "HP-ROMAN8" },
273	{ "tis620", "TIS-620" },
274	{ "turkish8", "HP-TURKISH8" },
275	{ "utf8", "UTF-8" }
276	# define alias_table_defined
277	# endif
278	# if defined __sgi /* IRIX */
279	{ "ISO8859-1", "ISO-8859-1" },
280	{ "ISO8859-15", "ISO-8859-15" },
281	{ "ISO8859-2", "ISO-8859-2" },
282	{ "ISO8859-5", "ISO-8859-5" },
283	{ "ISO8859-7", "ISO-8859-7" },
284	{ "ISO8859-9", "ISO-8859-9" },
285	{ "eucCN", "GB2312" },
286	{ "eucJP", "EUC-JP" },
287	{ "eucKR", "EUC-KR" },
288	{ "eucTW", "EUC-TW" }
289	# define alias_table_defined
290	# endif
291	# if defined __osf__ /* OSF/1 */
292	/{ "GBK", "GBK" },/
293	{ "ISO8859-1", "ISO-8859-1" },
294	{ "ISO8859-15", "ISO-8859-15" },
295	{ "ISO8859-2", "ISO-8859-2" },
296	{ "ISO8859-4", "ISO-8859-4" },
297	{ "ISO8859-5", "ISO-8859-5" },
298	{ "ISO8859-7", "ISO-8859-7" },
299	{ "ISO8859-8", "ISO-8859-8" },
300	{ "ISO8859-9", "ISO-8859-9" },
301	{ "KSC5601", "CP949" },
302	{ "SJIS", "SHIFT_JIS" },
303	{ "TACTIS", "TIS-620" },
304	/{ "UTF-8", "UTF-8" },/
305	{ "big5", "BIG5" },
306	{ "cp850", "CP850" },
307	{ "dechanyu", "DEC-HANYU" },
308	{ "dechanzi", "GB2312" },
309	{ "deckanji", "DEC-KANJI" },
310	{ "deckorean", "EUC-KR" },
311	{ "eucJP", "EUC-JP" },
312	{ "eucKR", "EUC-KR" },
313	{ "eucTW", "EUC-TW" },
314	{ "sdeckanji", "EUC-JP" }
315	# define alias_table_defined
316	# endif
317	# if defined __sun /* Solaris */
318	{ "5601", "EUC-KR" },
319	{ "646", "ASCII" },
320	/{ "BIG5", "BIG5" },/
321	{ "Big5-HKSCS", "BIG5-HKSCS" },
322	{ "GB18030", "GB18030" },
323	/{ "GBK", "GBK" },/
324	{ "ISO8859-1", "ISO-8859-1" },
325	{ "ISO8859-11", "TIS-620" },
326	{ "ISO8859-13", "ISO-8859-13" },
327	{ "ISO8859-15", "ISO-8859-15" },
328	{ "ISO8859-2", "ISO-8859-2" },
329	{ "ISO8859-3", "ISO-8859-3" },
330	{ "ISO8859-4", "ISO-8859-4" },
331	{ "ISO8859-5", "ISO-8859-5" },
332	{ "ISO8859-6", "ISO-8859-6" },
333	{ "ISO8859-7", "ISO-8859-7" },
334	{ "ISO8859-8", "ISO-8859-8" },
335	{ "ISO8859-9", "ISO-8859-9" },
336	{ "PCK", "SHIFT_JIS" },
337	{ "TIS620.2533", "TIS-620" },
338	/{ "UTF-8", "UTF-8" },/
339	{ "ansi-1251", "CP1251" },
340	{ "cns11643", "EUC-TW" },
341	{ "eucJP", "EUC-JP" },
342	{ "gb2312", "GB2312" },
343	{ "koi8-r", "KOI8-R" }
344	# define alias_table_defined
345	# endif
346	# if defined __minix /* Minix */
347	{ "646", "ASCII" }
348	# define alias_table_defined
349	# endif
350	# if defined WINDOWS_NATIVE \|\| defined __CYGWIN__ /* Windows */
351	{ "CP1361", "JOHAB" },
352	{ "CP20127", "ASCII" },
353	{ "CP20866", "KOI8-R" },
354	{ "CP20936", "GB2312" },
355	{ "CP21866", "KOI8-RU" },
356	{ "CP28591", "ISO-8859-1" },
357	{ "CP28592", "ISO-8859-2" },
358	{ "CP28593", "ISO-8859-3" },
359	{ "CP28594", "ISO-8859-4" },
360	{ "CP28595", "ISO-8859-5" },
361	{ "CP28596", "ISO-8859-6" },
362	{ "CP28597", "ISO-8859-7" },
363	{ "CP28598", "ISO-8859-8" },
364	{ "CP28599", "ISO-8859-9" },
365	{ "CP28605", "ISO-8859-15" },
366	{ "CP38598", "ISO-8859-8" },
367	{ "CP51932", "EUC-JP" },
368	{ "CP51936", "GB2312" },
369	{ "CP51949", "EUC-KR" },
370	{ "CP51950", "EUC-TW" },
371	{ "CP54936", "GB18030" },
372	{ "CP65001", "UTF-8" },
373	{ "CP936", "GBK" }
374	# define alias_table_defined
375	# endif
376	# if defined OS2 /* OS/2 */
377	/ The list of encodings is taken from "List of OS/2 Codepages"*
378	by Alex Taylor:
379	<http://altsan.org/os2/toolkits/uls/index.html#codepages>.
380	See also "__convcp() of kLIBC":
381	<http://trac.netlabs.org/libc/browser/branches/libc-0.6/src/emx/src/lib/locale/__convcp.c>,
382	or:
383	<https://github.com/bitwiseworks/libc/blob/master/src/emx/src/lib/locale/__convcp.c>. /*
384	{ "CP1004", "CP1252" },
385	/{ "CP1041", "CP943" },/
386	/{ "CP1088", "CP949" },/
387	{ "CP1089", "ISO-8859-6" },
388	/{ "CP1114", "CP950" },/
389	/{ "CP1115", "GB2312" },/
390	{ "CP1208", "UTF-8" },
391	/{ "CP1380", "GB2312" },/
392	{ "CP1381", "GB2312" },
393	{ "CP1383", "GB2312" },
394	{ "CP1386", "GBK" },
395	/{ "CP301", "CP943" },/
396	{ "CP3372", "EUC-JP" },
397	{ "CP4946", "CP850" },
398	/{ "CP5048", "JIS_X0208-1990" },/
399	/{ "CP5049", "JIS_X0212-1990" },/
400	/{ "CP5067", "KS_C_5601-1987" },/
401	{ "CP813", "ISO-8859-7" },
402	{ "CP819", "ISO-8859-1" },
403	{ "CP878", "KOI8-R" },
404	/{ "CP897", "CP943" },/
405	{ "CP912", "ISO-8859-2" },
406	{ "CP913", "ISO-8859-3" },
407	{ "CP914", "ISO-8859-4" },
408	{ "CP915", "ISO-8859-5" },
409	{ "CP916", "ISO-8859-8" },
410	{ "CP920", "ISO-8859-9" },
411	{ "CP921", "ISO-8859-13" },
412	{ "CP923", "ISO-8859-15" },
413	/{ "CP941", "CP943" },/
414	/{ "CP947", "CP950" },/
415	/{ "CP951", "CP949" },/
416	/{ "CP952", "JIS_X0208-1990" },/
417	/{ "CP953", "JIS_X0212-1990" },/
418	{ "CP954", "EUC-JP" },
419	{ "CP964", "EUC-TW" },
420	{ "CP970", "EUC-KR" },
421	/{ "CP971", "KS_C_5601-1987" },/
422	{ "IBM-1004", "CP1252" },
423	/{ "IBM-1006", "?" },/
424	/{ "IBM-1008", "?" },/
425	/{ "IBM-1041", "CP943" },/
426	/{ "IBM-1051", "?" },/
427	/{ "IBM-1088", "CP949" },/
428	{ "IBM-1089", "ISO-8859-6" },
429	/{ "IBM-1098", "?" },/
430	/{ "IBM-1114", "CP950" },/
431	/{ "IBM-1115", "GB2312" },/
432	/{ "IBM-1116", "?" },/
433	/{ "IBM-1117", "?" },/
434	/{ "IBM-1118", "?" },/
435	/{ "IBM-1119", "?" },/
436	{ "IBM-1124", "CP1124" },
437	{ "IBM-1125", "CP1125" },
438	{ "IBM-1131", "CP1131" },
439	{ "IBM-1208", "UTF-8" },
440	{ "IBM-1250", "CP1250" },
441	{ "IBM-1251", "CP1251" },
442	{ "IBM-1252", "CP1252" },
443	{ "IBM-1253", "CP1253" },
444	{ "IBM-1254", "CP1254" },
445	{ "IBM-1255", "CP1255" },
446	{ "IBM-1256", "CP1256" },
447	{ "IBM-1257", "CP1257" },
448	/{ "IBM-1275", "?" },/
449	/{ "IBM-1276", "?" },/
450	/{ "IBM-1277", "?" },/
451	/{ "IBM-1280", "?" },/
452	/{ "IBM-1281", "?" },/
453	/{ "IBM-1282", "?" },/
454	/{ "IBM-1283", "?" },/
455	/{ "IBM-1380", "GB2312" },/
456	{ "IBM-1381", "GB2312" },
457	{ "IBM-1383", "GB2312" },
458	{ "IBM-1386", "GBK" },
459	/{ "IBM-301", "CP943" },/
460	{ "IBM-3372", "EUC-JP" },
461	{ "IBM-367", "ASCII" },
462	{ "IBM-437", "CP437" },
463	{ "IBM-4946", "CP850" },
464	/{ "IBM-5048", "JIS_X0208-1990" },/
465	/{ "IBM-5049", "JIS_X0212-1990" },/
466	/{ "IBM-5067", "KS_C_5601-1987" },/
467	{ "IBM-813", "ISO-8859-7" },
468	{ "IBM-819", "ISO-8859-1" },
469	{ "IBM-850", "CP850" },
470	/{ "IBM-851", "?" },/
471	{ "IBM-852", "CP852" },
472	{ "IBM-855", "CP855" },
473	{ "IBM-856", "CP856" },
474	{ "IBM-857", "CP857" },
475	/{ "IBM-859", "?" },/
476	{ "IBM-860", "CP860" },
477	{ "IBM-861", "CP861" },
478	{ "IBM-862", "CP862" },
479	{ "IBM-863", "CP863" },
480	{ "IBM-864", "CP864" },
481	{ "IBM-865", "CP865" },
482	{ "IBM-866", "CP866" },
483	/{ "IBM-868", "?" },/
484	{ "IBM-869", "CP869" },
485	{ "IBM-874", "CP874" },
486	{ "IBM-878", "KOI8-R" },
487	/{ "IBM-895", "?" },/
488	/{ "IBM-897", "CP943" },/
489	/{ "IBM-907", "?" },/
490	/{ "IBM-909", "?" },/
491	{ "IBM-912", "ISO-8859-2" },
492	{ "IBM-913", "ISO-8859-3" },
493	{ "IBM-914", "ISO-8859-4" },
494	{ "IBM-915", "ISO-8859-5" },
495	{ "IBM-916", "ISO-8859-8" },
496	{ "IBM-920", "ISO-8859-9" },
497	{ "IBM-921", "ISO-8859-13" },
498	{ "IBM-922", "CP922" },
499	{ "IBM-923", "ISO-8859-15" },
500	{ "IBM-932", "CP932" },
501	/{ "IBM-941", "CP943" },/
502	/{ "IBM-942", "?" },/
503	{ "IBM-943", "CP943" },
504	/{ "IBM-947", "CP950" },/
505	{ "IBM-949", "CP949" },
506	{ "IBM-950", "CP950" },
507	/{ "IBM-951", "CP949" },/
508	/{ "IBM-952", "JIS_X0208-1990" },/
509	/{ "IBM-953", "JIS_X0212-1990" },/
510	{ "IBM-954", "EUC-JP" },
511	/{ "IBM-955", "?" },/
512	{ "IBM-964", "EUC-TW" },
513	{ "IBM-970", "EUC-KR" },
514	/{ "IBM-971", "KS_C_5601-1987" },/
515	{ "IBM-eucCN", "GB2312" },
516	{ "IBM-eucJP", "EUC-JP" },
517	{ "IBM-eucKR", "EUC-KR" },
518	{ "IBM-eucTW", "EUC-TW" },
519	{ "IBM33722", "EUC-JP" },
520	{ "ISO8859-1", "ISO-8859-1" },
521	{ "ISO8859-2", "ISO-8859-2" },
522	{ "ISO8859-3", "ISO-8859-3" },
523	{ "ISO8859-4", "ISO-8859-4" },
524	{ "ISO8859-5", "ISO-8859-5" },
525	{ "ISO8859-6", "ISO-8859-6" },
526	{ "ISO8859-7", "ISO-8859-7" },
527	{ "ISO8859-8", "ISO-8859-8" },
528	{ "ISO8859-9", "ISO-8859-9" },
529	/{ "JISX0201-1976", "JISX0201-1976" },/
530	/{ "JISX0208-1978", "?" },/
531	/{ "JISX0208-1983", "JIS_X0208-1983" },/
532	/{ "JISX0208-1990", "JIS_X0208-1990" },/
533	/{ "JISX0212-1990", "JIS_X0212-1990" },/
534	/{ "KSC5601-1987", "KS_C_5601-1987" },/
535	{ "SJIS-1", "CP943" },
536	{ "SJIS-2", "CP943" },
537	{ "eucJP", "EUC-JP" },
538	{ "eucKR", "EUC-KR" },
539	{ "eucTW-1993", "EUC-TW" }
540	# define alias_table_defined
541	# endif
542	# if defined VMS /* OpenVMS */
543	/ The list of encodings is taken from the OpenVMS 7.3-1 documentation*
544	"Compaq C Run-Time Library Reference Manual for OpenVMS systems"
545	section 10.7 "Handling Different Character Sets". /*
546	{ "DECHANYU", "DEC-HANYU" },
547	{ "DECHANZI", "GB2312" },
548	{ "DECKANJI", "DEC-KANJI" },
549	{ "DECKOREAN", "EUC-KR" },
550	{ "ISO8859-1", "ISO-8859-1" },
551	{ "ISO8859-2", "ISO-8859-2" },
552	{ "ISO8859-5", "ISO-8859-5" },
553	{ "ISO8859-7", "ISO-8859-7" },
554	{ "ISO8859-8", "ISO-8859-8" },
555	{ "ISO8859-9", "ISO-8859-9" },
556	{ "SDECKANJI", "EUC-JP" },
557	{ "SJIS", "SHIFT_JIS" },
558	{ "eucJP", "EUC-JP" },
559	{ "eucTW", "EUC-TW" }
560	# define alias_table_defined
561	# endif
562	# ifndef alias_table_defined
563	/ Just a dummy entry, to avoid a C syntax error. /
564	{ "", "" }
565	# endif
566	};
567
568	# endif
569
570	#else
571
572	/ On these platforms, we use a mapping from locale name to GNU canonical*
573	encoding name. /*
574
575	struct table_entry
576	{
577	const char locale[`17`+`1`];
578	const char canonical[`11`+`1`];
579	};
580
581	/ Table of platform-dependent mappings, sorted in ascending order. /
582	static const struct table_entry locale_table[] =
583	{
584	# if defined __FreeBSD__ /* FreeBSD 4.2 */
585	{ "cs_CZ.ISO_8859-2", "ISO-8859-2" },
586	{ "da_DK.DIS_8859-15", "ISO-8859-15" },
587	{ "da_DK.ISO_8859-1", "ISO-8859-1" },
588	{ "de_AT.DIS_8859-15", "ISO-8859-15" },
589	{ "de_AT.ISO_8859-1", "ISO-8859-1" },
590	{ "de_CH.DIS_8859-15", "ISO-8859-15" },
591	{ "de_CH.ISO_8859-1", "ISO-8859-1" },
592	{ "de_DE.DIS_8859-15", "ISO-8859-15" },
593	{ "de_DE.ISO_8859-1", "ISO-8859-1" },
594	{ "en_AU.DIS_8859-15", "ISO-8859-15" },
595	{ "en_AU.ISO_8859-1", "ISO-8859-1" },
596	{ "en_CA.DIS_8859-15", "ISO-8859-15" },
597	{ "en_CA.ISO_8859-1", "ISO-8859-1" },
598	{ "en_GB.DIS_8859-15", "ISO-8859-15" },
599	{ "en_GB.ISO_8859-1", "ISO-8859-1" },
600	{ "en_US.DIS_8859-15", "ISO-8859-15" },
601	{ "en_US.ISO_8859-1", "ISO-8859-1" },
602	{ "es_ES.DIS_8859-15", "ISO-8859-15" },
603	{ "es_ES.ISO_8859-1", "ISO-8859-1" },
604	{ "fi_FI.DIS_8859-15", "ISO-8859-15" },
605	{ "fi_FI.ISO_8859-1", "ISO-8859-1" },
606	{ "fr_BE.DIS_8859-15", "ISO-8859-15" },
607	{ "fr_BE.ISO_8859-1", "ISO-8859-1" },
608	{ "fr_CA.DIS_8859-15", "ISO-8859-15" },
609	{ "fr_CA.ISO_8859-1", "ISO-8859-1" },
610	{ "fr_CH.DIS_8859-15", "ISO-8859-15" },
611	{ "fr_CH.ISO_8859-1", "ISO-8859-1" },
612	{ "fr_FR.DIS_8859-15", "ISO-8859-15" },
613	{ "fr_FR.ISO_8859-1", "ISO-8859-1" },
614	{ "hr_HR.ISO_8859-2", "ISO-8859-2" },
615	{ "hu_HU.ISO_8859-2", "ISO-8859-2" },
616	{ "is_IS.DIS_8859-15", "ISO-8859-15" },
617	{ "is_IS.ISO_8859-1", "ISO-8859-1" },
618	{ "it_CH.DIS_8859-15", "ISO-8859-15" },
619	{ "it_CH.ISO_8859-1", "ISO-8859-1" },
620	{ "it_IT.DIS_8859-15", "ISO-8859-15" },
621	{ "it_IT.ISO_8859-1", "ISO-8859-1" },
622	{ "ja_JP.EUC", "EUC-JP" },
623	{ "ja_JP.SJIS", "SHIFT_JIS" },
624	{ "ja_JP.Shift_JIS", "SHIFT_JIS" },
625	{ "ko_KR.EUC", "EUC-KR" },
626	{ "la_LN.ASCII", "ASCII" },
627	{ "la_LN.DIS_8859-15", "ISO-8859-15" },
628	{ "la_LN.ISO_8859-1", "ISO-8859-1" },
629	{ "la_LN.ISO_8859-2", "ISO-8859-2" },
630	{ "la_LN.ISO_8859-4", "ISO-8859-4" },
631	{ "lt_LN.ASCII", "ASCII" },
632	{ "lt_LN.DIS_8859-15", "ISO-8859-15" },
633	{ "lt_LN.ISO_8859-1", "ISO-8859-1" },
634	{ "lt_LN.ISO_8859-2", "ISO-8859-2" },
635	{ "lt_LT.ISO_8859-4", "ISO-8859-4" },
636	{ "nl_BE.DIS_8859-15", "ISO-8859-15" },
637	{ "nl_BE.ISO_8859-1", "ISO-8859-1" },
638	{ "nl_NL.DIS_8859-15", "ISO-8859-15" },
639	{ "nl_NL.ISO_8859-1", "ISO-8859-1" },
640	{ "no_NO.DIS_8859-15", "ISO-8859-15" },
641	{ "no_NO.ISO_8859-1", "ISO-8859-1" },
642	{ "pl_PL.ISO_8859-2", "ISO-8859-2" },
643	{ "pt_PT.DIS_8859-15", "ISO-8859-15" },
644	{ "pt_PT.ISO_8859-1", "ISO-8859-1" },
645	{ "ru_RU.CP866", "CP866" },
646	{ "ru_RU.ISO_8859-5", "ISO-8859-5" },
647	{ "ru_RU.KOI8-R", "KOI8-R" },
648	{ "ru_SU.CP866", "CP866" },
649	{ "ru_SU.ISO_8859-5", "ISO-8859-5" },
650	{ "ru_SU.KOI8-R", "KOI8-R" },
651	{ "sl_SI.ISO_8859-2", "ISO-8859-2" },
652	{ "sv_SE.DIS_8859-15", "ISO-8859-15" },
653	{ "sv_SE.ISO_8859-1", "ISO-8859-1" },
654	{ "uk_UA.KOI8-U", "KOI8-U" },
655	{ "zh_CN.EUC", "GB2312" },
656	{ "zh_TW.BIG5", "BIG5" },
657	{ "zh_TW.Big5", "BIG5" }
658	# define locale_table_defined
659	# endif
660	# if defined __DJGPP__ /* DOS / DJGPP 2.03 */
661	/ The encodings given here may not all be correct.*
662	If you find that the encoding given for your language and
663	country is not the one your DOS machine actually uses, just
664	correct it in this file, and send a mail to
665	Juan Manuel Guerrero <juan.guerrero@gmx.de>
666	and <bug-gnulib@gnu.org>. /*
667	{ "C", "ASCII" },
668	{ "ar", "CP864" },
669	{ "ar_AE", "CP864" },
670	{ "ar_DZ", "CP864" },
671	{ "ar_EG", "CP864" },
672	{ "ar_IQ", "CP864" },
673	{ "ar_IR", "CP864" },
674	{ "ar_JO", "CP864" },
675	{ "ar_KW", "CP864" },
676	{ "ar_MA", "CP864" },
677	{ "ar_OM", "CP864" },
678	{ "ar_QA", "CP864" },
679	{ "ar_SA", "CP864" },
680	{ "ar_SY", "CP864" },
681	{ "be", "CP866" },
682	{ "be_BE", "CP866" },
683	{ "bg", "CP866" }, / not CP855 ?? /
684	{ "bg_BG", "CP866" }, / not CP855 ?? /
685	{ "ca", "CP850" },
686	{ "ca_ES", "CP850" },
687	{ "cs", "CP852" },
688	{ "cs_CZ", "CP852" },
689	{ "da", "CP865" }, / not CP850 ?? /
690	{ "da_DK", "CP865" }, / not CP850 ?? /
691	{ "de", "CP850" },
692	{ "de_AT", "CP850" },
693	{ "de_CH", "CP850" },
694	{ "de_DE", "CP850" },
695	{ "el", "CP869" },
696	{ "el_GR", "CP869" },
697	{ "en", "CP850" },
698	{ "en_AU", "CP850" }, / not CP437 ?? /
699	{ "en_CA", "CP850" },
700	{ "en_GB", "CP850" },
701	{ "en_NZ", "CP437" },
702	{ "en_US", "CP437" },
703	{ "en_ZA", "CP850" }, / not CP437 ?? /
704	{ "eo", "CP850" },
705	{ "eo_EO", "CP850" },
706	{ "es", "CP850" },
707	{ "es_AR", "CP850" },
708	{ "es_BO", "CP850" },
709	{ "es_CL", "CP850" },
710	{ "es_CO", "CP850" },
711	{ "es_CR", "CP850" },
712	{ "es_CU", "CP850" },
713	{ "es_DO", "CP850" },
714	{ "es_EC", "CP850" },
715	{ "es_ES", "CP850" },
716	{ "es_GT", "CP850" },
717	{ "es_HN", "CP850" },
718	{ "es_MX", "CP850" },
719	{ "es_NI", "CP850" },
720	{ "es_PA", "CP850" },
721	{ "es_PE", "CP850" },
722	{ "es_PY", "CP850" },
723	{ "es_SV", "CP850" },
724	{ "es_UY", "CP850" },
725	{ "es_VE", "CP850" },
726	{ "et", "CP850" },
727	{ "et_EE", "CP850" },
728	{ "eu", "CP850" },
729	{ "eu_ES", "CP850" },
730	{ "fi", "CP850" },
731	{ "fi_FI", "CP850" },
732	{ "fr", "CP850" },
733	{ "fr_BE", "CP850" },
734	{ "fr_CA", "CP850" },
735	{ "fr_CH", "CP850" },
736	{ "fr_FR", "CP850" },
737	{ "ga", "CP850" },
738	{ "ga_IE", "CP850" },
739	{ "gd", "CP850" },
740	{ "gd_GB", "CP850" },
741	{ "gl", "CP850" },
742	{ "gl_ES", "CP850" },
743	{ "he", "CP862" },
744	{ "he_IL", "CP862" },
745	{ "hr", "CP852" },
746	{ "hr_HR", "CP852" },
747	{ "hu", "CP852" },
748	{ "hu_HU", "CP852" },
749	{ "id", "CP850" }, / not CP437 ?? /
750	{ "id_ID", "CP850" }, / not CP437 ?? /
751	{ "is", "CP861" }, / not CP850 ?? /
752	{ "is_IS", "CP861" }, / not CP850 ?? /
753	{ "it", "CP850" },
754	{ "it_CH", "CP850" },
755	{ "it_IT", "CP850" },
756	{ "ja", "CP932" },
757	{ "ja_JP", "CP932" },
758	{ "kr", "CP949" }, / not CP934 ?? /
759	{ "kr_KR", "CP949" }, / not CP934 ?? /
760	{ "lt", "CP775" },
761	{ "lt_LT", "CP775" },
762	{ "lv", "CP775" },
763	{ "lv_LV", "CP775" },
764	{ "mk", "CP866" }, / not CP855 ?? /
765	{ "mk_MK", "CP866" }, / not CP855 ?? /
766	{ "mt", "CP850" },
767	{ "mt_MT", "CP850" },
768	{ "nb", "CP865" }, / not CP850 ?? /
769	{ "nb_NO", "CP865" }, / not CP850 ?? /
770	{ "nl", "CP850" },
771	{ "nl_BE", "CP850" },
772	{ "nl_NL", "CP850" },
773	{ "nn", "CP865" }, / not CP850 ?? /
774	{ "nn_NO", "CP865" }, / not CP850 ?? /
775	{ "no", "CP865" }, / not CP850 ?? /
776	{ "no_NO", "CP865" }, / not CP850 ?? /
777	{ "pl", "CP852" },
778	{ "pl_PL", "CP852" },
779	{ "pt", "CP850" },
780	{ "pt_BR", "CP850" },
781	{ "pt_PT", "CP850" },
782	{ "ro", "CP852" },
783	{ "ro_RO", "CP852" },
784	{ "ru", "CP866" },
785	{ "ru_RU", "CP866" },
786	{ "sk", "CP852" },
787	{ "sk_SK", "CP852" },
788	{ "sl", "CP852" },
789	{ "sl_SI", "CP852" },
790	{ "sq", "CP852" },
791	{ "sq_AL", "CP852" },
792	{ "sr", "CP852" }, / CP852 or CP866 or CP855 ?? /
793	{ "sr_CS", "CP852" }, / CP852 or CP866 or CP855 ?? /
794	{ "sr_YU", "CP852" }, / CP852 or CP866 or CP855 ?? /
795	{ "sv", "CP850" },
796	{ "sv_SE", "CP850" },
797	{ "th", "CP874" },
798	{ "th_TH", "CP874" },
799	{ "tr", "CP857" },
800	{ "tr_TR", "CP857" },
801	{ "uk", "CP1125" },
802	{ "uk_UA", "CP1125" },
803	{ "zh_CN", "GBK" },
804	{ "zh_TW", "CP950" } / not CP938 ?? /
805	# define locale_table_defined
806	# endif
807	# ifndef locale_table_defined
808	/ Just a dummy entry, to avoid a C syntax error. /
809	{ "", "" }
810	# endif
811	};
812
813	#endif
814
815
816	/ Determine the current locale's character encoding, and canonicalize it*
817	into one of the canonical names listed in localcharset.h.
818	The result must not be freed; it is statically allocated.
819	If the canonical name cannot be determined, the result is a non-canonical
820	name. /*
821
822	#ifdef STATIC
823	STATIC
824	#endif
825	const char *
826	locale_charset (void)
827	{
828	const char *codeset;
829
830	#if HAVE_LANGINFO_CODESET \|\| defined WINDOWS_NATIVE \|\| defined OS2
831
832	# if HAVE_LANGINFO_CODESET
833
834	/ Most systems support nl_langinfo (CODESET) nowadays. /
835	codeset = nl_langinfo (CODESET);
836
837	# ifdef __CYGWIN__
838	/ Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always*
839	returns "US-ASCII". Return the suffix of the locale name from the
840	environment variables (if present) or the codepage as a number. /*
841	if (codeset != NULL && strcmp (codeset, "US-ASCII") == `0`)
842	{
843	const char *locale;
844	static char buf[`2` + `10` + `1`];
845
846	locale = getenv ("LC_ALL");
847	if (locale == NULL \|\| locale[`0`] == `'\0'`)
848	{
849	locale = getenv ("LC_CTYPE");
850	if (locale == NULL \|\| locale[`0`] == `'\0'`)
851	locale = getenv ("LANG");
852	}
853	if (locale != NULL && locale[`0`] != `'\0'`)
854	{
855	/ If the locale name contains an encoding after the dot, return*
856	it. /*
857	const char *dot = strchr (locale, `'.'`);
858
859	if (dot != NULL)
860	{
861	const char *modifier;
862
863	dot++;
864	/ Look for the possible @... trailer and remove it, if any. /
865	modifier = strchr (dot, `'@'`);
866	if (modifier == NULL)
867	return dot;
868	if (modifier - dot < sizeof (buf))
869	{
870	memcpy (buf, dot, modifier - dot);
871	buf [modifier - dot] = `'\0'`;
872	return buf;
873	}
874	}
875	}
876
877	/ The Windows API has a function returning the locale's codepage as a*
878	number: GetACP(). This encoding is used by Cygwin, unless the user
879	has set the environment variable CYGWIN=codepage:oem (which very few
880	people do).
881	Output directed to console windows needs to be converted (to
882	GetOEMCP() if the console is using a raster font, or to
883	GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
884	this conversion transparently (see winsup/cygwin/fhandler_console.cc),
885	converting to GetConsoleOutputCP(). This leads to correct results,
886	except when SetConsoleOutputCP has been called and a raster font is
887	in use. /*
888	sprintf (buf, "CP%u", GetACP ());
889	codeset = buf;
890	}
891	# endif
892
893	if (codeset == NULL)
894	/ The canonical name cannot be determined. /
895	codeset = "";
896
897	# elif defined WINDOWS_NATIVE
898
899	static char buf[`2` + `10` + `1`];
900
901	/ The Windows API has a function returning the locale's codepage as*
902	a number, but the value doesn't change according to what the
903	'setlocale' call specified. So we use it as a last resort, in
904	case the string returned by 'setlocale' doesn't specify the
905	codepage. /*
906	char *current_locale = setlocale (LC_ALL, NULL);
907	char *pdot;
908
909	/ If they set different locales for different categories,*
910	'setlocale' will return a semi-colon separated list of locale
911	values. To make sure we use the correct one, we choose LC_CTYPE. /*
912	if (strchr (current_locale, `';'`))
913	current_locale = setlocale (LC_CTYPE, NULL);
914
915	pdot = strrchr (current_locale, `'.'`);
916	if (pdot && `2` + strlen (pdot + `1`) + `1` <= sizeof (buf))
917	sprintf (buf, "CP%s", pdot + `1`);
918	else
919	{
920	/ The Windows API has a function returning the locale's codepage as a*
921	number: GetACP().
922	When the output goes to a console window, it needs to be provided in
923	GetOEMCP() encoding if the console is using a raster font, or in
924	GetConsoleOutputCP() encoding if it is using a TrueType font.
925	But in GUI programs and for output sent to files and pipes, GetACP()
926	encoding is the best bet. /*
927	sprintf (buf, "CP%u", GetACP ());
928	}
929	/ For a locale name such as "French_France.65001", in Windows 10,*
930	setlocale now returns "French_France.utf8" instead. /*
931	if (strcmp (buf + `2`, "65001") == `0` \|\| strcmp (buf + `2`, "utf8") == `0`)
932	codeset = "UTF-8";
933	else
934	codeset = buf;
935
936	# elif defined OS2
937
938	const char *locale;
939	static char buf[`2` + `10` + `1`];
940	ULONG cp[`3`];
941	ULONG cplen;
942
943	codeset = NULL;
944
945	/ Allow user to override the codeset, as set in the operating system,*
946	with standard language environment variables. /*
947	locale = getenv ("LC_ALL");
948	if (locale == NULL \|\| locale[`0`] == `'\0'`)
949	{
950	locale = getenv ("LC_CTYPE");
951	if (locale == NULL \|\| locale[`0`] == `'\0'`)
952	locale = getenv ("LANG");
953	}
954	if (locale != NULL && locale[`0`] != `'\0'`)
955	{
956	/ If the locale name contains an encoding after the dot, return it. /
957	const char *dot = strchr (locale, `'.'`);
958
959	if (dot != NULL)
960	{
961	const char *modifier;
962
963	dot++;
964	/ Look for the possible @... trailer and remove it, if any. /
965	modifier = strchr (dot, `'@'`);
966	if (modifier == NULL)
967	return dot;
968	if (modifier - dot < sizeof (buf))
969	{
970	memcpy (buf, dot, modifier - dot);
971	buf [modifier - dot] = `'\0'`;
972	return buf;
973	}
974	}
975
976	/ For the POSIX locale, don't use the system's codepage. /
977	if (strcmp (locale, "C") == `0` \|\| strcmp (locale, "POSIX") == `0`)
978	codeset = "";
979	}
980
981	if (codeset == NULL)
982	{
983	/ OS/2 has a function returning the locale's codepage as a number. /
984	if (DosQueryCp (sizeof (cp), cp, &cplen))
985	codeset = "";
986	else
987	{
988	sprintf (buf, "CP%u", cp[`0`]);
989	codeset = buf;
990	}
991	}
992
993	# else
994
995	# error "Add code for other platforms here."
996
997	# endif
998
999	/ Resolve alias. /
1000	{
1001	# ifdef alias_table_defined
1002	/ On some platforms, UTF-8 locales are the most frequently used ones.*
1003	Speed up the common case and slow down the less common cases by
1004	testing for this case first. /*
1005	# if defined __OpenBSD__ \|\| (defined __APPLE__ && defined __MACH__) \|\| defined __sun \|\| defined __CYGWIN__
1006	if (strcmp (codeset, "UTF-8") == `0`)
1007	goto done_table_lookup;
1008	else
1009	# endif
1010	{
1011	const struct table_entry * const table = alias_table;
1012	size_t const table_size =
1013	sizeof (alias_table) / sizeof (struct table_entry);
1014	/ The table is sorted. Perform a binary search. /
1015	size_t hi = table_size;
1016	size_t lo = `0`;
1017	while (lo < hi)
1018	{
1019	/ Invariant:*
1020	for i < lo, strcmp (table[i].alias, codeset) < 0,
1021	for i >= hi, strcmp (table[i].alias, codeset) > 0. /*
1022	size_t mid = (hi + lo) >> `1`; / >= lo, < hi /
1023	int cmp = strcmp (table[mid].alias, codeset);
1024	if (cmp < `0`)
1025	lo = mid + `1`;
1026	else if (cmp > `0`)
1027	hi = mid;
1028	else
1029	{
1030	/ Found an i with*
1031	strcmp (table[i].alias, codeset) == 0. /*
1032	codeset = table[mid].canonical;
1033	goto done_table_lookup;
1034	}
1035	}
1036	}
1037	if (`0`)
1038	done_table_lookup: ;
1039	else
1040	# endif
1041	{
1042	/ Did not find it in the table. /
1043	/ On Mac OS X, all modern locales use the UTF-8 encoding.*
1044	BeOS and Haiku have a single locale, and it has UTF-8 encoding. /*
1045	# if (defined __APPLE__ && defined __MACH__) \|\| defined __BEOS__ \|\| defined __HAIKU__
1046	codeset = "UTF-8";
1047	# else
1048	/ Don't return an empty string. GNU libc and GNU libiconv interpret*
1049	the empty string as denoting "the locale's character encoding",
1050	thus GNU libiconv would call this function a second time. /*
1051	if (codeset[`0`] == `'\0'`)
1052	codeset = "ASCII";
1053	# endif
1054	}
1055	}
1056
1057	#else
1058
1059	/ On old systems which lack it, use setlocale or getenv. /
1060	const char *locale = NULL;
1061
1062	/ But most old systems don't have a complete set of locales. Some*
1063	(like DJGPP) have only the C locale. Therefore we don't use setlocale
1064	here; it would return "C" when it doesn't support the locale name the
1065	user has set. /*
1066	# if 0
1067	locale = setlocale (LC_CTYPE, NULL);
1068	# endif
1069	if (locale == NULL \|\| locale[`0`] == `'\0'`)
1070	{
1071	locale = getenv ("LC_ALL");
1072	if (locale == NULL \|\| locale[`0`] == `'\0'`)
1073	{
1074	locale = getenv ("LC_CTYPE");
1075	if (locale == NULL \|\| locale[`0`] == `'\0'`)
1076	locale = getenv ("LANG");
1077	if (locale == NULL)
1078	locale = "";
1079	}
1080	}
1081
1082	/ Map locale name to canonical encoding name. /
1083	{
1084	# ifdef locale_table_defined
1085	const struct table_entry * const table = locale_table;
1086	size_t const table_size =
1087	sizeof (locale_table) / sizeof (struct table_entry);
1088	/ The table is sorted. Perform a binary search. /
1089	size_t hi = table_size;
1090	size_t lo = `0`;
1091	while (lo < hi)
1092	{
1093	/ Invariant:*
1094	for i < lo, strcmp (table[i].locale, locale) < 0,
1095	for i >= hi, strcmp (table[i].locale, locale) > 0. /*
1096	size_t mid = (hi + lo) >> `1`; / >= lo, < hi /
1097	int cmp = strcmp (table[mid].locale, locale);
1098	if (cmp < `0`)
1099	lo = mid + `1`;
1100	else if (cmp > `0`)
1101	hi = mid;
1102	else
1103	{
1104	/ Found an i with*
1105	strcmp (table[i].locale, locale) == 0. /*
1106	codeset = table[mid].canonical;
1107	goto done_table_lookup;
1108	}
1109	}
1110	if (`0`)
1111	done_table_lookup: ;
1112	else
1113	# endif
1114	{
1115	/ Did not find it in the table. /
1116	/ On Mac OS X, all modern locales use the UTF-8 encoding.*
1117	BeOS and Haiku have a single locale, and it has UTF-8 encoding. /*
1118	# if (defined __APPLE__ && defined __MACH__) \|\| defined __BEOS__ \|\| defined __HAIKU__
1119	codeset = "UTF-8";
1120	# else
1121	/ The canonical name cannot be determined. /
1122	/ Don't return an empty string. GNU libc and GNU libiconv interpret*
1123	the empty string as denoting "the locale's character encoding",
1124	thus GNU libiconv would call this function a second time. /*
1125	codeset = "ASCII";
1126	# endif
1127	}
1128	}
1129
1130	#endif
1131
1132	#ifdef DARWIN7
1133	/ Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"*
1134	(the default codeset) does not work when MB_CUR_MAX is 1. /*
1135	if (strcmp (codeset, "UTF-8") == `0` && MB_CUR_MAX_L (uselocale (NULL)) <= `1`)
1136	codeset = "ASCII";
1137	#endif
1138
1139	return codeset;
1140	}
1141

Browse the source code of bison/lib/localcharset.c