1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * chklocale.c |
4 | * Functions for handling locale-related info |
5 | * |
6 | * |
7 | * Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/port/chklocale.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | |
16 | #ifndef FRONTEND |
17 | #include "postgres.h" |
18 | #else |
19 | #include "postgres_fe.h" |
20 | #endif |
21 | |
22 | #ifdef HAVE_LANGINFO_H |
23 | #include <langinfo.h> |
24 | #endif |
25 | |
26 | #include "mb/pg_wchar.h" |
27 | |
28 | |
29 | /* |
30 | * This table needs to recognize all the CODESET spellings for supported |
31 | * backend encodings, as well as frontend-only encodings where possible |
32 | * (the latter case is currently only needed for initdb to recognize |
33 | * error situations). On Windows, we rely on entries for codepage |
34 | * numbers (CPnnn). |
35 | * |
36 | * Note that we search the table with pg_strcasecmp(), so variant |
37 | * capitalizations don't need their own entries. |
38 | */ |
39 | struct encoding_match |
40 | { |
41 | enum pg_enc pg_enc_code; |
42 | const char *system_enc_name; |
43 | }; |
44 | |
45 | static const struct encoding_match encoding_match_list[] = { |
46 | {PG_EUC_JP, "EUC-JP" }, |
47 | {PG_EUC_JP, "eucJP" }, |
48 | {PG_EUC_JP, "IBM-eucJP" }, |
49 | {PG_EUC_JP, "sdeckanji" }, |
50 | {PG_EUC_JP, "CP20932" }, |
51 | |
52 | {PG_EUC_CN, "EUC-CN" }, |
53 | {PG_EUC_CN, "eucCN" }, |
54 | {PG_EUC_CN, "IBM-eucCN" }, |
55 | {PG_EUC_CN, "GB2312" }, |
56 | {PG_EUC_CN, "dechanzi" }, |
57 | {PG_EUC_CN, "CP20936" }, |
58 | |
59 | {PG_EUC_KR, "EUC-KR" }, |
60 | {PG_EUC_KR, "eucKR" }, |
61 | {PG_EUC_KR, "IBM-eucKR" }, |
62 | {PG_EUC_KR, "deckorean" }, |
63 | {PG_EUC_KR, "5601" }, |
64 | {PG_EUC_KR, "CP51949" }, |
65 | |
66 | {PG_EUC_TW, "EUC-TW" }, |
67 | {PG_EUC_TW, "eucTW" }, |
68 | {PG_EUC_TW, "IBM-eucTW" }, |
69 | {PG_EUC_TW, "cns11643" }, |
70 | /* No codepage for EUC-TW ? */ |
71 | |
72 | {PG_UTF8, "UTF-8" }, |
73 | {PG_UTF8, "utf8" }, |
74 | {PG_UTF8, "CP65001" }, |
75 | |
76 | {PG_LATIN1, "ISO-8859-1" }, |
77 | {PG_LATIN1, "ISO8859-1" }, |
78 | {PG_LATIN1, "iso88591" }, |
79 | {PG_LATIN1, "CP28591" }, |
80 | |
81 | {PG_LATIN2, "ISO-8859-2" }, |
82 | {PG_LATIN2, "ISO8859-2" }, |
83 | {PG_LATIN2, "iso88592" }, |
84 | {PG_LATIN2, "CP28592" }, |
85 | |
86 | {PG_LATIN3, "ISO-8859-3" }, |
87 | {PG_LATIN3, "ISO8859-3" }, |
88 | {PG_LATIN3, "iso88593" }, |
89 | {PG_LATIN3, "CP28593" }, |
90 | |
91 | {PG_LATIN4, "ISO-8859-4" }, |
92 | {PG_LATIN4, "ISO8859-4" }, |
93 | {PG_LATIN4, "iso88594" }, |
94 | {PG_LATIN4, "CP28594" }, |
95 | |
96 | {PG_LATIN5, "ISO-8859-9" }, |
97 | {PG_LATIN5, "ISO8859-9" }, |
98 | {PG_LATIN5, "iso88599" }, |
99 | {PG_LATIN5, "CP28599" }, |
100 | |
101 | {PG_LATIN6, "ISO-8859-10" }, |
102 | {PG_LATIN6, "ISO8859-10" }, |
103 | {PG_LATIN6, "iso885910" }, |
104 | |
105 | {PG_LATIN7, "ISO-8859-13" }, |
106 | {PG_LATIN7, "ISO8859-13" }, |
107 | {PG_LATIN7, "iso885913" }, |
108 | |
109 | {PG_LATIN8, "ISO-8859-14" }, |
110 | {PG_LATIN8, "ISO8859-14" }, |
111 | {PG_LATIN8, "iso885914" }, |
112 | |
113 | {PG_LATIN9, "ISO-8859-15" }, |
114 | {PG_LATIN9, "ISO8859-15" }, |
115 | {PG_LATIN9, "iso885915" }, |
116 | {PG_LATIN9, "CP28605" }, |
117 | |
118 | {PG_LATIN10, "ISO-8859-16" }, |
119 | {PG_LATIN10, "ISO8859-16" }, |
120 | {PG_LATIN10, "iso885916" }, |
121 | |
122 | {PG_KOI8R, "KOI8-R" }, |
123 | {PG_KOI8R, "CP20866" }, |
124 | |
125 | {PG_KOI8U, "KOI8-U" }, |
126 | {PG_KOI8U, "CP21866" }, |
127 | |
128 | {PG_WIN866, "CP866" }, |
129 | {PG_WIN874, "CP874" }, |
130 | {PG_WIN1250, "CP1250" }, |
131 | {PG_WIN1251, "CP1251" }, |
132 | {PG_WIN1251, "ansi-1251" }, |
133 | {PG_WIN1252, "CP1252" }, |
134 | {PG_WIN1253, "CP1253" }, |
135 | {PG_WIN1254, "CP1254" }, |
136 | {PG_WIN1255, "CP1255" }, |
137 | {PG_WIN1256, "CP1256" }, |
138 | {PG_WIN1257, "CP1257" }, |
139 | {PG_WIN1258, "CP1258" }, |
140 | |
141 | {PG_ISO_8859_5, "ISO-8859-5" }, |
142 | {PG_ISO_8859_5, "ISO8859-5" }, |
143 | {PG_ISO_8859_5, "iso88595" }, |
144 | {PG_ISO_8859_5, "CP28595" }, |
145 | |
146 | {PG_ISO_8859_6, "ISO-8859-6" }, |
147 | {PG_ISO_8859_6, "ISO8859-6" }, |
148 | {PG_ISO_8859_6, "iso88596" }, |
149 | {PG_ISO_8859_6, "CP28596" }, |
150 | |
151 | {PG_ISO_8859_7, "ISO-8859-7" }, |
152 | {PG_ISO_8859_7, "ISO8859-7" }, |
153 | {PG_ISO_8859_7, "iso88597" }, |
154 | {PG_ISO_8859_7, "CP28597" }, |
155 | |
156 | {PG_ISO_8859_8, "ISO-8859-8" }, |
157 | {PG_ISO_8859_8, "ISO8859-8" }, |
158 | {PG_ISO_8859_8, "iso88598" }, |
159 | {PG_ISO_8859_8, "CP28598" }, |
160 | |
161 | {PG_SJIS, "SJIS" }, |
162 | {PG_SJIS, "PCK" }, |
163 | {PG_SJIS, "CP932" }, |
164 | {PG_SJIS, "SHIFT_JIS" }, |
165 | |
166 | {PG_BIG5, "BIG5" }, |
167 | {PG_BIG5, "BIG5HKSCS" }, |
168 | {PG_BIG5, "Big5-HKSCS" }, |
169 | {PG_BIG5, "CP950" }, |
170 | |
171 | {PG_GBK, "GBK" }, |
172 | {PG_GBK, "CP936" }, |
173 | |
174 | {PG_UHC, "UHC" }, |
175 | {PG_UHC, "CP949" }, |
176 | |
177 | {PG_JOHAB, "JOHAB" }, |
178 | {PG_JOHAB, "CP1361" }, |
179 | |
180 | {PG_GB18030, "GB18030" }, |
181 | {PG_GB18030, "CP54936" }, |
182 | |
183 | {PG_SHIFT_JIS_2004, "SJIS_2004" }, |
184 | |
185 | {PG_SQL_ASCII, "US-ASCII" }, |
186 | |
187 | {PG_SQL_ASCII, NULL} /* end marker */ |
188 | }; |
189 | |
190 | #ifdef WIN32 |
191 | /* |
192 | * On Windows, use CP<code page number> instead of the nl_langinfo() result |
193 | * |
194 | * Visual Studio 2012 expanded the set of valid LC_CTYPE values, so have its |
195 | * locale machinery determine the code page. See comments at IsoLocaleName(). |
196 | * For other compilers, follow the locale's predictable format. |
197 | * |
198 | * Visual Studio 2015 should still be able to do the same, but the declaration |
199 | * of lc_codepage is missing in _locale_t, causing this code compilation to |
200 | * fail, hence this falls back instead on GetLocaleInfoEx. VS 2015 may be an |
201 | * exception and post-VS2015 versions should be able to handle properly the |
202 | * codepage number using _create_locale(). So, instead of the same logic as |
203 | * VS 2012 and VS 2013, this routine uses GetLocaleInfoEx to parse short |
204 | * locale names like "de-DE", "fr-FR", etc. If those cannot be parsed correctly |
205 | * process falls back to the pre-VS-2010 manual parsing done with |
206 | * using <Language>_<Country>.<CodePage> as a base. |
207 | * |
208 | * Returns a malloc()'d string for the caller to free. |
209 | */ |
210 | static char * |
211 | win32_langinfo(const char *ctype) |
212 | { |
213 | char *r = NULL; |
214 | |
215 | #if (_MSC_VER >= 1700) && (_MSC_VER < 1900) |
216 | _locale_t loct = NULL; |
217 | |
218 | loct = _create_locale(LC_CTYPE, ctype); |
219 | if (loct != NULL) |
220 | { |
221 | r = malloc(16); /* excess */ |
222 | if (r != NULL) |
223 | sprintf(r, "CP%u" , loct->locinfo->lc_codepage); |
224 | _free_locale(loct); |
225 | } |
226 | #else |
227 | char *codepage; |
228 | |
229 | #if (_MSC_VER >= 1900) |
230 | uint32 cp; |
231 | WCHAR wctype[LOCALE_NAME_MAX_LENGTH]; |
232 | |
233 | memset(wctype, 0, sizeof(wctype)); |
234 | MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH); |
235 | |
236 | if (GetLocaleInfoEx(wctype, |
237 | LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, |
238 | (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0) |
239 | { |
240 | r = malloc(16); /* excess */ |
241 | if (r != NULL) |
242 | sprintf(r, "CP%u" , cp); |
243 | } |
244 | else |
245 | #endif |
246 | { |
247 | /* |
248 | * Locale format on Win32 is <Language>_<Country>.<CodePage> . For |
249 | * example, English_United States.1252. |
250 | */ |
251 | codepage = strrchr(ctype, '.'); |
252 | if (codepage != NULL) |
253 | { |
254 | int ln; |
255 | |
256 | codepage++; |
257 | ln = strlen(codepage); |
258 | r = malloc(ln + 3); |
259 | if (r != NULL) |
260 | sprintf(r, "CP%s" , codepage); |
261 | } |
262 | |
263 | } |
264 | #endif |
265 | |
266 | return r; |
267 | } |
268 | |
269 | #ifndef FRONTEND |
270 | /* |
271 | * Given a Windows code page identifier, find the corresponding PostgreSQL |
272 | * encoding. Issue a warning and return -1 if none found. |
273 | */ |
274 | int |
275 | pg_codepage_to_encoding(UINT cp) |
276 | { |
277 | char sys[16]; |
278 | int i; |
279 | |
280 | sprintf(sys, "CP%u" , cp); |
281 | |
282 | /* Check the table */ |
283 | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
284 | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
285 | return encoding_match_list[i].pg_enc_code; |
286 | |
287 | ereport(WARNING, |
288 | (errmsg("could not determine encoding for codeset \"%s\"" , sys))); |
289 | |
290 | return -1; |
291 | } |
292 | #endif |
293 | #endif /* WIN32 */ |
294 | |
295 | #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32) |
296 | |
297 | /* |
298 | * Given a setting for LC_CTYPE, return the Postgres ID of the associated |
299 | * encoding, if we can determine it. Return -1 if we can't determine it. |
300 | * |
301 | * Pass in NULL to get the encoding for the current locale setting. |
302 | * Pass "" to get the encoding selected by the server's environment. |
303 | * |
304 | * If the result is PG_SQL_ASCII, callers should treat it as being compatible |
305 | * with any desired encoding. |
306 | * |
307 | * If running in the backend and write_message is false, this function must |
308 | * cope with the possibility that elog() and palloc() are not yet usable. |
309 | */ |
310 | int |
311 | pg_get_encoding_from_locale(const char *ctype, bool write_message) |
312 | { |
313 | char *sys; |
314 | int i; |
315 | |
316 | /* Get the CODESET property, and also LC_CTYPE if not passed in */ |
317 | if (ctype) |
318 | { |
319 | char *save; |
320 | char *name; |
321 | |
322 | /* If locale is C or POSIX, we can allow all encodings */ |
323 | if (pg_strcasecmp(ctype, "C" ) == 0 || |
324 | pg_strcasecmp(ctype, "POSIX" ) == 0) |
325 | return PG_SQL_ASCII; |
326 | |
327 | save = setlocale(LC_CTYPE, NULL); |
328 | if (!save) |
329 | return -1; /* setlocale() broken? */ |
330 | /* must copy result, or it might change after setlocale */ |
331 | save = strdup(save); |
332 | if (!save) |
333 | return -1; /* out of memory; unlikely */ |
334 | |
335 | name = setlocale(LC_CTYPE, ctype); |
336 | if (!name) |
337 | { |
338 | free(save); |
339 | return -1; /* bogus ctype passed in? */ |
340 | } |
341 | |
342 | #ifndef WIN32 |
343 | sys = nl_langinfo(CODESET); |
344 | if (sys) |
345 | sys = strdup(sys); |
346 | #else |
347 | sys = win32_langinfo(name); |
348 | #endif |
349 | |
350 | setlocale(LC_CTYPE, save); |
351 | free(save); |
352 | } |
353 | else |
354 | { |
355 | /* much easier... */ |
356 | ctype = setlocale(LC_CTYPE, NULL); |
357 | if (!ctype) |
358 | return -1; /* setlocale() broken? */ |
359 | |
360 | /* If locale is C or POSIX, we can allow all encodings */ |
361 | if (pg_strcasecmp(ctype, "C" ) == 0 || |
362 | pg_strcasecmp(ctype, "POSIX" ) == 0) |
363 | return PG_SQL_ASCII; |
364 | |
365 | #ifndef WIN32 |
366 | sys = nl_langinfo(CODESET); |
367 | if (sys) |
368 | sys = strdup(sys); |
369 | #else |
370 | sys = win32_langinfo(ctype); |
371 | #endif |
372 | } |
373 | |
374 | if (!sys) |
375 | return -1; /* out of memory; unlikely */ |
376 | |
377 | /* Check the table */ |
378 | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
379 | { |
380 | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
381 | { |
382 | free(sys); |
383 | return encoding_match_list[i].pg_enc_code; |
384 | } |
385 | } |
386 | |
387 | /* Special-case kluges for particular platforms go here */ |
388 | |
389 | #ifdef __darwin__ |
390 | |
391 | /* |
392 | * Current macOS has many locales that report an empty string for CODESET, |
393 | * but they all seem to actually use UTF-8. |
394 | */ |
395 | if (strlen(sys) == 0) |
396 | { |
397 | free(sys); |
398 | return PG_UTF8; |
399 | } |
400 | #endif |
401 | |
402 | /* |
403 | * We print a warning if we got a CODESET string but couldn't recognize |
404 | * it. This means we need another entry in the table. |
405 | */ |
406 | if (write_message) |
407 | { |
408 | #ifdef FRONTEND |
409 | fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\"" ), |
410 | ctype, sys); |
411 | /* keep newline separate so there's only one translatable string */ |
412 | fputc('\n', stderr); |
413 | #else |
414 | ereport(WARNING, |
415 | (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"" , |
416 | ctype, sys))); |
417 | #endif |
418 | } |
419 | |
420 | free(sys); |
421 | return -1; |
422 | } |
423 | #else /* (HAVE_LANGINFO_H && CODESET) || WIN32 */ |
424 | |
425 | /* |
426 | * stub if no multi-language platform support |
427 | * |
428 | * Note: we could return -1 here, but that would have the effect of |
429 | * forcing users to specify an encoding to initdb on such platforms. |
430 | * It seems better to silently default to SQL_ASCII. |
431 | */ |
432 | int |
433 | pg_get_encoding_from_locale(const char *ctype, bool write_message) |
434 | { |
435 | return PG_SQL_ASCII; |
436 | } |
437 | |
438 | #endif /* (HAVE_LANGINFO_H && CODESET) || WIN32 */ |
439 | |