| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * chklocale.c |
| 4 | * Functions for handling locale-related info |
| 5 | * |
| 6 | * |
| 7 | * Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/port/chklocale.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | |
| 16 | #ifndef FRONTEND |
| 17 | #include "postgres.h" |
| 18 | #else |
| 19 | #include "postgres_fe.h" |
| 20 | #endif |
| 21 | |
| 22 | #ifdef HAVE_LANGINFO_H |
| 23 | #include <langinfo.h> |
| 24 | #endif |
| 25 | |
| 26 | #include "mb/pg_wchar.h" |
| 27 | |
| 28 | |
| 29 | /* |
| 30 | * This table needs to recognize all the CODESET spellings for supported |
| 31 | * backend encodings, as well as frontend-only encodings where possible |
| 32 | * (the latter case is currently only needed for initdb to recognize |
| 33 | * error situations). On Windows, we rely on entries for codepage |
| 34 | * numbers (CPnnn). |
| 35 | * |
| 36 | * Note that we search the table with pg_strcasecmp(), so variant |
| 37 | * capitalizations don't need their own entries. |
| 38 | */ |
| 39 | struct encoding_match |
| 40 | { |
| 41 | enum pg_enc pg_enc_code; |
| 42 | const char *system_enc_name; |
| 43 | }; |
| 44 | |
| 45 | static const struct encoding_match encoding_match_list[] = { |
| 46 | {PG_EUC_JP, "EUC-JP" }, |
| 47 | {PG_EUC_JP, "eucJP" }, |
| 48 | {PG_EUC_JP, "IBM-eucJP" }, |
| 49 | {PG_EUC_JP, "sdeckanji" }, |
| 50 | {PG_EUC_JP, "CP20932" }, |
| 51 | |
| 52 | {PG_EUC_CN, "EUC-CN" }, |
| 53 | {PG_EUC_CN, "eucCN" }, |
| 54 | {PG_EUC_CN, "IBM-eucCN" }, |
| 55 | {PG_EUC_CN, "GB2312" }, |
| 56 | {PG_EUC_CN, "dechanzi" }, |
| 57 | {PG_EUC_CN, "CP20936" }, |
| 58 | |
| 59 | {PG_EUC_KR, "EUC-KR" }, |
| 60 | {PG_EUC_KR, "eucKR" }, |
| 61 | {PG_EUC_KR, "IBM-eucKR" }, |
| 62 | {PG_EUC_KR, "deckorean" }, |
| 63 | {PG_EUC_KR, "5601" }, |
| 64 | {PG_EUC_KR, "CP51949" }, |
| 65 | |
| 66 | {PG_EUC_TW, "EUC-TW" }, |
| 67 | {PG_EUC_TW, "eucTW" }, |
| 68 | {PG_EUC_TW, "IBM-eucTW" }, |
| 69 | {PG_EUC_TW, "cns11643" }, |
| 70 | /* No codepage for EUC-TW ? */ |
| 71 | |
| 72 | {PG_UTF8, "UTF-8" }, |
| 73 | {PG_UTF8, "utf8" }, |
| 74 | {PG_UTF8, "CP65001" }, |
| 75 | |
| 76 | {PG_LATIN1, "ISO-8859-1" }, |
| 77 | {PG_LATIN1, "ISO8859-1" }, |
| 78 | {PG_LATIN1, "iso88591" }, |
| 79 | {PG_LATIN1, "CP28591" }, |
| 80 | |
| 81 | {PG_LATIN2, "ISO-8859-2" }, |
| 82 | {PG_LATIN2, "ISO8859-2" }, |
| 83 | {PG_LATIN2, "iso88592" }, |
| 84 | {PG_LATIN2, "CP28592" }, |
| 85 | |
| 86 | {PG_LATIN3, "ISO-8859-3" }, |
| 87 | {PG_LATIN3, "ISO8859-3" }, |
| 88 | {PG_LATIN3, "iso88593" }, |
| 89 | {PG_LATIN3, "CP28593" }, |
| 90 | |
| 91 | {PG_LATIN4, "ISO-8859-4" }, |
| 92 | {PG_LATIN4, "ISO8859-4" }, |
| 93 | {PG_LATIN4, "iso88594" }, |
| 94 | {PG_LATIN4, "CP28594" }, |
| 95 | |
| 96 | {PG_LATIN5, "ISO-8859-9" }, |
| 97 | {PG_LATIN5, "ISO8859-9" }, |
| 98 | {PG_LATIN5, "iso88599" }, |
| 99 | {PG_LATIN5, "CP28599" }, |
| 100 | |
| 101 | {PG_LATIN6, "ISO-8859-10" }, |
| 102 | {PG_LATIN6, "ISO8859-10" }, |
| 103 | {PG_LATIN6, "iso885910" }, |
| 104 | |
| 105 | {PG_LATIN7, "ISO-8859-13" }, |
| 106 | {PG_LATIN7, "ISO8859-13" }, |
| 107 | {PG_LATIN7, "iso885913" }, |
| 108 | |
| 109 | {PG_LATIN8, "ISO-8859-14" }, |
| 110 | {PG_LATIN8, "ISO8859-14" }, |
| 111 | {PG_LATIN8, "iso885914" }, |
| 112 | |
| 113 | {PG_LATIN9, "ISO-8859-15" }, |
| 114 | {PG_LATIN9, "ISO8859-15" }, |
| 115 | {PG_LATIN9, "iso885915" }, |
| 116 | {PG_LATIN9, "CP28605" }, |
| 117 | |
| 118 | {PG_LATIN10, "ISO-8859-16" }, |
| 119 | {PG_LATIN10, "ISO8859-16" }, |
| 120 | {PG_LATIN10, "iso885916" }, |
| 121 | |
| 122 | {PG_KOI8R, "KOI8-R" }, |
| 123 | {PG_KOI8R, "CP20866" }, |
| 124 | |
| 125 | {PG_KOI8U, "KOI8-U" }, |
| 126 | {PG_KOI8U, "CP21866" }, |
| 127 | |
| 128 | {PG_WIN866, "CP866" }, |
| 129 | {PG_WIN874, "CP874" }, |
| 130 | {PG_WIN1250, "CP1250" }, |
| 131 | {PG_WIN1251, "CP1251" }, |
| 132 | {PG_WIN1251, "ansi-1251" }, |
| 133 | {PG_WIN1252, "CP1252" }, |
| 134 | {PG_WIN1253, "CP1253" }, |
| 135 | {PG_WIN1254, "CP1254" }, |
| 136 | {PG_WIN1255, "CP1255" }, |
| 137 | {PG_WIN1256, "CP1256" }, |
| 138 | {PG_WIN1257, "CP1257" }, |
| 139 | {PG_WIN1258, "CP1258" }, |
| 140 | |
| 141 | {PG_ISO_8859_5, "ISO-8859-5" }, |
| 142 | {PG_ISO_8859_5, "ISO8859-5" }, |
| 143 | {PG_ISO_8859_5, "iso88595" }, |
| 144 | {PG_ISO_8859_5, "CP28595" }, |
| 145 | |
| 146 | {PG_ISO_8859_6, "ISO-8859-6" }, |
| 147 | {PG_ISO_8859_6, "ISO8859-6" }, |
| 148 | {PG_ISO_8859_6, "iso88596" }, |
| 149 | {PG_ISO_8859_6, "CP28596" }, |
| 150 | |
| 151 | {PG_ISO_8859_7, "ISO-8859-7" }, |
| 152 | {PG_ISO_8859_7, "ISO8859-7" }, |
| 153 | {PG_ISO_8859_7, "iso88597" }, |
| 154 | {PG_ISO_8859_7, "CP28597" }, |
| 155 | |
| 156 | {PG_ISO_8859_8, "ISO-8859-8" }, |
| 157 | {PG_ISO_8859_8, "ISO8859-8" }, |
| 158 | {PG_ISO_8859_8, "iso88598" }, |
| 159 | {PG_ISO_8859_8, "CP28598" }, |
| 160 | |
| 161 | {PG_SJIS, "SJIS" }, |
| 162 | {PG_SJIS, "PCK" }, |
| 163 | {PG_SJIS, "CP932" }, |
| 164 | {PG_SJIS, "SHIFT_JIS" }, |
| 165 | |
| 166 | {PG_BIG5, "BIG5" }, |
| 167 | {PG_BIG5, "BIG5HKSCS" }, |
| 168 | {PG_BIG5, "Big5-HKSCS" }, |
| 169 | {PG_BIG5, "CP950" }, |
| 170 | |
| 171 | {PG_GBK, "GBK" }, |
| 172 | {PG_GBK, "CP936" }, |
| 173 | |
| 174 | {PG_UHC, "UHC" }, |
| 175 | {PG_UHC, "CP949" }, |
| 176 | |
| 177 | {PG_JOHAB, "JOHAB" }, |
| 178 | {PG_JOHAB, "CP1361" }, |
| 179 | |
| 180 | {PG_GB18030, "GB18030" }, |
| 181 | {PG_GB18030, "CP54936" }, |
| 182 | |
| 183 | {PG_SHIFT_JIS_2004, "SJIS_2004" }, |
| 184 | |
| 185 | {PG_SQL_ASCII, "US-ASCII" }, |
| 186 | |
| 187 | {PG_SQL_ASCII, NULL} /* end marker */ |
| 188 | }; |
| 189 | |
| 190 | #ifdef WIN32 |
| 191 | /* |
| 192 | * On Windows, use CP<code page number> instead of the nl_langinfo() result |
| 193 | * |
| 194 | * Visual Studio 2012 expanded the set of valid LC_CTYPE values, so have its |
| 195 | * locale machinery determine the code page. See comments at IsoLocaleName(). |
| 196 | * For other compilers, follow the locale's predictable format. |
| 197 | * |
| 198 | * Visual Studio 2015 should still be able to do the same, but the declaration |
| 199 | * of lc_codepage is missing in _locale_t, causing this code compilation to |
| 200 | * fail, hence this falls back instead on GetLocaleInfoEx. VS 2015 may be an |
| 201 | * exception and post-VS2015 versions should be able to handle properly the |
| 202 | * codepage number using _create_locale(). So, instead of the same logic as |
| 203 | * VS 2012 and VS 2013, this routine uses GetLocaleInfoEx to parse short |
| 204 | * locale names like "de-DE", "fr-FR", etc. If those cannot be parsed correctly |
| 205 | * process falls back to the pre-VS-2010 manual parsing done with |
| 206 | * using <Language>_<Country>.<CodePage> as a base. |
| 207 | * |
| 208 | * Returns a malloc()'d string for the caller to free. |
| 209 | */ |
| 210 | static char * |
| 211 | win32_langinfo(const char *ctype) |
| 212 | { |
| 213 | char *r = NULL; |
| 214 | |
| 215 | #if (_MSC_VER >= 1700) && (_MSC_VER < 1900) |
| 216 | _locale_t loct = NULL; |
| 217 | |
| 218 | loct = _create_locale(LC_CTYPE, ctype); |
| 219 | if (loct != NULL) |
| 220 | { |
| 221 | r = malloc(16); /* excess */ |
| 222 | if (r != NULL) |
| 223 | sprintf(r, "CP%u" , loct->locinfo->lc_codepage); |
| 224 | _free_locale(loct); |
| 225 | } |
| 226 | #else |
| 227 | char *codepage; |
| 228 | |
| 229 | #if (_MSC_VER >= 1900) |
| 230 | uint32 cp; |
| 231 | WCHAR wctype[LOCALE_NAME_MAX_LENGTH]; |
| 232 | |
| 233 | memset(wctype, 0, sizeof(wctype)); |
| 234 | MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH); |
| 235 | |
| 236 | if (GetLocaleInfoEx(wctype, |
| 237 | LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, |
| 238 | (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0) |
| 239 | { |
| 240 | r = malloc(16); /* excess */ |
| 241 | if (r != NULL) |
| 242 | sprintf(r, "CP%u" , cp); |
| 243 | } |
| 244 | else |
| 245 | #endif |
| 246 | { |
| 247 | /* |
| 248 | * Locale format on Win32 is <Language>_<Country>.<CodePage> . For |
| 249 | * example, English_United States.1252. |
| 250 | */ |
| 251 | codepage = strrchr(ctype, '.'); |
| 252 | if (codepage != NULL) |
| 253 | { |
| 254 | int ln; |
| 255 | |
| 256 | codepage++; |
| 257 | ln = strlen(codepage); |
| 258 | r = malloc(ln + 3); |
| 259 | if (r != NULL) |
| 260 | sprintf(r, "CP%s" , codepage); |
| 261 | } |
| 262 | |
| 263 | } |
| 264 | #endif |
| 265 | |
| 266 | return r; |
| 267 | } |
| 268 | |
| 269 | #ifndef FRONTEND |
| 270 | /* |
| 271 | * Given a Windows code page identifier, find the corresponding PostgreSQL |
| 272 | * encoding. Issue a warning and return -1 if none found. |
| 273 | */ |
| 274 | int |
| 275 | pg_codepage_to_encoding(UINT cp) |
| 276 | { |
| 277 | char sys[16]; |
| 278 | int i; |
| 279 | |
| 280 | sprintf(sys, "CP%u" , cp); |
| 281 | |
| 282 | /* Check the table */ |
| 283 | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
| 284 | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
| 285 | return encoding_match_list[i].pg_enc_code; |
| 286 | |
| 287 | ereport(WARNING, |
| 288 | (errmsg("could not determine encoding for codeset \"%s\"" , sys))); |
| 289 | |
| 290 | return -1; |
| 291 | } |
| 292 | #endif |
| 293 | #endif /* WIN32 */ |
| 294 | |
| 295 | #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32) |
| 296 | |
| 297 | /* |
| 298 | * Given a setting for LC_CTYPE, return the Postgres ID of the associated |
| 299 | * encoding, if we can determine it. Return -1 if we can't determine it. |
| 300 | * |
| 301 | * Pass in NULL to get the encoding for the current locale setting. |
| 302 | * Pass "" to get the encoding selected by the server's environment. |
| 303 | * |
| 304 | * If the result is PG_SQL_ASCII, callers should treat it as being compatible |
| 305 | * with any desired encoding. |
| 306 | * |
| 307 | * If running in the backend and write_message is false, this function must |
| 308 | * cope with the possibility that elog() and palloc() are not yet usable. |
| 309 | */ |
| 310 | int |
| 311 | pg_get_encoding_from_locale(const char *ctype, bool write_message) |
| 312 | { |
| 313 | char *sys; |
| 314 | int i; |
| 315 | |
| 316 | /* Get the CODESET property, and also LC_CTYPE if not passed in */ |
| 317 | if (ctype) |
| 318 | { |
| 319 | char *save; |
| 320 | char *name; |
| 321 | |
| 322 | /* If locale is C or POSIX, we can allow all encodings */ |
| 323 | if (pg_strcasecmp(ctype, "C" ) == 0 || |
| 324 | pg_strcasecmp(ctype, "POSIX" ) == 0) |
| 325 | return PG_SQL_ASCII; |
| 326 | |
| 327 | save = setlocale(LC_CTYPE, NULL); |
| 328 | if (!save) |
| 329 | return -1; /* setlocale() broken? */ |
| 330 | /* must copy result, or it might change after setlocale */ |
| 331 | save = strdup(save); |
| 332 | if (!save) |
| 333 | return -1; /* out of memory; unlikely */ |
| 334 | |
| 335 | name = setlocale(LC_CTYPE, ctype); |
| 336 | if (!name) |
| 337 | { |
| 338 | free(save); |
| 339 | return -1; /* bogus ctype passed in? */ |
| 340 | } |
| 341 | |
| 342 | #ifndef WIN32 |
| 343 | sys = nl_langinfo(CODESET); |
| 344 | if (sys) |
| 345 | sys = strdup(sys); |
| 346 | #else |
| 347 | sys = win32_langinfo(name); |
| 348 | #endif |
| 349 | |
| 350 | setlocale(LC_CTYPE, save); |
| 351 | free(save); |
| 352 | } |
| 353 | else |
| 354 | { |
| 355 | /* much easier... */ |
| 356 | ctype = setlocale(LC_CTYPE, NULL); |
| 357 | if (!ctype) |
| 358 | return -1; /* setlocale() broken? */ |
| 359 | |
| 360 | /* If locale is C or POSIX, we can allow all encodings */ |
| 361 | if (pg_strcasecmp(ctype, "C" ) == 0 || |
| 362 | pg_strcasecmp(ctype, "POSIX" ) == 0) |
| 363 | return PG_SQL_ASCII; |
| 364 | |
| 365 | #ifndef WIN32 |
| 366 | sys = nl_langinfo(CODESET); |
| 367 | if (sys) |
| 368 | sys = strdup(sys); |
| 369 | #else |
| 370 | sys = win32_langinfo(ctype); |
| 371 | #endif |
| 372 | } |
| 373 | |
| 374 | if (!sys) |
| 375 | return -1; /* out of memory; unlikely */ |
| 376 | |
| 377 | /* Check the table */ |
| 378 | for (i = 0; encoding_match_list[i].system_enc_name; i++) |
| 379 | { |
| 380 | if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0) |
| 381 | { |
| 382 | free(sys); |
| 383 | return encoding_match_list[i].pg_enc_code; |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | /* Special-case kluges for particular platforms go here */ |
| 388 | |
| 389 | #ifdef __darwin__ |
| 390 | |
| 391 | /* |
| 392 | * Current macOS has many locales that report an empty string for CODESET, |
| 393 | * but they all seem to actually use UTF-8. |
| 394 | */ |
| 395 | if (strlen(sys) == 0) |
| 396 | { |
| 397 | free(sys); |
| 398 | return PG_UTF8; |
| 399 | } |
| 400 | #endif |
| 401 | |
| 402 | /* |
| 403 | * We print a warning if we got a CODESET string but couldn't recognize |
| 404 | * it. This means we need another entry in the table. |
| 405 | */ |
| 406 | if (write_message) |
| 407 | { |
| 408 | #ifdef FRONTEND |
| 409 | fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\"" ), |
| 410 | ctype, sys); |
| 411 | /* keep newline separate so there's only one translatable string */ |
| 412 | fputc('\n', stderr); |
| 413 | #else |
| 414 | ereport(WARNING, |
| 415 | (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"" , |
| 416 | ctype, sys))); |
| 417 | #endif |
| 418 | } |
| 419 | |
| 420 | free(sys); |
| 421 | return -1; |
| 422 | } |
| 423 | #else /* (HAVE_LANGINFO_H && CODESET) || WIN32 */ |
| 424 | |
| 425 | /* |
| 426 | * stub if no multi-language platform support |
| 427 | * |
| 428 | * Note: we could return -1 here, but that would have the effect of |
| 429 | * forcing users to specify an encoding to initdb on such platforms. |
| 430 | * It seems better to silently default to SQL_ASCII. |
| 431 | */ |
| 432 | int |
| 433 | pg_get_encoding_from_locale(const char *ctype, bool write_message) |
| 434 | { |
| 435 | return PG_SQL_ASCII; |
| 436 | } |
| 437 | |
| 438 | #endif /* (HAVE_LANGINFO_H && CODESET) || WIN32 */ |
| 439 | |