1/*
2 * Encoding names and routines for work with it. All
3 * in this file is shared between FE and BE.
4 *
5 * src/backend/utils/mb/encnames.c
6 */
7#ifdef FRONTEND
8#include "postgres_fe.h"
9#else
10#include "postgres.h"
11#include "utils/builtins.h"
12#endif
13
14#include <ctype.h>
15#include <unistd.h>
16
17#include "mb/pg_wchar.h"
18
19
20/* ----------
21 * All encoding names, sorted: *** A L P H A B E T I C ***
22 *
23 * All names must be without irrelevant chars, search routines use
24 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
25 * are always converted to 'iso88591'. All must be lower case.
26 *
27 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
28 *
29 * Karel Zak, Aug 2001
30 * ----------
31 */
32typedef struct pg_encname
33{
34 const char *name;
35 pg_enc encoding;
36} pg_encname;
37
38static const pg_encname pg_encname_tbl[] =
39{
40 {
41 "abc", PG_WIN1258
42 }, /* alias for WIN1258 */
43 {
44 "alt", PG_WIN866
45 }, /* IBM866 */
46 {
47 "big5", PG_BIG5
48 }, /* Big5; Chinese for Taiwan multibyte set */
49 {
50 "euccn", PG_EUC_CN
51 }, /* EUC-CN; Extended Unix Code for simplified
52 * Chinese */
53 {
54 "eucjis2004", PG_EUC_JIS_2004
55 }, /* EUC-JIS-2004; Extended UNIX Code fixed
56 * Width for Japanese, standard JIS X 0213 */
57 {
58 "eucjp", PG_EUC_JP
59 }, /* EUC-JP; Extended UNIX Code fixed Width for
60 * Japanese, standard OSF */
61 {
62 "euckr", PG_EUC_KR
63 }, /* EUC-KR; Extended Unix Code for Korean , KS
64 * X 1001 standard */
65 {
66 "euctw", PG_EUC_TW
67 }, /* EUC-TW; Extended Unix Code for
68 *
69 * traditional Chinese */
70 {
71 "gb18030", PG_GB18030
72 }, /* GB18030;GB18030 */
73 {
74 "gbk", PG_GBK
75 }, /* GBK; Chinese Windows CodePage 936
76 * simplified Chinese */
77 {
78 "iso88591", PG_LATIN1
79 }, /* ISO-8859-1; RFC1345,KXS2 */
80 {
81 "iso885910", PG_LATIN6
82 }, /* ISO-8859-10; RFC1345,KXS2 */
83 {
84 "iso885913", PG_LATIN7
85 }, /* ISO-8859-13; RFC1345,KXS2 */
86 {
87 "iso885914", PG_LATIN8
88 }, /* ISO-8859-14; RFC1345,KXS2 */
89 {
90 "iso885915", PG_LATIN9
91 }, /* ISO-8859-15; RFC1345,KXS2 */
92 {
93 "iso885916", PG_LATIN10
94 }, /* ISO-8859-16; RFC1345,KXS2 */
95 {
96 "iso88592", PG_LATIN2
97 }, /* ISO-8859-2; RFC1345,KXS2 */
98 {
99 "iso88593", PG_LATIN3
100 }, /* ISO-8859-3; RFC1345,KXS2 */
101 {
102 "iso88594", PG_LATIN4
103 }, /* ISO-8859-4; RFC1345,KXS2 */
104 {
105 "iso88595", PG_ISO_8859_5
106 }, /* ISO-8859-5; RFC1345,KXS2 */
107 {
108 "iso88596", PG_ISO_8859_6
109 }, /* ISO-8859-6; RFC1345,KXS2 */
110 {
111 "iso88597", PG_ISO_8859_7
112 }, /* ISO-8859-7; RFC1345,KXS2 */
113 {
114 "iso88598", PG_ISO_8859_8
115 }, /* ISO-8859-8; RFC1345,KXS2 */
116 {
117 "iso88599", PG_LATIN5
118 }, /* ISO-8859-9; RFC1345,KXS2 */
119 {
120 "johab", PG_JOHAB
121 }, /* JOHAB; Extended Unix Code for simplified
122 * Chinese */
123 {
124 "koi8", PG_KOI8R
125 }, /* _dirty_ alias for KOI8-R (backward
126 * compatibility) */
127 {
128 "koi8r", PG_KOI8R
129 }, /* KOI8-R; RFC1489 */
130 {
131 "koi8u", PG_KOI8U
132 }, /* KOI8-U; RFC2319 */
133 {
134 "latin1", PG_LATIN1
135 }, /* alias for ISO-8859-1 */
136 {
137 "latin10", PG_LATIN10
138 }, /* alias for ISO-8859-16 */
139 {
140 "latin2", PG_LATIN2
141 }, /* alias for ISO-8859-2 */
142 {
143 "latin3", PG_LATIN3
144 }, /* alias for ISO-8859-3 */
145 {
146 "latin4", PG_LATIN4
147 }, /* alias for ISO-8859-4 */
148 {
149 "latin5", PG_LATIN5
150 }, /* alias for ISO-8859-9 */
151 {
152 "latin6", PG_LATIN6
153 }, /* alias for ISO-8859-10 */
154 {
155 "latin7", PG_LATIN7
156 }, /* alias for ISO-8859-13 */
157 {
158 "latin8", PG_LATIN8
159 }, /* alias for ISO-8859-14 */
160 {
161 "latin9", PG_LATIN9
162 }, /* alias for ISO-8859-15 */
163 {
164 "mskanji", PG_SJIS
165 }, /* alias for Shift_JIS */
166 {
167 "muleinternal", PG_MULE_INTERNAL
168 },
169 {
170 "shiftjis", PG_SJIS
171 }, /* Shift_JIS; JIS X 0202-1991 */
172
173 {
174 "shiftjis2004", PG_SHIFT_JIS_2004
175 }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
176 * standard JIS X 0213 */
177 {
178 "sjis", PG_SJIS
179 }, /* alias for Shift_JIS */
180 {
181 "sqlascii", PG_SQL_ASCII
182 },
183 {
184 "tcvn", PG_WIN1258
185 }, /* alias for WIN1258 */
186 {
187 "tcvn5712", PG_WIN1258
188 }, /* alias for WIN1258 */
189 {
190 "uhc", PG_UHC
191 }, /* UHC; Korean Windows CodePage 949 */
192 {
193 "unicode", PG_UTF8
194 }, /* alias for UTF8 */
195 {
196 "utf8", PG_UTF8
197 }, /* alias for UTF8 */
198 {
199 "vscii", PG_WIN1258
200 }, /* alias for WIN1258 */
201 {
202 "win", PG_WIN1251
203 }, /* _dirty_ alias for windows-1251 (backward
204 * compatibility) */
205 {
206 "win1250", PG_WIN1250
207 }, /* alias for Windows-1250 */
208 {
209 "win1251", PG_WIN1251
210 }, /* alias for Windows-1251 */
211 {
212 "win1252", PG_WIN1252
213 }, /* alias for Windows-1252 */
214 {
215 "win1253", PG_WIN1253
216 }, /* alias for Windows-1253 */
217 {
218 "win1254", PG_WIN1254
219 }, /* alias for Windows-1254 */
220 {
221 "win1255", PG_WIN1255
222 }, /* alias for Windows-1255 */
223 {
224 "win1256", PG_WIN1256
225 }, /* alias for Windows-1256 */
226 {
227 "win1257", PG_WIN1257
228 }, /* alias for Windows-1257 */
229 {
230 "win1258", PG_WIN1258
231 }, /* alias for Windows-1258 */
232 {
233 "win866", PG_WIN866
234 }, /* IBM866 */
235 {
236 "win874", PG_WIN874
237 }, /* alias for Windows-874 */
238 {
239 "win932", PG_SJIS
240 }, /* alias for Shift_JIS */
241 {
242 "win936", PG_GBK
243 }, /* alias for GBK */
244 {
245 "win949", PG_UHC
246 }, /* alias for UHC */
247 {
248 "win950", PG_BIG5
249 }, /* alias for BIG5 */
250 {
251 "windows1250", PG_WIN1250
252 }, /* Windows-1251; Microsoft */
253 {
254 "windows1251", PG_WIN1251
255 }, /* Windows-1251; Microsoft */
256 {
257 "windows1252", PG_WIN1252
258 }, /* Windows-1252; Microsoft */
259 {
260 "windows1253", PG_WIN1253
261 }, /* Windows-1253; Microsoft */
262 {
263 "windows1254", PG_WIN1254
264 }, /* Windows-1254; Microsoft */
265 {
266 "windows1255", PG_WIN1255
267 }, /* Windows-1255; Microsoft */
268 {
269 "windows1256", PG_WIN1256
270 }, /* Windows-1256; Microsoft */
271 {
272 "windows1257", PG_WIN1257
273 }, /* Windows-1257; Microsoft */
274 {
275 "windows1258", PG_WIN1258
276 }, /* Windows-1258; Microsoft */
277 {
278 "windows866", PG_WIN866
279 }, /* IBM866 */
280 {
281 "windows874", PG_WIN874
282 }, /* Windows-874; Microsoft */
283 {
284 "windows932", PG_SJIS
285 }, /* alias for Shift_JIS */
286 {
287 "windows936", PG_GBK
288 }, /* alias for GBK */
289 {
290 "windows949", PG_UHC
291 }, /* alias for UHC */
292 {
293 "windows950", PG_BIG5
294 } /* alias for BIG5 */
295};
296
297/* ----------
298 * These are "official" encoding names.
299 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
300 * ----------
301 */
302#ifndef WIN32
303#define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304#else
305#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306#endif
307const pg_enc2name pg_enc2name_tbl[] =
308{
309 DEF_ENC2NAME(SQL_ASCII, 0),
310 DEF_ENC2NAME(EUC_JP, 20932),
311 DEF_ENC2NAME(EUC_CN, 20936),
312 DEF_ENC2NAME(EUC_KR, 51949),
313 DEF_ENC2NAME(EUC_TW, 0),
314 DEF_ENC2NAME(EUC_JIS_2004, 20932),
315 DEF_ENC2NAME(UTF8, 65001),
316 DEF_ENC2NAME(MULE_INTERNAL, 0),
317 DEF_ENC2NAME(LATIN1, 28591),
318 DEF_ENC2NAME(LATIN2, 28592),
319 DEF_ENC2NAME(LATIN3, 28593),
320 DEF_ENC2NAME(LATIN4, 28594),
321 DEF_ENC2NAME(LATIN5, 28599),
322 DEF_ENC2NAME(LATIN6, 0),
323 DEF_ENC2NAME(LATIN7, 0),
324 DEF_ENC2NAME(LATIN8, 0),
325 DEF_ENC2NAME(LATIN9, 28605),
326 DEF_ENC2NAME(LATIN10, 0),
327 DEF_ENC2NAME(WIN1256, 1256),
328 DEF_ENC2NAME(WIN1258, 1258),
329 DEF_ENC2NAME(WIN866, 866),
330 DEF_ENC2NAME(WIN874, 874),
331 DEF_ENC2NAME(KOI8R, 20866),
332 DEF_ENC2NAME(WIN1251, 1251),
333 DEF_ENC2NAME(WIN1252, 1252),
334 DEF_ENC2NAME(ISO_8859_5, 28595),
335 DEF_ENC2NAME(ISO_8859_6, 28596),
336 DEF_ENC2NAME(ISO_8859_7, 28597),
337 DEF_ENC2NAME(ISO_8859_8, 28598),
338 DEF_ENC2NAME(WIN1250, 1250),
339 DEF_ENC2NAME(WIN1253, 1253),
340 DEF_ENC2NAME(WIN1254, 1254),
341 DEF_ENC2NAME(WIN1255, 1255),
342 DEF_ENC2NAME(WIN1257, 1257),
343 DEF_ENC2NAME(KOI8U, 21866),
344 DEF_ENC2NAME(SJIS, 932),
345 DEF_ENC2NAME(BIG5, 950),
346 DEF_ENC2NAME(GBK, 936),
347 DEF_ENC2NAME(UHC, 949),
348 DEF_ENC2NAME(GB18030, 54936),
349 DEF_ENC2NAME(JOHAB, 0),
350 DEF_ENC2NAME(SHIFT_JIS_2004, 932)
351};
352
353/* ----------
354 * These are encoding names for gettext.
355 *
356 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
357 * ----------
358 */
359const pg_enc2gettext pg_enc2gettext_tbl[] =
360{
361 {PG_SQL_ASCII, "US-ASCII"},
362 {PG_UTF8, "UTF-8"},
363 {PG_LATIN1, "LATIN1"},
364 {PG_LATIN2, "LATIN2"},
365 {PG_LATIN3, "LATIN3"},
366 {PG_LATIN4, "LATIN4"},
367 {PG_ISO_8859_5, "ISO-8859-5"},
368 {PG_ISO_8859_6, "ISO_8859-6"},
369 {PG_ISO_8859_7, "ISO-8859-7"},
370 {PG_ISO_8859_8, "ISO-8859-8"},
371 {PG_LATIN5, "LATIN5"},
372 {PG_LATIN6, "LATIN6"},
373 {PG_LATIN7, "LATIN7"},
374 {PG_LATIN8, "LATIN8"},
375 {PG_LATIN9, "LATIN-9"},
376 {PG_LATIN10, "LATIN10"},
377 {PG_KOI8R, "KOI8-R"},
378 {PG_KOI8U, "KOI8-U"},
379 {PG_WIN1250, "CP1250"},
380 {PG_WIN1251, "CP1251"},
381 {PG_WIN1252, "CP1252"},
382 {PG_WIN1253, "CP1253"},
383 {PG_WIN1254, "CP1254"},
384 {PG_WIN1255, "CP1255"},
385 {PG_WIN1256, "CP1256"},
386 {PG_WIN1257, "CP1257"},
387 {PG_WIN1258, "CP1258"},
388 {PG_WIN866, "CP866"},
389 {PG_WIN874, "CP874"},
390 {PG_EUC_CN, "EUC-CN"},
391 {PG_EUC_JP, "EUC-JP"},
392 {PG_EUC_KR, "EUC-KR"},
393 {PG_EUC_TW, "EUC-TW"},
394 {PG_EUC_JIS_2004, "EUC-JP"},
395 {PG_SJIS, "SHIFT-JIS"},
396 {PG_BIG5, "BIG5"},
397 {PG_GBK, "GBK"},
398 {PG_UHC, "UHC"},
399 {PG_GB18030, "GB18030"},
400 {PG_JOHAB, "JOHAB"},
401 {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
402 {0, NULL}
403};
404
405
406#ifndef FRONTEND
407
408/*
409 * Table of encoding names for ICU
410 *
411 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412 *
413 * NULL entries are not supported by ICU, or their mapping is unclear.
414 */
415static const char *const pg_enc2icu_tbl[] =
416{
417 NULL, /* PG_SQL_ASCII */
418 "EUC-JP", /* PG_EUC_JP */
419 "EUC-CN", /* PG_EUC_CN */
420 "EUC-KR", /* PG_EUC_KR */
421 "EUC-TW", /* PG_EUC_TW */
422 NULL, /* PG_EUC_JIS_2004 */
423 "UTF-8", /* PG_UTF8 */
424 NULL, /* PG_MULE_INTERNAL */
425 "ISO-8859-1", /* PG_LATIN1 */
426 "ISO-8859-2", /* PG_LATIN2 */
427 "ISO-8859-3", /* PG_LATIN3 */
428 "ISO-8859-4", /* PG_LATIN4 */
429 "ISO-8859-9", /* PG_LATIN5 */
430 "ISO-8859-10", /* PG_LATIN6 */
431 "ISO-8859-13", /* PG_LATIN7 */
432 "ISO-8859-14", /* PG_LATIN8 */
433 "ISO-8859-15", /* PG_LATIN9 */
434 NULL, /* PG_LATIN10 */
435 "CP1256", /* PG_WIN1256 */
436 "CP1258", /* PG_WIN1258 */
437 "CP866", /* PG_WIN866 */
438 NULL, /* PG_WIN874 */
439 "KOI8-R", /* PG_KOI8R */
440 "CP1251", /* PG_WIN1251 */
441 "CP1252", /* PG_WIN1252 */
442 "ISO-8859-5", /* PG_ISO_8859_5 */
443 "ISO-8859-6", /* PG_ISO_8859_6 */
444 "ISO-8859-7", /* PG_ISO_8859_7 */
445 "ISO-8859-8", /* PG_ISO_8859_8 */
446 "CP1250", /* PG_WIN1250 */
447 "CP1253", /* PG_WIN1253 */
448 "CP1254", /* PG_WIN1254 */
449 "CP1255", /* PG_WIN1255 */
450 "CP1257", /* PG_WIN1257 */
451 "KOI8-U", /* PG_KOI8U */
452};
453
454bool
455is_encoding_supported_by_icu(int encoding)
456{
457 return (pg_enc2icu_tbl[encoding] != NULL);
458}
459
460const char *
461get_encoding_name_for_icu(int encoding)
462{
463 const char *icu_encoding_name;
464
465 StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
466 "pg_enc2icu_tbl incomplete");
467
468 icu_encoding_name = pg_enc2icu_tbl[encoding];
469
470 if (!icu_encoding_name)
471 ereport(ERROR,
472 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
473 errmsg("encoding \"%s\" not supported by ICU",
474 pg_encoding_to_char(encoding))));
475
476 return icu_encoding_name;
477}
478
479#endif /* not FRONTEND */
480
481
482/* ----------
483 * Encoding checks, for error returns -1 else encoding id
484 * ----------
485 */
486int
487pg_valid_client_encoding(const char *name)
488{
489 int enc;
490
491 if ((enc = pg_char_to_encoding(name)) < 0)
492 return -1;
493
494 if (!PG_VALID_FE_ENCODING(enc))
495 return -1;
496
497 return enc;
498}
499
500int
501pg_valid_server_encoding(const char *name)
502{
503 int enc;
504
505 if ((enc = pg_char_to_encoding(name)) < 0)
506 return -1;
507
508 if (!PG_VALID_BE_ENCODING(enc))
509 return -1;
510
511 return enc;
512}
513
514int
515pg_valid_server_encoding_id(int encoding)
516{
517 return PG_VALID_BE_ENCODING(encoding);
518}
519
520/* ----------
521 * Remove irrelevant chars from encoding name
522 * ----------
523 */
524static char *
525clean_encoding_name(const char *key, char *newkey)
526{
527 const char *p;
528 char *np;
529
530 for (p = key, np = newkey; *p != '\0'; p++)
531 {
532 if (isalnum((unsigned char) *p))
533 {
534 if (*p >= 'A' && *p <= 'Z')
535 *np++ = *p + 'a' - 'A';
536 else
537 *np++ = *p;
538 }
539 }
540 *np = '\0';
541 return newkey;
542}
543
544/* ----------
545 * Search encoding by encoding name
546 *
547 * Returns encoding ID, or -1 for error
548 * ----------
549 */
550int
551pg_char_to_encoding(const char *name)
552{
553 unsigned int nel = lengthof(pg_encname_tbl);
554 const pg_encname *base = pg_encname_tbl,
555 *last = base + nel - 1,
556 *position;
557 int result;
558 char buff[NAMEDATALEN],
559 *key;
560
561 if (name == NULL || *name == '\0')
562 return -1;
563
564 if (strlen(name) >= NAMEDATALEN)
565 {
566#ifdef FRONTEND
567 fprintf(stderr, "encoding name too long\n");
568 return -1;
569#else
570 ereport(ERROR,
571 (errcode(ERRCODE_NAME_TOO_LONG),
572 errmsg("encoding name too long")));
573#endif
574 }
575 key = clean_encoding_name(name, buff);
576
577 while (last >= base)
578 {
579 position = base + ((last - base) >> 1);
580 result = key[0] - position->name[0];
581
582 if (result == 0)
583 {
584 result = strcmp(key, position->name);
585 if (result == 0)
586 return position->encoding;
587 }
588 if (result < 0)
589 last = position - 1;
590 else
591 base = position + 1;
592 }
593 return -1;
594}
595
596#ifndef FRONTEND
597Datum
598PG_char_to_encoding(PG_FUNCTION_ARGS)
599{
600 Name s = PG_GETARG_NAME(0);
601
602 PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
603}
604#endif
605
606const char *
607pg_encoding_to_char(int encoding)
608{
609 if (PG_VALID_ENCODING(encoding))
610 {
611 const pg_enc2name *p = &pg_enc2name_tbl[encoding];
612
613 Assert(encoding == p->encoding);
614 return p->name;
615 }
616 return "";
617}
618
619#ifndef FRONTEND
620Datum
621PG_encoding_to_char(PG_FUNCTION_ARGS)
622{
623 int32 encoding = PG_GETARG_INT32(0);
624 const char *encoding_name = pg_encoding_to_char(encoding);
625
626 return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
627}
628
629#endif
630