1/*
2 * conversion functions between pg_wchar and multibyte streams.
3 * Tatsuo Ishii
4 * src/backend/utils/mb/wchar.c
5 *
6 */
7/* can be used in either frontend or backend */
8#ifdef FRONTEND
9#include "postgres_fe.h"
10#else
11#include "postgres.h"
12#endif
13
14#include "mb/pg_wchar.h"
15
16
17/*
18 * Operations on multi-byte encodings are driven by a table of helper
19 * functions.
20 *
21 * To add an encoding support, define mblen(), dsplen() and verifier() for
22 * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
23 * conversion functions.
24 *
25 * These functions generally assume that their input is validly formed.
26 * The "verifier" functions, further down in the file, have to be more
27 * paranoid.
28 *
29 * We expect that mblen() does not need to examine more than the first byte
30 * of the character to discover the correct length. GB18030 is an exception
31 * to that rule, though, as it also looks at second byte. But even that
32 * behaves in a predictable way, if you only pass the first byte: it will
33 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
34 * good enough for all current uses.
35 *
36 * Note: for the display output of psql to work properly, the return values
37 * of the dsplen functions must conform to the Unicode standard. In particular
38 * the NUL character is zero width and control characters are generally
39 * width -1. It is recommended that non-ASCII encodings refer their ASCII
40 * subset to the ASCII routines to ensure consistency.
41 */
42
43/*
44 * SQL/ASCII
45 */
46static int
47pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
48{
49 int cnt = 0;
50
51 while (len > 0 && *from)
52 {
53 *to++ = *from++;
54 len--;
55 cnt++;
56 }
57 *to = 0;
58 return cnt;
59}
60
61static int
62pg_ascii_mblen(const unsigned char *s)
63{
64 return 1;
65}
66
67static int
68pg_ascii_dsplen(const unsigned char *s)
69{
70 if (*s == '\0')
71 return 0;
72 if (*s < 0x20 || *s == 0x7f)
73 return -1;
74
75 return 1;
76}
77
78/*
79 * EUC
80 */
81static int
82pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
83{
84 int cnt = 0;
85
86 while (len > 0 && *from)
87 {
88 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
89 * KANA") */
90 {
91 from++;
92 *to = (SS2 << 8) | *from++;
93 len -= 2;
94 }
95 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
96 {
97 from++;
98 *to = (SS3 << 16) | (*from++ << 8);
99 *to |= *from++;
100 len -= 3;
101 }
102 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
103 {
104 *to = *from++ << 8;
105 *to |= *from++;
106 len -= 2;
107 }
108 else /* must be ASCII */
109 {
110 *to = *from++;
111 len--;
112 }
113 to++;
114 cnt++;
115 }
116 *to = 0;
117 return cnt;
118}
119
120static inline int
121pg_euc_mblen(const unsigned char *s)
122{
123 int len;
124
125 if (*s == SS2)
126 len = 2;
127 else if (*s == SS3)
128 len = 3;
129 else if (IS_HIGHBIT_SET(*s))
130 len = 2;
131 else
132 len = 1;
133 return len;
134}
135
136static inline int
137pg_euc_dsplen(const unsigned char *s)
138{
139 int len;
140
141 if (*s == SS2)
142 len = 2;
143 else if (*s == SS3)
144 len = 2;
145 else if (IS_HIGHBIT_SET(*s))
146 len = 2;
147 else
148 len = pg_ascii_dsplen(s);
149 return len;
150}
151
152/*
153 * EUC_JP
154 */
155static int
156pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
157{
158 return pg_euc2wchar_with_len(from, to, len);
159}
160
161static int
162pg_eucjp_mblen(const unsigned char *s)
163{
164 return pg_euc_mblen(s);
165}
166
167static int
168pg_eucjp_dsplen(const unsigned char *s)
169{
170 int len;
171
172 if (*s == SS2)
173 len = 1;
174 else if (*s == SS3)
175 len = 2;
176 else if (IS_HIGHBIT_SET(*s))
177 len = 2;
178 else
179 len = pg_ascii_dsplen(s);
180 return len;
181}
182
183/*
184 * EUC_KR
185 */
186static int
187pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
188{
189 return pg_euc2wchar_with_len(from, to, len);
190}
191
192static int
193pg_euckr_mblen(const unsigned char *s)
194{
195 return pg_euc_mblen(s);
196}
197
198static int
199pg_euckr_dsplen(const unsigned char *s)
200{
201 return pg_euc_dsplen(s);
202}
203
204/*
205 * EUC_CN
206 *
207 */
208static int
209pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
210{
211 int cnt = 0;
212
213 while (len > 0 && *from)
214 {
215 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
216 {
217 from++;
218 *to = (SS2 << 16) | (*from++ << 8);
219 *to |= *from++;
220 len -= 3;
221 }
222 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
223 {
224 from++;
225 *to = (SS3 << 16) | (*from++ << 8);
226 *to |= *from++;
227 len -= 3;
228 }
229 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
230 {
231 *to = *from++ << 8;
232 *to |= *from++;
233 len -= 2;
234 }
235 else
236 {
237 *to = *from++;
238 len--;
239 }
240 to++;
241 cnt++;
242 }
243 *to = 0;
244 return cnt;
245}
246
247static int
248pg_euccn_mblen(const unsigned char *s)
249{
250 int len;
251
252 if (IS_HIGHBIT_SET(*s))
253 len = 2;
254 else
255 len = 1;
256 return len;
257}
258
259static int
260pg_euccn_dsplen(const unsigned char *s)
261{
262 int len;
263
264 if (IS_HIGHBIT_SET(*s))
265 len = 2;
266 else
267 len = pg_ascii_dsplen(s);
268 return len;
269}
270
271/*
272 * EUC_TW
273 *
274 */
275static int
276pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
277{
278 int cnt = 0;
279
280 while (len > 0 && *from)
281 {
282 if (*from == SS2 && len >= 4) /* code set 2 */
283 {
284 from++;
285 *to = (((uint32) SS2) << 24) | (*from++ << 16);
286 *to |= *from++ << 8;
287 *to |= *from++;
288 len -= 4;
289 }
290 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
291 {
292 from++;
293 *to = (SS3 << 16) | (*from++ << 8);
294 *to |= *from++;
295 len -= 3;
296 }
297 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
298 {
299 *to = *from++ << 8;
300 *to |= *from++;
301 len -= 2;
302 }
303 else
304 {
305 *to = *from++;
306 len--;
307 }
308 to++;
309 cnt++;
310 }
311 *to = 0;
312 return cnt;
313}
314
315static int
316pg_euctw_mblen(const unsigned char *s)
317{
318 int len;
319
320 if (*s == SS2)
321 len = 4;
322 else if (*s == SS3)
323 len = 3;
324 else if (IS_HIGHBIT_SET(*s))
325 len = 2;
326 else
327 len = 1;
328 return len;
329}
330
331static int
332pg_euctw_dsplen(const unsigned char *s)
333{
334 int len;
335
336 if (*s == SS2)
337 len = 2;
338 else if (*s == SS3)
339 len = 2;
340 else if (IS_HIGHBIT_SET(*s))
341 len = 2;
342 else
343 len = pg_ascii_dsplen(s);
344 return len;
345}
346
347/*
348 * Convert pg_wchar to EUC_* encoding.
349 * caller must allocate enough space for "to", including a trailing zero!
350 * len: length of from.
351 * "from" not necessarily null terminated.
352 */
353static int
354pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
355{
356 int cnt = 0;
357
358 while (len > 0 && *from)
359 {
360 unsigned char c;
361
362 if ((c = (*from >> 24)))
363 {
364 *to++ = c;
365 *to++ = (*from >> 16) & 0xff;
366 *to++ = (*from >> 8) & 0xff;
367 *to++ = *from & 0xff;
368 cnt += 4;
369 }
370 else if ((c = (*from >> 16)))
371 {
372 *to++ = c;
373 *to++ = (*from >> 8) & 0xff;
374 *to++ = *from & 0xff;
375 cnt += 3;
376 }
377 else if ((c = (*from >> 8)))
378 {
379 *to++ = c;
380 *to++ = *from & 0xff;
381 cnt += 2;
382 }
383 else
384 {
385 *to++ = *from;
386 cnt++;
387 }
388 from++;
389 len--;
390 }
391 *to = 0;
392 return cnt;
393}
394
395
396/*
397 * JOHAB
398 */
399static int
400pg_johab_mblen(const unsigned char *s)
401{
402 return pg_euc_mblen(s);
403}
404
405static int
406pg_johab_dsplen(const unsigned char *s)
407{
408 return pg_euc_dsplen(s);
409}
410
411/*
412 * convert UTF8 string to pg_wchar (UCS-4)
413 * caller must allocate enough space for "to", including a trailing zero!
414 * len: length of from.
415 * "from" not necessarily null terminated.
416 */
417static int
418pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
419{
420 int cnt = 0;
421 uint32 c1,
422 c2,
423 c3,
424 c4;
425
426 while (len > 0 && *from)
427 {
428 if ((*from & 0x80) == 0)
429 {
430 *to = *from++;
431 len--;
432 }
433 else if ((*from & 0xe0) == 0xc0)
434 {
435 if (len < 2)
436 break; /* drop trailing incomplete char */
437 c1 = *from++ & 0x1f;
438 c2 = *from++ & 0x3f;
439 *to = (c1 << 6) | c2;
440 len -= 2;
441 }
442 else if ((*from & 0xf0) == 0xe0)
443 {
444 if (len < 3)
445 break; /* drop trailing incomplete char */
446 c1 = *from++ & 0x0f;
447 c2 = *from++ & 0x3f;
448 c3 = *from++ & 0x3f;
449 *to = (c1 << 12) | (c2 << 6) | c3;
450 len -= 3;
451 }
452 else if ((*from & 0xf8) == 0xf0)
453 {
454 if (len < 4)
455 break; /* drop trailing incomplete char */
456 c1 = *from++ & 0x07;
457 c2 = *from++ & 0x3f;
458 c3 = *from++ & 0x3f;
459 c4 = *from++ & 0x3f;
460 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
461 len -= 4;
462 }
463 else
464 {
465 /* treat a bogus char as length 1; not ours to raise error */
466 *to = *from++;
467 len--;
468 }
469 to++;
470 cnt++;
471 }
472 *to = 0;
473 return cnt;
474}
475
476
477/*
478 * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
479 * space allocated.
480 */
481unsigned char *
482unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
483{
484 if (c <= 0x7F)
485 {
486 utf8string[0] = c;
487 }
488 else if (c <= 0x7FF)
489 {
490 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
491 utf8string[1] = 0x80 | (c & 0x3F);
492 }
493 else if (c <= 0xFFFF)
494 {
495 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
496 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
497 utf8string[2] = 0x80 | (c & 0x3F);
498 }
499 else
500 {
501 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
502 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
503 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
504 utf8string[3] = 0x80 | (c & 0x3F);
505 }
506
507 return utf8string;
508}
509
510/*
511 * Trivial conversion from pg_wchar to UTF-8.
512 * caller should allocate enough space for "to"
513 * len: length of from.
514 * "from" not necessarily null terminated.
515 */
516static int
517pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
518{
519 int cnt = 0;
520
521 while (len > 0 && *from)
522 {
523 int char_len;
524
525 unicode_to_utf8(*from, to);
526 char_len = pg_utf_mblen(to);
527 cnt += char_len;
528 to += char_len;
529 from++;
530 len--;
531 }
532 *to = 0;
533 return cnt;
534}
535
536/*
537 * Return the byte length of a UTF8 character pointed to by s
538 *
539 * Note: in the current implementation we do not support UTF8 sequences
540 * of more than 4 bytes; hence do NOT return a value larger than 4.
541 * We return "1" for any leading byte that is either flat-out illegal or
542 * indicates a length larger than we support.
543 *
544 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
545 * other places would need to be fixed to change this.
546 */
547int
548pg_utf_mblen(const unsigned char *s)
549{
550 int len;
551
552 if ((*s & 0x80) == 0)
553 len = 1;
554 else if ((*s & 0xe0) == 0xc0)
555 len = 2;
556 else if ((*s & 0xf0) == 0xe0)
557 len = 3;
558 else if ((*s & 0xf8) == 0xf0)
559 len = 4;
560#ifdef NOT_USED
561 else if ((*s & 0xfc) == 0xf8)
562 len = 5;
563 else if ((*s & 0xfe) == 0xfc)
564 len = 6;
565#endif
566 else
567 len = 1;
568 return len;
569}
570
571/*
572 * This is an implementation of wcwidth() and wcswidth() as defined in
573 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
574 * <http://www.UNIX-systems.org/online.html>
575 *
576 * Markus Kuhn -- 2001-09-08 -- public domain
577 *
578 * customised for PostgreSQL
579 *
580 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
581 */
582
583struct mbinterval
584{
585 unsigned short first;
586 unsigned short last;
587};
588
589/* auxiliary function for binary search in interval table */
590static int
591mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
592{
593 int min = 0;
594 int mid;
595
596 if (ucs < table[0].first || ucs > table[max].last)
597 return 0;
598 while (max >= min)
599 {
600 mid = (min + max) / 2;
601 if (ucs > table[mid].last)
602 min = mid + 1;
603 else if (ucs < table[mid].first)
604 max = mid - 1;
605 else
606 return 1;
607 }
608
609 return 0;
610}
611
612
613/* The following functions define the column width of an ISO 10646
614 * character as follows:
615 *
616 * - The null character (U+0000) has a column width of 0.
617 *
618 * - Other C0/C1 control characters and DEL will lead to a return
619 * value of -1.
620 *
621 * - Non-spacing and enclosing combining characters (general
622 * category code Mn or Me in the Unicode database) have a
623 * column width of 0.
624 *
625 * - Other format characters (general category code Cf in the Unicode
626 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
627 *
628 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
629 * have a column width of 0.
630 *
631 * - Spacing characters in the East Asian Wide (W) or East Asian
632 * FullWidth (F) category as defined in Unicode Technical
633 * Report #11 have a column width of 2.
634 *
635 * - All remaining characters (including all printable
636 * ISO 8859-1 and WGL4 characters, Unicode control characters,
637 * etc.) have a column width of 1.
638 *
639 * This implementation assumes that wchar_t characters are encoded
640 * in ISO 10646.
641 */
642
643static int
644ucs_wcwidth(pg_wchar ucs)
645{
646 /* sorted list of non-overlapping intervals of non-spacing characters */
647 static const struct mbinterval combining[] = {
648 {0x0300, 0x036F}, {0x0483, 0x0489}, {0x0591, 0x05BD},
649 {0x05BF, 0x05BF}, {0x05C1, 0x05C2}, {0x05C4, 0x05C5},
650 {0x05C7, 0x05C7}, {0x0610, 0x061A}, {0x064B, 0x065F},
651 {0x0670, 0x0670}, {0x06D6, 0x06DC}, {0x06DF, 0x06E4},
652 {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, {0x0711, 0x0711},
653 {0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x07EB, 0x07F3},
654 {0x07FD, 0x07FD}, {0x0816, 0x0819}, {0x081B, 0x0823},
655 {0x0825, 0x0827}, {0x0829, 0x082D}, {0x0859, 0x085B},
656 {0x08D3, 0x08E1}, {0x08E3, 0x0902}, {0x093A, 0x093A},
657 {0x093C, 0x093C}, {0x0941, 0x0948}, {0x094D, 0x094D},
658 {0x0951, 0x0957}, {0x0962, 0x0963}, {0x0981, 0x0981},
659 {0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
660 {0x09E2, 0x09E3}, {0x09FE, 0x0A02}, {0x0A3C, 0x0A3C},
661 {0x0A41, 0x0A51}, {0x0A70, 0x0A71}, {0x0A75, 0x0A75},
662 {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC8},
663 {0x0ACD, 0x0ACD}, {0x0AE2, 0x0AE3}, {0x0AFA, 0x0B01},
664 {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B44},
665 {0x0B4D, 0x0B56}, {0x0B62, 0x0B63}, {0x0B82, 0x0B82},
666 {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C00, 0x0C00},
667 {0x0C04, 0x0C04}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C56},
668 {0x0C62, 0x0C63}, {0x0C81, 0x0C81}, {0x0CBC, 0x0CBC},
669 {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
670 {0x0CE2, 0x0CE3}, {0x0D00, 0x0D01}, {0x0D3B, 0x0D3C},
671 {0x0D41, 0x0D44}, {0x0D4D, 0x0D4D}, {0x0D62, 0x0D63},
672 {0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD6}, {0x0E31, 0x0E31},
673 {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
674 {0x0EB4, 0x0EBC}, {0x0EC8, 0x0ECD}, {0x0F18, 0x0F19},
675 {0x0F35, 0x0F35}, {0x0F37, 0x0F37}, {0x0F39, 0x0F39},
676 {0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, {0x0F86, 0x0F87},
677 {0x0F8D, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030},
678 {0x1032, 0x1037}, {0x1039, 0x103A}, {0x103D, 0x103E},
679 {0x1058, 0x1059}, {0x105E, 0x1060}, {0x1071, 0x1074},
680 {0x1082, 0x1082}, {0x1085, 0x1086}, {0x108D, 0x108D},
681 {0x109D, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714},
682 {0x1732, 0x1734}, {0x1752, 0x1753}, {0x1772, 0x1773},
683 {0x17B4, 0x17B5}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
684 {0x17C9, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D},
685 {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x1922},
686 {0x1927, 0x1928}, {0x1932, 0x1932}, {0x1939, 0x193B},
687 {0x1A17, 0x1A18}, {0x1A1B, 0x1A1B}, {0x1A56, 0x1A56},
688 {0x1A58, 0x1A60}, {0x1A62, 0x1A62}, {0x1A65, 0x1A6C},
689 {0x1A73, 0x1A7F}, {0x1AB0, 0x1B03}, {0x1B34, 0x1B34},
690 {0x1B36, 0x1B3A}, {0x1B3C, 0x1B3C}, {0x1B42, 0x1B42},
691 {0x1B6B, 0x1B73}, {0x1B80, 0x1B81}, {0x1BA2, 0x1BA5},
692 {0x1BA8, 0x1BA9}, {0x1BAB, 0x1BAD}, {0x1BE6, 0x1BE6},
693 {0x1BE8, 0x1BE9}, {0x1BED, 0x1BED}, {0x1BEF, 0x1BF1},
694 {0x1C2C, 0x1C33}, {0x1C36, 0x1C37}, {0x1CD0, 0x1CD2},
695 {0x1CD4, 0x1CE0}, {0x1CE2, 0x1CE8}, {0x1CED, 0x1CED},
696 {0x1CF4, 0x1CF4}, {0x1CF8, 0x1CF9}, {0x1DC0, 0x1DFF},
697 {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F},
698 {0x2DE0, 0x2DFF}, {0x302A, 0x302D}, {0x3099, 0x309A},
699 {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F},
700 {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806},
701 {0xA80B, 0xA80B}, {0xA825, 0xA826}, {0xA8C4, 0xA8C5},
702 {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D},
703 {0xA947, 0xA951}, {0xA980, 0xA982}, {0xA9B3, 0xA9B3},
704 {0xA9B6, 0xA9B9}, {0xA9BC, 0xA9BD}, {0xA9E5, 0xA9E5},
705 {0xAA29, 0xAA2E}, {0xAA31, 0xAA32}, {0xAA35, 0xAA36},
706 {0xAA43, 0xAA43}, {0xAA4C, 0xAA4C}, {0xAA7C, 0xAA7C},
707 {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8},
708 {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEC, 0xAAED},
709 {0xAAF6, 0xAAF6}, {0xABE5, 0xABE5}, {0xABE8, 0xABE8},
710 {0xABED, 0xABED}, {0xFB1E, 0xFB1E}, {0xFE00, 0xFE0F},
711 {0xFE20, 0xFE2F},
712 };
713
714 /* test for 8-bit control characters */
715 if (ucs == 0)
716 return 0;
717
718 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
719 return -1;
720
721 /* binary search in table of non-spacing characters */
722 if (mbbisearch(ucs, combining,
723 sizeof(combining) / sizeof(struct mbinterval) - 1))
724 return 0;
725
726 /*
727 * if we arrive here, ucs is not a combining or C0/C1 control character
728 */
729
730 return 1 +
731 (ucs >= 0x1100 &&
732 (ucs <= 0x115f || /* Hangul Jamo init. consonants */
733 (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
734 ucs != 0x303f) || /* CJK ... Yi */
735 (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
736 (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
737 * Ideographs */
738 (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
739 (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
740 (ucs >= 0xffe0 && ucs <= 0xffe6) ||
741 (ucs >= 0x20000 && ucs <= 0x2ffff)));
742}
743
744/*
745 * Convert a UTF-8 character to a Unicode code point.
746 * This is a one-character version of pg_utf2wchar_with_len.
747 *
748 * No error checks here, c must point to a long-enough string.
749 */
750pg_wchar
751utf8_to_unicode(const unsigned char *c)
752{
753 if ((*c & 0x80) == 0)
754 return (pg_wchar) c[0];
755 else if ((*c & 0xe0) == 0xc0)
756 return (pg_wchar) (((c[0] & 0x1f) << 6) |
757 (c[1] & 0x3f));
758 else if ((*c & 0xf0) == 0xe0)
759 return (pg_wchar) (((c[0] & 0x0f) << 12) |
760 ((c[1] & 0x3f) << 6) |
761 (c[2] & 0x3f));
762 else if ((*c & 0xf8) == 0xf0)
763 return (pg_wchar) (((c[0] & 0x07) << 18) |
764 ((c[1] & 0x3f) << 12) |
765 ((c[2] & 0x3f) << 6) |
766 (c[3] & 0x3f));
767 else
768 /* that is an invalid code on purpose */
769 return 0xffffffff;
770}
771
772static int
773pg_utf_dsplen(const unsigned char *s)
774{
775 return ucs_wcwidth(utf8_to_unicode(s));
776}
777
778/*
779 * convert mule internal code to pg_wchar
780 * caller should allocate enough space for "to"
781 * len: length of from.
782 * "from" not necessarily null terminated.
783 */
784static int
785pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
786{
787 int cnt = 0;
788
789 while (len > 0 && *from)
790 {
791 if (IS_LC1(*from) && len >= 2)
792 {
793 *to = *from++ << 16;
794 *to |= *from++;
795 len -= 2;
796 }
797 else if (IS_LCPRV1(*from) && len >= 3)
798 {
799 from++;
800 *to = *from++ << 16;
801 *to |= *from++;
802 len -= 3;
803 }
804 else if (IS_LC2(*from) && len >= 3)
805 {
806 *to = *from++ << 16;
807 *to |= *from++ << 8;
808 *to |= *from++;
809 len -= 3;
810 }
811 else if (IS_LCPRV2(*from) && len >= 4)
812 {
813 from++;
814 *to = *from++ << 16;
815 *to |= *from++ << 8;
816 *to |= *from++;
817 len -= 4;
818 }
819 else
820 { /* assume ASCII */
821 *to = (unsigned char) *from++;
822 len--;
823 }
824 to++;
825 cnt++;
826 }
827 *to = 0;
828 return cnt;
829}
830
831/*
832 * convert pg_wchar to mule internal code
833 * caller should allocate enough space for "to"
834 * len: length of from.
835 * "from" not necessarily null terminated.
836 */
837static int
838pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
839{
840 int cnt = 0;
841
842 while (len > 0 && *from)
843 {
844 unsigned char lb;
845
846 lb = (*from >> 16) & 0xff;
847 if (IS_LC1(lb))
848 {
849 *to++ = lb;
850 *to++ = *from & 0xff;
851 cnt += 2;
852 }
853 else if (IS_LC2(lb))
854 {
855 *to++ = lb;
856 *to++ = (*from >> 8) & 0xff;
857 *to++ = *from & 0xff;
858 cnt += 3;
859 }
860 else if (IS_LCPRV1_A_RANGE(lb))
861 {
862 *to++ = LCPRV1_A;
863 *to++ = lb;
864 *to++ = *from & 0xff;
865 cnt += 3;
866 }
867 else if (IS_LCPRV1_B_RANGE(lb))
868 {
869 *to++ = LCPRV1_B;
870 *to++ = lb;
871 *to++ = *from & 0xff;
872 cnt += 3;
873 }
874 else if (IS_LCPRV2_A_RANGE(lb))
875 {
876 *to++ = LCPRV2_A;
877 *to++ = lb;
878 *to++ = (*from >> 8) & 0xff;
879 *to++ = *from & 0xff;
880 cnt += 4;
881 }
882 else if (IS_LCPRV2_B_RANGE(lb))
883 {
884 *to++ = LCPRV2_B;
885 *to++ = lb;
886 *to++ = (*from >> 8) & 0xff;
887 *to++ = *from & 0xff;
888 cnt += 4;
889 }
890 else
891 {
892 *to++ = *from & 0xff;
893 cnt += 1;
894 }
895 from++;
896 len--;
897 }
898 *to = 0;
899 return cnt;
900}
901
902int
903pg_mule_mblen(const unsigned char *s)
904{
905 int len;
906
907 if (IS_LC1(*s))
908 len = 2;
909 else if (IS_LCPRV1(*s))
910 len = 3;
911 else if (IS_LC2(*s))
912 len = 3;
913 else if (IS_LCPRV2(*s))
914 len = 4;
915 else
916 len = 1; /* assume ASCII */
917 return len;
918}
919
920static int
921pg_mule_dsplen(const unsigned char *s)
922{
923 int len;
924
925 /*
926 * Note: it's not really appropriate to assume that all multibyte charsets
927 * are double-wide on screen. But this seems an okay approximation for
928 * the MULE charsets we currently support.
929 */
930
931 if (IS_LC1(*s))
932 len = 1;
933 else if (IS_LCPRV1(*s))
934 len = 1;
935 else if (IS_LC2(*s))
936 len = 2;
937 else if (IS_LCPRV2(*s))
938 len = 2;
939 else
940 len = 1; /* assume ASCII */
941
942 return len;
943}
944
945/*
946 * ISO8859-1
947 */
948static int
949pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
950{
951 int cnt = 0;
952
953 while (len > 0 && *from)
954 {
955 *to++ = *from++;
956 len--;
957 cnt++;
958 }
959 *to = 0;
960 return cnt;
961}
962
963/*
964 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
965 * high bits.
966 * caller should allocate enough space for "to"
967 * len: length of from.
968 * "from" not necessarily null terminated.
969 */
970static int
971pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
972{
973 int cnt = 0;
974
975 while (len > 0 && *from)
976 {
977 *to++ = *from++;
978 len--;
979 cnt++;
980 }
981 *to = 0;
982 return cnt;
983}
984
985static int
986pg_latin1_mblen(const unsigned char *s)
987{
988 return 1;
989}
990
991static int
992pg_latin1_dsplen(const unsigned char *s)
993{
994 return pg_ascii_dsplen(s);
995}
996
997/*
998 * SJIS
999 */
1000static int
1001pg_sjis_mblen(const unsigned char *s)
1002{
1003 int len;
1004
1005 if (*s >= 0xa1 && *s <= 0xdf)
1006 len = 1; /* 1 byte kana? */
1007 else if (IS_HIGHBIT_SET(*s))
1008 len = 2; /* kanji? */
1009 else
1010 len = 1; /* should be ASCII */
1011 return len;
1012}
1013
1014static int
1015pg_sjis_dsplen(const unsigned char *s)
1016{
1017 int len;
1018
1019 if (*s >= 0xa1 && *s <= 0xdf)
1020 len = 1; /* 1 byte kana? */
1021 else if (IS_HIGHBIT_SET(*s))
1022 len = 2; /* kanji? */
1023 else
1024 len = pg_ascii_dsplen(s); /* should be ASCII */
1025 return len;
1026}
1027
1028/*
1029 * Big5
1030 */
1031static int
1032pg_big5_mblen(const unsigned char *s)
1033{
1034 int len;
1035
1036 if (IS_HIGHBIT_SET(*s))
1037 len = 2; /* kanji? */
1038 else
1039 len = 1; /* should be ASCII */
1040 return len;
1041}
1042
1043static int
1044pg_big5_dsplen(const unsigned char *s)
1045{
1046 int len;
1047
1048 if (IS_HIGHBIT_SET(*s))
1049 len = 2; /* kanji? */
1050 else
1051 len = pg_ascii_dsplen(s); /* should be ASCII */
1052 return len;
1053}
1054
1055/*
1056 * GBK
1057 */
1058static int
1059pg_gbk_mblen(const unsigned char *s)
1060{
1061 int len;
1062
1063 if (IS_HIGHBIT_SET(*s))
1064 len = 2; /* kanji? */
1065 else
1066 len = 1; /* should be ASCII */
1067 return len;
1068}
1069
1070static int
1071pg_gbk_dsplen(const unsigned char *s)
1072{
1073 int len;
1074
1075 if (IS_HIGHBIT_SET(*s))
1076 len = 2; /* kanji? */
1077 else
1078 len = pg_ascii_dsplen(s); /* should be ASCII */
1079 return len;
1080}
1081
1082/*
1083 * UHC
1084 */
1085static int
1086pg_uhc_mblen(const unsigned char *s)
1087{
1088 int len;
1089
1090 if (IS_HIGHBIT_SET(*s))
1091 len = 2; /* 2byte? */
1092 else
1093 len = 1; /* should be ASCII */
1094 return len;
1095}
1096
1097static int
1098pg_uhc_dsplen(const unsigned char *s)
1099{
1100 int len;
1101
1102 if (IS_HIGHBIT_SET(*s))
1103 len = 2; /* 2byte? */
1104 else
1105 len = pg_ascii_dsplen(s); /* should be ASCII */
1106 return len;
1107}
1108
1109/*
1110 * GB18030
1111 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1112 */
1113
1114/*
1115 * Unlike all other mblen() functions, this also looks at the second byte of
1116 * the input. However, if you only pass the first byte of a multi-byte
1117 * string, and \0 as the second byte, this still works in a predictable way:
1118 * a 4-byte character will be reported as two 2-byte characters. That's
1119 * enough for all current uses, as a client-only encoding. It works that
1120 * way, because in any valid 4-byte GB18030-encoded character, the third and
1121 * fourth byte look like a 2-byte encoded character, when looked at
1122 * separately.
1123 */
1124static int
1125pg_gb18030_mblen(const unsigned char *s)
1126{
1127 int len;
1128
1129 if (!IS_HIGHBIT_SET(*s))
1130 len = 1; /* ASCII */
1131 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1132 len = 4;
1133 else
1134 len = 2;
1135 return len;
1136}
1137
1138static int
1139pg_gb18030_dsplen(const unsigned char *s)
1140{
1141 int len;
1142
1143 if (IS_HIGHBIT_SET(*s))
1144 len = 2;
1145 else
1146 len = pg_ascii_dsplen(s); /* ASCII */
1147 return len;
1148}
1149
1150/*
1151 *-------------------------------------------------------------------
1152 * multibyte sequence validators
1153 *
1154 * These functions accept "s", a pointer to the first byte of a string,
1155 * and "len", the remaining length of the string. If there is a validly
1156 * encoded character beginning at *s, return its length in bytes; else
1157 * return -1.
1158 *
1159 * The functions can assume that len > 0 and that *s != '\0', but they must
1160 * test for and reject zeroes in any additional bytes of a multibyte character.
1161 *
1162 * Note that this definition allows the function for a single-byte
1163 * encoding to be just "return 1".
1164 *-------------------------------------------------------------------
1165 */
1166
1167static int
1168pg_ascii_verifier(const unsigned char *s, int len)
1169{
1170 return 1;
1171}
1172
1173#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1174
1175static int
1176pg_eucjp_verifier(const unsigned char *s, int len)
1177{
1178 int l;
1179 unsigned char c1,
1180 c2;
1181
1182 c1 = *s++;
1183
1184 switch (c1)
1185 {
1186 case SS2: /* JIS X 0201 */
1187 l = 2;
1188 if (l > len)
1189 return -1;
1190 c2 = *s++;
1191 if (c2 < 0xa1 || c2 > 0xdf)
1192 return -1;
1193 break;
1194
1195 case SS3: /* JIS X 0212 */
1196 l = 3;
1197 if (l > len)
1198 return -1;
1199 c2 = *s++;
1200 if (!IS_EUC_RANGE_VALID(c2))
1201 return -1;
1202 c2 = *s++;
1203 if (!IS_EUC_RANGE_VALID(c2))
1204 return -1;
1205 break;
1206
1207 default:
1208 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1209 {
1210 l = 2;
1211 if (l > len)
1212 return -1;
1213 if (!IS_EUC_RANGE_VALID(c1))
1214 return -1;
1215 c2 = *s++;
1216 if (!IS_EUC_RANGE_VALID(c2))
1217 return -1;
1218 }
1219 else
1220 /* must be ASCII */
1221 {
1222 l = 1;
1223 }
1224 break;
1225 }
1226
1227 return l;
1228}
1229
1230static int
1231pg_euckr_verifier(const unsigned char *s, int len)
1232{
1233 int l;
1234 unsigned char c1,
1235 c2;
1236
1237 c1 = *s++;
1238
1239 if (IS_HIGHBIT_SET(c1))
1240 {
1241 l = 2;
1242 if (l > len)
1243 return -1;
1244 if (!IS_EUC_RANGE_VALID(c1))
1245 return -1;
1246 c2 = *s++;
1247 if (!IS_EUC_RANGE_VALID(c2))
1248 return -1;
1249 }
1250 else
1251 /* must be ASCII */
1252 {
1253 l = 1;
1254 }
1255
1256 return l;
1257}
1258
1259/* EUC-CN byte sequences are exactly same as EUC-KR */
1260#define pg_euccn_verifier pg_euckr_verifier
1261
1262static int
1263pg_euctw_verifier(const unsigned char *s, int len)
1264{
1265 int l;
1266 unsigned char c1,
1267 c2;
1268
1269 c1 = *s++;
1270
1271 switch (c1)
1272 {
1273 case SS2: /* CNS 11643 Plane 1-7 */
1274 l = 4;
1275 if (l > len)
1276 return -1;
1277 c2 = *s++;
1278 if (c2 < 0xa1 || c2 > 0xa7)
1279 return -1;
1280 c2 = *s++;
1281 if (!IS_EUC_RANGE_VALID(c2))
1282 return -1;
1283 c2 = *s++;
1284 if (!IS_EUC_RANGE_VALID(c2))
1285 return -1;
1286 break;
1287
1288 case SS3: /* unused */
1289 return -1;
1290
1291 default:
1292 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1293 {
1294 l = 2;
1295 if (l > len)
1296 return -1;
1297 /* no further range check on c1? */
1298 c2 = *s++;
1299 if (!IS_EUC_RANGE_VALID(c2))
1300 return -1;
1301 }
1302 else
1303 /* must be ASCII */
1304 {
1305 l = 1;
1306 }
1307 break;
1308 }
1309 return l;
1310}
1311
1312static int
1313pg_johab_verifier(const unsigned char *s, int len)
1314{
1315 int l,
1316 mbl;
1317 unsigned char c;
1318
1319 l = mbl = pg_johab_mblen(s);
1320
1321 if (len < l)
1322 return -1;
1323
1324 if (!IS_HIGHBIT_SET(*s))
1325 return mbl;
1326
1327 while (--l > 0)
1328 {
1329 c = *++s;
1330 if (!IS_EUC_RANGE_VALID(c))
1331 return -1;
1332 }
1333 return mbl;
1334}
1335
1336static int
1337pg_mule_verifier(const unsigned char *s, int len)
1338{
1339 int l,
1340 mbl;
1341 unsigned char c;
1342
1343 l = mbl = pg_mule_mblen(s);
1344
1345 if (len < l)
1346 return -1;
1347
1348 while (--l > 0)
1349 {
1350 c = *++s;
1351 if (!IS_HIGHBIT_SET(c))
1352 return -1;
1353 }
1354 return mbl;
1355}
1356
1357static int
1358pg_latin1_verifier(const unsigned char *s, int len)
1359{
1360 return 1;
1361}
1362
1363static int
1364pg_sjis_verifier(const unsigned char *s, int len)
1365{
1366 int l,
1367 mbl;
1368 unsigned char c1,
1369 c2;
1370
1371 l = mbl = pg_sjis_mblen(s);
1372
1373 if (len < l)
1374 return -1;
1375
1376 if (l == 1) /* pg_sjis_mblen already verified it */
1377 return mbl;
1378
1379 c1 = *s++;
1380 c2 = *s;
1381 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1382 return -1;
1383 return mbl;
1384}
1385
1386static int
1387pg_big5_verifier(const unsigned char *s, int len)
1388{
1389 int l,
1390 mbl;
1391
1392 l = mbl = pg_big5_mblen(s);
1393
1394 if (len < l)
1395 return -1;
1396
1397 while (--l > 0)
1398 {
1399 if (*++s == '\0')
1400 return -1;
1401 }
1402
1403 return mbl;
1404}
1405
1406static int
1407pg_gbk_verifier(const unsigned char *s, int len)
1408{
1409 int l,
1410 mbl;
1411
1412 l = mbl = pg_gbk_mblen(s);
1413
1414 if (len < l)
1415 return -1;
1416
1417 while (--l > 0)
1418 {
1419 if (*++s == '\0')
1420 return -1;
1421 }
1422
1423 return mbl;
1424}
1425
1426static int
1427pg_uhc_verifier(const unsigned char *s, int len)
1428{
1429 int l,
1430 mbl;
1431
1432 l = mbl = pg_uhc_mblen(s);
1433
1434 if (len < l)
1435 return -1;
1436
1437 while (--l > 0)
1438 {
1439 if (*++s == '\0')
1440 return -1;
1441 }
1442
1443 return mbl;
1444}
1445
1446static int
1447pg_gb18030_verifier(const unsigned char *s, int len)
1448{
1449 int l;
1450
1451 if (!IS_HIGHBIT_SET(*s))
1452 l = 1; /* ASCII */
1453 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1454 {
1455 /* Should be 4-byte, validate remaining bytes */
1456 if (*s >= 0x81 && *s <= 0xfe &&
1457 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1458 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1459 l = 4;
1460 else
1461 l = -1;
1462 }
1463 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1464 {
1465 /* Should be 2-byte, validate */
1466 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1467 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1468 l = 2;
1469 else
1470 l = -1;
1471 }
1472 else
1473 l = -1;
1474 return l;
1475}
1476
1477static int
1478pg_utf8_verifier(const unsigned char *s, int len)
1479{
1480 int l = pg_utf_mblen(s);
1481
1482 if (len < l)
1483 return -1;
1484
1485 if (!pg_utf8_islegal(s, l))
1486 return -1;
1487
1488 return l;
1489}
1490
1491/*
1492 * Check for validity of a single UTF-8 encoded character
1493 *
1494 * This directly implements the rules in RFC3629. The bizarre-looking
1495 * restrictions on the second byte are meant to ensure that there isn't
1496 * more than one encoding of a given Unicode character point; that is,
1497 * you may not use a longer-than-necessary byte sequence with high order
1498 * zero bits to represent a character that would fit in fewer bytes.
1499 * To do otherwise is to create security hazards (eg, create an apparent
1500 * non-ASCII character that decodes to plain ASCII).
1501 *
1502 * length is assumed to have been obtained by pg_utf_mblen(), and the
1503 * caller must have checked that that many bytes are present in the buffer.
1504 */
1505bool
1506pg_utf8_islegal(const unsigned char *source, int length)
1507{
1508 unsigned char a;
1509
1510 switch (length)
1511 {
1512 default:
1513 /* reject lengths 5 and 6 for now */
1514 return false;
1515 case 4:
1516 a = source[3];
1517 if (a < 0x80 || a > 0xBF)
1518 return false;
1519 /* FALL THRU */
1520 case 3:
1521 a = source[2];
1522 if (a < 0x80 || a > 0xBF)
1523 return false;
1524 /* FALL THRU */
1525 case 2:
1526 a = source[1];
1527 switch (*source)
1528 {
1529 case 0xE0:
1530 if (a < 0xA0 || a > 0xBF)
1531 return false;
1532 break;
1533 case 0xED:
1534 if (a < 0x80 || a > 0x9F)
1535 return false;
1536 break;
1537 case 0xF0:
1538 if (a < 0x90 || a > 0xBF)
1539 return false;
1540 break;
1541 case 0xF4:
1542 if (a < 0x80 || a > 0x8F)
1543 return false;
1544 break;
1545 default:
1546 if (a < 0x80 || a > 0xBF)
1547 return false;
1548 break;
1549 }
1550 /* FALL THRU */
1551 case 1:
1552 a = *source;
1553 if (a >= 0x80 && a < 0xC2)
1554 return false;
1555 if (a > 0xF4)
1556 return false;
1557 break;
1558 }
1559 return true;
1560}
1561
1562#ifndef FRONTEND
1563
1564/*
1565 * Generic character incrementer function.
1566 *
1567 * Not knowing anything about the properties of the encoding in use, we just
1568 * keep incrementing the last byte until we get a validly-encoded result,
1569 * or we run out of values to try. We don't bother to try incrementing
1570 * higher-order bytes, so there's no growth in runtime for wider characters.
1571 * (If we did try to do that, we'd need to consider the likelihood that 255
1572 * is not a valid final byte in the encoding.)
1573 */
1574static bool
1575pg_generic_charinc(unsigned char *charptr, int len)
1576{
1577 unsigned char *lastbyte = charptr + len - 1;
1578 mbverifier mbverify;
1579
1580 /* We can just invoke the character verifier directly. */
1581 mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1582
1583 while (*lastbyte < (unsigned char) 255)
1584 {
1585 (*lastbyte)++;
1586 if ((*mbverify) (charptr, len) == len)
1587 return true;
1588 }
1589
1590 return false;
1591}
1592
1593/*
1594 * UTF-8 character incrementer function.
1595 *
1596 * For a one-byte character less than 0x7F, we just increment the byte.
1597 *
1598 * For a multibyte character, every byte but the first must fall between 0x80
1599 * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1600 * the last byte that's not already at its maximum value. If we can't find a
1601 * byte that's less than the maximum allowable value, we simply fail. We also
1602 * need some special-case logic to skip regions used for surrogate pair
1603 * handling, as those should not occur in valid UTF-8.
1604 *
1605 * Note that we don't reset lower-order bytes back to their minimums, since
1606 * we can't afford to make an exhaustive search (see make_greater_string).
1607 */
1608static bool
1609pg_utf8_increment(unsigned char *charptr, int length)
1610{
1611 unsigned char a;
1612 unsigned char limit;
1613
1614 switch (length)
1615 {
1616 default:
1617 /* reject lengths 5 and 6 for now */
1618 return false;
1619 case 4:
1620 a = charptr[3];
1621 if (a < 0xBF)
1622 {
1623 charptr[3]++;
1624 break;
1625 }
1626 /* FALL THRU */
1627 case 3:
1628 a = charptr[2];
1629 if (a < 0xBF)
1630 {
1631 charptr[2]++;
1632 break;
1633 }
1634 /* FALL THRU */
1635 case 2:
1636 a = charptr[1];
1637 switch (*charptr)
1638 {
1639 case 0xED:
1640 limit = 0x9F;
1641 break;
1642 case 0xF4:
1643 limit = 0x8F;
1644 break;
1645 default:
1646 limit = 0xBF;
1647 break;
1648 }
1649 if (a < limit)
1650 {
1651 charptr[1]++;
1652 break;
1653 }
1654 /* FALL THRU */
1655 case 1:
1656 a = *charptr;
1657 if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1658 return false;
1659 charptr[0]++;
1660 break;
1661 }
1662
1663 return true;
1664}
1665
1666/*
1667 * EUC-JP character incrementer function.
1668 *
1669 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1670 * representing JIS X 0201 characters with the second byte ranging between
1671 * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1672 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1673 *
1674 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1675 * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1676 * is incremented if possible, otherwise the second-to-last byte.
1677 *
1678 * If the sequence starts with a value other than the above and its MSB
1679 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1680 * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1681 * incremented if possible, otherwise the second-to-last byte.
1682 *
1683 * Otherwise, the sequence is a single-byte ASCII character. It is
1684 * incremented up to 0x7f.
1685 */
1686static bool
1687pg_eucjp_increment(unsigned char *charptr, int length)
1688{
1689 unsigned char c1,
1690 c2;
1691 int i;
1692
1693 c1 = *charptr;
1694
1695 switch (c1)
1696 {
1697 case SS2: /* JIS X 0201 */
1698 if (length != 2)
1699 return false;
1700
1701 c2 = charptr[1];
1702
1703 if (c2 >= 0xdf)
1704 charptr[0] = charptr[1] = 0xa1;
1705 else if (c2 < 0xa1)
1706 charptr[1] = 0xa1;
1707 else
1708 charptr[1]++;
1709 break;
1710
1711 case SS3: /* JIS X 0212 */
1712 if (length != 3)
1713 return false;
1714
1715 for (i = 2; i > 0; i--)
1716 {
1717 c2 = charptr[i];
1718 if (c2 < 0xa1)
1719 {
1720 charptr[i] = 0xa1;
1721 return true;
1722 }
1723 else if (c2 < 0xfe)
1724 {
1725 charptr[i]++;
1726 return true;
1727 }
1728 }
1729
1730 /* Out of 3-byte code region */
1731 return false;
1732
1733 default:
1734 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1735 {
1736 if (length != 2)
1737 return false;
1738
1739 for (i = 1; i >= 0; i--)
1740 {
1741 c2 = charptr[i];
1742 if (c2 < 0xa1)
1743 {
1744 charptr[i] = 0xa1;
1745 return true;
1746 }
1747 else if (c2 < 0xfe)
1748 {
1749 charptr[i]++;
1750 return true;
1751 }
1752 }
1753
1754 /* Out of 2 byte code region */
1755 return false;
1756 }
1757 else
1758 { /* ASCII, single byte */
1759 if (c1 > 0x7e)
1760 return false;
1761 (*charptr)++;
1762 }
1763 break;
1764 }
1765
1766 return true;
1767}
1768#endif /* !FRONTEND */
1769
1770
1771/*
1772 *-------------------------------------------------------------------
1773 * encoding info table
1774 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1775 *-------------------------------------------------------------------
1776 */
1777const pg_wchar_tbl pg_wchar_table[] = {
1778 {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1779 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
1780 {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
1781 {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
1782 {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
1783 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
1784 {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
1785 {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
1786 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1787 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1788 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1789 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1790 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1791 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1792 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1793 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1794 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1795 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1796 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1797 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1798 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1799 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1800 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1801 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1802 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1803 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1804 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1805 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1806 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1807 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1808 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1809 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1810 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1811 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1812 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1813 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1814 {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1815 {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
1816 {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
1817 {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
1818 {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
1819 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
1820};
1821
1822/* returns the byte length of a word for mule internal code */
1823int
1824pg_mic_mblen(const unsigned char *mbstr)
1825{
1826 return pg_mule_mblen(mbstr);
1827}
1828
1829/*
1830 * Returns the byte length of a multibyte character.
1831 */
1832int
1833pg_encoding_mblen(int encoding, const char *mbstr)
1834{
1835 return (PG_VALID_ENCODING(encoding) ?
1836 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1837 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1838}
1839
1840/*
1841 * Returns the display length of a multibyte character.
1842 */
1843int
1844pg_encoding_dsplen(int encoding, const char *mbstr)
1845{
1846 return (PG_VALID_ENCODING(encoding) ?
1847 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1848 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1849}
1850
1851/*
1852 * Verify the first multibyte character of the given string.
1853 * Return its byte length if good, -1 if bad. (See comments above for
1854 * full details of the mbverify API.)
1855 */
1856int
1857pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1858{
1859 return (PG_VALID_ENCODING(encoding) ?
1860 pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1861 pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1862}
1863
1864/*
1865 * fetch maximum length of a given encoding
1866 */
1867int
1868pg_encoding_max_length(int encoding)
1869{
1870 Assert(PG_VALID_ENCODING(encoding));
1871
1872 return pg_wchar_table[encoding].maxmblen;
1873}
1874
1875#ifndef FRONTEND
1876
1877/*
1878 * fetch maximum length of the encoding for the current database
1879 */
1880int
1881pg_database_encoding_max_length(void)
1882{
1883 return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1884}
1885
1886/*
1887 * get the character incrementer for the encoding for the current database
1888 */
1889mbcharacter_incrementer
1890pg_database_encoding_character_incrementer(void)
1891{
1892 /*
1893 * Eventually it might be best to add a field to pg_wchar_table[], but for
1894 * now we just use a switch.
1895 */
1896 switch (GetDatabaseEncoding())
1897 {
1898 case PG_UTF8:
1899 return pg_utf8_increment;
1900
1901 case PG_EUC_JP:
1902 return pg_eucjp_increment;
1903
1904 default:
1905 return pg_generic_charinc;
1906 }
1907}
1908
1909/*
1910 * Verify mbstr to make sure that it is validly encoded in the current
1911 * database encoding. Otherwise same as pg_verify_mbstr().
1912 */
1913bool
1914pg_verifymbstr(const char *mbstr, int len, bool noError)
1915{
1916 return
1917 pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1918}
1919
1920/*
1921 * Verify mbstr to make sure that it is validly encoded in the specified
1922 * encoding.
1923 */
1924bool
1925pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1926{
1927 return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1928}
1929
1930/*
1931 * Verify mbstr to make sure that it is validly encoded in the specified
1932 * encoding.
1933 *
1934 * mbstr is not necessarily zero terminated; length of mbstr is
1935 * specified by len.
1936 *
1937 * If OK, return length of string in the encoding.
1938 * If a problem is found, return -1 when noError is
1939 * true; when noError is false, ereport() a descriptive message.
1940 */
1941int
1942pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1943{
1944 mbverifier mbverify;
1945 int mb_len;
1946
1947 Assert(PG_VALID_ENCODING(encoding));
1948
1949 /*
1950 * In single-byte encodings, we need only reject nulls (\0).
1951 */
1952 if (pg_encoding_max_length(encoding) <= 1)
1953 {
1954 const char *nullpos = memchr(mbstr, 0, len);
1955
1956 if (nullpos == NULL)
1957 return len;
1958 if (noError)
1959 return -1;
1960 report_invalid_encoding(encoding, nullpos, 1);
1961 }
1962
1963 /* fetch function pointer just once */
1964 mbverify = pg_wchar_table[encoding].mbverify;
1965
1966 mb_len = 0;
1967
1968 while (len > 0)
1969 {
1970 int l;
1971
1972 /* fast path for ASCII-subset characters */
1973 if (!IS_HIGHBIT_SET(*mbstr))
1974 {
1975 if (*mbstr != '\0')
1976 {
1977 mb_len++;
1978 mbstr++;
1979 len--;
1980 continue;
1981 }
1982 if (noError)
1983 return -1;
1984 report_invalid_encoding(encoding, mbstr, len);
1985 }
1986
1987 l = (*mbverify) ((const unsigned char *) mbstr, len);
1988
1989 if (l < 0)
1990 {
1991 if (noError)
1992 return -1;
1993 report_invalid_encoding(encoding, mbstr, len);
1994 }
1995
1996 mbstr += l;
1997 len -= l;
1998 mb_len++;
1999 }
2000 return mb_len;
2001}
2002
2003/*
2004 * check_encoding_conversion_args: check arguments of a conversion function
2005 *
2006 * "expected" arguments can be either an encoding ID or -1 to indicate that
2007 * the caller will check whether it accepts the ID.
2008 *
2009 * Note: the errors here are not really user-facing, so elog instead of
2010 * ereport seems sufficient. Also, we trust that the "expected" encoding
2011 * arguments are valid encoding IDs, but we don't trust the actuals.
2012 */
2013void
2014check_encoding_conversion_args(int src_encoding,
2015 int dest_encoding,
2016 int len,
2017 int expected_src_encoding,
2018 int expected_dest_encoding)
2019{
2020 if (!PG_VALID_ENCODING(src_encoding))
2021 elog(ERROR, "invalid source encoding ID: %d", src_encoding);
2022 if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
2023 elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
2024 pg_enc2name_tbl[expected_src_encoding].name,
2025 pg_enc2name_tbl[src_encoding].name);
2026 if (!PG_VALID_ENCODING(dest_encoding))
2027 elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
2028 if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
2029 elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
2030 pg_enc2name_tbl[expected_dest_encoding].name,
2031 pg_enc2name_tbl[dest_encoding].name);
2032 if (len < 0)
2033 elog(ERROR, "encoding conversion length must not be negative");
2034}
2035
2036/*
2037 * report_invalid_encoding: complain about invalid multibyte character
2038 *
2039 * note: len is remaining length of string, not length of character;
2040 * len must be greater than zero, as we always examine the first byte.
2041 */
2042void
2043report_invalid_encoding(int encoding, const char *mbstr, int len)
2044{
2045 int l = pg_encoding_mblen(encoding, mbstr);
2046 char buf[8 * 5 + 1];
2047 char *p = buf;
2048 int j,
2049 jlimit;
2050
2051 jlimit = Min(l, len);
2052 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
2053
2054 for (j = 0; j < jlimit; j++)
2055 {
2056 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2057 if (j < jlimit - 1)
2058 p += sprintf(p, " ");
2059 }
2060
2061 ereport(ERROR,
2062 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
2063 errmsg("invalid byte sequence for encoding \"%s\": %s",
2064 pg_enc2name_tbl[encoding].name,
2065 buf)));
2066}
2067
2068/*
2069 * report_untranslatable_char: complain about untranslatable character
2070 *
2071 * note: len is remaining length of string, not length of character;
2072 * len must be greater than zero, as we always examine the first byte.
2073 */
2074void
2075report_untranslatable_char(int src_encoding, int dest_encoding,
2076 const char *mbstr, int len)
2077{
2078 int l = pg_encoding_mblen(src_encoding, mbstr);
2079 char buf[8 * 5 + 1];
2080 char *p = buf;
2081 int j,
2082 jlimit;
2083
2084 jlimit = Min(l, len);
2085 jlimit = Min(jlimit, 8); /* prevent buffer overrun */
2086
2087 for (j = 0; j < jlimit; j++)
2088 {
2089 p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2090 if (j < jlimit - 1)
2091 p += sprintf(p, " ");
2092 }
2093
2094 ereport(ERROR,
2095 (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2096 errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2097 buf,
2098 pg_enc2name_tbl[src_encoding].name,
2099 pg_enc2name_tbl[dest_encoding].name)));
2100}
2101
2102#endif /* !FRONTEND */
2103