1/*-------------------------------------------------------------------------
2 *
3 * Utility functions for conversion procs.
4 *
5 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
6 * Portions Copyright (c) 1994, Regents of the University of California
7 *
8 * IDENTIFICATION
9 * src/backend/utils/mb/conv.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "postgres.h"
14#include "mb/pg_wchar.h"
15
16
17/*
18 * local2local: a generic single byte charset encoding
19 * conversion between two ASCII-superset encodings.
20 *
21 * l points to the source string of length len
22 * p is the output area (must be large enough!)
23 * src_encoding is the PG identifier for the source encoding
24 * dest_encoding is the PG identifier for the target encoding
25 * tab holds conversion entries for the source charset
26 * starting from 128 (0x80). each entry in the table holds the corresponding
27 * code point for the target charset, or 0 if there is no equivalent code.
28 */
29void
30local2local(const unsigned char *l,
31 unsigned char *p,
32 int len,
33 int src_encoding,
34 int dest_encoding,
35 const unsigned char *tab)
36{
37 unsigned char c1,
38 c2;
39
40 while (len > 0)
41 {
42 c1 = *l;
43 if (c1 == 0)
44 report_invalid_encoding(src_encoding, (const char *) l, len);
45 if (!IS_HIGHBIT_SET(c1))
46 *p++ = c1;
47 else
48 {
49 c2 = tab[c1 - HIGHBIT];
50 if (c2)
51 *p++ = c2;
52 else
53 report_untranslatable_char(src_encoding, dest_encoding,
54 (const char *) l, len);
55 }
56 l++;
57 len--;
58 }
59 *p = '\0';
60}
61
62/*
63 * LATINn ---> MIC when the charset's local codes map directly to MIC
64 *
65 * l points to the source string of length len
66 * p is the output area (must be large enough!)
67 * lc is the mule character set id for the local encoding
68 * encoding is the PG identifier for the local encoding
69 */
70void
71latin2mic(const unsigned char *l, unsigned char *p, int len,
72 int lc, int encoding)
73{
74 int c1;
75
76 while (len > 0)
77 {
78 c1 = *l;
79 if (c1 == 0)
80 report_invalid_encoding(encoding, (const char *) l, len);
81 if (IS_HIGHBIT_SET(c1))
82 *p++ = lc;
83 *p++ = c1;
84 l++;
85 len--;
86 }
87 *p = '\0';
88}
89
90/*
91 * MIC ---> LATINn when the charset's local codes map directly to MIC
92 *
93 * mic points to the source string of length len
94 * p is the output area (must be large enough!)
95 * lc is the mule character set id for the local encoding
96 * encoding is the PG identifier for the local encoding
97 */
98void
99mic2latin(const unsigned char *mic, unsigned char *p, int len,
100 int lc, int encoding)
101{
102 int c1;
103
104 while (len > 0)
105 {
106 c1 = *mic;
107 if (c1 == 0)
108 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109 if (!IS_HIGHBIT_SET(c1))
110 {
111 /* easy for ASCII */
112 *p++ = c1;
113 mic++;
114 len--;
115 }
116 else
117 {
118 int l = pg_mic_mblen(mic);
119
120 if (len < l)
121 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122 len);
123 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
124 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125 (const char *) mic, len);
126 *p++ = mic[1];
127 mic += 2;
128 len -= 2;
129 }
130 }
131 *p = '\0';
132}
133
134
135/*
136 * ASCII ---> MIC
137 *
138 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139 * characters, here we must take a hard line because we don't know
140 * the appropriate MIC equivalent.
141 */
142void
143pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
144{
145 int c1;
146
147 while (len > 0)
148 {
149 c1 = *l;
150 if (c1 == 0 || IS_HIGHBIT_SET(c1))
151 report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152 *p++ = c1;
153 l++;
154 len--;
155 }
156 *p = '\0';
157}
158
159/*
160 * MIC ---> ASCII
161 */
162void
163pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
164{
165 int c1;
166
167 while (len > 0)
168 {
169 c1 = *mic;
170 if (c1 == 0 || IS_HIGHBIT_SET(c1))
171 report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172 (const char *) mic, len);
173 *p++ = c1;
174 mic++;
175 len--;
176 }
177 *p = '\0';
178}
179
180/*
181 * latin2mic_with_table: a generic single byte charset encoding
182 * conversion from a local charset to the mule internal code.
183 *
184 * l points to the source string of length len
185 * p is the output area (must be large enough!)
186 * lc is the mule character set id for the local encoding
187 * encoding is the PG identifier for the local encoding
188 * tab holds conversion entries for the local charset
189 * starting from 128 (0x80). each entry in the table holds the corresponding
190 * code point for the mule encoding, or 0 if there is no equivalent code.
191 */
192void
193latin2mic_with_table(const unsigned char *l,
194 unsigned char *p,
195 int len,
196 int lc,
197 int encoding,
198 const unsigned char *tab)
199{
200 unsigned char c1,
201 c2;
202
203 while (len > 0)
204 {
205 c1 = *l;
206 if (c1 == 0)
207 report_invalid_encoding(encoding, (const char *) l, len);
208 if (!IS_HIGHBIT_SET(c1))
209 *p++ = c1;
210 else
211 {
212 c2 = tab[c1 - HIGHBIT];
213 if (c2)
214 {
215 *p++ = lc;
216 *p++ = c2;
217 }
218 else
219 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220 (const char *) l, len);
221 }
222 l++;
223 len--;
224 }
225 *p = '\0';
226}
227
228/*
229 * mic2latin_with_table: a generic single byte charset encoding
230 * conversion from the mule internal code to a local charset.
231 *
232 * mic points to the source string of length len
233 * p is the output area (must be large enough!)
234 * lc is the mule character set id for the local encoding
235 * encoding is the PG identifier for the local encoding
236 * tab holds conversion entries for the mule internal code's second byte,
237 * starting from 128 (0x80). each entry in the table holds the corresponding
238 * code point for the local charset, or 0 if there is no equivalent code.
239 */
240void
241mic2latin_with_table(const unsigned char *mic,
242 unsigned char *p,
243 int len,
244 int lc,
245 int encoding,
246 const unsigned char *tab)
247{
248 unsigned char c1,
249 c2;
250
251 while (len > 0)
252 {
253 c1 = *mic;
254 if (c1 == 0)
255 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256 if (!IS_HIGHBIT_SET(c1))
257 {
258 /* easy for ASCII */
259 *p++ = c1;
260 mic++;
261 len--;
262 }
263 else
264 {
265 int l = pg_mic_mblen(mic);
266
267 if (len < l)
268 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269 len);
270 if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
271 (c2 = tab[mic[1] - HIGHBIT]) == 0)
272 {
273 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274 (const char *) mic, len);
275 break; /* keep compiler quiet */
276 }
277 *p++ = c2;
278 mic += 2;
279 len -= 2;
280 }
281 }
282 *p = '\0';
283}
284
285/*
286 * comparison routine for bsearch()
287 * this routine is intended for combined UTF8 -> local code
288 */
289static int
290compare3(const void *p1, const void *p2)
291{
292 uint32 s1,
293 s2,
294 d1,
295 d2;
296
297 s1 = *(const uint32 *) p1;
298 s2 = *((const uint32 *) p1 + 1);
299 d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
300 d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
301 return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
302}
303
304/*
305 * comparison routine for bsearch()
306 * this routine is intended for local code -> combined UTF8
307 */
308static int
309compare4(const void *p1, const void *p2)
310{
311 uint32 v1,
312 v2;
313
314 v1 = *(const uint32 *) p1;
315 v2 = ((const pg_local_to_utf_combined *) p2)->code;
316 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
317}
318
319/*
320 * store 32bit character representation into multibyte stream
321 */
322static inline unsigned char *
323store_coded_char(unsigned char *dest, uint32 code)
324{
325 if (code & 0xff000000)
326 *dest++ = code >> 24;
327 if (code & 0x00ff0000)
328 *dest++ = code >> 16;
329 if (code & 0x0000ff00)
330 *dest++ = code >> 8;
331 if (code & 0x000000ff)
332 *dest++ = code;
333 return dest;
334}
335
336/*
337 * Convert a character using a conversion radix tree.
338 *
339 * 'l' is the length of the input character in bytes, and b1-b4 are
340 * the input character's bytes.
341 */
342static inline uint32
343pg_mb_radix_conv(const pg_mb_radix_tree *rt,
344 int l,
345 unsigned char b1,
346 unsigned char b2,
347 unsigned char b3,
348 unsigned char b4)
349{
350 if (l == 4)
351 {
352 /* 4-byte code */
353
354 /* check code validity */
355 if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
356 b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
357 b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
358 b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
359 return 0;
360
361 /* perform lookup */
362 if (rt->chars32)
363 {
364 uint32 idx = rt->b4root;
365
366 idx = rt->chars32[b1 + idx - rt->b4_1_lower];
367 idx = rt->chars32[b2 + idx - rt->b4_2_lower];
368 idx = rt->chars32[b3 + idx - rt->b4_3_lower];
369 return rt->chars32[b4 + idx - rt->b4_4_lower];
370 }
371 else
372 {
373 uint16 idx = rt->b4root;
374
375 idx = rt->chars16[b1 + idx - rt->b4_1_lower];
376 idx = rt->chars16[b2 + idx - rt->b4_2_lower];
377 idx = rt->chars16[b3 + idx - rt->b4_3_lower];
378 return rt->chars16[b4 + idx - rt->b4_4_lower];
379 }
380 }
381 else if (l == 3)
382 {
383 /* 3-byte code */
384
385 /* check code validity */
386 if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
387 b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
388 b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
389 return 0;
390
391 /* perform lookup */
392 if (rt->chars32)
393 {
394 uint32 idx = rt->b3root;
395
396 idx = rt->chars32[b2 + idx - rt->b3_1_lower];
397 idx = rt->chars32[b3 + idx - rt->b3_2_lower];
398 return rt->chars32[b4 + idx - rt->b3_3_lower];
399 }
400 else
401 {
402 uint16 idx = rt->b3root;
403
404 idx = rt->chars16[b2 + idx - rt->b3_1_lower];
405 idx = rt->chars16[b3 + idx - rt->b3_2_lower];
406 return rt->chars16[b4 + idx - rt->b3_3_lower];
407 }
408 }
409 else if (l == 2)
410 {
411 /* 2-byte code */
412
413 /* check code validity - first byte */
414 if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
415 b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
416 return 0;
417
418 /* perform lookup */
419 if (rt->chars32)
420 {
421 uint32 idx = rt->b2root;
422
423 idx = rt->chars32[b3 + idx - rt->b2_1_lower];
424 return rt->chars32[b4 + idx - rt->b2_2_lower];
425 }
426 else
427 {
428 uint16 idx = rt->b2root;
429
430 idx = rt->chars16[b3 + idx - rt->b2_1_lower];
431 return rt->chars16[b4 + idx - rt->b2_2_lower];
432 }
433 }
434 else if (l == 1)
435 {
436 /* 1-byte code */
437
438 /* check code validity - first byte */
439 if (b4 < rt->b1_lower || b4 > rt->b1_upper)
440 return 0;
441
442 /* perform lookup */
443 if (rt->chars32)
444 return rt->chars32[b4 + rt->b1root - rt->b1_lower];
445 else
446 return rt->chars16[b4 + rt->b1root - rt->b1_lower];
447 }
448 return 0; /* shouldn't happen */
449}
450
451/*
452 * UTF8 ---> local code
453 *
454 * utf: input string in UTF8 encoding (need not be null-terminated)
455 * len: length of input string (in bytes)
456 * iso: pointer to the output area (must be large enough!)
457 (output string will be null-terminated)
458 * map: conversion map for single characters
459 * cmap: conversion map for combined characters
460 * (optional, pass NULL if none)
461 * cmapsize: number of entries in the conversion map for combined characters
462 * (optional, pass 0 if none)
463 * conv_func: algorithmic encoding conversion function
464 * (optional, pass NULL if none)
465 * encoding: PG identifier for the local encoding
466 *
467 * For each character, the cmap (if provided) is consulted first; if no match,
468 * the map is consulted next; if still no match, the conv_func (if provided)
469 * is applied. An error is raised if no match is found.
470 *
471 * See pg_wchar.h for more details about the data structures used here.
472 */
473void
474UtfToLocal(const unsigned char *utf, int len,
475 unsigned char *iso,
476 const pg_mb_radix_tree *map,
477 const pg_utf_to_local_combined *cmap, int cmapsize,
478 utf_local_conversion_func conv_func,
479 int encoding)
480{
481 uint32 iutf;
482 int l;
483 const pg_utf_to_local_combined *cp;
484
485 if (!PG_VALID_ENCODING(encoding))
486 ereport(ERROR,
487 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
488 errmsg("invalid encoding number: %d", encoding)));
489
490 for (; len > 0; len -= l)
491 {
492 unsigned char b1 = 0;
493 unsigned char b2 = 0;
494 unsigned char b3 = 0;
495 unsigned char b4 = 0;
496
497 /* "break" cases all represent errors */
498 if (*utf == '\0')
499 break;
500
501 l = pg_utf_mblen(utf);
502 if (len < l)
503 break;
504
505 if (!pg_utf8_islegal(utf, l))
506 break;
507
508 if (l == 1)
509 {
510 /* ASCII case is easy, assume it's one-to-one conversion */
511 *iso++ = *utf++;
512 continue;
513 }
514
515 /* collect coded char of length l */
516 if (l == 2)
517 {
518 b3 = *utf++;
519 b4 = *utf++;
520 }
521 else if (l == 3)
522 {
523 b2 = *utf++;
524 b3 = *utf++;
525 b4 = *utf++;
526 }
527 else if (l == 4)
528 {
529 b1 = *utf++;
530 b2 = *utf++;
531 b3 = *utf++;
532 b4 = *utf++;
533 }
534 else
535 {
536 elog(ERROR, "unsupported character length %d", l);
537 iutf = 0; /* keep compiler quiet */
538 }
539 iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
540
541 /* First, try with combined map if possible */
542 if (cmap && len > l)
543 {
544 const unsigned char *utf_save = utf;
545 int len_save = len;
546 int l_save = l;
547
548 /* collect next character, same as above */
549 len -= l;
550
551 l = pg_utf_mblen(utf);
552 if (len < l)
553 break;
554
555 if (!pg_utf8_islegal(utf, l))
556 break;
557
558 /* We assume ASCII character cannot be in combined map */
559 if (l > 1)
560 {
561 uint32 iutf2;
562 uint32 cutf[2];
563
564 if (l == 2)
565 {
566 iutf2 = *utf++ << 8;
567 iutf2 |= *utf++;
568 }
569 else if (l == 3)
570 {
571 iutf2 = *utf++ << 16;
572 iutf2 |= *utf++ << 8;
573 iutf2 |= *utf++;
574 }
575 else if (l == 4)
576 {
577 iutf2 = *utf++ << 24;
578 iutf2 |= *utf++ << 16;
579 iutf2 |= *utf++ << 8;
580 iutf2 |= *utf++;
581 }
582 else
583 {
584 elog(ERROR, "unsupported character length %d", l);
585 iutf2 = 0; /* keep compiler quiet */
586 }
587
588 cutf[0] = iutf;
589 cutf[1] = iutf2;
590
591 cp = bsearch(cutf, cmap, cmapsize,
592 sizeof(pg_utf_to_local_combined), compare3);
593
594 if (cp)
595 {
596 iso = store_coded_char(iso, cp->code);
597 continue;
598 }
599 }
600
601 /* fail, so back up to reprocess second character next time */
602 utf = utf_save;
603 len = len_save;
604 l = l_save;
605 }
606
607 /* Now check ordinary map */
608 if (map)
609 {
610 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
611
612 if (converted)
613 {
614 iso = store_coded_char(iso, converted);
615 continue;
616 }
617 }
618
619 /* if there's a conversion function, try that */
620 if (conv_func)
621 {
622 uint32 converted = (*conv_func) (iutf);
623
624 if (converted)
625 {
626 iso = store_coded_char(iso, converted);
627 continue;
628 }
629 }
630
631 /* failed to translate this character */
632 report_untranslatable_char(PG_UTF8, encoding,
633 (const char *) (utf - l), len);
634 }
635
636 /* if we broke out of loop early, must be invalid input */
637 if (len > 0)
638 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
639
640 *iso = '\0';
641}
642
643/*
644 * local code ---> UTF8
645 *
646 * iso: input string in local encoding (need not be null-terminated)
647 * len: length of input string (in bytes)
648 * utf: pointer to the output area (must be large enough!)
649 (output string will be null-terminated)
650 * map: conversion map for single characters
651 * cmap: conversion map for combined characters
652 * (optional, pass NULL if none)
653 * cmapsize: number of entries in the conversion map for combined characters
654 * (optional, pass 0 if none)
655 * conv_func: algorithmic encoding conversion function
656 * (optional, pass NULL if none)
657 * encoding: PG identifier for the local encoding
658 *
659 * For each character, the map is consulted first; if no match, the cmap
660 * (if provided) is consulted next; if still no match, the conv_func
661 * (if provided) is applied. An error is raised if no match is found.
662 *
663 * See pg_wchar.h for more details about the data structures used here.
664 */
665void
666LocalToUtf(const unsigned char *iso, int len,
667 unsigned char *utf,
668 const pg_mb_radix_tree *map,
669 const pg_local_to_utf_combined *cmap, int cmapsize,
670 utf_local_conversion_func conv_func,
671 int encoding)
672{
673 uint32 iiso;
674 int l;
675 const pg_local_to_utf_combined *cp;
676
677 if (!PG_VALID_ENCODING(encoding))
678 ereport(ERROR,
679 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
680 errmsg("invalid encoding number: %d", encoding)));
681
682 for (; len > 0; len -= l)
683 {
684 unsigned char b1 = 0;
685 unsigned char b2 = 0;
686 unsigned char b3 = 0;
687 unsigned char b4 = 0;
688
689 /* "break" cases all represent errors */
690 if (*iso == '\0')
691 break;
692
693 if (!IS_HIGHBIT_SET(*iso))
694 {
695 /* ASCII case is easy, assume it's one-to-one conversion */
696 *utf++ = *iso++;
697 l = 1;
698 continue;
699 }
700
701 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
702 if (l < 0)
703 break;
704
705 /* collect coded char of length l */
706 if (l == 1)
707 b4 = *iso++;
708 else if (l == 2)
709 {
710 b3 = *iso++;
711 b4 = *iso++;
712 }
713 else if (l == 3)
714 {
715 b2 = *iso++;
716 b3 = *iso++;
717 b4 = *iso++;
718 }
719 else if (l == 4)
720 {
721 b1 = *iso++;
722 b2 = *iso++;
723 b3 = *iso++;
724 b4 = *iso++;
725 }
726 else
727 {
728 elog(ERROR, "unsupported character length %d", l);
729 iiso = 0; /* keep compiler quiet */
730 }
731 iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
732
733 if (map)
734 {
735 uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
736
737 if (converted)
738 {
739 utf = store_coded_char(utf, converted);
740 continue;
741 }
742
743 /* If there's a combined character map, try that */
744 if (cmap)
745 {
746 cp = bsearch(&iiso, cmap, cmapsize,
747 sizeof(pg_local_to_utf_combined), compare4);
748
749 if (cp)
750 {
751 utf = store_coded_char(utf, cp->utf1);
752 utf = store_coded_char(utf, cp->utf2);
753 continue;
754 }
755 }
756 }
757
758 /* if there's a conversion function, try that */
759 if (conv_func)
760 {
761 uint32 converted = (*conv_func) (iiso);
762
763 if (converted)
764 {
765 utf = store_coded_char(utf, converted);
766 continue;
767 }
768 }
769
770 /* failed to translate this character */
771 report_untranslatable_char(encoding, PG_UTF8,
772 (const char *) (iso - l), len);
773 }
774
775 /* if we broke out of loop early, must be invalid input */
776 if (len > 0)
777 report_invalid_encoding(encoding, (const char *) iso, len);
778
779 *utf = '\0';
780}
781