1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * Utility functions for conversion procs. |
4 | * |
5 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
6 | * Portions Copyright (c) 1994, Regents of the University of California |
7 | * |
8 | * IDENTIFICATION |
9 | * src/backend/utils/mb/conv.c |
10 | * |
11 | *------------------------------------------------------------------------- |
12 | */ |
13 | #include "postgres.h" |
14 | #include "mb/pg_wchar.h" |
15 | |
16 | |
17 | /* |
18 | * local2local: a generic single byte charset encoding |
19 | * conversion between two ASCII-superset encodings. |
20 | * |
21 | * l points to the source string of length len |
22 | * p is the output area (must be large enough!) |
23 | * src_encoding is the PG identifier for the source encoding |
24 | * dest_encoding is the PG identifier for the target encoding |
25 | * tab holds conversion entries for the source charset |
26 | * starting from 128 (0x80). each entry in the table holds the corresponding |
27 | * code point for the target charset, or 0 if there is no equivalent code. |
28 | */ |
29 | void |
30 | local2local(const unsigned char *l, |
31 | unsigned char *p, |
32 | int len, |
33 | int src_encoding, |
34 | int dest_encoding, |
35 | const unsigned char *tab) |
36 | { |
37 | unsigned char c1, |
38 | c2; |
39 | |
40 | while (len > 0) |
41 | { |
42 | c1 = *l; |
43 | if (c1 == 0) |
44 | report_invalid_encoding(src_encoding, (const char *) l, len); |
45 | if (!IS_HIGHBIT_SET(c1)) |
46 | *p++ = c1; |
47 | else |
48 | { |
49 | c2 = tab[c1 - HIGHBIT]; |
50 | if (c2) |
51 | *p++ = c2; |
52 | else |
53 | report_untranslatable_char(src_encoding, dest_encoding, |
54 | (const char *) l, len); |
55 | } |
56 | l++; |
57 | len--; |
58 | } |
59 | *p = '\0'; |
60 | } |
61 | |
62 | /* |
63 | * LATINn ---> MIC when the charset's local codes map directly to MIC |
64 | * |
65 | * l points to the source string of length len |
66 | * p is the output area (must be large enough!) |
67 | * lc is the mule character set id for the local encoding |
68 | * encoding is the PG identifier for the local encoding |
69 | */ |
70 | void |
71 | latin2mic(const unsigned char *l, unsigned char *p, int len, |
72 | int lc, int encoding) |
73 | { |
74 | int c1; |
75 | |
76 | while (len > 0) |
77 | { |
78 | c1 = *l; |
79 | if (c1 == 0) |
80 | report_invalid_encoding(encoding, (const char *) l, len); |
81 | if (IS_HIGHBIT_SET(c1)) |
82 | *p++ = lc; |
83 | *p++ = c1; |
84 | l++; |
85 | len--; |
86 | } |
87 | *p = '\0'; |
88 | } |
89 | |
90 | /* |
91 | * MIC ---> LATINn when the charset's local codes map directly to MIC |
92 | * |
93 | * mic points to the source string of length len |
94 | * p is the output area (must be large enough!) |
95 | * lc is the mule character set id for the local encoding |
96 | * encoding is the PG identifier for the local encoding |
97 | */ |
98 | void |
99 | mic2latin(const unsigned char *mic, unsigned char *p, int len, |
100 | int lc, int encoding) |
101 | { |
102 | int c1; |
103 | |
104 | while (len > 0) |
105 | { |
106 | c1 = *mic; |
107 | if (c1 == 0) |
108 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
109 | if (!IS_HIGHBIT_SET(c1)) |
110 | { |
111 | /* easy for ASCII */ |
112 | *p++ = c1; |
113 | mic++; |
114 | len--; |
115 | } |
116 | else |
117 | { |
118 | int l = pg_mic_mblen(mic); |
119 | |
120 | if (len < l) |
121 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
122 | len); |
123 | if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) |
124 | report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
125 | (const char *) mic, len); |
126 | *p++ = mic[1]; |
127 | mic += 2; |
128 | len -= 2; |
129 | } |
130 | } |
131 | *p = '\0'; |
132 | } |
133 | |
134 | |
135 | /* |
136 | * ASCII ---> MIC |
137 | * |
138 | * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set |
139 | * characters, here we must take a hard line because we don't know |
140 | * the appropriate MIC equivalent. |
141 | */ |
142 | void |
143 | pg_ascii2mic(const unsigned char *l, unsigned char *p, int len) |
144 | { |
145 | int c1; |
146 | |
147 | while (len > 0) |
148 | { |
149 | c1 = *l; |
150 | if (c1 == 0 || IS_HIGHBIT_SET(c1)) |
151 | report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len); |
152 | *p++ = c1; |
153 | l++; |
154 | len--; |
155 | } |
156 | *p = '\0'; |
157 | } |
158 | |
159 | /* |
160 | * MIC ---> ASCII |
161 | */ |
162 | void |
163 | pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len) |
164 | { |
165 | int c1; |
166 | |
167 | while (len > 0) |
168 | { |
169 | c1 = *mic; |
170 | if (c1 == 0 || IS_HIGHBIT_SET(c1)) |
171 | report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII, |
172 | (const char *) mic, len); |
173 | *p++ = c1; |
174 | mic++; |
175 | len--; |
176 | } |
177 | *p = '\0'; |
178 | } |
179 | |
180 | /* |
181 | * latin2mic_with_table: a generic single byte charset encoding |
182 | * conversion from a local charset to the mule internal code. |
183 | * |
184 | * l points to the source string of length len |
185 | * p is the output area (must be large enough!) |
186 | * lc is the mule character set id for the local encoding |
187 | * encoding is the PG identifier for the local encoding |
188 | * tab holds conversion entries for the local charset |
189 | * starting from 128 (0x80). each entry in the table holds the corresponding |
190 | * code point for the mule encoding, or 0 if there is no equivalent code. |
191 | */ |
192 | void |
193 | latin2mic_with_table(const unsigned char *l, |
194 | unsigned char *p, |
195 | int len, |
196 | int lc, |
197 | int encoding, |
198 | const unsigned char *tab) |
199 | { |
200 | unsigned char c1, |
201 | c2; |
202 | |
203 | while (len > 0) |
204 | { |
205 | c1 = *l; |
206 | if (c1 == 0) |
207 | report_invalid_encoding(encoding, (const char *) l, len); |
208 | if (!IS_HIGHBIT_SET(c1)) |
209 | *p++ = c1; |
210 | else |
211 | { |
212 | c2 = tab[c1 - HIGHBIT]; |
213 | if (c2) |
214 | { |
215 | *p++ = lc; |
216 | *p++ = c2; |
217 | } |
218 | else |
219 | report_untranslatable_char(encoding, PG_MULE_INTERNAL, |
220 | (const char *) l, len); |
221 | } |
222 | l++; |
223 | len--; |
224 | } |
225 | *p = '\0'; |
226 | } |
227 | |
228 | /* |
229 | * mic2latin_with_table: a generic single byte charset encoding |
230 | * conversion from the mule internal code to a local charset. |
231 | * |
232 | * mic points to the source string of length len |
233 | * p is the output area (must be large enough!) |
234 | * lc is the mule character set id for the local encoding |
235 | * encoding is the PG identifier for the local encoding |
236 | * tab holds conversion entries for the mule internal code's second byte, |
237 | * starting from 128 (0x80). each entry in the table holds the corresponding |
238 | * code point for the local charset, or 0 if there is no equivalent code. |
239 | */ |
240 | void |
241 | mic2latin_with_table(const unsigned char *mic, |
242 | unsigned char *p, |
243 | int len, |
244 | int lc, |
245 | int encoding, |
246 | const unsigned char *tab) |
247 | { |
248 | unsigned char c1, |
249 | c2; |
250 | |
251 | while (len > 0) |
252 | { |
253 | c1 = *mic; |
254 | if (c1 == 0) |
255 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
256 | if (!IS_HIGHBIT_SET(c1)) |
257 | { |
258 | /* easy for ASCII */ |
259 | *p++ = c1; |
260 | mic++; |
261 | len--; |
262 | } |
263 | else |
264 | { |
265 | int l = pg_mic_mblen(mic); |
266 | |
267 | if (len < l) |
268 | report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
269 | len); |
270 | if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || |
271 | (c2 = tab[mic[1] - HIGHBIT]) == 0) |
272 | { |
273 | report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
274 | (const char *) mic, len); |
275 | break; /* keep compiler quiet */ |
276 | } |
277 | *p++ = c2; |
278 | mic += 2; |
279 | len -= 2; |
280 | } |
281 | } |
282 | *p = '\0'; |
283 | } |
284 | |
285 | /* |
286 | * comparison routine for bsearch() |
287 | * this routine is intended for combined UTF8 -> local code |
288 | */ |
289 | static int |
290 | compare3(const void *p1, const void *p2) |
291 | { |
292 | uint32 s1, |
293 | s2, |
294 | d1, |
295 | d2; |
296 | |
297 | s1 = *(const uint32 *) p1; |
298 | s2 = *((const uint32 *) p1 + 1); |
299 | d1 = ((const pg_utf_to_local_combined *) p2)->utf1; |
300 | d2 = ((const pg_utf_to_local_combined *) p2)->utf2; |
301 | return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); |
302 | } |
303 | |
304 | /* |
305 | * comparison routine for bsearch() |
306 | * this routine is intended for local code -> combined UTF8 |
307 | */ |
308 | static int |
309 | compare4(const void *p1, const void *p2) |
310 | { |
311 | uint32 v1, |
312 | v2; |
313 | |
314 | v1 = *(const uint32 *) p1; |
315 | v2 = ((const pg_local_to_utf_combined *) p2)->code; |
316 | return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
317 | } |
318 | |
319 | /* |
320 | * store 32bit character representation into multibyte stream |
321 | */ |
322 | static inline unsigned char * |
323 | store_coded_char(unsigned char *dest, uint32 code) |
324 | { |
325 | if (code & 0xff000000) |
326 | *dest++ = code >> 24; |
327 | if (code & 0x00ff0000) |
328 | *dest++ = code >> 16; |
329 | if (code & 0x0000ff00) |
330 | *dest++ = code >> 8; |
331 | if (code & 0x000000ff) |
332 | *dest++ = code; |
333 | return dest; |
334 | } |
335 | |
336 | /* |
337 | * Convert a character using a conversion radix tree. |
338 | * |
339 | * 'l' is the length of the input character in bytes, and b1-b4 are |
340 | * the input character's bytes. |
341 | */ |
342 | static inline uint32 |
343 | pg_mb_radix_conv(const pg_mb_radix_tree *rt, |
344 | int l, |
345 | unsigned char b1, |
346 | unsigned char b2, |
347 | unsigned char b3, |
348 | unsigned char b4) |
349 | { |
350 | if (l == 4) |
351 | { |
352 | /* 4-byte code */ |
353 | |
354 | /* check code validity */ |
355 | if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || |
356 | b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || |
357 | b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || |
358 | b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) |
359 | return 0; |
360 | |
361 | /* perform lookup */ |
362 | if (rt->chars32) |
363 | { |
364 | uint32 idx = rt->b4root; |
365 | |
366 | idx = rt->chars32[b1 + idx - rt->b4_1_lower]; |
367 | idx = rt->chars32[b2 + idx - rt->b4_2_lower]; |
368 | idx = rt->chars32[b3 + idx - rt->b4_3_lower]; |
369 | return rt->chars32[b4 + idx - rt->b4_4_lower]; |
370 | } |
371 | else |
372 | { |
373 | uint16 idx = rt->b4root; |
374 | |
375 | idx = rt->chars16[b1 + idx - rt->b4_1_lower]; |
376 | idx = rt->chars16[b2 + idx - rt->b4_2_lower]; |
377 | idx = rt->chars16[b3 + idx - rt->b4_3_lower]; |
378 | return rt->chars16[b4 + idx - rt->b4_4_lower]; |
379 | } |
380 | } |
381 | else if (l == 3) |
382 | { |
383 | /* 3-byte code */ |
384 | |
385 | /* check code validity */ |
386 | if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || |
387 | b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || |
388 | b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) |
389 | return 0; |
390 | |
391 | /* perform lookup */ |
392 | if (rt->chars32) |
393 | { |
394 | uint32 idx = rt->b3root; |
395 | |
396 | idx = rt->chars32[b2 + idx - rt->b3_1_lower]; |
397 | idx = rt->chars32[b3 + idx - rt->b3_2_lower]; |
398 | return rt->chars32[b4 + idx - rt->b3_3_lower]; |
399 | } |
400 | else |
401 | { |
402 | uint16 idx = rt->b3root; |
403 | |
404 | idx = rt->chars16[b2 + idx - rt->b3_1_lower]; |
405 | idx = rt->chars16[b3 + idx - rt->b3_2_lower]; |
406 | return rt->chars16[b4 + idx - rt->b3_3_lower]; |
407 | } |
408 | } |
409 | else if (l == 2) |
410 | { |
411 | /* 2-byte code */ |
412 | |
413 | /* check code validity - first byte */ |
414 | if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || |
415 | b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) |
416 | return 0; |
417 | |
418 | /* perform lookup */ |
419 | if (rt->chars32) |
420 | { |
421 | uint32 idx = rt->b2root; |
422 | |
423 | idx = rt->chars32[b3 + idx - rt->b2_1_lower]; |
424 | return rt->chars32[b4 + idx - rt->b2_2_lower]; |
425 | } |
426 | else |
427 | { |
428 | uint16 idx = rt->b2root; |
429 | |
430 | idx = rt->chars16[b3 + idx - rt->b2_1_lower]; |
431 | return rt->chars16[b4 + idx - rt->b2_2_lower]; |
432 | } |
433 | } |
434 | else if (l == 1) |
435 | { |
436 | /* 1-byte code */ |
437 | |
438 | /* check code validity - first byte */ |
439 | if (b4 < rt->b1_lower || b4 > rt->b1_upper) |
440 | return 0; |
441 | |
442 | /* perform lookup */ |
443 | if (rt->chars32) |
444 | return rt->chars32[b4 + rt->b1root - rt->b1_lower]; |
445 | else |
446 | return rt->chars16[b4 + rt->b1root - rt->b1_lower]; |
447 | } |
448 | return 0; /* shouldn't happen */ |
449 | } |
450 | |
451 | /* |
452 | * UTF8 ---> local code |
453 | * |
454 | * utf: input string in UTF8 encoding (need not be null-terminated) |
455 | * len: length of input string (in bytes) |
456 | * iso: pointer to the output area (must be large enough!) |
457 | (output string will be null-terminated) |
458 | * map: conversion map for single characters |
459 | * cmap: conversion map for combined characters |
460 | * (optional, pass NULL if none) |
461 | * cmapsize: number of entries in the conversion map for combined characters |
462 | * (optional, pass 0 if none) |
463 | * conv_func: algorithmic encoding conversion function |
464 | * (optional, pass NULL if none) |
465 | * encoding: PG identifier for the local encoding |
466 | * |
467 | * For each character, the cmap (if provided) is consulted first; if no match, |
468 | * the map is consulted next; if still no match, the conv_func (if provided) |
469 | * is applied. An error is raised if no match is found. |
470 | * |
471 | * See pg_wchar.h for more details about the data structures used here. |
472 | */ |
473 | void |
474 | UtfToLocal(const unsigned char *utf, int len, |
475 | unsigned char *iso, |
476 | const pg_mb_radix_tree *map, |
477 | const pg_utf_to_local_combined *cmap, int cmapsize, |
478 | utf_local_conversion_func conv_func, |
479 | int encoding) |
480 | { |
481 | uint32 iutf; |
482 | int l; |
483 | const pg_utf_to_local_combined *cp; |
484 | |
485 | if (!PG_VALID_ENCODING(encoding)) |
486 | ereport(ERROR, |
487 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
488 | errmsg("invalid encoding number: %d" , encoding))); |
489 | |
490 | for (; len > 0; len -= l) |
491 | { |
492 | unsigned char b1 = 0; |
493 | unsigned char b2 = 0; |
494 | unsigned char b3 = 0; |
495 | unsigned char b4 = 0; |
496 | |
497 | /* "break" cases all represent errors */ |
498 | if (*utf == '\0') |
499 | break; |
500 | |
501 | l = pg_utf_mblen(utf); |
502 | if (len < l) |
503 | break; |
504 | |
505 | if (!pg_utf8_islegal(utf, l)) |
506 | break; |
507 | |
508 | if (l == 1) |
509 | { |
510 | /* ASCII case is easy, assume it's one-to-one conversion */ |
511 | *iso++ = *utf++; |
512 | continue; |
513 | } |
514 | |
515 | /* collect coded char of length l */ |
516 | if (l == 2) |
517 | { |
518 | b3 = *utf++; |
519 | b4 = *utf++; |
520 | } |
521 | else if (l == 3) |
522 | { |
523 | b2 = *utf++; |
524 | b3 = *utf++; |
525 | b4 = *utf++; |
526 | } |
527 | else if (l == 4) |
528 | { |
529 | b1 = *utf++; |
530 | b2 = *utf++; |
531 | b3 = *utf++; |
532 | b4 = *utf++; |
533 | } |
534 | else |
535 | { |
536 | elog(ERROR, "unsupported character length %d" , l); |
537 | iutf = 0; /* keep compiler quiet */ |
538 | } |
539 | iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
540 | |
541 | /* First, try with combined map if possible */ |
542 | if (cmap && len > l) |
543 | { |
544 | const unsigned char *utf_save = utf; |
545 | int len_save = len; |
546 | int l_save = l; |
547 | |
548 | /* collect next character, same as above */ |
549 | len -= l; |
550 | |
551 | l = pg_utf_mblen(utf); |
552 | if (len < l) |
553 | break; |
554 | |
555 | if (!pg_utf8_islegal(utf, l)) |
556 | break; |
557 | |
558 | /* We assume ASCII character cannot be in combined map */ |
559 | if (l > 1) |
560 | { |
561 | uint32 iutf2; |
562 | uint32 cutf[2]; |
563 | |
564 | if (l == 2) |
565 | { |
566 | iutf2 = *utf++ << 8; |
567 | iutf2 |= *utf++; |
568 | } |
569 | else if (l == 3) |
570 | { |
571 | iutf2 = *utf++ << 16; |
572 | iutf2 |= *utf++ << 8; |
573 | iutf2 |= *utf++; |
574 | } |
575 | else if (l == 4) |
576 | { |
577 | iutf2 = *utf++ << 24; |
578 | iutf2 |= *utf++ << 16; |
579 | iutf2 |= *utf++ << 8; |
580 | iutf2 |= *utf++; |
581 | } |
582 | else |
583 | { |
584 | elog(ERROR, "unsupported character length %d" , l); |
585 | iutf2 = 0; /* keep compiler quiet */ |
586 | } |
587 | |
588 | cutf[0] = iutf; |
589 | cutf[1] = iutf2; |
590 | |
591 | cp = bsearch(cutf, cmap, cmapsize, |
592 | sizeof(pg_utf_to_local_combined), compare3); |
593 | |
594 | if (cp) |
595 | { |
596 | iso = store_coded_char(iso, cp->code); |
597 | continue; |
598 | } |
599 | } |
600 | |
601 | /* fail, so back up to reprocess second character next time */ |
602 | utf = utf_save; |
603 | len = len_save; |
604 | l = l_save; |
605 | } |
606 | |
607 | /* Now check ordinary map */ |
608 | if (map) |
609 | { |
610 | uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
611 | |
612 | if (converted) |
613 | { |
614 | iso = store_coded_char(iso, converted); |
615 | continue; |
616 | } |
617 | } |
618 | |
619 | /* if there's a conversion function, try that */ |
620 | if (conv_func) |
621 | { |
622 | uint32 converted = (*conv_func) (iutf); |
623 | |
624 | if (converted) |
625 | { |
626 | iso = store_coded_char(iso, converted); |
627 | continue; |
628 | } |
629 | } |
630 | |
631 | /* failed to translate this character */ |
632 | report_untranslatable_char(PG_UTF8, encoding, |
633 | (const char *) (utf - l), len); |
634 | } |
635 | |
636 | /* if we broke out of loop early, must be invalid input */ |
637 | if (len > 0) |
638 | report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
639 | |
640 | *iso = '\0'; |
641 | } |
642 | |
643 | /* |
644 | * local code ---> UTF8 |
645 | * |
646 | * iso: input string in local encoding (need not be null-terminated) |
647 | * len: length of input string (in bytes) |
648 | * utf: pointer to the output area (must be large enough!) |
649 | (output string will be null-terminated) |
650 | * map: conversion map for single characters |
651 | * cmap: conversion map for combined characters |
652 | * (optional, pass NULL if none) |
653 | * cmapsize: number of entries in the conversion map for combined characters |
654 | * (optional, pass 0 if none) |
655 | * conv_func: algorithmic encoding conversion function |
656 | * (optional, pass NULL if none) |
657 | * encoding: PG identifier for the local encoding |
658 | * |
659 | * For each character, the map is consulted first; if no match, the cmap |
660 | * (if provided) is consulted next; if still no match, the conv_func |
661 | * (if provided) is applied. An error is raised if no match is found. |
662 | * |
663 | * See pg_wchar.h for more details about the data structures used here. |
664 | */ |
665 | void |
666 | LocalToUtf(const unsigned char *iso, int len, |
667 | unsigned char *utf, |
668 | const pg_mb_radix_tree *map, |
669 | const pg_local_to_utf_combined *cmap, int cmapsize, |
670 | utf_local_conversion_func conv_func, |
671 | int encoding) |
672 | { |
673 | uint32 iiso; |
674 | int l; |
675 | const pg_local_to_utf_combined *cp; |
676 | |
677 | if (!PG_VALID_ENCODING(encoding)) |
678 | ereport(ERROR, |
679 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
680 | errmsg("invalid encoding number: %d" , encoding))); |
681 | |
682 | for (; len > 0; len -= l) |
683 | { |
684 | unsigned char b1 = 0; |
685 | unsigned char b2 = 0; |
686 | unsigned char b3 = 0; |
687 | unsigned char b4 = 0; |
688 | |
689 | /* "break" cases all represent errors */ |
690 | if (*iso == '\0') |
691 | break; |
692 | |
693 | if (!IS_HIGHBIT_SET(*iso)) |
694 | { |
695 | /* ASCII case is easy, assume it's one-to-one conversion */ |
696 | *utf++ = *iso++; |
697 | l = 1; |
698 | continue; |
699 | } |
700 | |
701 | l = pg_encoding_verifymb(encoding, (const char *) iso, len); |
702 | if (l < 0) |
703 | break; |
704 | |
705 | /* collect coded char of length l */ |
706 | if (l == 1) |
707 | b4 = *iso++; |
708 | else if (l == 2) |
709 | { |
710 | b3 = *iso++; |
711 | b4 = *iso++; |
712 | } |
713 | else if (l == 3) |
714 | { |
715 | b2 = *iso++; |
716 | b3 = *iso++; |
717 | b4 = *iso++; |
718 | } |
719 | else if (l == 4) |
720 | { |
721 | b1 = *iso++; |
722 | b2 = *iso++; |
723 | b3 = *iso++; |
724 | b4 = *iso++; |
725 | } |
726 | else |
727 | { |
728 | elog(ERROR, "unsupported character length %d" , l); |
729 | iiso = 0; /* keep compiler quiet */ |
730 | } |
731 | iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
732 | |
733 | if (map) |
734 | { |
735 | uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
736 | |
737 | if (converted) |
738 | { |
739 | utf = store_coded_char(utf, converted); |
740 | continue; |
741 | } |
742 | |
743 | /* If there's a combined character map, try that */ |
744 | if (cmap) |
745 | { |
746 | cp = bsearch(&iiso, cmap, cmapsize, |
747 | sizeof(pg_local_to_utf_combined), compare4); |
748 | |
749 | if (cp) |
750 | { |
751 | utf = store_coded_char(utf, cp->utf1); |
752 | utf = store_coded_char(utf, cp->utf2); |
753 | continue; |
754 | } |
755 | } |
756 | } |
757 | |
758 | /* if there's a conversion function, try that */ |
759 | if (conv_func) |
760 | { |
761 | uint32 converted = (*conv_func) (iiso); |
762 | |
763 | if (converted) |
764 | { |
765 | utf = store_coded_char(utf, converted); |
766 | continue; |
767 | } |
768 | } |
769 | |
770 | /* failed to translate this character */ |
771 | report_untranslatable_char(encoding, PG_UTF8, |
772 | (const char *) (iso - l), len); |
773 | } |
774 | |
775 | /* if we broke out of loop early, must be invalid input */ |
776 | if (len > 0) |
777 | report_invalid_encoding(encoding, (const char *) iso, len); |
778 | |
779 | *utf = '\0'; |
780 | } |
781 | |