conv.c source code [PostgreSQL/src/backend/utils/mb/conv.c]

1	/-------------------------------------------------------------------------*
2	*
3	* Utility functions for conversion procs.
4	*
5	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
6	* Portions Copyright (c) 1994, Regents of the University of California
7	*
8	* IDENTIFICATION
9	* src/backend/utils/mb/conv.c
10	*
11	*-------------------------------------------------------------------------
12	*/
13	#include "postgres.h"
14	#include "mb/pg_wchar.h"
15
16
17	/*
18	* local2local: a generic single byte charset encoding
19	* conversion between two ASCII-superset encodings.
20	*
21	* l points to the source string of length len
22	* p is the output area (must be large enough!)
23	* src_encoding is the PG identifier for the source encoding
24	* dest_encoding is the PG identifier for the target encoding
25	* tab holds conversion entries for the source charset
26	* starting from 128 (0x80). each entry in the table holds the corresponding
27	* code point for the target charset, or 0 if there is no equivalent code.
28	*/
29	void
30	local2local(const unsigned char *l,
31	unsigned char *p,
32	int len,
33	int src_encoding,
34	int dest_encoding,
35	const unsigned char *tab)
36	{
37	unsigned char c1,
38	c2;
39
40	while (len > `0`)
41	{
42	c1 = *l;
43	if (c1 == `0`)
44	report_invalid_encoding(src_encoding, (const char *) l, len);
45	if (!IS_HIGHBIT_SET(c1))
46	*p++ = c1;
47	else
48	{
49	c2 = tab[c1 - HIGHBIT];
50	if (c2)
51	*p++ = c2;
52	else
53	report_untranslatable_char(src_encoding, dest_encoding,
54	(const char *) l, len);
55	}
56	l++;
57	len--;
58	}
59	*p = `'\0'`;
60	}
61
62	/*
63	* LATINn ---> MIC when the charset's local codes map directly to MIC
64	*
65	* l points to the source string of length len
66	* p is the output area (must be large enough!)
67	* lc is the mule character set id for the local encoding
68	* encoding is the PG identifier for the local encoding
69	*/
70	void
71	latin2mic(const unsigned char l, unsigned* char p, int* len,
72	int lc, int encoding)
73	{
74	int c1;
75
76	while (len > `0`)
77	{
78	c1 = *l;
79	if (c1 == `0`)
80	report_invalid_encoding(encoding, (const char *) l, len);
81	if (IS_HIGHBIT_SET(c1))
82	*p++ = lc;
83	*p++ = c1;
84	l++;
85	len--;
86	}
87	*p = `'\0'`;
88	}
89
90	/*
91	* MIC ---> LATINn when the charset's local codes map directly to MIC
92	*
93	* mic points to the source string of length len
94	* p is the output area (must be large enough!)
95	* lc is the mule character set id for the local encoding
96	* encoding is the PG identifier for the local encoding
97	*/
98	void
99	mic2latin(const unsigned char mic, unsigned* char p, int* len,
100	int lc, int encoding)
101	{
102	int c1;
103
104	while (len > `0`)
105	{
106	c1 = *mic;
107	if (c1 == `0`)
108	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109	if (!IS_HIGHBIT_SET(c1))
110	{
111	/ easy for ASCII /
112	*p++ = c1;
113	mic++;
114	len--;
115	}
116	else
117	{
118	int l = pg_mic_mblen(mic);
119
120	if (len < l)
121	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122	len);
123	if (l != `2` \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[`1`]))
124	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125	(const char *) mic, len);
126	*p++ = mic[`1`];
127	mic += `2`;
128	len -= `2`;
129	}
130	}
131	*p = `'\0'`;
132	}
133
134
135	/*
136	* ASCII ---> MIC
137	*
138	* While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139	* characters, here we must take a hard line because we don't know
140	* the appropriate MIC equivalent.
141	*/
142	void
143	pg_ascii2mic(const unsigned char l, unsigned* char p, int* len)
144	{
145	int c1;
146
147	while (len > `0`)
148	{
149	c1 = *l;
150	if (c1 == `0` \|\| IS_HIGHBIT_SET(c1))
151	report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152	*p++ = c1;
153	l++;
154	len--;
155	}
156	*p = `'\0'`;
157	}
158
159	/*
160	* MIC ---> ASCII
161	*/
162	void
163	pg_mic2ascii(const unsigned char mic, unsigned* char p, int* len)
164	{
165	int c1;
166
167	while (len > `0`)
168	{
169	c1 = *mic;
170	if (c1 == `0` \|\| IS_HIGHBIT_SET(c1))
171	report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172	(const char *) mic, len);
173	*p++ = c1;
174	mic++;
175	len--;
176	}
177	*p = `'\0'`;
178	}
179
180	/*
181	* latin2mic_with_table: a generic single byte charset encoding
182	* conversion from a local charset to the mule internal code.
183	*
184	* l points to the source string of length len
185	* p is the output area (must be large enough!)
186	* lc is the mule character set id for the local encoding
187	* encoding is the PG identifier for the local encoding
188	* tab holds conversion entries for the local charset
189	* starting from 128 (0x80). each entry in the table holds the corresponding
190	* code point for the mule encoding, or 0 if there is no equivalent code.
191	*/
192	void
193	latin2mic_with_table(const unsigned char *l,
194	unsigned char *p,
195	int len,
196	int lc,
197	int encoding,
198	const unsigned char *tab)
199	{
200	unsigned char c1,
201	c2;
202
203	while (len > `0`)
204	{
205	c1 = *l;
206	if (c1 == `0`)
207	report_invalid_encoding(encoding, (const char *) l, len);
208	if (!IS_HIGHBIT_SET(c1))
209	*p++ = c1;
210	else
211	{
212	c2 = tab[c1 - HIGHBIT];
213	if (c2)
214	{
215	*p++ = lc;
216	*p++ = c2;
217	}
218	else
219	report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220	(const char *) l, len);
221	}
222	l++;
223	len--;
224	}
225	*p = `'\0'`;
226	}
227
228	/*
229	* mic2latin_with_table: a generic single byte charset encoding
230	* conversion from the mule internal code to a local charset.
231	*
232	* mic points to the source string of length len
233	* p is the output area (must be large enough!)
234	* lc is the mule character set id for the local encoding
235	* encoding is the PG identifier for the local encoding
236	* tab holds conversion entries for the mule internal code's second byte,
237	* starting from 128 (0x80). each entry in the table holds the corresponding
238	* code point for the local charset, or 0 if there is no equivalent code.
239	*/
240	void
241	mic2latin_with_table(const unsigned char *mic,
242	unsigned char *p,
243	int len,
244	int lc,
245	int encoding,
246	const unsigned char *tab)
247	{
248	unsigned char c1,
249	c2;
250
251	while (len > `0`)
252	{
253	c1 = *mic;
254	if (c1 == `0`)
255	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256	if (!IS_HIGHBIT_SET(c1))
257	{
258	/ easy for ASCII /
259	*p++ = c1;
260	mic++;
261	len--;
262	}
263	else
264	{
265	int l = pg_mic_mblen(mic);
266
267	if (len < l)
268	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269	len);
270	if (l != `2` \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[`1`]) \|\|
271	(c2 = tab[mic[`1`] - HIGHBIT]) == `0`)
272	{
273	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274	(const char *) mic, len);
275	break; / keep compiler quiet /
276	}
277	*p++ = c2;
278	mic += `2`;
279	len -= `2`;
280	}
281	}
282	*p = `'\0'`;
283	}
284
285	/*
286	* comparison routine for bsearch()
287	* this routine is intended for combined UTF8 -> local code
288	*/
289	static int
290	compare3(const void p1, const* void *p2)
291	{
292	uint32 s1,
293	s2,
294	d1,
295	d2;
296
297	s1 = (const* uint32 *) p1;
298	s2 = ((const* uint32 *) p1 + `1`);
299	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
300	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
301	return (s1 > d1 \|\| (s1 == d1 && s2 > d2)) ? `1` : ((s1 == d1 && s2 == d2) ? `0` : -`1`);
302	}
303
304	/*
305	* comparison routine for bsearch()
306	* this routine is intended for local code -> combined UTF8
307	*/
308	static int
309	compare4(const void p1, const* void *p2)
310	{
311	uint32 v1,
312	v2;
313
314	v1 = (const* uint32 *) p1;
315	v2 = ((const pg_local_to_utf_combined *) p2)->code;
316	return (v1 > v2) ? `1` : ((v1 == v2) ? `0` : -`1`);
317	}
318
319	/*
320	* store 32bit character representation into multibyte stream
321	*/
322	static inline unsigned char *
323	store_coded_char(unsigned char *dest, uint32 code)
324	{
325	if (code & `0xff000000`)
326	*dest++ = code >> `24`;
327	if (code & `0x00ff0000`)
328	*dest++ = code >> `16`;
329	if (code & `0x0000ff00`)
330	*dest++ = code >> `8`;
331	if (code & `0x000000ff`)
332	*dest++ = code;
333	return dest;
334	}
335
336	/*
337	* Convert a character using a conversion radix tree.
338	*
339	* 'l' is the length of the input character in bytes, and b1-b4 are
340	* the input character's bytes.
341	*/
342	static inline uint32
343	pg_mb_radix_conv(const pg_mb_radix_tree *rt,
344	int l,
345	unsigned char b1,
346	unsigned char b2,
347	unsigned char b3,
348	unsigned char b4)
349	{
350	if (l == `4`)
351	{
352	/ 4-byte code /
353
354	/ check code validity /
355	if (b1 < rt->b4_1_lower \|\| b1 > rt->b4_1_upper \|\|
356	b2 < rt->b4_2_lower \|\| b2 > rt->b4_2_upper \|\|
357	b3 < rt->b4_3_lower \|\| b3 > rt->b4_3_upper \|\|
358	b4 < rt->b4_4_lower \|\| b4 > rt->b4_4_upper)
359	return `0`;
360
361	/ perform lookup /
362	if (rt->chars32)
363	{
364	uint32 idx = rt->b4root;
365
366	idx = rt->chars32[b1 + idx - rt->b4_1_lower];
367	idx = rt->chars32[b2 + idx - rt->b4_2_lower];
368	idx = rt->chars32[b3 + idx - rt->b4_3_lower];
369	return rt->chars32[b4 + idx - rt->b4_4_lower];
370	}
371	else
372	{
373	uint16 idx = rt->b4root;
374
375	idx = rt->chars16[b1 + idx - rt->b4_1_lower];
376	idx = rt->chars16[b2 + idx - rt->b4_2_lower];
377	idx = rt->chars16[b3 + idx - rt->b4_3_lower];
378	return rt->chars16[b4 + idx - rt->b4_4_lower];
379	}
380	}
381	else if (l == `3`)
382	{
383	/ 3-byte code /
384
385	/ check code validity /
386	if (b2 < rt->b3_1_lower \|\| b2 > rt->b3_1_upper \|\|
387	b3 < rt->b3_2_lower \|\| b3 > rt->b3_2_upper \|\|
388	b4 < rt->b3_3_lower \|\| b4 > rt->b3_3_upper)
389	return `0`;
390
391	/ perform lookup /
392	if (rt->chars32)
393	{
394	uint32 idx = rt->b3root;
395
396	idx = rt->chars32[b2 + idx - rt->b3_1_lower];
397	idx = rt->chars32[b3 + idx - rt->b3_2_lower];
398	return rt->chars32[b4 + idx - rt->b3_3_lower];
399	}
400	else
401	{
402	uint16 idx = rt->b3root;
403
404	idx = rt->chars16[b2 + idx - rt->b3_1_lower];
405	idx = rt->chars16[b3 + idx - rt->b3_2_lower];
406	return rt->chars16[b4 + idx - rt->b3_3_lower];
407	}
408	}
409	else if (l == `2`)
410	{
411	/ 2-byte code /
412
413	/ check code validity - first byte /
414	if (b3 < rt->b2_1_lower \|\| b3 > rt->b2_1_upper \|\|
415	b4 < rt->b2_2_lower \|\| b4 > rt->b2_2_upper)
416	return `0`;
417
418	/ perform lookup /
419	if (rt->chars32)
420	{
421	uint32 idx = rt->b2root;
422
423	idx = rt->chars32[b3 + idx - rt->b2_1_lower];
424	return rt->chars32[b4 + idx - rt->b2_2_lower];
425	}
426	else
427	{
428	uint16 idx = rt->b2root;
429
430	idx = rt->chars16[b3 + idx - rt->b2_1_lower];
431	return rt->chars16[b4 + idx - rt->b2_2_lower];
432	}
433	}
434	else if (l == `1`)
435	{
436	/ 1-byte code /
437
438	/ check code validity - first byte /
439	if (b4 < rt->b1_lower \|\| b4 > rt->b1_upper)
440	return `0`;
441
442	/ perform lookup /
443	if (rt->chars32)
444	return rt->chars32[b4 + rt->b1root - rt->b1_lower];
445	else
446	return rt->chars16[b4 + rt->b1root - rt->b1_lower];
447	}
448	return `0`; / shouldn't happen /
449	}
450
451	/*
452	* UTF8 ---> local code
453	*
454	* utf: input string in UTF8 encoding (need not be null-terminated)
455	* len: length of input string (in bytes)
456	* iso: pointer to the output area (must be large enough!)
457	(output string will be null-terminated)
458	* map: conversion map for single characters
459	* cmap: conversion map for combined characters
460	* (optional, pass NULL if none)
461	* cmapsize: number of entries in the conversion map for combined characters
462	* (optional, pass 0 if none)
463	* conv_func: algorithmic encoding conversion function
464	* (optional, pass NULL if none)
465	* encoding: PG identifier for the local encoding
466	*
467	* For each character, the cmap (if provided) is consulted first; if no match,
468	* the map is consulted next; if still no match, the conv_func (if provided)
469	* is applied. An error is raised if no match is found.
470	*
471	* See pg_wchar.h for more details about the data structures used here.
472	*/
473	void
474	UtfToLocal(const unsigned char utf, int* len,
475	unsigned char *iso,
476	const pg_mb_radix_tree *map,
477	const pg_utf_to_local_combined cmap, int* cmapsize,
478	utf_local_conversion_func conv_func,
479	int encoding)
480	{
481	uint32 iutf;
482	int l;
483	const pg_utf_to_local_combined *cp;
484
485	if (!PG_VALID_ENCODING(encoding))
486	ereport(ERROR,
487	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
488	errmsg("invalid encoding number: %d", encoding)));
489
490	for (; len > `0`; len -= l)
491	{
492	unsigned char b1 = `0`;
493	unsigned char b2 = `0`;
494	unsigned char b3 = `0`;
495	unsigned char b4 = `0`;
496
497	/ "break" cases all represent errors /
498	if (*utf == `'\0'`)
499	break;
500
501	l = pg_utf_mblen(utf);
502	if (len < l)
503	break;
504
505	if (!pg_utf8_islegal(utf, l))
506	break;
507
508	if (l == `1`)
509	{
510	/ ASCII case is easy, assume it's one-to-one conversion /
511	iso++ = utf++;
512	continue;
513	}
514
515	/ collect coded char of length l /
516	if (l == `2`)
517	{
518	b3 = *utf++;
519	b4 = *utf++;
520	}
521	else if (l == `3`)
522	{
523	b2 = *utf++;
524	b3 = *utf++;
525	b4 = *utf++;
526	}
527	else if (l == `4`)
528	{
529	b1 = *utf++;
530	b2 = *utf++;
531	b3 = *utf++;
532	b4 = *utf++;
533	}
534	else
535	{
536	elog(ERROR, "unsupported character length %d", l);
537	iutf = `0`; / keep compiler quiet /
538	}
539	iutf = (b1 << `24` \| b2 << `16` \| b3 << `8` \| b4);
540
541	/ First, try with combined map if possible /
542	if (cmap && len > l)
543	{
544	const unsigned char *utf_save = utf;
545	int len_save = len;
546	int l_save = l;
547
548	/ collect next character, same as above /
549	len -= l;
550
551	l = pg_utf_mblen(utf);
552	if (len < l)
553	break;
554
555	if (!pg_utf8_islegal(utf, l))
556	break;
557
558	/ We assume ASCII character cannot be in combined map /
559	if (l > `1`)
560	{
561	uint32 iutf2;
562	uint32 cutf[`2`];
563
564	if (l == `2`)
565	{
566	iutf2 = *utf++ << `8`;
567	iutf2 \|= *utf++;
568	}
569	else if (l == `3`)
570	{
571	iutf2 = *utf++ << `16`;
572	iutf2 \|= *utf++ << `8`;
573	iutf2 \|= *utf++;
574	}
575	else if (l == `4`)
576	{
577	iutf2 = *utf++ << `24`;
578	iutf2 \|= *utf++ << `16`;
579	iutf2 \|= *utf++ << `8`;
580	iutf2 \|= *utf++;
581	}
582	else
583	{
584	elog(ERROR, "unsupported character length %d", l);
585	iutf2 = `0`; / keep compiler quiet /
586	}
587
588	cutf[`0`] = iutf;
589	cutf[`1`] = iutf2;
590
591	cp = bsearch(cutf, cmap, cmapsize,
592	sizeof(pg_utf_to_local_combined), compare3);
593
594	if (cp)
595	{
596	iso = store_coded_char(iso, cp->code);
597	continue;
598	}
599	}
600
601	/ fail, so back up to reprocess second character next time /
602	utf = utf_save;
603	len = len_save;
604	l = l_save;
605	}
606
607	/ Now check ordinary map /
608	if (map)
609	{
610	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
611
612	if (converted)
613	{
614	iso = store_coded_char(iso, converted);
615	continue;
616	}
617	}
618
619	/ if there's a conversion function, try that /
620	if (conv_func)
621	{
622	uint32 converted = (*conv_func) (iutf);
623
624	if (converted)
625	{
626	iso = store_coded_char(iso, converted);
627	continue;
628	}
629	}
630
631	/ failed to translate this character /
632	report_untranslatable_char(PG_UTF8, encoding,
633	(const char *) (utf - l), len);
634	}
635
636	/ if we broke out of loop early, must be invalid input /
637	if (len > `0`)
638	report_invalid_encoding(PG_UTF8, (const char *) utf, len);
639
640	*iso = `'\0'`;
641	}
642
643	/*
644	* local code ---> UTF8
645	*
646	* iso: input string in local encoding (need not be null-terminated)
647	* len: length of input string (in bytes)
648	* utf: pointer to the output area (must be large enough!)
649	(output string will be null-terminated)
650	* map: conversion map for single characters
651	* cmap: conversion map for combined characters
652	* (optional, pass NULL if none)
653	* cmapsize: number of entries in the conversion map for combined characters
654	* (optional, pass 0 if none)
655	* conv_func: algorithmic encoding conversion function
656	* (optional, pass NULL if none)
657	* encoding: PG identifier for the local encoding
658	*
659	* For each character, the map is consulted first; if no match, the cmap
660	* (if provided) is consulted next; if still no match, the conv_func
661	* (if provided) is applied. An error is raised if no match is found.
662	*
663	* See pg_wchar.h for more details about the data structures used here.
664	*/
665	void
666	LocalToUtf(const unsigned char iso, int* len,
667	unsigned char *utf,
668	const pg_mb_radix_tree *map,
669	const pg_local_to_utf_combined cmap, int* cmapsize,
670	utf_local_conversion_func conv_func,
671	int encoding)
672	{
673	uint32 iiso;
674	int l;
675	const pg_local_to_utf_combined *cp;
676
677	if (!PG_VALID_ENCODING(encoding))
678	ereport(ERROR,
679	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
680	errmsg("invalid encoding number: %d", encoding)));
681
682	for (; len > `0`; len -= l)
683	{
684	unsigned char b1 = `0`;
685	unsigned char b2 = `0`;
686	unsigned char b3 = `0`;
687	unsigned char b4 = `0`;
688
689	/ "break" cases all represent errors /
690	if (*iso == `'\0'`)
691	break;
692
693	if (!IS_HIGHBIT_SET(*iso))
694	{
695	/ ASCII case is easy, assume it's one-to-one conversion /
696	utf++ = iso++;
697	l = `1`;
698	continue;
699	}
700
701	l = pg_encoding_verifymb(encoding, (const char *) iso, len);
702	if (l < `0`)
703	break;
704
705	/ collect coded char of length l /
706	if (l == `1`)
707	b4 = *iso++;
708	else if (l == `2`)
709	{
710	b3 = *iso++;
711	b4 = *iso++;
712	}
713	else if (l == `3`)
714	{
715	b2 = *iso++;
716	b3 = *iso++;
717	b4 = *iso++;
718	}
719	else if (l == `4`)
720	{
721	b1 = *iso++;
722	b2 = *iso++;
723	b3 = *iso++;
724	b4 = *iso++;
725	}
726	else
727	{
728	elog(ERROR, "unsupported character length %d", l);
729	iiso = `0`; / keep compiler quiet /
730	}
731	iiso = (b1 << `24` \| b2 << `16` \| b3 << `8` \| b4);
732
733	if (map)
734	{
735	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
736
737	if (converted)
738	{
739	utf = store_coded_char(utf, converted);
740	continue;
741	}
742
743	/ If there's a combined character map, try that /
744	if (cmap)
745	{
746	cp = bsearch(&iiso, cmap, cmapsize,
747	sizeof(pg_local_to_utf_combined), compare4);
748
749	if (cp)
750	{
751	utf = store_coded_char(utf, cp->utf1);
752	utf = store_coded_char(utf, cp->utf2);
753	continue;
754	}
755	}
756	}
757
758	/ if there's a conversion function, try that /
759	if (conv_func)
760	{
761	uint32 converted = (*conv_func) (iiso);
762
763	if (converted)
764	{
765	utf = store_coded_char(utf, converted);
766	continue;
767	}
768	}
769
770	/ failed to translate this character /
771	report_untranslatable_char(encoding, PG_UTF8,
772	(const char *) (iso - l), len);
773	}
774
775	/ if we broke out of loop early, must be invalid input /
776	if (len > `0`)
777	report_invalid_encoding(encoding, (const char *) iso, len);
778
779	*utf = `'\0'`;
780	}
781

Browse the source code of PostgreSQL/src/backend/utils/mb/conv.c