mbutils.c source code [PostgreSQL/src/backend/utils/mb/mbutils.c]

1	/-------------------------------------------------------------------------*
2	*
3	* mbutils.c
4	* This file contains functions for encoding conversion.
5	*
6	* The string-conversion functions in this file share some API quirks.
7	* Note the following:
8	*
9	* The functions return a palloc'd, null-terminated string if conversion
10	* is required. However, if no conversion is performed, the given source
11	* string pointer is returned as-is.
12	*
13	* Although the presence of a length argument means that callers can pass
14	* non-null-terminated strings, care is required because the same string
15	* will be passed back if no conversion occurs. Such callers must check
16	* whether result == src and handle that case differently.
17	*
18	* If the source and destination encodings are the same, the source string
19	* is returned without any verification; it's assumed to be valid data.
20	* If that might not be the case, the caller is responsible for validating
21	* the string using a separate call to pg_verify_mbstr(). Whenever the
22	* source and destination encodings are different, the functions ensure that
23	* the result is validly encoded according to the destination encoding.
24	*
25	*
26	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27	* Portions Copyright (c) 1994, Regents of the University of California
28	*
29	*
30	* IDENTIFICATION
31	* src/backend/utils/mb/mbutils.c
32	*
33	*-------------------------------------------------------------------------
34	*/
35	#include "postgres.h"
36
37	#include "access/xact.h"
38	#include "catalog/namespace.h"
39	#include "mb/pg_wchar.h"
40	#include "utils/builtins.h"
41	#include "utils/memutils.h"
42	#include "utils/syscache.h"
43
44	/*
45	* We maintain a simple linked list caching the fmgr lookup info for the
46	* currently selected conversion functions, as well as any that have been
47	* selected previously in the current session. (We remember previous
48	* settings because we must be able to restore a previous setting during
49	* transaction rollback, without doing any fresh catalog accesses.)
50	*
51	* Since we'll never release this data, we just keep it in TopMemoryContext.
52	*/
53	typedef struct ConvProcInfo
54	{
55	int s_encoding; / server and client encoding IDs /
56	int c_encoding;
57	FmgrInfo to_server_info; / lookup info for conversion procs /
58	FmgrInfo to_client_info;
59	} ConvProcInfo;
60
61	static List ConvProcList = NIL; /* List of ConvProcInfo /
62
63	/*
64	* These variables point to the currently active conversion functions,
65	* or are NULL when no conversion is needed.
66	*/
67	static FmgrInfo *ToServerConvProc = NULL;
68	static FmgrInfo *ToClientConvProc = NULL;
69
70	/*
71	* These variables track the currently-selected encodings.
72	*/
73	static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
74	static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
75	static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
76
77	/*
78	* During backend startup we can't set client encoding because we (a)
79	* can't look up the conversion functions, and (b) may not know the database
80	* encoding yet either. So SetClientEncoding() just accepts anything and
81	* remembers it for InitializeClientEncoding() to apply later.
82	*/
83	static bool backend_startup_complete = false;
84	static int pending_client_encoding = PG_SQL_ASCII;
85
86
87	/ Internal functions /
88	static char perform_default_encoding_conversion(const* char *src,
89	int len, bool is_client_to_server);
90	static int cliplen(const char str, int* len, int limit);
91
92
93	/*
94	* Prepare for a future call to SetClientEncoding. Success should mean
95	* that SetClientEncoding is guaranteed to succeed for this encoding request.
96	*
97	* (But note that success before backend_startup_complete does not guarantee
98	* success after ...)
99	*
100	* Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
101	*/
102	int
103	PrepareClientEncoding(int encoding)
104	{
105	int current_server_encoding;
106	ListCell *lc;
107
108	if (!PG_VALID_FE_ENCODING(encoding))
109	return -`1`;
110
111	/ Can't do anything during startup, per notes above /
112	if (!backend_startup_complete)
113	return `0`;
114
115	current_server_encoding = GetDatabaseEncoding();
116
117	/*
118	* Check for cases that require no conversion function.
119	*/
120	if (current_server_encoding == encoding \|\|
121	current_server_encoding == PG_SQL_ASCII \|\|
122	encoding == PG_SQL_ASCII)
123	return `0`;
124
125	if (IsTransactionState())
126	{
127	/*
128	* If we're in a live transaction, it's safe to access the catalogs,
129	* so look up the functions. We repeat the lookup even if the info is
130	* already cached, so that we can react to changes in the contents of
131	* pg_conversion.
132	*/
133	Oid to_server_proc,
134	to_client_proc;
135	ConvProcInfo *convinfo;
136	MemoryContext oldcontext;
137
138	to_server_proc = FindDefaultConversionProc(encoding,
139	current_server_encoding);
140	if (!OidIsValid(to_server_proc))
141	return -`1`;
142	to_client_proc = FindDefaultConversionProc(current_server_encoding,
143	encoding);
144	if (!OidIsValid(to_client_proc))
145	return -`1`;
146
147	/*
148	* Load the fmgr info into TopMemoryContext (could still fail here)
149	*/
150	convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
151	sizeof(ConvProcInfo));
152	convinfo->s_encoding = current_server_encoding;
153	convinfo->c_encoding = encoding;
154	fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
155	TopMemoryContext);
156	fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
157	TopMemoryContext);
158
159	/ Attach new info to head of list /
160	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
161	ConvProcList = lcons(convinfo, ConvProcList);
162	MemoryContextSwitchTo(oldcontext);
163
164	/*
165	* We cannot yet remove any older entry for the same encoding pair,
166	* since it could still be in use. SetClientEncoding will clean up.
167	*/
168
169	return `0`; / success /
170	}
171	else
172	{
173	/*
174	* If we're not in a live transaction, the only thing we can do is
175	* restore a previous setting using the cache. This covers all
176	* transaction-rollback cases. The only case it might not work for is
177	* trying to change client_encoding on the fly by editing
178	* postgresql.conf and SIGHUP'ing. Which would probably be a stupid
179	* thing to do anyway.
180	*/
181	foreach(lc, ConvProcList)
182	{
183	ConvProcInfo oldinfo = (ConvProcInfo ) lfirst(lc);
184
185	if (oldinfo->s_encoding == current_server_encoding &&
186	oldinfo->c_encoding == encoding)
187	return `0`;
188	}
189
190	return -`1`; / it's not cached, so fail /
191	}
192	}
193
194	/*
195	* Set the active client encoding and set up the conversion-function pointers.
196	* PrepareClientEncoding should have been called previously for this encoding.
197	*
198	* Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
199	*/
200	int
201	SetClientEncoding(int encoding)
202	{
203	int current_server_encoding;
204	bool found;
205	ListCell *lc;
206	ListCell *prev;
207	ListCell *next;
208
209	if (!PG_VALID_FE_ENCODING(encoding))
210	return -`1`;
211
212	/ Can't do anything during startup, per notes above /
213	if (!backend_startup_complete)
214	{
215	pending_client_encoding = encoding;
216	return `0`;
217	}
218
219	current_server_encoding = GetDatabaseEncoding();
220
221	/*
222	* Check for cases that require no conversion function.
223	*/
224	if (current_server_encoding == encoding \|\|
225	current_server_encoding == PG_SQL_ASCII \|\|
226	encoding == PG_SQL_ASCII)
227	{
228	ClientEncoding = &pg_enc2name_tbl[encoding];
229	ToServerConvProc = NULL;
230	ToClientConvProc = NULL;
231	return `0`;
232	}
233
234	/*
235	* Search the cache for the entry previously prepared by
236	* PrepareClientEncoding; if there isn't one, we lose. While at it,
237	* release any duplicate entries so that repeated Prepare/Set cycles don't
238	* leak memory.
239	*/
240	found = false;
241	prev = NULL;
242	for (lc = list_head(ConvProcList); lc; lc = next)
243	{
244	ConvProcInfo convinfo = (ConvProcInfo ) lfirst(lc);
245
246	next = lnext(lc);
247
248	if (convinfo->s_encoding == current_server_encoding &&
249	convinfo->c_encoding == encoding)
250	{
251	if (!found)
252	{
253	/ Found newest entry, so set up /
254	ClientEncoding = &pg_enc2name_tbl[encoding];
255	ToServerConvProc = &convinfo->to_server_info;
256	ToClientConvProc = &convinfo->to_client_info;
257	found = true;
258	}
259	else
260	{
261	/ Duplicate entry, release it /
262	ConvProcList = list_delete_cell(ConvProcList, lc, prev);
263	pfree(convinfo);
264	continue; / prev mustn't advance /
265	}
266	}
267
268	prev = lc;
269	}
270
271	if (found)
272	return `0`; / success /
273	else
274	return -`1`; / it's not cached, so fail /
275	}
276
277	/*
278	* Initialize client encoding conversions.
279	* Called from InitPostgres() once during backend startup.
280	*/
281	void
282	InitializeClientEncoding(void)
283	{
284	Assert(!backend_startup_complete);
285	backend_startup_complete = true;
286
287	if (PrepareClientEncoding(pending_client_encoding) < `0` \|\|
288	SetClientEncoding(pending_client_encoding) < `0`)
289	{
290	/*
291	* Oops, the requested conversion is not available. We couldn't fail
292	* before, but we can now.
293	*/
294	ereport(FATAL,
295	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
296	errmsg("conversion between %s and %s is not supported",
297	pg_enc2name_tbl[pending_client_encoding].name,
298	GetDatabaseEncodingName())));
299	}
300	}
301
302	/*
303	* returns the current client encoding
304	*/
305	int
306	pg_get_client_encoding(void)
307	{
308	return ClientEncoding->encoding;
309	}
310
311	/*
312	* returns the current client encoding name
313	*/
314	const char *
315	pg_get_client_encoding_name(void)
316	{
317	return ClientEncoding->name;
318	}
319
320	/*
321	* Convert src string to another encoding (general case).
322	*
323	* See the notes about string conversion functions at the top of this file.
324	*/
325	unsigned char *
326	pg_do_encoding_conversion(unsigned char src, int* len,
327	int src_encoding, int dest_encoding)
328	{
329	unsigned char *result;
330	Oid proc;
331
332	if (len <= `0`)
333	return src; / empty string is always valid /
334
335	if (src_encoding == dest_encoding)
336	return src; / no conversion required, assume valid /
337
338	if (dest_encoding == PG_SQL_ASCII)
339	return src; / any string is valid in SQL_ASCII /
340
341	if (src_encoding == PG_SQL_ASCII)
342	{
343	/ No conversion is possible, but we must validate the result /
344	(void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
345	return src;
346	}
347
348	if (!IsTransactionState()) / shouldn't happen /
349	elog(ERROR, "cannot perform encoding conversion outside a transaction");
350
351	proc = FindDefaultConversionProc(src_encoding, dest_encoding);
352	if (!OidIsValid(proc))
353	ereport(ERROR,
354	(errcode(ERRCODE_UNDEFINED_FUNCTION),
355	errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
356	pg_encoding_to_char(src_encoding),
357	pg_encoding_to_char(dest_encoding))));
358
359	/*
360	* Allocate space for conversion result, being wary of integer overflow
361	*/
362	if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
363	ereport(ERROR,
364	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
365	errmsg("out of memory"),
366	errdetail("String of %d bytes is too long for encoding conversion.",
367	len)));
368
369	result = palloc(len * MAX_CONVERSION_GROWTH + `1`);
370
371	OidFunctionCall5(proc,
372	Int32GetDatum(src_encoding),
373	Int32GetDatum(dest_encoding),
374	CStringGetDatum(src),
375	CStringGetDatum(result),
376	Int32GetDatum(len));
377	return result;
378	}
379
380	/*
381	* Convert string to encoding encoding_name. The source
382	* encoding is the DB encoding.
383	*
384	* BYTEA convert_to(TEXT string, NAME encoding_name) */
385	Datum
386	pg_convert_to(PG_FUNCTION_ARGS)
387	{
388	Datum string = PG_GETARG_DATUM(`0`);
389	Datum dest_encoding_name = PG_GETARG_DATUM(`1`);
390	Datum src_encoding_name = DirectFunctionCall1(namein,
391	CStringGetDatum(DatabaseEncoding->name));
392	Datum result;
393
394	/*
395	* pg_convert expects a bytea as its first argument. We're passing it a
396	* text argument here, relying on the fact that they are both in fact
397	* varlena types, and thus structurally identical.
398	*/
399	result = DirectFunctionCall3(pg_convert, string,
400	src_encoding_name, dest_encoding_name);
401
402	PG_RETURN_DATUM(result);
403	}
404
405	/*
406	* Convert string from encoding encoding_name. The destination
407	* encoding is the DB encoding.
408	*
409	* TEXT convert_from(BYTEA string, NAME encoding_name) */
410	Datum
411	pg_convert_from(PG_FUNCTION_ARGS)
412	{
413	Datum string = PG_GETARG_DATUM(`0`);
414	Datum src_encoding_name = PG_GETARG_DATUM(`1`);
415	Datum dest_encoding_name = DirectFunctionCall1(namein,
416	CStringGetDatum(DatabaseEncoding->name));
417	Datum result;
418
419	result = DirectFunctionCall3(pg_convert, string,
420	src_encoding_name, dest_encoding_name);
421
422	/*
423	* pg_convert returns a bytea, which we in turn return as text, relying on
424	* the fact that they are both in fact varlena types, and thus
425	* structurally identical. Although not all bytea values are valid text,
426	* in this case it will be because we've told pg_convert to return one
427	* that is valid as text in the current database encoding.
428	*/
429	PG_RETURN_DATUM(result);
430	}
431
432	/*
433	* Convert string between two arbitrary encodings.
434	*
435	* BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
436	*/
437	Datum
438	pg_convert(PG_FUNCTION_ARGS)
439	{
440	bytea *string = PG_GETARG_BYTEA_PP(`0`);
441	char src_encoding_name = NameStr(PG_GETARG_NAME(`1`));
442	int src_encoding = pg_char_to_encoding(src_encoding_name);
443	char dest_encoding_name = NameStr(PG_GETARG_NAME(`2`));
444	int dest_encoding = pg_char_to_encoding(dest_encoding_name);
445	const char *src_str;
446	char *dest_str;
447	bytea *retval;
448	int len;
449
450	if (src_encoding < `0`)
451	ereport(ERROR,
452	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
453	errmsg("invalid source encoding name \"%s\"",
454	src_encoding_name)));
455	if (dest_encoding < `0`)
456	ereport(ERROR,
457	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
458	errmsg("invalid destination encoding name \"%s\"",
459	dest_encoding_name)));
460
461	/ make sure that source string is valid /
462	len = VARSIZE_ANY_EXHDR(string);
463	src_str = VARDATA_ANY(string);
464	pg_verify_mbstr_len(src_encoding, src_str, len, false);
465
466	/ perform conversion /
467	dest_str = (char ) pg_do_encoding_conversion((unsigned* char ) unconstify(char* *, src_str),
468	len,
469	src_encoding,
470	dest_encoding);
471
472	/ update len if conversion actually happened /
473	if (dest_str != src_str)
474	len = strlen(dest_str);
475
476	/*
477	* build bytea data type structure.
478	*/
479	retval = (bytea *) palloc(len + VARHDRSZ);
480	SET_VARSIZE(retval, len + VARHDRSZ);
481	memcpy(VARDATA(retval), dest_str, len);
482
483	if (dest_str != src_str)
484	pfree(dest_str);
485
486	/ free memory if allocated by the toaster /
487	PG_FREE_IF_COPY(string, `0`);
488
489	PG_RETURN_BYTEA_P(retval);
490	}
491
492	/*
493	* get the length of the string considered as text in the specified
494	* encoding. Raises an error if the data is not valid in that
495	* encoding.
496	*
497	* INT4 length (BYTEA string, NAME src_encoding_name)
498	*/
499	Datum
500	length_in_encoding(PG_FUNCTION_ARGS)
501	{
502	bytea *string = PG_GETARG_BYTEA_PP(`0`);
503	char src_encoding_name = NameStr(PG_GETARG_NAME(`1`));
504	int src_encoding = pg_char_to_encoding(src_encoding_name);
505	const char *src_str;
506	int len;
507	int retval;
508
509	if (src_encoding < `0`)
510	ereport(ERROR,
511	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
512	errmsg("invalid encoding name \"%s\"",
513	src_encoding_name)));
514
515	len = VARSIZE_ANY_EXHDR(string);
516	src_str = VARDATA_ANY(string);
517
518	retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
519
520	PG_RETURN_INT32(retval);
521	}
522
523	/*
524	* Get maximum multibyte character length in the specified encoding.
525	*
526	* Note encoding is specified numerically, not by name as above.
527	*/
528	Datum
529	pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
530	{
531	int encoding = PG_GETARG_INT32(`0`);
532
533	if (PG_VALID_ENCODING(encoding))
534	PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
535	else
536	PG_RETURN_NULL();
537	}
538
539	/*
540	* Convert client encoding to server encoding.
541	*
542	* See the notes about string conversion functions at the top of this file.
543	*/
544	char *
545	pg_client_to_server(const char s, int* len)
546	{
547	return pg_any_to_server(s, len, ClientEncoding->encoding);
548	}
549
550	/*
551	* Convert any encoding to server encoding.
552	*
553	* See the notes about string conversion functions at the top of this file.
554	*
555	* Unlike the other string conversion functions, this will apply validation
556	* even if encoding == DatabaseEncoding->encoding. This is because this is
557	* used to process data coming in from outside the database, and we never
558	* want to just assume validity.
559	*/
560	char *
561	pg_any_to_server(const char s, int* len, int encoding)
562	{
563	if (len <= `0`)
564	return unconstify(char , s); /* empty string is always valid /
565
566	if (encoding == DatabaseEncoding->encoding \|\|
567	encoding == PG_SQL_ASCII)
568	{
569	/*
570	* No conversion is needed, but we must still validate the data.
571	*/
572	(void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
573	return unconstify(char *, s);
574	}
575
576	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
577	{
578	/*
579	* No conversion is possible, but we must still validate the data,
580	* because the client-side code might have done string escaping using
581	* the selected client_encoding. If the client encoding is ASCII-safe
582	* then we just do a straight validation under that encoding. For an
583	* ASCII-unsafe encoding we have a problem: we dare not pass such data
584	* to the parser but we have no way to convert it. We compromise by
585	* rejecting the data if it contains any non-ASCII characters.
586	*/
587	if (PG_VALID_BE_ENCODING(encoding))
588	(void) pg_verify_mbstr(encoding, s, len, false);
589	else
590	{
591	int i;
592
593	for (i = `0`; i < len; i++)
594	{
595	if (s[i] == `'\0'` \|\| IS_HIGHBIT_SET(s[i]))
596	ereport(ERROR,
597	(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
598	errmsg("invalid byte value for encoding \"%s\": 0x%02x",
599	pg_enc2name_tbl[PG_SQL_ASCII].name,
600	(unsigned char) s[i])));
601	}
602	}
603	return unconstify(char *, s);
604	}
605
606	/ Fast path if we can use cached conversion function /
607	if (encoding == ClientEncoding->encoding)
608	return perform_default_encoding_conversion(s, len, true);
609
610	/ General case ... will not work outside transactions /
611	return (char ) pg_do_encoding_conversion((unsigned* char ) unconstify(char* *, s),
612	len,
613	encoding,
614	DatabaseEncoding->encoding);
615	}
616
617	/*
618	* Convert server encoding to client encoding.
619	*
620	* See the notes about string conversion functions at the top of this file.
621	*/
622	char *
623	pg_server_to_client(const char s, int* len)
624	{
625	return pg_server_to_any(s, len, ClientEncoding->encoding);
626	}
627
628	/*
629	* Convert server encoding to any encoding.
630	*
631	* See the notes about string conversion functions at the top of this file.
632	*/
633	char *
634	pg_server_to_any(const char s, int* len, int encoding)
635	{
636	if (len <= `0`)
637	return unconstify(char , s); /* empty string is always valid /
638
639	if (encoding == DatabaseEncoding->encoding \|\|
640	encoding == PG_SQL_ASCII)
641	return unconstify(char , s); /* assume data is valid /
642
643	if (DatabaseEncoding->encoding == PG_SQL_ASCII)
644	{
645	/ No conversion is possible, but we must validate the result /
646	(void) pg_verify_mbstr(encoding, s, len, false);
647	return unconstify(char *, s);
648	}
649
650	/ Fast path if we can use cached conversion function /
651	if (encoding == ClientEncoding->encoding)
652	return perform_default_encoding_conversion(s, len, false);
653
654	/ General case ... will not work outside transactions /
655	return (char ) pg_do_encoding_conversion((unsigned* char ) unconstify(char* *, s),
656	len,
657	DatabaseEncoding->encoding,
658	encoding);
659	}
660
661	/*
662	* Perform default encoding conversion using cached FmgrInfo. Since
663	* this function does not access database at all, it is safe to call
664	* outside transactions. If the conversion has not been set up by
665	* SetClientEncoding(), no conversion is performed.
666	*/
667	static char *
668	perform_default_encoding_conversion(const char src, int* len,
669	bool is_client_to_server)
670	{
671	char *result;
672	int src_encoding,
673	dest_encoding;
674	FmgrInfo *flinfo;
675
676	if (is_client_to_server)
677	{
678	src_encoding = ClientEncoding->encoding;
679	dest_encoding = DatabaseEncoding->encoding;
680	flinfo = ToServerConvProc;
681	}
682	else
683	{
684	src_encoding = DatabaseEncoding->encoding;
685	dest_encoding = ClientEncoding->encoding;
686	flinfo = ToClientConvProc;
687	}
688
689	if (flinfo == NULL)
690	return unconstify(char *, src);
691
692	/*
693	* Allocate space for conversion result, being wary of integer overflow
694	*/
695	if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
696	ereport(ERROR,
697	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
698	errmsg("out of memory"),
699	errdetail("String of %d bytes is too long for encoding conversion.",
700	len)));
701
702	result = palloc(len * MAX_CONVERSION_GROWTH + `1`);
703
704	FunctionCall5(flinfo,
705	Int32GetDatum(src_encoding),
706	Int32GetDatum(dest_encoding),
707	CStringGetDatum(src),
708	CStringGetDatum(result),
709	Int32GetDatum(len));
710	return result;
711	}
712
713
714	/ convert a multibyte string to a wchar /
715	int
716	pg_mb2wchar(const char from, pg_wchar to)
717	{
718	return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
719	}
720
721	/ convert a multibyte string to a wchar with a limited length /
722	int
723	pg_mb2wchar_with_len(const char from, pg_wchar to, int len)
724	{
725	return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
726	}
727
728	/ same, with any encoding /
729	int
730	pg_encoding_mb2wchar_with_len(int encoding,
731	const char from, pg_wchar to, int len)
732	{
733	return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
734	}
735
736	/ convert a wchar string to a multibyte /
737	int
738	pg_wchar2mb(const pg_wchar from, char* *to)
739	{
740	return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
741	}
742
743	/ convert a wchar string to a multibyte with a limited length /
744	int
745	pg_wchar2mb_with_len(const pg_wchar from, char* to, int* len)
746	{
747	return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
748	}
749
750	/ same, with any encoding /
751	int
752	pg_encoding_wchar2mb_with_len(int encoding,
753	const pg_wchar from, char* to, int* len)
754	{
755	return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
756	}
757
758	/ returns the byte length of a multibyte character /
759	int
760	pg_mblen(const char *mbstr)
761	{
762	return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
763	}
764
765	/ returns the display length of a multibyte character /
766	int
767	pg_dsplen(const char *mbstr)
768	{
769	return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
770	}
771
772	/ returns the length (counted in wchars) of a multibyte string /
773	int
774	pg_mbstrlen(const char *mbstr)
775	{
776	int len = `0`;
777
778	/ optimization for single byte encoding /
779	if (pg_database_encoding_max_length() == `1`)
780	return strlen(mbstr);
781
782	while (*mbstr)
783	{
784	mbstr += pg_mblen(mbstr);
785	len++;
786	}
787	return len;
788	}
789
790	/ returns the length (counted in wchars) of a multibyte string*
791	* (not necessarily NULL terminated)
792	*/
793	int
794	pg_mbstrlen_with_len(const char mbstr, int* limit)
795	{
796	int len = `0`;
797
798	/ optimization for single byte encoding /
799	if (pg_database_encoding_max_length() == `1`)
800	return limit;
801
802	while (limit > `0` && *mbstr)
803	{
804	int l = pg_mblen(mbstr);
805
806	limit -= l;
807	mbstr += l;
808	len++;
809	}
810	return len;
811	}
812
813	/*
814	* returns the byte length of a multibyte string
815	* (not necessarily NULL terminated)
816	* that is no longer than limit.
817	* this function does not break multibyte character boundary.
818	*/
819	int
820	pg_mbcliplen(const char mbstr, int* len, int limit)
821	{
822	return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
823	len, limit);
824	}
825
826	/*
827	* pg_mbcliplen with specified encoding
828	*/
829	int
830	pg_encoding_mbcliplen(int encoding, const char *mbstr,
831	int len, int limit)
832	{
833	mblen_converter mblen_fn;
834	int clen = `0`;
835	int l;
836
837	/ optimization for single byte encoding /
838	if (pg_encoding_max_length(encoding) == `1`)
839	return cliplen(mbstr, len, limit);
840
841	mblen_fn = pg_wchar_table[encoding].mblen;
842
843	while (len > `0` && *mbstr)
844	{
845	l = (mblen_fn) ((const* unsigned char *) mbstr);
846	if ((clen + l) > limit)
847	break;
848	clen += l;
849	if (clen == limit)
850	break;
851	len -= l;
852	mbstr += l;
853	}
854	return clen;
855	}
856
857	/*
858	* Similar to pg_mbcliplen except the limit parameter specifies the
859	* character length, not the byte length.
860	*/
861	int
862	pg_mbcharcliplen(const char mbstr, int* len, int limit)
863	{
864	int clen = `0`;
865	int nch = `0`;
866	int l;
867
868	/ optimization for single byte encoding /
869	if (pg_database_encoding_max_length() == `1`)
870	return cliplen(mbstr, len, limit);
871
872	while (len > `0` && *mbstr)
873	{
874	l = pg_mblen(mbstr);
875	nch++;
876	if (nch > limit)
877	break;
878	clen += l;
879	len -= l;
880	mbstr += l;
881	}
882	return clen;
883	}
884
885	/ mbcliplen for any single-byte encoding /
886	static int
887	cliplen(const char str, int* len, int limit)
888	{
889	int l = `0`;
890
891	len = Min(len, limit);
892	while (l < len && str[l])
893	l++;
894	return l;
895	}
896
897	void
898	SetDatabaseEncoding(int encoding)
899	{
900	if (!PG_VALID_BE_ENCODING(encoding))
901	elog(ERROR, "invalid database encoding: %d", encoding);
902
903	DatabaseEncoding = &pg_enc2name_tbl[encoding];
904	Assert(DatabaseEncoding->encoding == encoding);
905	}
906
907	void
908	SetMessageEncoding(int encoding)
909	{
910	/ Some calls happen before we can elog()! /
911	Assert(PG_VALID_ENCODING(encoding));
912
913	MessageEncoding = &pg_enc2name_tbl[encoding];
914	Assert(MessageEncoding->encoding == encoding);
915	}
916
917	#ifdef ENABLE_NLS
918	/*
919	* Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
920	* codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
921	* fail for gettext-internal causes like out-of-memory.
922	*/
923	static bool
924	raw_pg_bind_textdomain_codeset(const char domainname, int* encoding)
925	{
926	bool elog_ok = (CurrentMemoryContext != NULL);
927	int i;
928
929	for (i = `0`; pg_enc2gettext_tbl[i].name != NULL; i++)
930	{
931	if (pg_enc2gettext_tbl[i].encoding == encoding)
932	{
933	if (bind_textdomain_codeset(domainname,
934	pg_enc2gettext_tbl[i].name) != NULL)
935	return true;
936
937	if (elog_ok)
938	elog(LOG, "bind_textdomain_codeset failed");
939	else
940	write_stderr("bind_textdomain_codeset failed");
941
942	break;
943	}
944	}
945
946	return false;
947	}
948
949	/*
950	* Bind a gettext message domain to the codeset corresponding to the database
951	* encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
952	* Return the MessageEncoding implied by the new settings.
953	*
954	* On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
955	* When that matches the database encoding, we don't need to do anything. In
956	* CREATE DATABASE, we enforce or trust that the locale's codeset matches the
957	* database encoding, except for the C locale. (On Windows, we also permit a
958	* discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
959	* gettext to the right codeset.
960	*
961	* On Windows, gettext defaults to the Windows ANSI code page. This is a
962	* convenient departure for software that passes the strings to Windows ANSI
963	* APIs, but we don't do that. Compel gettext to use database encoding or,
964	* failing that, the LC_CTYPE encoding as it would on other platforms.
965	*
966	* This function is called before elog() and palloc() are usable.
967	*/
968	int
969	pg_bind_textdomain_codeset(const char *domainname)
970	{
971	bool elog_ok = (CurrentMemoryContext != NULL);
972	int encoding = GetDatabaseEncoding();
973	int new_msgenc;
974
975	#ifndef WIN32
976	const char *ctype = setlocale(LC_CTYPE, NULL);
977
978	if (pg_strcasecmp(ctype, "C") == `0` \|\| pg_strcasecmp(ctype, "POSIX") == `0`)
979	#endif
980	if (encoding != PG_SQL_ASCII &&
981	raw_pg_bind_textdomain_codeset(domainname, encoding))
982	return encoding;
983
984	new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
985	if (new_msgenc < `0`)
986	new_msgenc = PG_SQL_ASCII;
987
988	#ifdef WIN32
989	if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
990	/ On failure, the old message encoding remains valid. /
991	return GetMessageEncoding();
992	#endif
993
994	return new_msgenc;
995	}
996	#endif
997
998	/*
999	* The database encoding, also called the server encoding, represents the
1000	* encoding of data stored in text-like data types. Affected types include
1001	* cstring, text, varchar, name, xml, and json.
1002	*/
1003	int
1004	GetDatabaseEncoding(void)
1005	{
1006	return DatabaseEncoding->encoding;
1007	}
1008
1009	const char *
1010	GetDatabaseEncodingName(void)
1011	{
1012	return DatabaseEncoding->name;
1013	}
1014
1015	Datum
1016	getdatabaseencoding(PG_FUNCTION_ARGS)
1017	{
1018	return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1019	}
1020
1021	Datum
1022	pg_client_encoding(PG_FUNCTION_ARGS)
1023	{
1024	return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1025	}
1026
1027	/*
1028	* gettext() returns messages in this encoding. This often matches the
1029	* database encoding, but it differs for SQL_ASCII databases, for processes
1030	* not attached to a database, and under a database encoding lacking iconv
1031	* support (MULE_INTERNAL).
1032	*/
1033	int
1034	GetMessageEncoding(void)
1035	{
1036	return MessageEncoding->encoding;
1037	}
1038
1039	#ifdef WIN32
1040	/*
1041	* Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1042	* string. The character length is also passed to utf16len if not
1043	* null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1044	* should be ASCII-only; this will function as though MessageEncoding is UTF8.
1045	*/
1046	WCHAR *
1047	pgwin32_message_to_UTF16(const char str, int* len, int *utf16len)
1048	{
1049	int msgenc = GetMessageEncoding();
1050	WCHAR *utf16;
1051	int dstlen;
1052	UINT codepage;
1053
1054	if (msgenc == PG_SQL_ASCII)
1055	/ No conversion is possible, and SQL_ASCII is never utf16. /
1056	return NULL;
1057
1058	codepage = pg_enc2name_tbl[msgenc].codepage;
1059
1060	/*
1061	* Use MultiByteToWideChar directly if there is a corresponding codepage,
1062	* or double conversion through UTF8 if not. Double conversion is needed,
1063	* for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1064	*/
1065	if (codepage != `0`)
1066	{
1067	utf16 = (WCHAR ) palloc(sizeof(WCHAR) (len + `1`));
1068	dstlen = MultiByteToWideChar(codepage, `0`, str, len, utf16, len);
1069	utf16[dstlen] = (WCHAR) `0`;
1070	}
1071	else
1072	{
1073	char *utf8;
1074
1075	/*
1076	* XXX pg_do_encoding_conversion() requires a transaction. In the
1077	* absence of one, hope for the input to be valid UTF8.
1078	*/
1079	if (IsTransactionState())
1080	{
1081	utf8 = (char ) pg_do_encoding_conversion((unsigned* char *) str,
1082	len,
1083	msgenc,
1084	PG_UTF8);
1085	if (utf8 != str)
1086	len = strlen(utf8);
1087	}
1088	else
1089	utf8 = (char *) str;
1090
1091	utf16 = (WCHAR ) palloc(sizeof(WCHAR) (len + `1`));
1092	dstlen = MultiByteToWideChar(CP_UTF8, `0`, utf8, len, utf16, len);
1093	utf16[dstlen] = (WCHAR) `0`;
1094
1095	if (utf8 != str)
1096	pfree(utf8);
1097	}
1098
1099	if (dstlen == `0` && len > `0`)
1100	{
1101	pfree(utf16);
1102	return NULL; / error /
1103	}
1104
1105	if (utf16len)
1106	*utf16len = dstlen;
1107	return utf16;
1108	}
1109
1110	#endif
1111

Browse the source code of PostgreSQL/src/backend/utils/mb/mbutils.c