1/*-------------------------------------------------------------------------
2 *
3 * mbutils.c
4 * This file contains functions for encoding conversion.
5 *
6 * The string-conversion functions in this file share some API quirks.
7 * Note the following:
8 *
9 * The functions return a palloc'd, null-terminated string if conversion
10 * is required. However, if no conversion is performed, the given source
11 * string pointer is returned as-is.
12 *
13 * Although the presence of a length argument means that callers can pass
14 * non-null-terminated strings, care is required because the same string
15 * will be passed back if no conversion occurs. Such callers *must* check
16 * whether result == src and handle that case differently.
17 *
18 * If the source and destination encodings are the same, the source string
19 * is returned without any verification; it's assumed to be valid data.
20 * If that might not be the case, the caller is responsible for validating
21 * the string using a separate call to pg_verify_mbstr(). Whenever the
22 * source and destination encodings are different, the functions ensure that
23 * the result is validly encoded according to the destination encoding.
24 *
25 *
26 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/mb/mbutils.c
32 *
33 *-------------------------------------------------------------------------
34 */
35#include "postgres.h"
36
37#include "access/xact.h"
38#include "catalog/namespace.h"
39#include "mb/pg_wchar.h"
40#include "utils/builtins.h"
41#include "utils/memutils.h"
42#include "utils/syscache.h"
43
44/*
45 * We maintain a simple linked list caching the fmgr lookup info for the
46 * currently selected conversion functions, as well as any that have been
47 * selected previously in the current session. (We remember previous
48 * settings because we must be able to restore a previous setting during
49 * transaction rollback, without doing any fresh catalog accesses.)
50 *
51 * Since we'll never release this data, we just keep it in TopMemoryContext.
52 */
53typedef struct ConvProcInfo
54{
55 int s_encoding; /* server and client encoding IDs */
56 int c_encoding;
57 FmgrInfo to_server_info; /* lookup info for conversion procs */
58 FmgrInfo to_client_info;
59} ConvProcInfo;
60
61static List *ConvProcList = NIL; /* List of ConvProcInfo */
62
63/*
64 * These variables point to the currently active conversion functions,
65 * or are NULL when no conversion is needed.
66 */
67static FmgrInfo *ToServerConvProc = NULL;
68static FmgrInfo *ToClientConvProc = NULL;
69
70/*
71 * These variables track the currently-selected encodings.
72 */
73static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
74static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
75static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
76
77/*
78 * During backend startup we can't set client encoding because we (a)
79 * can't look up the conversion functions, and (b) may not know the database
80 * encoding yet either. So SetClientEncoding() just accepts anything and
81 * remembers it for InitializeClientEncoding() to apply later.
82 */
83static bool backend_startup_complete = false;
84static int pending_client_encoding = PG_SQL_ASCII;
85
86
87/* Internal functions */
88static char *perform_default_encoding_conversion(const char *src,
89 int len, bool is_client_to_server);
90static int cliplen(const char *str, int len, int limit);
91
92
93/*
94 * Prepare for a future call to SetClientEncoding. Success should mean
95 * that SetClientEncoding is guaranteed to succeed for this encoding request.
96 *
97 * (But note that success before backend_startup_complete does not guarantee
98 * success after ...)
99 *
100 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
101 */
102int
103PrepareClientEncoding(int encoding)
104{
105 int current_server_encoding;
106 ListCell *lc;
107
108 if (!PG_VALID_FE_ENCODING(encoding))
109 return -1;
110
111 /* Can't do anything during startup, per notes above */
112 if (!backend_startup_complete)
113 return 0;
114
115 current_server_encoding = GetDatabaseEncoding();
116
117 /*
118 * Check for cases that require no conversion function.
119 */
120 if (current_server_encoding == encoding ||
121 current_server_encoding == PG_SQL_ASCII ||
122 encoding == PG_SQL_ASCII)
123 return 0;
124
125 if (IsTransactionState())
126 {
127 /*
128 * If we're in a live transaction, it's safe to access the catalogs,
129 * so look up the functions. We repeat the lookup even if the info is
130 * already cached, so that we can react to changes in the contents of
131 * pg_conversion.
132 */
133 Oid to_server_proc,
134 to_client_proc;
135 ConvProcInfo *convinfo;
136 MemoryContext oldcontext;
137
138 to_server_proc = FindDefaultConversionProc(encoding,
139 current_server_encoding);
140 if (!OidIsValid(to_server_proc))
141 return -1;
142 to_client_proc = FindDefaultConversionProc(current_server_encoding,
143 encoding);
144 if (!OidIsValid(to_client_proc))
145 return -1;
146
147 /*
148 * Load the fmgr info into TopMemoryContext (could still fail here)
149 */
150 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
151 sizeof(ConvProcInfo));
152 convinfo->s_encoding = current_server_encoding;
153 convinfo->c_encoding = encoding;
154 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
155 TopMemoryContext);
156 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
157 TopMemoryContext);
158
159 /* Attach new info to head of list */
160 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
161 ConvProcList = lcons(convinfo, ConvProcList);
162 MemoryContextSwitchTo(oldcontext);
163
164 /*
165 * We cannot yet remove any older entry for the same encoding pair,
166 * since it could still be in use. SetClientEncoding will clean up.
167 */
168
169 return 0; /* success */
170 }
171 else
172 {
173 /*
174 * If we're not in a live transaction, the only thing we can do is
175 * restore a previous setting using the cache. This covers all
176 * transaction-rollback cases. The only case it might not work for is
177 * trying to change client_encoding on the fly by editing
178 * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
179 * thing to do anyway.
180 */
181 foreach(lc, ConvProcList)
182 {
183 ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
184
185 if (oldinfo->s_encoding == current_server_encoding &&
186 oldinfo->c_encoding == encoding)
187 return 0;
188 }
189
190 return -1; /* it's not cached, so fail */
191 }
192}
193
194/*
195 * Set the active client encoding and set up the conversion-function pointers.
196 * PrepareClientEncoding should have been called previously for this encoding.
197 *
198 * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
199 */
200int
201SetClientEncoding(int encoding)
202{
203 int current_server_encoding;
204 bool found;
205 ListCell *lc;
206 ListCell *prev;
207 ListCell *next;
208
209 if (!PG_VALID_FE_ENCODING(encoding))
210 return -1;
211
212 /* Can't do anything during startup, per notes above */
213 if (!backend_startup_complete)
214 {
215 pending_client_encoding = encoding;
216 return 0;
217 }
218
219 current_server_encoding = GetDatabaseEncoding();
220
221 /*
222 * Check for cases that require no conversion function.
223 */
224 if (current_server_encoding == encoding ||
225 current_server_encoding == PG_SQL_ASCII ||
226 encoding == PG_SQL_ASCII)
227 {
228 ClientEncoding = &pg_enc2name_tbl[encoding];
229 ToServerConvProc = NULL;
230 ToClientConvProc = NULL;
231 return 0;
232 }
233
234 /*
235 * Search the cache for the entry previously prepared by
236 * PrepareClientEncoding; if there isn't one, we lose. While at it,
237 * release any duplicate entries so that repeated Prepare/Set cycles don't
238 * leak memory.
239 */
240 found = false;
241 prev = NULL;
242 for (lc = list_head(ConvProcList); lc; lc = next)
243 {
244 ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
245
246 next = lnext(lc);
247
248 if (convinfo->s_encoding == current_server_encoding &&
249 convinfo->c_encoding == encoding)
250 {
251 if (!found)
252 {
253 /* Found newest entry, so set up */
254 ClientEncoding = &pg_enc2name_tbl[encoding];
255 ToServerConvProc = &convinfo->to_server_info;
256 ToClientConvProc = &convinfo->to_client_info;
257 found = true;
258 }
259 else
260 {
261 /* Duplicate entry, release it */
262 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
263 pfree(convinfo);
264 continue; /* prev mustn't advance */
265 }
266 }
267
268 prev = lc;
269 }
270
271 if (found)
272 return 0; /* success */
273 else
274 return -1; /* it's not cached, so fail */
275}
276
277/*
278 * Initialize client encoding conversions.
279 * Called from InitPostgres() once during backend startup.
280 */
281void
282InitializeClientEncoding(void)
283{
284 Assert(!backend_startup_complete);
285 backend_startup_complete = true;
286
287 if (PrepareClientEncoding(pending_client_encoding) < 0 ||
288 SetClientEncoding(pending_client_encoding) < 0)
289 {
290 /*
291 * Oops, the requested conversion is not available. We couldn't fail
292 * before, but we can now.
293 */
294 ereport(FATAL,
295 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
296 errmsg("conversion between %s and %s is not supported",
297 pg_enc2name_tbl[pending_client_encoding].name,
298 GetDatabaseEncodingName())));
299 }
300}
301
302/*
303 * returns the current client encoding
304 */
305int
306pg_get_client_encoding(void)
307{
308 return ClientEncoding->encoding;
309}
310
311/*
312 * returns the current client encoding name
313 */
314const char *
315pg_get_client_encoding_name(void)
316{
317 return ClientEncoding->name;
318}
319
320/*
321 * Convert src string to another encoding (general case).
322 *
323 * See the notes about string conversion functions at the top of this file.
324 */
325unsigned char *
326pg_do_encoding_conversion(unsigned char *src, int len,
327 int src_encoding, int dest_encoding)
328{
329 unsigned char *result;
330 Oid proc;
331
332 if (len <= 0)
333 return src; /* empty string is always valid */
334
335 if (src_encoding == dest_encoding)
336 return src; /* no conversion required, assume valid */
337
338 if (dest_encoding == PG_SQL_ASCII)
339 return src; /* any string is valid in SQL_ASCII */
340
341 if (src_encoding == PG_SQL_ASCII)
342 {
343 /* No conversion is possible, but we must validate the result */
344 (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
345 return src;
346 }
347
348 if (!IsTransactionState()) /* shouldn't happen */
349 elog(ERROR, "cannot perform encoding conversion outside a transaction");
350
351 proc = FindDefaultConversionProc(src_encoding, dest_encoding);
352 if (!OidIsValid(proc))
353 ereport(ERROR,
354 (errcode(ERRCODE_UNDEFINED_FUNCTION),
355 errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
356 pg_encoding_to_char(src_encoding),
357 pg_encoding_to_char(dest_encoding))));
358
359 /*
360 * Allocate space for conversion result, being wary of integer overflow
361 */
362 if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
363 ereport(ERROR,
364 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
365 errmsg("out of memory"),
366 errdetail("String of %d bytes is too long for encoding conversion.",
367 len)));
368
369 result = palloc(len * MAX_CONVERSION_GROWTH + 1);
370
371 OidFunctionCall5(proc,
372 Int32GetDatum(src_encoding),
373 Int32GetDatum(dest_encoding),
374 CStringGetDatum(src),
375 CStringGetDatum(result),
376 Int32GetDatum(len));
377 return result;
378}
379
380/*
381 * Convert string to encoding encoding_name. The source
382 * encoding is the DB encoding.
383 *
384 * BYTEA convert_to(TEXT string, NAME encoding_name) */
385Datum
386pg_convert_to(PG_FUNCTION_ARGS)
387{
388 Datum string = PG_GETARG_DATUM(0);
389 Datum dest_encoding_name = PG_GETARG_DATUM(1);
390 Datum src_encoding_name = DirectFunctionCall1(namein,
391 CStringGetDatum(DatabaseEncoding->name));
392 Datum result;
393
394 /*
395 * pg_convert expects a bytea as its first argument. We're passing it a
396 * text argument here, relying on the fact that they are both in fact
397 * varlena types, and thus structurally identical.
398 */
399 result = DirectFunctionCall3(pg_convert, string,
400 src_encoding_name, dest_encoding_name);
401
402 PG_RETURN_DATUM(result);
403}
404
405/*
406 * Convert string from encoding encoding_name. The destination
407 * encoding is the DB encoding.
408 *
409 * TEXT convert_from(BYTEA string, NAME encoding_name) */
410Datum
411pg_convert_from(PG_FUNCTION_ARGS)
412{
413 Datum string = PG_GETARG_DATUM(0);
414 Datum src_encoding_name = PG_GETARG_DATUM(1);
415 Datum dest_encoding_name = DirectFunctionCall1(namein,
416 CStringGetDatum(DatabaseEncoding->name));
417 Datum result;
418
419 result = DirectFunctionCall3(pg_convert, string,
420 src_encoding_name, dest_encoding_name);
421
422 /*
423 * pg_convert returns a bytea, which we in turn return as text, relying on
424 * the fact that they are both in fact varlena types, and thus
425 * structurally identical. Although not all bytea values are valid text,
426 * in this case it will be because we've told pg_convert to return one
427 * that is valid as text in the current database encoding.
428 */
429 PG_RETURN_DATUM(result);
430}
431
432/*
433 * Convert string between two arbitrary encodings.
434 *
435 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
436 */
437Datum
438pg_convert(PG_FUNCTION_ARGS)
439{
440 bytea *string = PG_GETARG_BYTEA_PP(0);
441 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
442 int src_encoding = pg_char_to_encoding(src_encoding_name);
443 char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
444 int dest_encoding = pg_char_to_encoding(dest_encoding_name);
445 const char *src_str;
446 char *dest_str;
447 bytea *retval;
448 int len;
449
450 if (src_encoding < 0)
451 ereport(ERROR,
452 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
453 errmsg("invalid source encoding name \"%s\"",
454 src_encoding_name)));
455 if (dest_encoding < 0)
456 ereport(ERROR,
457 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
458 errmsg("invalid destination encoding name \"%s\"",
459 dest_encoding_name)));
460
461 /* make sure that source string is valid */
462 len = VARSIZE_ANY_EXHDR(string);
463 src_str = VARDATA_ANY(string);
464 pg_verify_mbstr_len(src_encoding, src_str, len, false);
465
466 /* perform conversion */
467 dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
468 len,
469 src_encoding,
470 dest_encoding);
471
472 /* update len if conversion actually happened */
473 if (dest_str != src_str)
474 len = strlen(dest_str);
475
476 /*
477 * build bytea data type structure.
478 */
479 retval = (bytea *) palloc(len + VARHDRSZ);
480 SET_VARSIZE(retval, len + VARHDRSZ);
481 memcpy(VARDATA(retval), dest_str, len);
482
483 if (dest_str != src_str)
484 pfree(dest_str);
485
486 /* free memory if allocated by the toaster */
487 PG_FREE_IF_COPY(string, 0);
488
489 PG_RETURN_BYTEA_P(retval);
490}
491
492/*
493 * get the length of the string considered as text in the specified
494 * encoding. Raises an error if the data is not valid in that
495 * encoding.
496 *
497 * INT4 length (BYTEA string, NAME src_encoding_name)
498 */
499Datum
500length_in_encoding(PG_FUNCTION_ARGS)
501{
502 bytea *string = PG_GETARG_BYTEA_PP(0);
503 char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
504 int src_encoding = pg_char_to_encoding(src_encoding_name);
505 const char *src_str;
506 int len;
507 int retval;
508
509 if (src_encoding < 0)
510 ereport(ERROR,
511 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
512 errmsg("invalid encoding name \"%s\"",
513 src_encoding_name)));
514
515 len = VARSIZE_ANY_EXHDR(string);
516 src_str = VARDATA_ANY(string);
517
518 retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
519
520 PG_RETURN_INT32(retval);
521}
522
523/*
524 * Get maximum multibyte character length in the specified encoding.
525 *
526 * Note encoding is specified numerically, not by name as above.
527 */
528Datum
529pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
530{
531 int encoding = PG_GETARG_INT32(0);
532
533 if (PG_VALID_ENCODING(encoding))
534 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
535 else
536 PG_RETURN_NULL();
537}
538
539/*
540 * Convert client encoding to server encoding.
541 *
542 * See the notes about string conversion functions at the top of this file.
543 */
544char *
545pg_client_to_server(const char *s, int len)
546{
547 return pg_any_to_server(s, len, ClientEncoding->encoding);
548}
549
550/*
551 * Convert any encoding to server encoding.
552 *
553 * See the notes about string conversion functions at the top of this file.
554 *
555 * Unlike the other string conversion functions, this will apply validation
556 * even if encoding == DatabaseEncoding->encoding. This is because this is
557 * used to process data coming in from outside the database, and we never
558 * want to just assume validity.
559 */
560char *
561pg_any_to_server(const char *s, int len, int encoding)
562{
563 if (len <= 0)
564 return unconstify(char *, s); /* empty string is always valid */
565
566 if (encoding == DatabaseEncoding->encoding ||
567 encoding == PG_SQL_ASCII)
568 {
569 /*
570 * No conversion is needed, but we must still validate the data.
571 */
572 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
573 return unconstify(char *, s);
574 }
575
576 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
577 {
578 /*
579 * No conversion is possible, but we must still validate the data,
580 * because the client-side code might have done string escaping using
581 * the selected client_encoding. If the client encoding is ASCII-safe
582 * then we just do a straight validation under that encoding. For an
583 * ASCII-unsafe encoding we have a problem: we dare not pass such data
584 * to the parser but we have no way to convert it. We compromise by
585 * rejecting the data if it contains any non-ASCII characters.
586 */
587 if (PG_VALID_BE_ENCODING(encoding))
588 (void) pg_verify_mbstr(encoding, s, len, false);
589 else
590 {
591 int i;
592
593 for (i = 0; i < len; i++)
594 {
595 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
596 ereport(ERROR,
597 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
598 errmsg("invalid byte value for encoding \"%s\": 0x%02x",
599 pg_enc2name_tbl[PG_SQL_ASCII].name,
600 (unsigned char) s[i])));
601 }
602 }
603 return unconstify(char *, s);
604 }
605
606 /* Fast path if we can use cached conversion function */
607 if (encoding == ClientEncoding->encoding)
608 return perform_default_encoding_conversion(s, len, true);
609
610 /* General case ... will not work outside transactions */
611 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
612 len,
613 encoding,
614 DatabaseEncoding->encoding);
615}
616
617/*
618 * Convert server encoding to client encoding.
619 *
620 * See the notes about string conversion functions at the top of this file.
621 */
622char *
623pg_server_to_client(const char *s, int len)
624{
625 return pg_server_to_any(s, len, ClientEncoding->encoding);
626}
627
628/*
629 * Convert server encoding to any encoding.
630 *
631 * See the notes about string conversion functions at the top of this file.
632 */
633char *
634pg_server_to_any(const char *s, int len, int encoding)
635{
636 if (len <= 0)
637 return unconstify(char *, s); /* empty string is always valid */
638
639 if (encoding == DatabaseEncoding->encoding ||
640 encoding == PG_SQL_ASCII)
641 return unconstify(char *, s); /* assume data is valid */
642
643 if (DatabaseEncoding->encoding == PG_SQL_ASCII)
644 {
645 /* No conversion is possible, but we must validate the result */
646 (void) pg_verify_mbstr(encoding, s, len, false);
647 return unconstify(char *, s);
648 }
649
650 /* Fast path if we can use cached conversion function */
651 if (encoding == ClientEncoding->encoding)
652 return perform_default_encoding_conversion(s, len, false);
653
654 /* General case ... will not work outside transactions */
655 return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
656 len,
657 DatabaseEncoding->encoding,
658 encoding);
659}
660
661/*
662 * Perform default encoding conversion using cached FmgrInfo. Since
663 * this function does not access database at all, it is safe to call
664 * outside transactions. If the conversion has not been set up by
665 * SetClientEncoding(), no conversion is performed.
666 */
667static char *
668perform_default_encoding_conversion(const char *src, int len,
669 bool is_client_to_server)
670{
671 char *result;
672 int src_encoding,
673 dest_encoding;
674 FmgrInfo *flinfo;
675
676 if (is_client_to_server)
677 {
678 src_encoding = ClientEncoding->encoding;
679 dest_encoding = DatabaseEncoding->encoding;
680 flinfo = ToServerConvProc;
681 }
682 else
683 {
684 src_encoding = DatabaseEncoding->encoding;
685 dest_encoding = ClientEncoding->encoding;
686 flinfo = ToClientConvProc;
687 }
688
689 if (flinfo == NULL)
690 return unconstify(char *, src);
691
692 /*
693 * Allocate space for conversion result, being wary of integer overflow
694 */
695 if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
696 ereport(ERROR,
697 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
698 errmsg("out of memory"),
699 errdetail("String of %d bytes is too long for encoding conversion.",
700 len)));
701
702 result = palloc(len * MAX_CONVERSION_GROWTH + 1);
703
704 FunctionCall5(flinfo,
705 Int32GetDatum(src_encoding),
706 Int32GetDatum(dest_encoding),
707 CStringGetDatum(src),
708 CStringGetDatum(result),
709 Int32GetDatum(len));
710 return result;
711}
712
713
714/* convert a multibyte string to a wchar */
715int
716pg_mb2wchar(const char *from, pg_wchar *to)
717{
718 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
719}
720
721/* convert a multibyte string to a wchar with a limited length */
722int
723pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
724{
725 return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
726}
727
728/* same, with any encoding */
729int
730pg_encoding_mb2wchar_with_len(int encoding,
731 const char *from, pg_wchar *to, int len)
732{
733 return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
734}
735
736/* convert a wchar string to a multibyte */
737int
738pg_wchar2mb(const pg_wchar *from, char *to)
739{
740 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
741}
742
743/* convert a wchar string to a multibyte with a limited length */
744int
745pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
746{
747 return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
748}
749
750/* same, with any encoding */
751int
752pg_encoding_wchar2mb_with_len(int encoding,
753 const pg_wchar *from, char *to, int len)
754{
755 return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
756}
757
758/* returns the byte length of a multibyte character */
759int
760pg_mblen(const char *mbstr)
761{
762 return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
763}
764
765/* returns the display length of a multibyte character */
766int
767pg_dsplen(const char *mbstr)
768{
769 return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
770}
771
772/* returns the length (counted in wchars) of a multibyte string */
773int
774pg_mbstrlen(const char *mbstr)
775{
776 int len = 0;
777
778 /* optimization for single byte encoding */
779 if (pg_database_encoding_max_length() == 1)
780 return strlen(mbstr);
781
782 while (*mbstr)
783 {
784 mbstr += pg_mblen(mbstr);
785 len++;
786 }
787 return len;
788}
789
790/* returns the length (counted in wchars) of a multibyte string
791 * (not necessarily NULL terminated)
792 */
793int
794pg_mbstrlen_with_len(const char *mbstr, int limit)
795{
796 int len = 0;
797
798 /* optimization for single byte encoding */
799 if (pg_database_encoding_max_length() == 1)
800 return limit;
801
802 while (limit > 0 && *mbstr)
803 {
804 int l = pg_mblen(mbstr);
805
806 limit -= l;
807 mbstr += l;
808 len++;
809 }
810 return len;
811}
812
813/*
814 * returns the byte length of a multibyte string
815 * (not necessarily NULL terminated)
816 * that is no longer than limit.
817 * this function does not break multibyte character boundary.
818 */
819int
820pg_mbcliplen(const char *mbstr, int len, int limit)
821{
822 return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
823 len, limit);
824}
825
826/*
827 * pg_mbcliplen with specified encoding
828 */
829int
830pg_encoding_mbcliplen(int encoding, const char *mbstr,
831 int len, int limit)
832{
833 mblen_converter mblen_fn;
834 int clen = 0;
835 int l;
836
837 /* optimization for single byte encoding */
838 if (pg_encoding_max_length(encoding) == 1)
839 return cliplen(mbstr, len, limit);
840
841 mblen_fn = pg_wchar_table[encoding].mblen;
842
843 while (len > 0 && *mbstr)
844 {
845 l = (*mblen_fn) ((const unsigned char *) mbstr);
846 if ((clen + l) > limit)
847 break;
848 clen += l;
849 if (clen == limit)
850 break;
851 len -= l;
852 mbstr += l;
853 }
854 return clen;
855}
856
857/*
858 * Similar to pg_mbcliplen except the limit parameter specifies the
859 * character length, not the byte length.
860 */
861int
862pg_mbcharcliplen(const char *mbstr, int len, int limit)
863{
864 int clen = 0;
865 int nch = 0;
866 int l;
867
868 /* optimization for single byte encoding */
869 if (pg_database_encoding_max_length() == 1)
870 return cliplen(mbstr, len, limit);
871
872 while (len > 0 && *mbstr)
873 {
874 l = pg_mblen(mbstr);
875 nch++;
876 if (nch > limit)
877 break;
878 clen += l;
879 len -= l;
880 mbstr += l;
881 }
882 return clen;
883}
884
885/* mbcliplen for any single-byte encoding */
886static int
887cliplen(const char *str, int len, int limit)
888{
889 int l = 0;
890
891 len = Min(len, limit);
892 while (l < len && str[l])
893 l++;
894 return l;
895}
896
897void
898SetDatabaseEncoding(int encoding)
899{
900 if (!PG_VALID_BE_ENCODING(encoding))
901 elog(ERROR, "invalid database encoding: %d", encoding);
902
903 DatabaseEncoding = &pg_enc2name_tbl[encoding];
904 Assert(DatabaseEncoding->encoding == encoding);
905}
906
907void
908SetMessageEncoding(int encoding)
909{
910 /* Some calls happen before we can elog()! */
911 Assert(PG_VALID_ENCODING(encoding));
912
913 MessageEncoding = &pg_enc2name_tbl[encoding];
914 Assert(MessageEncoding->encoding == encoding);
915}
916
917#ifdef ENABLE_NLS
918/*
919 * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
920 * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
921 * fail for gettext-internal causes like out-of-memory.
922 */
923static bool
924raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
925{
926 bool elog_ok = (CurrentMemoryContext != NULL);
927 int i;
928
929 for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
930 {
931 if (pg_enc2gettext_tbl[i].encoding == encoding)
932 {
933 if (bind_textdomain_codeset(domainname,
934 pg_enc2gettext_tbl[i].name) != NULL)
935 return true;
936
937 if (elog_ok)
938 elog(LOG, "bind_textdomain_codeset failed");
939 else
940 write_stderr("bind_textdomain_codeset failed");
941
942 break;
943 }
944 }
945
946 return false;
947}
948
949/*
950 * Bind a gettext message domain to the codeset corresponding to the database
951 * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
952 * Return the MessageEncoding implied by the new settings.
953 *
954 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
955 * When that matches the database encoding, we don't need to do anything. In
956 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
957 * database encoding, except for the C locale. (On Windows, we also permit a
958 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
959 * gettext to the right codeset.
960 *
961 * On Windows, gettext defaults to the Windows ANSI code page. This is a
962 * convenient departure for software that passes the strings to Windows ANSI
963 * APIs, but we don't do that. Compel gettext to use database encoding or,
964 * failing that, the LC_CTYPE encoding as it would on other platforms.
965 *
966 * This function is called before elog() and palloc() are usable.
967 */
968int
969pg_bind_textdomain_codeset(const char *domainname)
970{
971 bool elog_ok = (CurrentMemoryContext != NULL);
972 int encoding = GetDatabaseEncoding();
973 int new_msgenc;
974
975#ifndef WIN32
976 const char *ctype = setlocale(LC_CTYPE, NULL);
977
978 if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
979#endif
980 if (encoding != PG_SQL_ASCII &&
981 raw_pg_bind_textdomain_codeset(domainname, encoding))
982 return encoding;
983
984 new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
985 if (new_msgenc < 0)
986 new_msgenc = PG_SQL_ASCII;
987
988#ifdef WIN32
989 if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
990 /* On failure, the old message encoding remains valid. */
991 return GetMessageEncoding();
992#endif
993
994 return new_msgenc;
995}
996#endif
997
998/*
999 * The database encoding, also called the server encoding, represents the
1000 * encoding of data stored in text-like data types. Affected types include
1001 * cstring, text, varchar, name, xml, and json.
1002 */
1003int
1004GetDatabaseEncoding(void)
1005{
1006 return DatabaseEncoding->encoding;
1007}
1008
1009const char *
1010GetDatabaseEncodingName(void)
1011{
1012 return DatabaseEncoding->name;
1013}
1014
1015Datum
1016getdatabaseencoding(PG_FUNCTION_ARGS)
1017{
1018 return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1019}
1020
1021Datum
1022pg_client_encoding(PG_FUNCTION_ARGS)
1023{
1024 return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1025}
1026
1027/*
1028 * gettext() returns messages in this encoding. This often matches the
1029 * database encoding, but it differs for SQL_ASCII databases, for processes
1030 * not attached to a database, and under a database encoding lacking iconv
1031 * support (MULE_INTERNAL).
1032 */
1033int
1034GetMessageEncoding(void)
1035{
1036 return MessageEncoding->encoding;
1037}
1038
1039#ifdef WIN32
1040/*
1041 * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1042 * string. The character length is also passed to utf16len if not
1043 * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1044 * should be ASCII-only; this will function as though MessageEncoding is UTF8.
1045 */
1046WCHAR *
1047pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1048{
1049 int msgenc = GetMessageEncoding();
1050 WCHAR *utf16;
1051 int dstlen;
1052 UINT codepage;
1053
1054 if (msgenc == PG_SQL_ASCII)
1055 /* No conversion is possible, and SQL_ASCII is never utf16. */
1056 return NULL;
1057
1058 codepage = pg_enc2name_tbl[msgenc].codepage;
1059
1060 /*
1061 * Use MultiByteToWideChar directly if there is a corresponding codepage,
1062 * or double conversion through UTF8 if not. Double conversion is needed,
1063 * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1064 */
1065 if (codepage != 0)
1066 {
1067 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1068 dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1069 utf16[dstlen] = (WCHAR) 0;
1070 }
1071 else
1072 {
1073 char *utf8;
1074
1075 /*
1076 * XXX pg_do_encoding_conversion() requires a transaction. In the
1077 * absence of one, hope for the input to be valid UTF8.
1078 */
1079 if (IsTransactionState())
1080 {
1081 utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1082 len,
1083 msgenc,
1084 PG_UTF8);
1085 if (utf8 != str)
1086 len = strlen(utf8);
1087 }
1088 else
1089 utf8 = (char *) str;
1090
1091 utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1092 dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1093 utf16[dstlen] = (WCHAR) 0;
1094
1095 if (utf8 != str)
1096 pfree(utf8);
1097 }
1098
1099 if (dstlen == 0 && len > 0)
1100 {
1101 pfree(utf16);
1102 return NULL; /* error */
1103 }
1104
1105 if (utf16len)
1106 *utf16len = dstlen;
1107 return utf16;
1108}
1109
1110#endif
1111