1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * mbutils.c |
4 | * This file contains functions for encoding conversion. |
5 | * |
6 | * The string-conversion functions in this file share some API quirks. |
7 | * Note the following: |
8 | * |
9 | * The functions return a palloc'd, null-terminated string if conversion |
10 | * is required. However, if no conversion is performed, the given source |
11 | * string pointer is returned as-is. |
12 | * |
13 | * Although the presence of a length argument means that callers can pass |
14 | * non-null-terminated strings, care is required because the same string |
15 | * will be passed back if no conversion occurs. Such callers *must* check |
16 | * whether result == src and handle that case differently. |
17 | * |
18 | * If the source and destination encodings are the same, the source string |
19 | * is returned without any verification; it's assumed to be valid data. |
20 | * If that might not be the case, the caller is responsible for validating |
21 | * the string using a separate call to pg_verify_mbstr(). Whenever the |
22 | * source and destination encodings are different, the functions ensure that |
23 | * the result is validly encoded according to the destination encoding. |
24 | * |
25 | * |
26 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
27 | * Portions Copyright (c) 1994, Regents of the University of California |
28 | * |
29 | * |
30 | * IDENTIFICATION |
31 | * src/backend/utils/mb/mbutils.c |
32 | * |
33 | *------------------------------------------------------------------------- |
34 | */ |
35 | #include "postgres.h" |
36 | |
37 | #include "access/xact.h" |
38 | #include "catalog/namespace.h" |
39 | #include "mb/pg_wchar.h" |
40 | #include "utils/builtins.h" |
41 | #include "utils/memutils.h" |
42 | #include "utils/syscache.h" |
43 | |
44 | /* |
45 | * We maintain a simple linked list caching the fmgr lookup info for the |
46 | * currently selected conversion functions, as well as any that have been |
47 | * selected previously in the current session. (We remember previous |
48 | * settings because we must be able to restore a previous setting during |
49 | * transaction rollback, without doing any fresh catalog accesses.) |
50 | * |
51 | * Since we'll never release this data, we just keep it in TopMemoryContext. |
52 | */ |
53 | typedef struct ConvProcInfo |
54 | { |
55 | int s_encoding; /* server and client encoding IDs */ |
56 | int c_encoding; |
57 | FmgrInfo to_server_info; /* lookup info for conversion procs */ |
58 | FmgrInfo to_client_info; |
59 | } ConvProcInfo; |
60 | |
61 | static List *ConvProcList = NIL; /* List of ConvProcInfo */ |
62 | |
63 | /* |
64 | * These variables point to the currently active conversion functions, |
65 | * or are NULL when no conversion is needed. |
66 | */ |
67 | static FmgrInfo *ToServerConvProc = NULL; |
68 | static FmgrInfo *ToClientConvProc = NULL; |
69 | |
70 | /* |
71 | * These variables track the currently-selected encodings. |
72 | */ |
73 | static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; |
74 | static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; |
75 | static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII]; |
76 | |
77 | /* |
78 | * During backend startup we can't set client encoding because we (a) |
79 | * can't look up the conversion functions, and (b) may not know the database |
80 | * encoding yet either. So SetClientEncoding() just accepts anything and |
81 | * remembers it for InitializeClientEncoding() to apply later. |
82 | */ |
83 | static bool backend_startup_complete = false; |
84 | static int pending_client_encoding = PG_SQL_ASCII; |
85 | |
86 | |
87 | /* Internal functions */ |
88 | static char *perform_default_encoding_conversion(const char *src, |
89 | int len, bool is_client_to_server); |
90 | static int cliplen(const char *str, int len, int limit); |
91 | |
92 | |
93 | /* |
94 | * Prepare for a future call to SetClientEncoding. Success should mean |
95 | * that SetClientEncoding is guaranteed to succeed for this encoding request. |
96 | * |
97 | * (But note that success before backend_startup_complete does not guarantee |
98 | * success after ...) |
99 | * |
100 | * Returns 0 if okay, -1 if not (bad encoding or can't support conversion) |
101 | */ |
102 | int |
103 | PrepareClientEncoding(int encoding) |
104 | { |
105 | int current_server_encoding; |
106 | ListCell *lc; |
107 | |
108 | if (!PG_VALID_FE_ENCODING(encoding)) |
109 | return -1; |
110 | |
111 | /* Can't do anything during startup, per notes above */ |
112 | if (!backend_startup_complete) |
113 | return 0; |
114 | |
115 | current_server_encoding = GetDatabaseEncoding(); |
116 | |
117 | /* |
118 | * Check for cases that require no conversion function. |
119 | */ |
120 | if (current_server_encoding == encoding || |
121 | current_server_encoding == PG_SQL_ASCII || |
122 | encoding == PG_SQL_ASCII) |
123 | return 0; |
124 | |
125 | if (IsTransactionState()) |
126 | { |
127 | /* |
128 | * If we're in a live transaction, it's safe to access the catalogs, |
129 | * so look up the functions. We repeat the lookup even if the info is |
130 | * already cached, so that we can react to changes in the contents of |
131 | * pg_conversion. |
132 | */ |
133 | Oid to_server_proc, |
134 | to_client_proc; |
135 | ConvProcInfo *convinfo; |
136 | MemoryContext oldcontext; |
137 | |
138 | to_server_proc = FindDefaultConversionProc(encoding, |
139 | current_server_encoding); |
140 | if (!OidIsValid(to_server_proc)) |
141 | return -1; |
142 | to_client_proc = FindDefaultConversionProc(current_server_encoding, |
143 | encoding); |
144 | if (!OidIsValid(to_client_proc)) |
145 | return -1; |
146 | |
147 | /* |
148 | * Load the fmgr info into TopMemoryContext (could still fail here) |
149 | */ |
150 | convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext, |
151 | sizeof(ConvProcInfo)); |
152 | convinfo->s_encoding = current_server_encoding; |
153 | convinfo->c_encoding = encoding; |
154 | fmgr_info_cxt(to_server_proc, &convinfo->to_server_info, |
155 | TopMemoryContext); |
156 | fmgr_info_cxt(to_client_proc, &convinfo->to_client_info, |
157 | TopMemoryContext); |
158 | |
159 | /* Attach new info to head of list */ |
160 | oldcontext = MemoryContextSwitchTo(TopMemoryContext); |
161 | ConvProcList = lcons(convinfo, ConvProcList); |
162 | MemoryContextSwitchTo(oldcontext); |
163 | |
164 | /* |
165 | * We cannot yet remove any older entry for the same encoding pair, |
166 | * since it could still be in use. SetClientEncoding will clean up. |
167 | */ |
168 | |
169 | return 0; /* success */ |
170 | } |
171 | else |
172 | { |
173 | /* |
174 | * If we're not in a live transaction, the only thing we can do is |
175 | * restore a previous setting using the cache. This covers all |
176 | * transaction-rollback cases. The only case it might not work for is |
177 | * trying to change client_encoding on the fly by editing |
178 | * postgresql.conf and SIGHUP'ing. Which would probably be a stupid |
179 | * thing to do anyway. |
180 | */ |
181 | foreach(lc, ConvProcList) |
182 | { |
183 | ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc); |
184 | |
185 | if (oldinfo->s_encoding == current_server_encoding && |
186 | oldinfo->c_encoding == encoding) |
187 | return 0; |
188 | } |
189 | |
190 | return -1; /* it's not cached, so fail */ |
191 | } |
192 | } |
193 | |
194 | /* |
195 | * Set the active client encoding and set up the conversion-function pointers. |
196 | * PrepareClientEncoding should have been called previously for this encoding. |
197 | * |
198 | * Returns 0 if okay, -1 if not (bad encoding or can't support conversion) |
199 | */ |
200 | int |
201 | SetClientEncoding(int encoding) |
202 | { |
203 | int current_server_encoding; |
204 | bool found; |
205 | ListCell *lc; |
206 | ListCell *prev; |
207 | ListCell *next; |
208 | |
209 | if (!PG_VALID_FE_ENCODING(encoding)) |
210 | return -1; |
211 | |
212 | /* Can't do anything during startup, per notes above */ |
213 | if (!backend_startup_complete) |
214 | { |
215 | pending_client_encoding = encoding; |
216 | return 0; |
217 | } |
218 | |
219 | current_server_encoding = GetDatabaseEncoding(); |
220 | |
221 | /* |
222 | * Check for cases that require no conversion function. |
223 | */ |
224 | if (current_server_encoding == encoding || |
225 | current_server_encoding == PG_SQL_ASCII || |
226 | encoding == PG_SQL_ASCII) |
227 | { |
228 | ClientEncoding = &pg_enc2name_tbl[encoding]; |
229 | ToServerConvProc = NULL; |
230 | ToClientConvProc = NULL; |
231 | return 0; |
232 | } |
233 | |
234 | /* |
235 | * Search the cache for the entry previously prepared by |
236 | * PrepareClientEncoding; if there isn't one, we lose. While at it, |
237 | * release any duplicate entries so that repeated Prepare/Set cycles don't |
238 | * leak memory. |
239 | */ |
240 | found = false; |
241 | prev = NULL; |
242 | for (lc = list_head(ConvProcList); lc; lc = next) |
243 | { |
244 | ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc); |
245 | |
246 | next = lnext(lc); |
247 | |
248 | if (convinfo->s_encoding == current_server_encoding && |
249 | convinfo->c_encoding == encoding) |
250 | { |
251 | if (!found) |
252 | { |
253 | /* Found newest entry, so set up */ |
254 | ClientEncoding = &pg_enc2name_tbl[encoding]; |
255 | ToServerConvProc = &convinfo->to_server_info; |
256 | ToClientConvProc = &convinfo->to_client_info; |
257 | found = true; |
258 | } |
259 | else |
260 | { |
261 | /* Duplicate entry, release it */ |
262 | ConvProcList = list_delete_cell(ConvProcList, lc, prev); |
263 | pfree(convinfo); |
264 | continue; /* prev mustn't advance */ |
265 | } |
266 | } |
267 | |
268 | prev = lc; |
269 | } |
270 | |
271 | if (found) |
272 | return 0; /* success */ |
273 | else |
274 | return -1; /* it's not cached, so fail */ |
275 | } |
276 | |
277 | /* |
278 | * Initialize client encoding conversions. |
279 | * Called from InitPostgres() once during backend startup. |
280 | */ |
281 | void |
282 | InitializeClientEncoding(void) |
283 | { |
284 | Assert(!backend_startup_complete); |
285 | backend_startup_complete = true; |
286 | |
287 | if (PrepareClientEncoding(pending_client_encoding) < 0 || |
288 | SetClientEncoding(pending_client_encoding) < 0) |
289 | { |
290 | /* |
291 | * Oops, the requested conversion is not available. We couldn't fail |
292 | * before, but we can now. |
293 | */ |
294 | ereport(FATAL, |
295 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
296 | errmsg("conversion between %s and %s is not supported" , |
297 | pg_enc2name_tbl[pending_client_encoding].name, |
298 | GetDatabaseEncodingName()))); |
299 | } |
300 | } |
301 | |
302 | /* |
303 | * returns the current client encoding |
304 | */ |
305 | int |
306 | pg_get_client_encoding(void) |
307 | { |
308 | return ClientEncoding->encoding; |
309 | } |
310 | |
311 | /* |
312 | * returns the current client encoding name |
313 | */ |
314 | const char * |
315 | pg_get_client_encoding_name(void) |
316 | { |
317 | return ClientEncoding->name; |
318 | } |
319 | |
320 | /* |
321 | * Convert src string to another encoding (general case). |
322 | * |
323 | * See the notes about string conversion functions at the top of this file. |
324 | */ |
325 | unsigned char * |
326 | pg_do_encoding_conversion(unsigned char *src, int len, |
327 | int src_encoding, int dest_encoding) |
328 | { |
329 | unsigned char *result; |
330 | Oid proc; |
331 | |
332 | if (len <= 0) |
333 | return src; /* empty string is always valid */ |
334 | |
335 | if (src_encoding == dest_encoding) |
336 | return src; /* no conversion required, assume valid */ |
337 | |
338 | if (dest_encoding == PG_SQL_ASCII) |
339 | return src; /* any string is valid in SQL_ASCII */ |
340 | |
341 | if (src_encoding == PG_SQL_ASCII) |
342 | { |
343 | /* No conversion is possible, but we must validate the result */ |
344 | (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false); |
345 | return src; |
346 | } |
347 | |
348 | if (!IsTransactionState()) /* shouldn't happen */ |
349 | elog(ERROR, "cannot perform encoding conversion outside a transaction" ); |
350 | |
351 | proc = FindDefaultConversionProc(src_encoding, dest_encoding); |
352 | if (!OidIsValid(proc)) |
353 | ereport(ERROR, |
354 | (errcode(ERRCODE_UNDEFINED_FUNCTION), |
355 | errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist" , |
356 | pg_encoding_to_char(src_encoding), |
357 | pg_encoding_to_char(dest_encoding)))); |
358 | |
359 | /* |
360 | * Allocate space for conversion result, being wary of integer overflow |
361 | */ |
362 | if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH)) |
363 | ereport(ERROR, |
364 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
365 | errmsg("out of memory" ), |
366 | errdetail("String of %d bytes is too long for encoding conversion." , |
367 | len))); |
368 | |
369 | result = palloc(len * MAX_CONVERSION_GROWTH + 1); |
370 | |
371 | OidFunctionCall5(proc, |
372 | Int32GetDatum(src_encoding), |
373 | Int32GetDatum(dest_encoding), |
374 | CStringGetDatum(src), |
375 | CStringGetDatum(result), |
376 | Int32GetDatum(len)); |
377 | return result; |
378 | } |
379 | |
380 | /* |
381 | * Convert string to encoding encoding_name. The source |
382 | * encoding is the DB encoding. |
383 | * |
384 | * BYTEA convert_to(TEXT string, NAME encoding_name) */ |
385 | Datum |
386 | pg_convert_to(PG_FUNCTION_ARGS) |
387 | { |
388 | Datum string = PG_GETARG_DATUM(0); |
389 | Datum dest_encoding_name = PG_GETARG_DATUM(1); |
390 | Datum src_encoding_name = DirectFunctionCall1(namein, |
391 | CStringGetDatum(DatabaseEncoding->name)); |
392 | Datum result; |
393 | |
394 | /* |
395 | * pg_convert expects a bytea as its first argument. We're passing it a |
396 | * text argument here, relying on the fact that they are both in fact |
397 | * varlena types, and thus structurally identical. |
398 | */ |
399 | result = DirectFunctionCall3(pg_convert, string, |
400 | src_encoding_name, dest_encoding_name); |
401 | |
402 | PG_RETURN_DATUM(result); |
403 | } |
404 | |
405 | /* |
406 | * Convert string from encoding encoding_name. The destination |
407 | * encoding is the DB encoding. |
408 | * |
409 | * TEXT convert_from(BYTEA string, NAME encoding_name) */ |
410 | Datum |
411 | pg_convert_from(PG_FUNCTION_ARGS) |
412 | { |
413 | Datum string = PG_GETARG_DATUM(0); |
414 | Datum src_encoding_name = PG_GETARG_DATUM(1); |
415 | Datum dest_encoding_name = DirectFunctionCall1(namein, |
416 | CStringGetDatum(DatabaseEncoding->name)); |
417 | Datum result; |
418 | |
419 | result = DirectFunctionCall3(pg_convert, string, |
420 | src_encoding_name, dest_encoding_name); |
421 | |
422 | /* |
423 | * pg_convert returns a bytea, which we in turn return as text, relying on |
424 | * the fact that they are both in fact varlena types, and thus |
425 | * structurally identical. Although not all bytea values are valid text, |
426 | * in this case it will be because we've told pg_convert to return one |
427 | * that is valid as text in the current database encoding. |
428 | */ |
429 | PG_RETURN_DATUM(result); |
430 | } |
431 | |
432 | /* |
433 | * Convert string between two arbitrary encodings. |
434 | * |
435 | * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name) |
436 | */ |
437 | Datum |
438 | pg_convert(PG_FUNCTION_ARGS) |
439 | { |
440 | bytea *string = PG_GETARG_BYTEA_PP(0); |
441 | char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); |
442 | int src_encoding = pg_char_to_encoding(src_encoding_name); |
443 | char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); |
444 | int dest_encoding = pg_char_to_encoding(dest_encoding_name); |
445 | const char *src_str; |
446 | char *dest_str; |
447 | bytea *retval; |
448 | int len; |
449 | |
450 | if (src_encoding < 0) |
451 | ereport(ERROR, |
452 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
453 | errmsg("invalid source encoding name \"%s\"" , |
454 | src_encoding_name))); |
455 | if (dest_encoding < 0) |
456 | ereport(ERROR, |
457 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
458 | errmsg("invalid destination encoding name \"%s\"" , |
459 | dest_encoding_name))); |
460 | |
461 | /* make sure that source string is valid */ |
462 | len = VARSIZE_ANY_EXHDR(string); |
463 | src_str = VARDATA_ANY(string); |
464 | pg_verify_mbstr_len(src_encoding, src_str, len, false); |
465 | |
466 | /* perform conversion */ |
467 | dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str), |
468 | len, |
469 | src_encoding, |
470 | dest_encoding); |
471 | |
472 | /* update len if conversion actually happened */ |
473 | if (dest_str != src_str) |
474 | len = strlen(dest_str); |
475 | |
476 | /* |
477 | * build bytea data type structure. |
478 | */ |
479 | retval = (bytea *) palloc(len + VARHDRSZ); |
480 | SET_VARSIZE(retval, len + VARHDRSZ); |
481 | memcpy(VARDATA(retval), dest_str, len); |
482 | |
483 | if (dest_str != src_str) |
484 | pfree(dest_str); |
485 | |
486 | /* free memory if allocated by the toaster */ |
487 | PG_FREE_IF_COPY(string, 0); |
488 | |
489 | PG_RETURN_BYTEA_P(retval); |
490 | } |
491 | |
492 | /* |
493 | * get the length of the string considered as text in the specified |
494 | * encoding. Raises an error if the data is not valid in that |
495 | * encoding. |
496 | * |
497 | * INT4 length (BYTEA string, NAME src_encoding_name) |
498 | */ |
499 | Datum |
500 | length_in_encoding(PG_FUNCTION_ARGS) |
501 | { |
502 | bytea *string = PG_GETARG_BYTEA_PP(0); |
503 | char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); |
504 | int src_encoding = pg_char_to_encoding(src_encoding_name); |
505 | const char *src_str; |
506 | int len; |
507 | int retval; |
508 | |
509 | if (src_encoding < 0) |
510 | ereport(ERROR, |
511 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
512 | errmsg("invalid encoding name \"%s\"" , |
513 | src_encoding_name))); |
514 | |
515 | len = VARSIZE_ANY_EXHDR(string); |
516 | src_str = VARDATA_ANY(string); |
517 | |
518 | retval = pg_verify_mbstr_len(src_encoding, src_str, len, false); |
519 | |
520 | PG_RETURN_INT32(retval); |
521 | } |
522 | |
523 | /* |
524 | * Get maximum multibyte character length in the specified encoding. |
525 | * |
526 | * Note encoding is specified numerically, not by name as above. |
527 | */ |
528 | Datum |
529 | pg_encoding_max_length_sql(PG_FUNCTION_ARGS) |
530 | { |
531 | int encoding = PG_GETARG_INT32(0); |
532 | |
533 | if (PG_VALID_ENCODING(encoding)) |
534 | PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen); |
535 | else |
536 | PG_RETURN_NULL(); |
537 | } |
538 | |
539 | /* |
540 | * Convert client encoding to server encoding. |
541 | * |
542 | * See the notes about string conversion functions at the top of this file. |
543 | */ |
544 | char * |
545 | pg_client_to_server(const char *s, int len) |
546 | { |
547 | return pg_any_to_server(s, len, ClientEncoding->encoding); |
548 | } |
549 | |
550 | /* |
551 | * Convert any encoding to server encoding. |
552 | * |
553 | * See the notes about string conversion functions at the top of this file. |
554 | * |
555 | * Unlike the other string conversion functions, this will apply validation |
556 | * even if encoding == DatabaseEncoding->encoding. This is because this is |
557 | * used to process data coming in from outside the database, and we never |
558 | * want to just assume validity. |
559 | */ |
560 | char * |
561 | pg_any_to_server(const char *s, int len, int encoding) |
562 | { |
563 | if (len <= 0) |
564 | return unconstify(char *, s); /* empty string is always valid */ |
565 | |
566 | if (encoding == DatabaseEncoding->encoding || |
567 | encoding == PG_SQL_ASCII) |
568 | { |
569 | /* |
570 | * No conversion is needed, but we must still validate the data. |
571 | */ |
572 | (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); |
573 | return unconstify(char *, s); |
574 | } |
575 | |
576 | if (DatabaseEncoding->encoding == PG_SQL_ASCII) |
577 | { |
578 | /* |
579 | * No conversion is possible, but we must still validate the data, |
580 | * because the client-side code might have done string escaping using |
581 | * the selected client_encoding. If the client encoding is ASCII-safe |
582 | * then we just do a straight validation under that encoding. For an |
583 | * ASCII-unsafe encoding we have a problem: we dare not pass such data |
584 | * to the parser but we have no way to convert it. We compromise by |
585 | * rejecting the data if it contains any non-ASCII characters. |
586 | */ |
587 | if (PG_VALID_BE_ENCODING(encoding)) |
588 | (void) pg_verify_mbstr(encoding, s, len, false); |
589 | else |
590 | { |
591 | int i; |
592 | |
593 | for (i = 0; i < len; i++) |
594 | { |
595 | if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) |
596 | ereport(ERROR, |
597 | (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
598 | errmsg("invalid byte value for encoding \"%s\": 0x%02x" , |
599 | pg_enc2name_tbl[PG_SQL_ASCII].name, |
600 | (unsigned char) s[i]))); |
601 | } |
602 | } |
603 | return unconstify(char *, s); |
604 | } |
605 | |
606 | /* Fast path if we can use cached conversion function */ |
607 | if (encoding == ClientEncoding->encoding) |
608 | return perform_default_encoding_conversion(s, len, true); |
609 | |
610 | /* General case ... will not work outside transactions */ |
611 | return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s), |
612 | len, |
613 | encoding, |
614 | DatabaseEncoding->encoding); |
615 | } |
616 | |
617 | /* |
618 | * Convert server encoding to client encoding. |
619 | * |
620 | * See the notes about string conversion functions at the top of this file. |
621 | */ |
622 | char * |
623 | pg_server_to_client(const char *s, int len) |
624 | { |
625 | return pg_server_to_any(s, len, ClientEncoding->encoding); |
626 | } |
627 | |
628 | /* |
629 | * Convert server encoding to any encoding. |
630 | * |
631 | * See the notes about string conversion functions at the top of this file. |
632 | */ |
633 | char * |
634 | pg_server_to_any(const char *s, int len, int encoding) |
635 | { |
636 | if (len <= 0) |
637 | return unconstify(char *, s); /* empty string is always valid */ |
638 | |
639 | if (encoding == DatabaseEncoding->encoding || |
640 | encoding == PG_SQL_ASCII) |
641 | return unconstify(char *, s); /* assume data is valid */ |
642 | |
643 | if (DatabaseEncoding->encoding == PG_SQL_ASCII) |
644 | { |
645 | /* No conversion is possible, but we must validate the result */ |
646 | (void) pg_verify_mbstr(encoding, s, len, false); |
647 | return unconstify(char *, s); |
648 | } |
649 | |
650 | /* Fast path if we can use cached conversion function */ |
651 | if (encoding == ClientEncoding->encoding) |
652 | return perform_default_encoding_conversion(s, len, false); |
653 | |
654 | /* General case ... will not work outside transactions */ |
655 | return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s), |
656 | len, |
657 | DatabaseEncoding->encoding, |
658 | encoding); |
659 | } |
660 | |
661 | /* |
662 | * Perform default encoding conversion using cached FmgrInfo. Since |
663 | * this function does not access database at all, it is safe to call |
664 | * outside transactions. If the conversion has not been set up by |
665 | * SetClientEncoding(), no conversion is performed. |
666 | */ |
667 | static char * |
668 | perform_default_encoding_conversion(const char *src, int len, |
669 | bool is_client_to_server) |
670 | { |
671 | char *result; |
672 | int src_encoding, |
673 | dest_encoding; |
674 | FmgrInfo *flinfo; |
675 | |
676 | if (is_client_to_server) |
677 | { |
678 | src_encoding = ClientEncoding->encoding; |
679 | dest_encoding = DatabaseEncoding->encoding; |
680 | flinfo = ToServerConvProc; |
681 | } |
682 | else |
683 | { |
684 | src_encoding = DatabaseEncoding->encoding; |
685 | dest_encoding = ClientEncoding->encoding; |
686 | flinfo = ToClientConvProc; |
687 | } |
688 | |
689 | if (flinfo == NULL) |
690 | return unconstify(char *, src); |
691 | |
692 | /* |
693 | * Allocate space for conversion result, being wary of integer overflow |
694 | */ |
695 | if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH)) |
696 | ereport(ERROR, |
697 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
698 | errmsg("out of memory" ), |
699 | errdetail("String of %d bytes is too long for encoding conversion." , |
700 | len))); |
701 | |
702 | result = palloc(len * MAX_CONVERSION_GROWTH + 1); |
703 | |
704 | FunctionCall5(flinfo, |
705 | Int32GetDatum(src_encoding), |
706 | Int32GetDatum(dest_encoding), |
707 | CStringGetDatum(src), |
708 | CStringGetDatum(result), |
709 | Int32GetDatum(len)); |
710 | return result; |
711 | } |
712 | |
713 | |
714 | /* convert a multibyte string to a wchar */ |
715 | int |
716 | pg_mb2wchar(const char *from, pg_wchar *to) |
717 | { |
718 | return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from)); |
719 | } |
720 | |
721 | /* convert a multibyte string to a wchar with a limited length */ |
722 | int |
723 | pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len) |
724 | { |
725 | return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len); |
726 | } |
727 | |
728 | /* same, with any encoding */ |
729 | int |
730 | pg_encoding_mb2wchar_with_len(int encoding, |
731 | const char *from, pg_wchar *to, int len) |
732 | { |
733 | return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len); |
734 | } |
735 | |
736 | /* convert a wchar string to a multibyte */ |
737 | int |
738 | pg_wchar2mb(const pg_wchar *from, char *to) |
739 | { |
740 | return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from)); |
741 | } |
742 | |
743 | /* convert a wchar string to a multibyte with a limited length */ |
744 | int |
745 | pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len) |
746 | { |
747 | return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len); |
748 | } |
749 | |
750 | /* same, with any encoding */ |
751 | int |
752 | pg_encoding_wchar2mb_with_len(int encoding, |
753 | const pg_wchar *from, char *to, int len) |
754 | { |
755 | return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); |
756 | } |
757 | |
758 | /* returns the byte length of a multibyte character */ |
759 | int |
760 | pg_mblen(const char *mbstr) |
761 | { |
762 | return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); |
763 | } |
764 | |
765 | /* returns the display length of a multibyte character */ |
766 | int |
767 | pg_dsplen(const char *mbstr) |
768 | { |
769 | return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr); |
770 | } |
771 | |
772 | /* returns the length (counted in wchars) of a multibyte string */ |
773 | int |
774 | pg_mbstrlen(const char *mbstr) |
775 | { |
776 | int len = 0; |
777 | |
778 | /* optimization for single byte encoding */ |
779 | if (pg_database_encoding_max_length() == 1) |
780 | return strlen(mbstr); |
781 | |
782 | while (*mbstr) |
783 | { |
784 | mbstr += pg_mblen(mbstr); |
785 | len++; |
786 | } |
787 | return len; |
788 | } |
789 | |
790 | /* returns the length (counted in wchars) of a multibyte string |
791 | * (not necessarily NULL terminated) |
792 | */ |
793 | int |
794 | pg_mbstrlen_with_len(const char *mbstr, int limit) |
795 | { |
796 | int len = 0; |
797 | |
798 | /* optimization for single byte encoding */ |
799 | if (pg_database_encoding_max_length() == 1) |
800 | return limit; |
801 | |
802 | while (limit > 0 && *mbstr) |
803 | { |
804 | int l = pg_mblen(mbstr); |
805 | |
806 | limit -= l; |
807 | mbstr += l; |
808 | len++; |
809 | } |
810 | return len; |
811 | } |
812 | |
813 | /* |
814 | * returns the byte length of a multibyte string |
815 | * (not necessarily NULL terminated) |
816 | * that is no longer than limit. |
817 | * this function does not break multibyte character boundary. |
818 | */ |
819 | int |
820 | pg_mbcliplen(const char *mbstr, int len, int limit) |
821 | { |
822 | return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr, |
823 | len, limit); |
824 | } |
825 | |
826 | /* |
827 | * pg_mbcliplen with specified encoding |
828 | */ |
829 | int |
830 | pg_encoding_mbcliplen(int encoding, const char *mbstr, |
831 | int len, int limit) |
832 | { |
833 | mblen_converter mblen_fn; |
834 | int clen = 0; |
835 | int l; |
836 | |
837 | /* optimization for single byte encoding */ |
838 | if (pg_encoding_max_length(encoding) == 1) |
839 | return cliplen(mbstr, len, limit); |
840 | |
841 | mblen_fn = pg_wchar_table[encoding].mblen; |
842 | |
843 | while (len > 0 && *mbstr) |
844 | { |
845 | l = (*mblen_fn) ((const unsigned char *) mbstr); |
846 | if ((clen + l) > limit) |
847 | break; |
848 | clen += l; |
849 | if (clen == limit) |
850 | break; |
851 | len -= l; |
852 | mbstr += l; |
853 | } |
854 | return clen; |
855 | } |
856 | |
857 | /* |
858 | * Similar to pg_mbcliplen except the limit parameter specifies the |
859 | * character length, not the byte length. |
860 | */ |
861 | int |
862 | pg_mbcharcliplen(const char *mbstr, int len, int limit) |
863 | { |
864 | int clen = 0; |
865 | int nch = 0; |
866 | int l; |
867 | |
868 | /* optimization for single byte encoding */ |
869 | if (pg_database_encoding_max_length() == 1) |
870 | return cliplen(mbstr, len, limit); |
871 | |
872 | while (len > 0 && *mbstr) |
873 | { |
874 | l = pg_mblen(mbstr); |
875 | nch++; |
876 | if (nch > limit) |
877 | break; |
878 | clen += l; |
879 | len -= l; |
880 | mbstr += l; |
881 | } |
882 | return clen; |
883 | } |
884 | |
885 | /* mbcliplen for any single-byte encoding */ |
886 | static int |
887 | cliplen(const char *str, int len, int limit) |
888 | { |
889 | int l = 0; |
890 | |
891 | len = Min(len, limit); |
892 | while (l < len && str[l]) |
893 | l++; |
894 | return l; |
895 | } |
896 | |
897 | void |
898 | SetDatabaseEncoding(int encoding) |
899 | { |
900 | if (!PG_VALID_BE_ENCODING(encoding)) |
901 | elog(ERROR, "invalid database encoding: %d" , encoding); |
902 | |
903 | DatabaseEncoding = &pg_enc2name_tbl[encoding]; |
904 | Assert(DatabaseEncoding->encoding == encoding); |
905 | } |
906 | |
907 | void |
908 | SetMessageEncoding(int encoding) |
909 | { |
910 | /* Some calls happen before we can elog()! */ |
911 | Assert(PG_VALID_ENCODING(encoding)); |
912 | |
913 | MessageEncoding = &pg_enc2name_tbl[encoding]; |
914 | Assert(MessageEncoding->encoding == encoding); |
915 | } |
916 | |
917 | #ifdef ENABLE_NLS |
918 | /* |
919 | * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext |
920 | * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also |
921 | * fail for gettext-internal causes like out-of-memory. |
922 | */ |
923 | static bool |
924 | raw_pg_bind_textdomain_codeset(const char *domainname, int encoding) |
925 | { |
926 | bool elog_ok = (CurrentMemoryContext != NULL); |
927 | int i; |
928 | |
929 | for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++) |
930 | { |
931 | if (pg_enc2gettext_tbl[i].encoding == encoding) |
932 | { |
933 | if (bind_textdomain_codeset(domainname, |
934 | pg_enc2gettext_tbl[i].name) != NULL) |
935 | return true; |
936 | |
937 | if (elog_ok) |
938 | elog(LOG, "bind_textdomain_codeset failed" ); |
939 | else |
940 | write_stderr("bind_textdomain_codeset failed" ); |
941 | |
942 | break; |
943 | } |
944 | } |
945 | |
946 | return false; |
947 | } |
948 | |
949 | /* |
950 | * Bind a gettext message domain to the codeset corresponding to the database |
951 | * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE. |
952 | * Return the MessageEncoding implied by the new settings. |
953 | * |
954 | * On most platforms, gettext defaults to the codeset implied by LC_CTYPE. |
955 | * When that matches the database encoding, we don't need to do anything. In |
956 | * CREATE DATABASE, we enforce or trust that the locale's codeset matches the |
957 | * database encoding, except for the C locale. (On Windows, we also permit a |
958 | * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind |
959 | * gettext to the right codeset. |
960 | * |
961 | * On Windows, gettext defaults to the Windows ANSI code page. This is a |
962 | * convenient departure for software that passes the strings to Windows ANSI |
963 | * APIs, but we don't do that. Compel gettext to use database encoding or, |
964 | * failing that, the LC_CTYPE encoding as it would on other platforms. |
965 | * |
966 | * This function is called before elog() and palloc() are usable. |
967 | */ |
968 | int |
969 | pg_bind_textdomain_codeset(const char *domainname) |
970 | { |
971 | bool elog_ok = (CurrentMemoryContext != NULL); |
972 | int encoding = GetDatabaseEncoding(); |
973 | int new_msgenc; |
974 | |
975 | #ifndef WIN32 |
976 | const char *ctype = setlocale(LC_CTYPE, NULL); |
977 | |
978 | if (pg_strcasecmp(ctype, "C" ) == 0 || pg_strcasecmp(ctype, "POSIX" ) == 0) |
979 | #endif |
980 | if (encoding != PG_SQL_ASCII && |
981 | raw_pg_bind_textdomain_codeset(domainname, encoding)) |
982 | return encoding; |
983 | |
984 | new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok); |
985 | if (new_msgenc < 0) |
986 | new_msgenc = PG_SQL_ASCII; |
987 | |
988 | #ifdef WIN32 |
989 | if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc)) |
990 | /* On failure, the old message encoding remains valid. */ |
991 | return GetMessageEncoding(); |
992 | #endif |
993 | |
994 | return new_msgenc; |
995 | } |
996 | #endif |
997 | |
998 | /* |
999 | * The database encoding, also called the server encoding, represents the |
1000 | * encoding of data stored in text-like data types. Affected types include |
1001 | * cstring, text, varchar, name, xml, and json. |
1002 | */ |
1003 | int |
1004 | GetDatabaseEncoding(void) |
1005 | { |
1006 | return DatabaseEncoding->encoding; |
1007 | } |
1008 | |
1009 | const char * |
1010 | GetDatabaseEncodingName(void) |
1011 | { |
1012 | return DatabaseEncoding->name; |
1013 | } |
1014 | |
1015 | Datum |
1016 | getdatabaseencoding(PG_FUNCTION_ARGS) |
1017 | { |
1018 | return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name)); |
1019 | } |
1020 | |
1021 | Datum |
1022 | pg_client_encoding(PG_FUNCTION_ARGS) |
1023 | { |
1024 | return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name)); |
1025 | } |
1026 | |
1027 | /* |
1028 | * gettext() returns messages in this encoding. This often matches the |
1029 | * database encoding, but it differs for SQL_ASCII databases, for processes |
1030 | * not attached to a database, and under a database encoding lacking iconv |
1031 | * support (MULE_INTERNAL). |
1032 | */ |
1033 | int |
1034 | GetMessageEncoding(void) |
1035 | { |
1036 | return MessageEncoding->encoding; |
1037 | } |
1038 | |
1039 | #ifdef WIN32 |
1040 | /* |
1041 | * Convert from MessageEncoding to a palloc'ed, null-terminated utf16 |
1042 | * string. The character length is also passed to utf16len if not |
1043 | * null. Returns NULL iff failed. Before MessageEncoding initialization, "str" |
1044 | * should be ASCII-only; this will function as though MessageEncoding is UTF8. |
1045 | */ |
1046 | WCHAR * |
1047 | pgwin32_message_to_UTF16(const char *str, int len, int *utf16len) |
1048 | { |
1049 | int msgenc = GetMessageEncoding(); |
1050 | WCHAR *utf16; |
1051 | int dstlen; |
1052 | UINT codepage; |
1053 | |
1054 | if (msgenc == PG_SQL_ASCII) |
1055 | /* No conversion is possible, and SQL_ASCII is never utf16. */ |
1056 | return NULL; |
1057 | |
1058 | codepage = pg_enc2name_tbl[msgenc].codepage; |
1059 | |
1060 | /* |
1061 | * Use MultiByteToWideChar directly if there is a corresponding codepage, |
1062 | * or double conversion through UTF8 if not. Double conversion is needed, |
1063 | * for example, in an ENCODING=LATIN8, LC_CTYPE=C database. |
1064 | */ |
1065 | if (codepage != 0) |
1066 | { |
1067 | utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1)); |
1068 | dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len); |
1069 | utf16[dstlen] = (WCHAR) 0; |
1070 | } |
1071 | else |
1072 | { |
1073 | char *utf8; |
1074 | |
1075 | /* |
1076 | * XXX pg_do_encoding_conversion() requires a transaction. In the |
1077 | * absence of one, hope for the input to be valid UTF8. |
1078 | */ |
1079 | if (IsTransactionState()) |
1080 | { |
1081 | utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str, |
1082 | len, |
1083 | msgenc, |
1084 | PG_UTF8); |
1085 | if (utf8 != str) |
1086 | len = strlen(utf8); |
1087 | } |
1088 | else |
1089 | utf8 = (char *) str; |
1090 | |
1091 | utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1)); |
1092 | dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len); |
1093 | utf16[dstlen] = (WCHAR) 0; |
1094 | |
1095 | if (utf8 != str) |
1096 | pfree(utf8); |
1097 | } |
1098 | |
1099 | if (dstlen == 0 && len > 0) |
1100 | { |
1101 | pfree(utf16); |
1102 | return NULL; /* error */ |
1103 | } |
1104 | |
1105 | if (utf16len) |
1106 | *utf16len = dstlen; |
1107 | return utf16; |
1108 | } |
1109 | |
1110 | #endif |
1111 | |