1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * varlena.c |
4 | * Functions for the variable-length built-in types. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/utils/adt/varlena.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #include "postgres.h" |
16 | |
17 | #include <ctype.h> |
18 | #include <limits.h> |
19 | |
20 | #include "access/tuptoaster.h" |
21 | #include "catalog/pg_collation.h" |
22 | #include "catalog/pg_type.h" |
23 | #include "common/int.h" |
24 | #include "lib/hyperloglog.h" |
25 | #include "libpq/pqformat.h" |
26 | #include "miscadmin.h" |
27 | #include "parser/scansup.h" |
28 | #include "port/pg_bswap.h" |
29 | #include "regex/regex.h" |
30 | #include "utils/builtins.h" |
31 | #include "utils/bytea.h" |
32 | #include "utils/hashutils.h" |
33 | #include "utils/lsyscache.h" |
34 | #include "utils/memutils.h" |
35 | #include "utils/pg_locale.h" |
36 | #include "utils/sortsupport.h" |
37 | #include "utils/varlena.h" |
38 | |
39 | |
40 | /* GUC variable */ |
41 | int bytea_output = BYTEA_OUTPUT_HEX; |
42 | |
43 | typedef struct varlena unknown; |
44 | typedef struct varlena VarString; |
45 | |
46 | /* |
47 | * State for text_position_* functions. |
48 | */ |
49 | typedef struct |
50 | { |
51 | bool is_multibyte; /* T if multibyte encoding */ |
52 | bool is_multibyte_char_in_char; |
53 | |
54 | char *str1; /* haystack string */ |
55 | char *str2; /* needle string */ |
56 | int len1; /* string lengths in bytes */ |
57 | int len2; |
58 | |
59 | /* Skip table for Boyer-Moore-Horspool search algorithm: */ |
60 | int skiptablemask; /* mask for ANDing with skiptable subscripts */ |
61 | int skiptable[256]; /* skip distance for given mismatched char */ |
62 | |
63 | char *last_match; /* pointer to last match in 'str1' */ |
64 | |
65 | /* |
66 | * Sometimes we need to convert the byte position of a match to a |
67 | * character position. These store the last position that was converted, |
68 | * so that on the next call, we can continue from that point, rather than |
69 | * count characters from the very beginning. |
70 | */ |
71 | char *refpoint; /* pointer within original haystack string */ |
72 | int refpos; /* 0-based character offset of the same point */ |
73 | } TextPositionState; |
74 | |
75 | typedef struct |
76 | { |
77 | char *buf1; /* 1st string, or abbreviation original string |
78 | * buf */ |
79 | char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ |
80 | int buflen1; |
81 | int buflen2; |
82 | int last_len1; /* Length of last buf1 string/strxfrm() input */ |
83 | int last_len2; /* Length of last buf2 string/strxfrm() blob */ |
84 | int last_returned; /* Last comparison result (cache) */ |
85 | bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */ |
86 | bool collate_c; |
87 | Oid typid; /* Actual datatype (text/bpchar/bytea/name) */ |
88 | hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ |
89 | hyperLogLogState full_card; /* Full key cardinality state */ |
90 | double prop_card; /* Required cardinality proportion */ |
91 | pg_locale_t locale; |
92 | } VarStringSortSupport; |
93 | |
94 | /* |
95 | * This should be large enough that most strings will fit, but small enough |
96 | * that we feel comfortable putting it on the stack |
97 | */ |
98 | #define TEXTBUFLEN 1024 |
99 | |
100 | #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X)) |
101 | #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X)) |
102 | #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n)) |
103 | #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n)) |
104 | #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x) |
105 | |
106 | #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X)) |
107 | #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X)) |
108 | |
109 | static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup); |
110 | static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup); |
111 | static int namefastcmp_c(Datum x, Datum y, SortSupport ssup); |
112 | static int varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup); |
113 | static int namefastcmp_locale(Datum x, Datum y, SortSupport ssup); |
114 | static int varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup); |
115 | static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup); |
116 | static Datum varstr_abbrev_convert(Datum original, SortSupport ssup); |
117 | static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup); |
118 | static int32 text_length(Datum str); |
119 | static text *text_catenate(text *t1, text *t2); |
120 | static text *text_substring(Datum str, |
121 | int32 start, |
122 | int32 length, |
123 | bool length_not_specified); |
124 | static text *text_overlay(text *t1, text *t2, int sp, int sl); |
125 | static int text_position(text *t1, text *t2, Oid collid); |
126 | static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state); |
127 | static bool text_position_next(TextPositionState *state); |
128 | static char *text_position_next_internal(char *start_ptr, TextPositionState *state); |
129 | static char *text_position_get_match_ptr(TextPositionState *state); |
130 | static int text_position_get_match_pos(TextPositionState *state); |
131 | static void text_position_cleanup(TextPositionState *state); |
132 | static void check_collation_set(Oid collid); |
133 | static int text_cmp(text *arg1, text *arg2, Oid collid); |
134 | static bytea *bytea_catenate(bytea *t1, bytea *t2); |
135 | static bytea *bytea_substring(Datum str, |
136 | int S, |
137 | int L, |
138 | bool length_not_specified); |
139 | static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); |
140 | static void appendStringInfoText(StringInfo str, const text *t); |
141 | static Datum text_to_array_internal(PG_FUNCTION_ARGS); |
142 | static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, |
143 | const char *fldsep, const char *null_string); |
144 | static StringInfo makeStringAggState(FunctionCallInfo fcinfo); |
145 | static bool text_format_parse_digits(const char **ptr, const char *end_ptr, |
146 | int *value); |
147 | static const char *text_format_parse_format(const char *start_ptr, |
148 | const char *end_ptr, |
149 | int *argpos, int *widthpos, |
150 | int *flags, int *width); |
151 | static void text_format_string_conversion(StringInfo buf, char conversion, |
152 | FmgrInfo *typOutputInfo, |
153 | Datum value, bool isNull, |
154 | int flags, int width); |
155 | static void text_format_append_string(StringInfo buf, const char *str, |
156 | int flags, int width); |
157 | |
158 | |
159 | /***************************************************************************** |
160 | * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE * |
161 | *****************************************************************************/ |
162 | |
163 | /* |
164 | * cstring_to_text |
165 | * |
166 | * Create a text value from a null-terminated C string. |
167 | * |
168 | * The new text value is freshly palloc'd with a full-size VARHDR. |
169 | */ |
170 | text * |
171 | cstring_to_text(const char *s) |
172 | { |
173 | return cstring_to_text_with_len(s, strlen(s)); |
174 | } |
175 | |
176 | /* |
177 | * cstring_to_text_with_len |
178 | * |
179 | * Same as cstring_to_text except the caller specifies the string length; |
180 | * the string need not be null_terminated. |
181 | */ |
182 | text * |
183 | cstring_to_text_with_len(const char *s, int len) |
184 | { |
185 | text *result = (text *) palloc(len + VARHDRSZ); |
186 | |
187 | SET_VARSIZE(result, len + VARHDRSZ); |
188 | memcpy(VARDATA(result), s, len); |
189 | |
190 | return result; |
191 | } |
192 | |
193 | /* |
194 | * text_to_cstring |
195 | * |
196 | * Create a palloc'd, null-terminated C string from a text value. |
197 | * |
198 | * We support being passed a compressed or toasted text value. |
199 | * This is a bit bogus since such values shouldn't really be referred to as |
200 | * "text *", but it seems useful for robustness. If we didn't handle that |
201 | * case here, we'd need another routine that did, anyway. |
202 | */ |
203 | char * |
204 | text_to_cstring(const text *t) |
205 | { |
206 | /* must cast away the const, unfortunately */ |
207 | text *tunpacked = pg_detoast_datum_packed(unconstify(text *, t)); |
208 | int len = VARSIZE_ANY_EXHDR(tunpacked); |
209 | char *result; |
210 | |
211 | result = (char *) palloc(len + 1); |
212 | memcpy(result, VARDATA_ANY(tunpacked), len); |
213 | result[len] = '\0'; |
214 | |
215 | if (tunpacked != t) |
216 | pfree(tunpacked); |
217 | |
218 | return result; |
219 | } |
220 | |
221 | /* |
222 | * text_to_cstring_buffer |
223 | * |
224 | * Copy a text value into a caller-supplied buffer of size dst_len. |
225 | * |
226 | * The text string is truncated if necessary to fit. The result is |
227 | * guaranteed null-terminated (unless dst_len == 0). |
228 | * |
229 | * We support being passed a compressed or toasted text value. |
230 | * This is a bit bogus since such values shouldn't really be referred to as |
231 | * "text *", but it seems useful for robustness. If we didn't handle that |
232 | * case here, we'd need another routine that did, anyway. |
233 | */ |
234 | void |
235 | text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) |
236 | { |
237 | /* must cast away the const, unfortunately */ |
238 | text *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src)); |
239 | size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked); |
240 | |
241 | if (dst_len > 0) |
242 | { |
243 | dst_len--; |
244 | if (dst_len >= src_len) |
245 | dst_len = src_len; |
246 | else /* ensure truncation is encoding-safe */ |
247 | dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len); |
248 | memcpy(dst, VARDATA_ANY(srcunpacked), dst_len); |
249 | dst[dst_len] = '\0'; |
250 | } |
251 | |
252 | if (srcunpacked != src) |
253 | pfree(srcunpacked); |
254 | } |
255 | |
256 | |
257 | /***************************************************************************** |
258 | * USER I/O ROUTINES * |
259 | *****************************************************************************/ |
260 | |
261 | |
262 | #define VAL(CH) ((CH) - '0') |
263 | #define DIG(VAL) ((VAL) + '0') |
264 | |
265 | /* |
266 | * byteain - converts from printable representation of byte array |
267 | * |
268 | * Non-printable characters must be passed as '\nnn' (octal) and are |
269 | * converted to internal form. '\' must be passed as '\\'. |
270 | * ereport(ERROR, ...) if bad form. |
271 | * |
272 | * BUGS: |
273 | * The input is scanned twice. |
274 | * The error checking of input is minimal. |
275 | */ |
276 | Datum |
277 | byteain(PG_FUNCTION_ARGS) |
278 | { |
279 | char *inputText = PG_GETARG_CSTRING(0); |
280 | char *tp; |
281 | char *rp; |
282 | int bc; |
283 | bytea *result; |
284 | |
285 | /* Recognize hex input */ |
286 | if (inputText[0] == '\\' && inputText[1] == 'x') |
287 | { |
288 | size_t len = strlen(inputText); |
289 | |
290 | bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ |
291 | result = palloc(bc); |
292 | bc = hex_decode(inputText + 2, len - 2, VARDATA(result)); |
293 | SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ |
294 | |
295 | PG_RETURN_BYTEA_P(result); |
296 | } |
297 | |
298 | /* Else, it's the traditional escaped style */ |
299 | for (bc = 0, tp = inputText; *tp != '\0'; bc++) |
300 | { |
301 | if (tp[0] != '\\') |
302 | tp++; |
303 | else if ((tp[0] == '\\') && |
304 | (tp[1] >= '0' && tp[1] <= '3') && |
305 | (tp[2] >= '0' && tp[2] <= '7') && |
306 | (tp[3] >= '0' && tp[3] <= '7')) |
307 | tp += 4; |
308 | else if ((tp[0] == '\\') && |
309 | (tp[1] == '\\')) |
310 | tp += 2; |
311 | else |
312 | { |
313 | /* |
314 | * one backslash, not followed by another or ### valid octal |
315 | */ |
316 | ereport(ERROR, |
317 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
318 | errmsg("invalid input syntax for type %s" , "bytea" ))); |
319 | } |
320 | } |
321 | |
322 | bc += VARHDRSZ; |
323 | |
324 | result = (bytea *) palloc(bc); |
325 | SET_VARSIZE(result, bc); |
326 | |
327 | tp = inputText; |
328 | rp = VARDATA(result); |
329 | while (*tp != '\0') |
330 | { |
331 | if (tp[0] != '\\') |
332 | *rp++ = *tp++; |
333 | else if ((tp[0] == '\\') && |
334 | (tp[1] >= '0' && tp[1] <= '3') && |
335 | (tp[2] >= '0' && tp[2] <= '7') && |
336 | (tp[3] >= '0' && tp[3] <= '7')) |
337 | { |
338 | bc = VAL(tp[1]); |
339 | bc <<= 3; |
340 | bc += VAL(tp[2]); |
341 | bc <<= 3; |
342 | *rp++ = bc + VAL(tp[3]); |
343 | |
344 | tp += 4; |
345 | } |
346 | else if ((tp[0] == '\\') && |
347 | (tp[1] == '\\')) |
348 | { |
349 | *rp++ = '\\'; |
350 | tp += 2; |
351 | } |
352 | else |
353 | { |
354 | /* |
355 | * We should never get here. The first pass should not allow it. |
356 | */ |
357 | ereport(ERROR, |
358 | (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
359 | errmsg("invalid input syntax for type %s" , "bytea" ))); |
360 | } |
361 | } |
362 | |
363 | PG_RETURN_BYTEA_P(result); |
364 | } |
365 | |
366 | /* |
367 | * byteaout - converts to printable representation of byte array |
368 | * |
369 | * In the traditional escaped format, non-printable characters are |
370 | * printed as '\nnn' (octal) and '\' as '\\'. |
371 | */ |
372 | Datum |
373 | byteaout(PG_FUNCTION_ARGS) |
374 | { |
375 | bytea *vlena = PG_GETARG_BYTEA_PP(0); |
376 | char *result; |
377 | char *rp; |
378 | |
379 | if (bytea_output == BYTEA_OUTPUT_HEX) |
380 | { |
381 | /* Print hex format */ |
382 | rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); |
383 | *rp++ = '\\'; |
384 | *rp++ = 'x'; |
385 | rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); |
386 | } |
387 | else if (bytea_output == BYTEA_OUTPUT_ESCAPE) |
388 | { |
389 | /* Print traditional escaped format */ |
390 | char *vp; |
391 | int len; |
392 | int i; |
393 | |
394 | len = 1; /* empty string has 1 char */ |
395 | vp = VARDATA_ANY(vlena); |
396 | for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) |
397 | { |
398 | if (*vp == '\\') |
399 | len += 2; |
400 | else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) |
401 | len += 4; |
402 | else |
403 | len++; |
404 | } |
405 | rp = result = (char *) palloc(len); |
406 | vp = VARDATA_ANY(vlena); |
407 | for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) |
408 | { |
409 | if (*vp == '\\') |
410 | { |
411 | *rp++ = '\\'; |
412 | *rp++ = '\\'; |
413 | } |
414 | else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) |
415 | { |
416 | int val; /* holds unprintable chars */ |
417 | |
418 | val = *vp; |
419 | rp[0] = '\\'; |
420 | rp[3] = DIG(val & 07); |
421 | val >>= 3; |
422 | rp[2] = DIG(val & 07); |
423 | val >>= 3; |
424 | rp[1] = DIG(val & 03); |
425 | rp += 4; |
426 | } |
427 | else |
428 | *rp++ = *vp; |
429 | } |
430 | } |
431 | else |
432 | { |
433 | elog(ERROR, "unrecognized bytea_output setting: %d" , |
434 | bytea_output); |
435 | rp = result = NULL; /* keep compiler quiet */ |
436 | } |
437 | *rp = '\0'; |
438 | PG_RETURN_CSTRING(result); |
439 | } |
440 | |
441 | /* |
442 | * bytearecv - converts external binary format to bytea |
443 | */ |
444 | Datum |
445 | bytearecv(PG_FUNCTION_ARGS) |
446 | { |
447 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
448 | bytea *result; |
449 | int nbytes; |
450 | |
451 | nbytes = buf->len - buf->cursor; |
452 | result = (bytea *) palloc(nbytes + VARHDRSZ); |
453 | SET_VARSIZE(result, nbytes + VARHDRSZ); |
454 | pq_copymsgbytes(buf, VARDATA(result), nbytes); |
455 | PG_RETURN_BYTEA_P(result); |
456 | } |
457 | |
458 | /* |
459 | * byteasend - converts bytea to binary format |
460 | * |
461 | * This is a special case: just copy the input... |
462 | */ |
463 | Datum |
464 | byteasend(PG_FUNCTION_ARGS) |
465 | { |
466 | bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); |
467 | |
468 | PG_RETURN_BYTEA_P(vlena); |
469 | } |
470 | |
471 | Datum |
472 | bytea_string_agg_transfn(PG_FUNCTION_ARGS) |
473 | { |
474 | StringInfo state; |
475 | |
476 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
477 | |
478 | /* Append the value unless null. */ |
479 | if (!PG_ARGISNULL(1)) |
480 | { |
481 | bytea *value = PG_GETARG_BYTEA_PP(1); |
482 | |
483 | /* On the first time through, we ignore the delimiter. */ |
484 | if (state == NULL) |
485 | state = makeStringAggState(fcinfo); |
486 | else if (!PG_ARGISNULL(2)) |
487 | { |
488 | bytea *delim = PG_GETARG_BYTEA_PP(2); |
489 | |
490 | appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim)); |
491 | } |
492 | |
493 | appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value)); |
494 | } |
495 | |
496 | /* |
497 | * The transition type for string_agg() is declared to be "internal", |
498 | * which is a pass-by-value type the same size as a pointer. |
499 | */ |
500 | PG_RETURN_POINTER(state); |
501 | } |
502 | |
503 | Datum |
504 | bytea_string_agg_finalfn(PG_FUNCTION_ARGS) |
505 | { |
506 | StringInfo state; |
507 | |
508 | /* cannot be called directly because of internal-type argument */ |
509 | Assert(AggCheckCallContext(fcinfo, NULL)); |
510 | |
511 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
512 | |
513 | if (state != NULL) |
514 | { |
515 | bytea *result; |
516 | |
517 | result = (bytea *) palloc(state->len + VARHDRSZ); |
518 | SET_VARSIZE(result, state->len + VARHDRSZ); |
519 | memcpy(VARDATA(result), state->data, state->len); |
520 | PG_RETURN_BYTEA_P(result); |
521 | } |
522 | else |
523 | PG_RETURN_NULL(); |
524 | } |
525 | |
526 | /* |
527 | * textin - converts "..." to internal representation |
528 | */ |
529 | Datum |
530 | textin(PG_FUNCTION_ARGS) |
531 | { |
532 | char *inputText = PG_GETARG_CSTRING(0); |
533 | |
534 | PG_RETURN_TEXT_P(cstring_to_text(inputText)); |
535 | } |
536 | |
537 | /* |
538 | * textout - converts internal representation to "..." |
539 | */ |
540 | Datum |
541 | textout(PG_FUNCTION_ARGS) |
542 | { |
543 | Datum txt = PG_GETARG_DATUM(0); |
544 | |
545 | PG_RETURN_CSTRING(TextDatumGetCString(txt)); |
546 | } |
547 | |
548 | /* |
549 | * textrecv - converts external binary format to text |
550 | */ |
551 | Datum |
552 | textrecv(PG_FUNCTION_ARGS) |
553 | { |
554 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
555 | text *result; |
556 | char *str; |
557 | int nbytes; |
558 | |
559 | str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); |
560 | |
561 | result = cstring_to_text_with_len(str, nbytes); |
562 | pfree(str); |
563 | PG_RETURN_TEXT_P(result); |
564 | } |
565 | |
566 | /* |
567 | * textsend - converts text to binary format |
568 | */ |
569 | Datum |
570 | textsend(PG_FUNCTION_ARGS) |
571 | { |
572 | text *t = PG_GETARG_TEXT_PP(0); |
573 | StringInfoData buf; |
574 | |
575 | pq_begintypsend(&buf); |
576 | pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); |
577 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
578 | } |
579 | |
580 | |
581 | /* |
582 | * unknownin - converts "..." to internal representation |
583 | */ |
584 | Datum |
585 | unknownin(PG_FUNCTION_ARGS) |
586 | { |
587 | char *str = PG_GETARG_CSTRING(0); |
588 | |
589 | /* representation is same as cstring */ |
590 | PG_RETURN_CSTRING(pstrdup(str)); |
591 | } |
592 | |
593 | /* |
594 | * unknownout - converts internal representation to "..." |
595 | */ |
596 | Datum |
597 | unknownout(PG_FUNCTION_ARGS) |
598 | { |
599 | /* representation is same as cstring */ |
600 | char *str = PG_GETARG_CSTRING(0); |
601 | |
602 | PG_RETURN_CSTRING(pstrdup(str)); |
603 | } |
604 | |
605 | /* |
606 | * unknownrecv - converts external binary format to unknown |
607 | */ |
608 | Datum |
609 | unknownrecv(PG_FUNCTION_ARGS) |
610 | { |
611 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
612 | char *str; |
613 | int nbytes; |
614 | |
615 | str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); |
616 | /* representation is same as cstring */ |
617 | PG_RETURN_CSTRING(str); |
618 | } |
619 | |
620 | /* |
621 | * unknownsend - converts unknown to binary format |
622 | */ |
623 | Datum |
624 | unknownsend(PG_FUNCTION_ARGS) |
625 | { |
626 | /* representation is same as cstring */ |
627 | char *str = PG_GETARG_CSTRING(0); |
628 | StringInfoData buf; |
629 | |
630 | pq_begintypsend(&buf); |
631 | pq_sendtext(&buf, str, strlen(str)); |
632 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
633 | } |
634 | |
635 | |
636 | /* ========== PUBLIC ROUTINES ========== */ |
637 | |
638 | /* |
639 | * textlen - |
640 | * returns the logical length of a text* |
641 | * (which is less than the VARSIZE of the text*) |
642 | */ |
643 | Datum |
644 | textlen(PG_FUNCTION_ARGS) |
645 | { |
646 | Datum str = PG_GETARG_DATUM(0); |
647 | |
648 | /* try to avoid decompressing argument */ |
649 | PG_RETURN_INT32(text_length(str)); |
650 | } |
651 | |
652 | /* |
653 | * text_length - |
654 | * Does the real work for textlen() |
655 | * |
656 | * This is broken out so it can be called directly by other string processing |
657 | * functions. Note that the argument is passed as a Datum, to indicate that |
658 | * it may still be in compressed form. We can avoid decompressing it at all |
659 | * in some cases. |
660 | */ |
661 | static int32 |
662 | text_length(Datum str) |
663 | { |
664 | /* fastpath when max encoding length is one */ |
665 | if (pg_database_encoding_max_length() == 1) |
666 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
667 | else |
668 | { |
669 | text *t = DatumGetTextPP(str); |
670 | |
671 | PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t), |
672 | VARSIZE_ANY_EXHDR(t))); |
673 | } |
674 | } |
675 | |
676 | /* |
677 | * textoctetlen - |
678 | * returns the physical length of a text* |
679 | * (which is less than the VARSIZE of the text*) |
680 | */ |
681 | Datum |
682 | textoctetlen(PG_FUNCTION_ARGS) |
683 | { |
684 | Datum str = PG_GETARG_DATUM(0); |
685 | |
686 | /* We need not detoast the input at all */ |
687 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
688 | } |
689 | |
690 | /* |
691 | * textcat - |
692 | * takes two text* and returns a text* that is the concatenation of |
693 | * the two. |
694 | * |
695 | * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96. |
696 | * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10. |
697 | * Allocate space for output in all cases. |
698 | * XXX - thomas 1997-07-10 |
699 | */ |
700 | Datum |
701 | textcat(PG_FUNCTION_ARGS) |
702 | { |
703 | text *t1 = PG_GETARG_TEXT_PP(0); |
704 | text *t2 = PG_GETARG_TEXT_PP(1); |
705 | |
706 | PG_RETURN_TEXT_P(text_catenate(t1, t2)); |
707 | } |
708 | |
709 | /* |
710 | * text_catenate |
711 | * Guts of textcat(), broken out so it can be used by other functions |
712 | * |
713 | * Arguments can be in short-header form, but not compressed or out-of-line |
714 | */ |
715 | static text * |
716 | text_catenate(text *t1, text *t2) |
717 | { |
718 | text *result; |
719 | int len1, |
720 | len2, |
721 | len; |
722 | char *ptr; |
723 | |
724 | len1 = VARSIZE_ANY_EXHDR(t1); |
725 | len2 = VARSIZE_ANY_EXHDR(t2); |
726 | |
727 | /* paranoia ... probably should throw error instead? */ |
728 | if (len1 < 0) |
729 | len1 = 0; |
730 | if (len2 < 0) |
731 | len2 = 0; |
732 | |
733 | len = len1 + len2 + VARHDRSZ; |
734 | result = (text *) palloc(len); |
735 | |
736 | /* Set size of result string... */ |
737 | SET_VARSIZE(result, len); |
738 | |
739 | /* Fill data field of result string... */ |
740 | ptr = VARDATA(result); |
741 | if (len1 > 0) |
742 | memcpy(ptr, VARDATA_ANY(t1), len1); |
743 | if (len2 > 0) |
744 | memcpy(ptr + len1, VARDATA_ANY(t2), len2); |
745 | |
746 | return result; |
747 | } |
748 | |
749 | /* |
750 | * charlen_to_bytelen() |
751 | * Compute the number of bytes occupied by n characters starting at *p |
752 | * |
753 | * It is caller's responsibility that there actually are n characters; |
754 | * the string need not be null-terminated. |
755 | */ |
756 | static int |
757 | charlen_to_bytelen(const char *p, int n) |
758 | { |
759 | if (pg_database_encoding_max_length() == 1) |
760 | { |
761 | /* Optimization for single-byte encodings */ |
762 | return n; |
763 | } |
764 | else |
765 | { |
766 | const char *s; |
767 | |
768 | for (s = p; n > 0; n--) |
769 | s += pg_mblen(s); |
770 | |
771 | return s - p; |
772 | } |
773 | } |
774 | |
775 | /* |
776 | * text_substr() |
777 | * Return a substring starting at the specified position. |
778 | * - thomas 1997-12-31 |
779 | * |
780 | * Input: |
781 | * - string |
782 | * - starting position (is one-based) |
783 | * - string length |
784 | * |
785 | * If the starting position is zero or less, then return from the start of the string |
786 | * adjusting the length to be consistent with the "negative start" per SQL. |
787 | * If the length is less than zero, return the remaining string. |
788 | * |
789 | * Added multibyte support. |
790 | * - Tatsuo Ishii 1998-4-21 |
791 | * Changed behavior if starting position is less than one to conform to SQL behavior. |
792 | * Formerly returned the entire string; now returns a portion. |
793 | * - Thomas Lockhart 1998-12-10 |
794 | * Now uses faster TOAST-slicing interface |
795 | * - John Gray 2002-02-22 |
796 | * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change |
797 | * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw |
798 | * error; if E < 1, return '', not entire string). Fixed MB related bug when |
799 | * S > LC and < LC + 4 sometimes garbage characters are returned. |
800 | * - Joe Conway 2002-08-10 |
801 | */ |
802 | Datum |
803 | text_substr(PG_FUNCTION_ARGS) |
804 | { |
805 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), |
806 | PG_GETARG_INT32(1), |
807 | PG_GETARG_INT32(2), |
808 | false)); |
809 | } |
810 | |
811 | /* |
812 | * text_substr_no_len - |
813 | * Wrapper to avoid opr_sanity failure due to |
814 | * one function accepting a different number of args. |
815 | */ |
816 | Datum |
817 | text_substr_no_len(PG_FUNCTION_ARGS) |
818 | { |
819 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), |
820 | PG_GETARG_INT32(1), |
821 | -1, true)); |
822 | } |
823 | |
824 | /* |
825 | * text_substring - |
826 | * Does the real work for text_substr() and text_substr_no_len() |
827 | * |
828 | * This is broken out so it can be called directly by other string processing |
829 | * functions. Note that the argument is passed as a Datum, to indicate that |
830 | * it may still be in compressed/toasted form. We can avoid detoasting all |
831 | * of it in some cases. |
832 | * |
833 | * The result is always a freshly palloc'd datum. |
834 | */ |
835 | static text * |
836 | text_substring(Datum str, int32 start, int32 length, bool length_not_specified) |
837 | { |
838 | int32 eml = pg_database_encoding_max_length(); |
839 | int32 S = start; /* start position */ |
840 | int32 S1; /* adjusted start position */ |
841 | int32 L1; /* adjusted substring length */ |
842 | |
843 | /* life is easy if the encoding max length is 1 */ |
844 | if (eml == 1) |
845 | { |
846 | S1 = Max(S, 1); |
847 | |
848 | if (length_not_specified) /* special case - get length to end of |
849 | * string */ |
850 | L1 = -1; |
851 | else |
852 | { |
853 | /* end position */ |
854 | int E = S + length; |
855 | |
856 | /* |
857 | * A negative value for L is the only way for the end position to |
858 | * be before the start. SQL99 says to throw an error. |
859 | */ |
860 | if (E < S) |
861 | ereport(ERROR, |
862 | (errcode(ERRCODE_SUBSTRING_ERROR), |
863 | errmsg("negative substring length not allowed" ))); |
864 | |
865 | /* |
866 | * A zero or negative value for the end position can happen if the |
867 | * start was negative or one. SQL99 says to return a zero-length |
868 | * string. |
869 | */ |
870 | if (E < 1) |
871 | return cstring_to_text("" ); |
872 | |
873 | L1 = E - S1; |
874 | } |
875 | |
876 | /* |
877 | * If the start position is past the end of the string, SQL99 says to |
878 | * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do |
879 | * that for us. Convert to zero-based starting position |
880 | */ |
881 | return DatumGetTextPSlice(str, S1 - 1, L1); |
882 | } |
883 | else if (eml > 1) |
884 | { |
885 | /* |
886 | * When encoding max length is > 1, we can't get LC without |
887 | * detoasting, so we'll grab a conservatively large slice now and go |
888 | * back later to do the right thing |
889 | */ |
890 | int32 slice_start; |
891 | int32 slice_size; |
892 | int32 slice_strlen; |
893 | text *slice; |
894 | int32 E1; |
895 | int32 i; |
896 | char *p; |
897 | char *s; |
898 | text *ret; |
899 | |
900 | /* |
901 | * if S is past the end of the string, the tuple toaster will return a |
902 | * zero-length string to us |
903 | */ |
904 | S1 = Max(S, 1); |
905 | |
906 | /* |
907 | * We need to start at position zero because there is no way to know |
908 | * in advance which byte offset corresponds to the supplied start |
909 | * position. |
910 | */ |
911 | slice_start = 0; |
912 | |
913 | if (length_not_specified) /* special case - get length to end of |
914 | * string */ |
915 | slice_size = L1 = -1; |
916 | else |
917 | { |
918 | int E = S + length; |
919 | |
920 | /* |
921 | * A negative value for L is the only way for the end position to |
922 | * be before the start. SQL99 says to throw an error. |
923 | */ |
924 | if (E < S) |
925 | ereport(ERROR, |
926 | (errcode(ERRCODE_SUBSTRING_ERROR), |
927 | errmsg("negative substring length not allowed" ))); |
928 | |
929 | /* |
930 | * A zero or negative value for the end position can happen if the |
931 | * start was negative or one. SQL99 says to return a zero-length |
932 | * string. |
933 | */ |
934 | if (E < 1) |
935 | return cstring_to_text("" ); |
936 | |
937 | /* |
938 | * if E is past the end of the string, the tuple toaster will |
939 | * truncate the length for us |
940 | */ |
941 | L1 = E - S1; |
942 | |
943 | /* |
944 | * Total slice size in bytes can't be any longer than the start |
945 | * position plus substring length times the encoding max length. |
946 | */ |
947 | slice_size = (S1 + L1) * eml; |
948 | } |
949 | |
950 | /* |
951 | * If we're working with an untoasted source, no need to do an extra |
952 | * copying step. |
953 | */ |
954 | if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || |
955 | VARATT_IS_EXTERNAL(DatumGetPointer(str))) |
956 | slice = DatumGetTextPSlice(str, slice_start, slice_size); |
957 | else |
958 | slice = (text *) DatumGetPointer(str); |
959 | |
960 | /* see if we got back an empty string */ |
961 | if (VARSIZE_ANY_EXHDR(slice) == 0) |
962 | { |
963 | if (slice != (text *) DatumGetPointer(str)) |
964 | pfree(slice); |
965 | return cstring_to_text("" ); |
966 | } |
967 | |
968 | /* Now we can get the actual length of the slice in MB characters */ |
969 | slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), |
970 | VARSIZE_ANY_EXHDR(slice)); |
971 | |
972 | /* |
973 | * Check that the start position wasn't > slice_strlen. If so, SQL99 |
974 | * says to return a zero-length string. |
975 | */ |
976 | if (S1 > slice_strlen) |
977 | { |
978 | if (slice != (text *) DatumGetPointer(str)) |
979 | pfree(slice); |
980 | return cstring_to_text("" ); |
981 | } |
982 | |
983 | /* |
984 | * Adjust L1 and E1 now that we know the slice string length. Again |
985 | * remember that S1 is one based, and slice_start is zero based. |
986 | */ |
987 | if (L1 > -1) |
988 | E1 = Min(S1 + L1, slice_start + 1 + slice_strlen); |
989 | else |
990 | E1 = slice_start + 1 + slice_strlen; |
991 | |
992 | /* |
993 | * Find the start position in the slice; remember S1 is not zero based |
994 | */ |
995 | p = VARDATA_ANY(slice); |
996 | for (i = 0; i < S1 - 1; i++) |
997 | p += pg_mblen(p); |
998 | |
999 | /* hang onto a pointer to our start position */ |
1000 | s = p; |
1001 | |
1002 | /* |
1003 | * Count the actual bytes used by the substring of the requested |
1004 | * length. |
1005 | */ |
1006 | for (i = S1; i < E1; i++) |
1007 | p += pg_mblen(p); |
1008 | |
1009 | ret = (text *) palloc(VARHDRSZ + (p - s)); |
1010 | SET_VARSIZE(ret, VARHDRSZ + (p - s)); |
1011 | memcpy(VARDATA(ret), s, (p - s)); |
1012 | |
1013 | if (slice != (text *) DatumGetPointer(str)) |
1014 | pfree(slice); |
1015 | |
1016 | return ret; |
1017 | } |
1018 | else |
1019 | elog(ERROR, "invalid backend encoding: encoding max length < 1" ); |
1020 | |
1021 | /* not reached: suppress compiler warning */ |
1022 | return NULL; |
1023 | } |
1024 | |
1025 | /* |
1026 | * textoverlay |
1027 | * Replace specified substring of first string with second |
1028 | * |
1029 | * The SQL standard defines OVERLAY() in terms of substring and concatenation. |
1030 | * This code is a direct implementation of what the standard says. |
1031 | */ |
1032 | Datum |
1033 | textoverlay(PG_FUNCTION_ARGS) |
1034 | { |
1035 | text *t1 = PG_GETARG_TEXT_PP(0); |
1036 | text *t2 = PG_GETARG_TEXT_PP(1); |
1037 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
1038 | int sl = PG_GETARG_INT32(3); /* substring length */ |
1039 | |
1040 | PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl)); |
1041 | } |
1042 | |
1043 | Datum |
1044 | textoverlay_no_len(PG_FUNCTION_ARGS) |
1045 | { |
1046 | text *t1 = PG_GETARG_TEXT_PP(0); |
1047 | text *t2 = PG_GETARG_TEXT_PP(1); |
1048 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
1049 | int sl; |
1050 | |
1051 | sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */ |
1052 | PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl)); |
1053 | } |
1054 | |
1055 | static text * |
1056 | text_overlay(text *t1, text *t2, int sp, int sl) |
1057 | { |
1058 | text *result; |
1059 | text *s1; |
1060 | text *s2; |
1061 | int sp_pl_sl; |
1062 | |
1063 | /* |
1064 | * Check for possible integer-overflow cases. For negative sp, throw a |
1065 | * "substring length" error because that's what should be expected |
1066 | * according to the spec's definition of OVERLAY(). |
1067 | */ |
1068 | if (sp <= 0) |
1069 | ereport(ERROR, |
1070 | (errcode(ERRCODE_SUBSTRING_ERROR), |
1071 | errmsg("negative substring length not allowed" ))); |
1072 | if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) |
1073 | ereport(ERROR, |
1074 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
1075 | errmsg("integer out of range" ))); |
1076 | |
1077 | s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false); |
1078 | s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); |
1079 | result = text_catenate(s1, t2); |
1080 | result = text_catenate(result, s2); |
1081 | |
1082 | return result; |
1083 | } |
1084 | |
1085 | /* |
1086 | * textpos - |
1087 | * Return the position of the specified substring. |
1088 | * Implements the SQL POSITION() function. |
1089 | * Ref: A Guide To The SQL Standard, Date & Darwen, 1997 |
1090 | * - thomas 1997-07-27 |
1091 | */ |
1092 | Datum |
1093 | textpos(PG_FUNCTION_ARGS) |
1094 | { |
1095 | text *str = PG_GETARG_TEXT_PP(0); |
1096 | text *search_str = PG_GETARG_TEXT_PP(1); |
1097 | |
1098 | PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION())); |
1099 | } |
1100 | |
1101 | /* |
1102 | * text_position - |
1103 | * Does the real work for textpos() |
1104 | * |
1105 | * Inputs: |
1106 | * t1 - string to be searched |
1107 | * t2 - pattern to match within t1 |
1108 | * Result: |
1109 | * Character index of the first matched char, starting from 1, |
1110 | * or 0 if no match. |
1111 | * |
1112 | * This is broken out so it can be called directly by other string processing |
1113 | * functions. |
1114 | */ |
1115 | static int |
1116 | text_position(text *t1, text *t2, Oid collid) |
1117 | { |
1118 | TextPositionState state; |
1119 | int result; |
1120 | |
1121 | if (VARSIZE_ANY_EXHDR(t1) < 1 || VARSIZE_ANY_EXHDR(t2) < 1) |
1122 | return 0; |
1123 | |
1124 | text_position_setup(t1, t2, collid, &state); |
1125 | if (!text_position_next(&state)) |
1126 | result = 0; |
1127 | else |
1128 | result = text_position_get_match_pos(&state); |
1129 | text_position_cleanup(&state); |
1130 | return result; |
1131 | } |
1132 | |
1133 | |
1134 | /* |
1135 | * text_position_setup, text_position_next, text_position_cleanup - |
1136 | * Component steps of text_position() |
1137 | * |
1138 | * These are broken out so that a string can be efficiently searched for |
1139 | * multiple occurrences of the same pattern. text_position_next may be |
1140 | * called multiple times, and it advances to the next match on each call. |
1141 | * text_position_get_match_ptr() and text_position_get_match_pos() return |
1142 | * a pointer or 1-based character position of the last match, respectively. |
1143 | * |
1144 | * The "state" variable is normally just a local variable in the caller. |
1145 | * |
1146 | * NOTE: text_position_next skips over the matched portion. For example, |
1147 | * searching for "xx" in "xxx" returns only one match, not two. |
1148 | */ |
1149 | |
1150 | static void |
1151 | text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state) |
1152 | { |
1153 | int len1 = VARSIZE_ANY_EXHDR(t1); |
1154 | int len2 = VARSIZE_ANY_EXHDR(t2); |
1155 | pg_locale_t mylocale = 0; |
1156 | |
1157 | check_collation_set(collid); |
1158 | |
1159 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) |
1160 | mylocale = pg_newlocale_from_collation(collid); |
1161 | |
1162 | if (mylocale && !mylocale->deterministic) |
1163 | ereport(ERROR, |
1164 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1165 | errmsg("nondeterministic collations are not supported for substring searches" ))); |
1166 | |
1167 | Assert(len1 > 0); |
1168 | Assert(len2 > 0); |
1169 | |
1170 | /* |
1171 | * Even with a multi-byte encoding, we perform the search using the raw |
1172 | * byte sequence, ignoring multibyte issues. For UTF-8, that works fine, |
1173 | * because in UTF-8 the byte sequence of one character cannot contain |
1174 | * another character. For other multi-byte encodings, we do the search |
1175 | * initially as a simple byte search, ignoring multibyte issues, but |
1176 | * verify afterwards that the match we found is at a character boundary, |
1177 | * and continue the search if it was a false match. |
1178 | */ |
1179 | if (pg_database_encoding_max_length() == 1) |
1180 | { |
1181 | state->is_multibyte = false; |
1182 | state->is_multibyte_char_in_char = false; |
1183 | } |
1184 | else if (GetDatabaseEncoding() == PG_UTF8) |
1185 | { |
1186 | state->is_multibyte = true; |
1187 | state->is_multibyte_char_in_char = false; |
1188 | } |
1189 | else |
1190 | { |
1191 | state->is_multibyte = true; |
1192 | state->is_multibyte_char_in_char = true; |
1193 | } |
1194 | |
1195 | state->str1 = VARDATA_ANY(t1); |
1196 | state->str2 = VARDATA_ANY(t2); |
1197 | state->len1 = len1; |
1198 | state->len2 = len2; |
1199 | state->last_match = NULL; |
1200 | state->refpoint = state->str1; |
1201 | state->refpos = 0; |
1202 | |
1203 | /* |
1204 | * Prepare the skip table for Boyer-Moore-Horspool searching. In these |
1205 | * notes we use the terminology that the "haystack" is the string to be |
1206 | * searched (t1) and the "needle" is the pattern being sought (t2). |
1207 | * |
1208 | * If the needle is empty or bigger than the haystack then there is no |
1209 | * point in wasting cycles initializing the table. We also choose not to |
1210 | * use B-M-H for needles of length 1, since the skip table can't possibly |
1211 | * save anything in that case. |
1212 | */ |
1213 | if (len1 >= len2 && len2 > 1) |
1214 | { |
1215 | int searchlength = len1 - len2; |
1216 | int skiptablemask; |
1217 | int last; |
1218 | int i; |
1219 | const char *str2 = state->str2; |
1220 | |
1221 | /* |
1222 | * First we must determine how much of the skip table to use. The |
1223 | * declaration of TextPositionState allows up to 256 elements, but for |
1224 | * short search problems we don't really want to have to initialize so |
1225 | * many elements --- it would take too long in comparison to the |
1226 | * actual search time. So we choose a useful skip table size based on |
1227 | * the haystack length minus the needle length. The closer the needle |
1228 | * length is to the haystack length the less useful skipping becomes. |
1229 | * |
1230 | * Note: since we use bit-masking to select table elements, the skip |
1231 | * table size MUST be a power of 2, and so the mask must be 2^N-1. |
1232 | */ |
1233 | if (searchlength < 16) |
1234 | skiptablemask = 3; |
1235 | else if (searchlength < 64) |
1236 | skiptablemask = 7; |
1237 | else if (searchlength < 128) |
1238 | skiptablemask = 15; |
1239 | else if (searchlength < 512) |
1240 | skiptablemask = 31; |
1241 | else if (searchlength < 2048) |
1242 | skiptablemask = 63; |
1243 | else if (searchlength < 4096) |
1244 | skiptablemask = 127; |
1245 | else |
1246 | skiptablemask = 255; |
1247 | state->skiptablemask = skiptablemask; |
1248 | |
1249 | /* |
1250 | * Initialize the skip table. We set all elements to the needle |
1251 | * length, since this is the correct skip distance for any character |
1252 | * not found in the needle. |
1253 | */ |
1254 | for (i = 0; i <= skiptablemask; i++) |
1255 | state->skiptable[i] = len2; |
1256 | |
1257 | /* |
1258 | * Now examine the needle. For each character except the last one, |
1259 | * set the corresponding table element to the appropriate skip |
1260 | * distance. Note that when two characters share the same skip table |
1261 | * entry, the one later in the needle must determine the skip |
1262 | * distance. |
1263 | */ |
1264 | last = len2 - 1; |
1265 | |
1266 | for (i = 0; i < last; i++) |
1267 | state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i; |
1268 | } |
1269 | } |
1270 | |
1271 | /* |
1272 | * Advance to the next match, starting from the end of the previous match |
1273 | * (or the beginning of the string, on first call). Returns true if a match |
1274 | * is found. |
1275 | */ |
1276 | static bool |
1277 | text_position_next(TextPositionState *state) |
1278 | { |
1279 | int needle_len = state->len2; |
1280 | char *start_ptr; |
1281 | char *matchptr; |
1282 | |
1283 | if (needle_len <= 0) |
1284 | return false; /* result for empty pattern */ |
1285 | |
1286 | /* Start from the point right after the previous match. */ |
1287 | if (state->last_match) |
1288 | start_ptr = state->last_match + needle_len; |
1289 | else |
1290 | start_ptr = state->str1; |
1291 | |
1292 | retry: |
1293 | matchptr = text_position_next_internal(start_ptr, state); |
1294 | |
1295 | if (!matchptr) |
1296 | return false; |
1297 | |
1298 | /* |
1299 | * Found a match for the byte sequence. If this is a multibyte encoding, |
1300 | * where one character's byte sequence can appear inside a longer |
1301 | * multi-byte character, we need to verify that the match was at a |
1302 | * character boundary, not in the middle of a multi-byte character. |
1303 | */ |
1304 | if (state->is_multibyte_char_in_char) |
1305 | { |
1306 | /* Walk one character at a time, until we reach the match. */ |
1307 | |
1308 | /* the search should never move backwards. */ |
1309 | Assert(state->refpoint <= matchptr); |
1310 | |
1311 | while (state->refpoint < matchptr) |
1312 | { |
1313 | /* step to next character. */ |
1314 | state->refpoint += pg_mblen(state->refpoint); |
1315 | state->refpos++; |
1316 | |
1317 | /* |
1318 | * If we stepped over the match's start position, then it was a |
1319 | * false positive, where the byte sequence appeared in the middle |
1320 | * of a multi-byte character. Skip it, and continue the search at |
1321 | * the next character boundary. |
1322 | */ |
1323 | if (state->refpoint > matchptr) |
1324 | { |
1325 | start_ptr = state->refpoint; |
1326 | goto retry; |
1327 | } |
1328 | } |
1329 | } |
1330 | |
1331 | state->last_match = matchptr; |
1332 | return true; |
1333 | } |
1334 | |
1335 | /* |
1336 | * Subroutine of text_position_next(). This searches for the raw byte |
1337 | * sequence, ignoring any multi-byte encoding issues. Returns the first |
1338 | * match starting at 'start_ptr', or NULL if no match is found. |
1339 | */ |
1340 | static char * |
1341 | text_position_next_internal(char *start_ptr, TextPositionState *state) |
1342 | { |
1343 | int haystack_len = state->len1; |
1344 | int needle_len = state->len2; |
1345 | int skiptablemask = state->skiptablemask; |
1346 | const char *haystack = state->str1; |
1347 | const char *needle = state->str2; |
1348 | const char *haystack_end = &haystack[haystack_len]; |
1349 | const char *hptr; |
1350 | |
1351 | Assert(start_ptr >= haystack && start_ptr <= haystack_end); |
1352 | |
1353 | if (needle_len == 1) |
1354 | { |
1355 | /* No point in using B-M-H for a one-character needle */ |
1356 | char nchar = *needle; |
1357 | |
1358 | hptr = start_ptr; |
1359 | while (hptr < haystack_end) |
1360 | { |
1361 | if (*hptr == nchar) |
1362 | return (char *) hptr; |
1363 | hptr++; |
1364 | } |
1365 | } |
1366 | else |
1367 | { |
1368 | const char *needle_last = &needle[needle_len - 1]; |
1369 | |
1370 | /* Start at startpos plus the length of the needle */ |
1371 | hptr = start_ptr + needle_len - 1; |
1372 | while (hptr < haystack_end) |
1373 | { |
1374 | /* Match the needle scanning *backward* */ |
1375 | const char *nptr; |
1376 | const char *p; |
1377 | |
1378 | nptr = needle_last; |
1379 | p = hptr; |
1380 | while (*nptr == *p) |
1381 | { |
1382 | /* Matched it all? If so, return 1-based position */ |
1383 | if (nptr == needle) |
1384 | return (char *) p; |
1385 | nptr--, p--; |
1386 | } |
1387 | |
1388 | /* |
1389 | * No match, so use the haystack char at hptr to decide how far to |
1390 | * advance. If the needle had any occurrence of that character |
1391 | * (or more precisely, one sharing the same skiptable entry) |
1392 | * before its last character, then we advance far enough to align |
1393 | * the last such needle character with that haystack position. |
1394 | * Otherwise we can advance by the whole needle length. |
1395 | */ |
1396 | hptr += state->skiptable[(unsigned char) *hptr & skiptablemask]; |
1397 | } |
1398 | } |
1399 | |
1400 | return 0; /* not found */ |
1401 | } |
1402 | |
1403 | /* |
1404 | * Return a pointer to the current match. |
1405 | * |
1406 | * The returned pointer points into correct position in the original |
1407 | * the haystack string. |
1408 | */ |
1409 | static char * |
1410 | text_position_get_match_ptr(TextPositionState *state) |
1411 | { |
1412 | return state->last_match; |
1413 | } |
1414 | |
1415 | /* |
1416 | * Return the offset of the current match. |
1417 | * |
1418 | * The offset is in characters, 1-based. |
1419 | */ |
1420 | static int |
1421 | text_position_get_match_pos(TextPositionState *state) |
1422 | { |
1423 | if (!state->is_multibyte) |
1424 | return state->last_match - state->str1 + 1; |
1425 | else |
1426 | { |
1427 | /* Convert the byte position to char position. */ |
1428 | while (state->refpoint < state->last_match) |
1429 | { |
1430 | state->refpoint += pg_mblen(state->refpoint); |
1431 | state->refpos++; |
1432 | } |
1433 | Assert(state->refpoint == state->last_match); |
1434 | return state->refpos + 1; |
1435 | } |
1436 | } |
1437 | |
1438 | static void |
1439 | text_position_cleanup(TextPositionState *state) |
1440 | { |
1441 | /* no cleanup needed */ |
1442 | } |
1443 | |
1444 | static void |
1445 | check_collation_set(Oid collid) |
1446 | { |
1447 | if (!OidIsValid(collid)) |
1448 | { |
1449 | /* |
1450 | * This typically means that the parser could not resolve a conflict |
1451 | * of implicit collations, so report it that way. |
1452 | */ |
1453 | ereport(ERROR, |
1454 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
1455 | errmsg("could not determine which collation to use for string comparison" ), |
1456 | errhint("Use the COLLATE clause to set the collation explicitly." ))); |
1457 | } |
1458 | } |
1459 | |
1460 | /* varstr_cmp() |
1461 | * Comparison function for text strings with given lengths. |
1462 | * Includes locale support, but must copy strings to temporary memory |
1463 | * to allow null-termination for inputs to strcoll(). |
1464 | * Returns an integer less than, equal to, or greater than zero, indicating |
1465 | * whether arg1 is less than, equal to, or greater than arg2. |
1466 | * |
1467 | * Note: many functions that depend on this are marked leakproof; therefore, |
1468 | * avoid reporting the actual contents of the input when throwing errors. |
1469 | * All errors herein should be things that can't happen except on corrupt |
1470 | * data, anyway; otherwise we will have trouble with indexing strings that |
1471 | * would cause them. |
1472 | */ |
1473 | int |
1474 | varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) |
1475 | { |
1476 | int result; |
1477 | |
1478 | check_collation_set(collid); |
1479 | |
1480 | /* |
1481 | * Unfortunately, there is no strncoll(), so in the non-C locale case we |
1482 | * have to do some memory copying. This turns out to be significantly |
1483 | * slower, so we optimize the case where LC_COLLATE is C. We also try to |
1484 | * optimize relatively-short strings by avoiding palloc/pfree overhead. |
1485 | */ |
1486 | if (lc_collate_is_c(collid)) |
1487 | { |
1488 | result = memcmp(arg1, arg2, Min(len1, len2)); |
1489 | if ((result == 0) && (len1 != len2)) |
1490 | result = (len1 < len2) ? -1 : 1; |
1491 | } |
1492 | else |
1493 | { |
1494 | char a1buf[TEXTBUFLEN]; |
1495 | char a2buf[TEXTBUFLEN]; |
1496 | char *a1p, |
1497 | *a2p; |
1498 | pg_locale_t mylocale = 0; |
1499 | |
1500 | if (collid != DEFAULT_COLLATION_OID) |
1501 | mylocale = pg_newlocale_from_collation(collid); |
1502 | |
1503 | /* |
1504 | * memcmp() can't tell us which of two unequal strings sorts first, |
1505 | * but it's a cheap way to tell if they're equal. Testing shows that |
1506 | * memcmp() followed by strcoll() is only trivially slower than |
1507 | * strcoll() by itself, so we don't lose much if this doesn't work out |
1508 | * very often, and if it does - for example, because there are many |
1509 | * equal strings in the input - then we win big by avoiding expensive |
1510 | * collation-aware comparisons. |
1511 | */ |
1512 | if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) |
1513 | return 0; |
1514 | |
1515 | #ifdef WIN32 |
1516 | /* Win32 does not have UTF-8, so we need to map to UTF-16 */ |
1517 | if (GetDatabaseEncoding() == PG_UTF8 |
1518 | && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) |
1519 | { |
1520 | int a1len; |
1521 | int a2len; |
1522 | int r; |
1523 | |
1524 | if (len1 >= TEXTBUFLEN / 2) |
1525 | { |
1526 | a1len = len1 * 2 + 2; |
1527 | a1p = palloc(a1len); |
1528 | } |
1529 | else |
1530 | { |
1531 | a1len = TEXTBUFLEN; |
1532 | a1p = a1buf; |
1533 | } |
1534 | if (len2 >= TEXTBUFLEN / 2) |
1535 | { |
1536 | a2len = len2 * 2 + 2; |
1537 | a2p = palloc(a2len); |
1538 | } |
1539 | else |
1540 | { |
1541 | a2len = TEXTBUFLEN; |
1542 | a2p = a2buf; |
1543 | } |
1544 | |
1545 | /* stupid Microsloth API does not work for zero-length input */ |
1546 | if (len1 == 0) |
1547 | r = 0; |
1548 | else |
1549 | { |
1550 | r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, |
1551 | (LPWSTR) a1p, a1len / 2); |
1552 | if (!r) |
1553 | ereport(ERROR, |
1554 | (errmsg("could not convert string to UTF-16: error code %lu" , |
1555 | GetLastError()))); |
1556 | } |
1557 | ((LPWSTR) a1p)[r] = 0; |
1558 | |
1559 | if (len2 == 0) |
1560 | r = 0; |
1561 | else |
1562 | { |
1563 | r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, |
1564 | (LPWSTR) a2p, a2len / 2); |
1565 | if (!r) |
1566 | ereport(ERROR, |
1567 | (errmsg("could not convert string to UTF-16: error code %lu" , |
1568 | GetLastError()))); |
1569 | } |
1570 | ((LPWSTR) a2p)[r] = 0; |
1571 | |
1572 | errno = 0; |
1573 | #ifdef HAVE_LOCALE_T |
1574 | if (mylocale) |
1575 | result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); |
1576 | else |
1577 | #endif |
1578 | result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); |
1579 | if (result == 2147483647) /* _NLSCMPERROR; missing from mingw |
1580 | * headers */ |
1581 | ereport(ERROR, |
1582 | (errmsg("could not compare Unicode strings: %m" ))); |
1583 | |
1584 | /* Break tie if necessary. */ |
1585 | if (result == 0 && |
1586 | (!mylocale || mylocale->deterministic)) |
1587 | { |
1588 | result = memcmp(arg1, arg2, Min(len1, len2)); |
1589 | if ((result == 0) && (len1 != len2)) |
1590 | result = (len1 < len2) ? -1 : 1; |
1591 | } |
1592 | |
1593 | if (a1p != a1buf) |
1594 | pfree(a1p); |
1595 | if (a2p != a2buf) |
1596 | pfree(a2p); |
1597 | |
1598 | return result; |
1599 | } |
1600 | #endif /* WIN32 */ |
1601 | |
1602 | if (len1 >= TEXTBUFLEN) |
1603 | a1p = (char *) palloc(len1 + 1); |
1604 | else |
1605 | a1p = a1buf; |
1606 | if (len2 >= TEXTBUFLEN) |
1607 | a2p = (char *) palloc(len2 + 1); |
1608 | else |
1609 | a2p = a2buf; |
1610 | |
1611 | memcpy(a1p, arg1, len1); |
1612 | a1p[len1] = '\0'; |
1613 | memcpy(a2p, arg2, len2); |
1614 | a2p[len2] = '\0'; |
1615 | |
1616 | if (mylocale) |
1617 | { |
1618 | if (mylocale->provider == COLLPROVIDER_ICU) |
1619 | { |
1620 | #ifdef USE_ICU |
1621 | #ifdef HAVE_UCOL_STRCOLLUTF8 |
1622 | if (GetDatabaseEncoding() == PG_UTF8) |
1623 | { |
1624 | UErrorCode status; |
1625 | |
1626 | status = U_ZERO_ERROR; |
1627 | result = ucol_strcollUTF8(mylocale->info.icu.ucol, |
1628 | arg1, len1, |
1629 | arg2, len2, |
1630 | &status); |
1631 | if (U_FAILURE(status)) |
1632 | ereport(ERROR, |
1633 | (errmsg("collation failed: %s" , u_errorName(status)))); |
1634 | } |
1635 | else |
1636 | #endif |
1637 | { |
1638 | int32_t ulen1, |
1639 | ulen2; |
1640 | UChar *uchar1, |
1641 | *uchar2; |
1642 | |
1643 | ulen1 = icu_to_uchar(&uchar1, arg1, len1); |
1644 | ulen2 = icu_to_uchar(&uchar2, arg2, len2); |
1645 | |
1646 | result = ucol_strcoll(mylocale->info.icu.ucol, |
1647 | uchar1, ulen1, |
1648 | uchar2, ulen2); |
1649 | |
1650 | pfree(uchar1); |
1651 | pfree(uchar2); |
1652 | } |
1653 | #else /* not USE_ICU */ |
1654 | /* shouldn't happen */ |
1655 | elog(ERROR, "unsupported collprovider: %c" , mylocale->provider); |
1656 | #endif /* not USE_ICU */ |
1657 | } |
1658 | else |
1659 | { |
1660 | #ifdef HAVE_LOCALE_T |
1661 | result = strcoll_l(a1p, a2p, mylocale->info.lt); |
1662 | #else |
1663 | /* shouldn't happen */ |
1664 | elog(ERROR, "unsupported collprovider: %c" , mylocale->provider); |
1665 | #endif |
1666 | } |
1667 | } |
1668 | else |
1669 | result = strcoll(a1p, a2p); |
1670 | |
1671 | /* Break tie if necessary. */ |
1672 | if (result == 0 && |
1673 | (!mylocale || mylocale->deterministic)) |
1674 | result = strcmp(a1p, a2p); |
1675 | |
1676 | if (a1p != a1buf) |
1677 | pfree(a1p); |
1678 | if (a2p != a2buf) |
1679 | pfree(a2p); |
1680 | } |
1681 | |
1682 | return result; |
1683 | } |
1684 | |
1685 | /* text_cmp() |
1686 | * Internal comparison function for text strings. |
1687 | * Returns -1, 0 or 1 |
1688 | */ |
1689 | static int |
1690 | text_cmp(text *arg1, text *arg2, Oid collid) |
1691 | { |
1692 | char *a1p, |
1693 | *a2p; |
1694 | int len1, |
1695 | len2; |
1696 | |
1697 | a1p = VARDATA_ANY(arg1); |
1698 | a2p = VARDATA_ANY(arg2); |
1699 | |
1700 | len1 = VARSIZE_ANY_EXHDR(arg1); |
1701 | len2 = VARSIZE_ANY_EXHDR(arg2); |
1702 | |
1703 | return varstr_cmp(a1p, len1, a2p, len2, collid); |
1704 | } |
1705 | |
1706 | /* |
1707 | * Comparison functions for text strings. |
1708 | * |
1709 | * Note: btree indexes need these routines not to leak memory; therefore, |
1710 | * be careful to free working copies of toasted datums. Most places don't |
1711 | * need to be so careful. |
1712 | */ |
1713 | |
1714 | Datum |
1715 | texteq(PG_FUNCTION_ARGS) |
1716 | { |
1717 | Oid collid = PG_GET_COLLATION(); |
1718 | bool result; |
1719 | |
1720 | check_collation_set(collid); |
1721 | |
1722 | if (lc_collate_is_c(collid) || |
1723 | collid == DEFAULT_COLLATION_OID || |
1724 | pg_newlocale_from_collation(collid)->deterministic) |
1725 | { |
1726 | Datum arg1 = PG_GETARG_DATUM(0); |
1727 | Datum arg2 = PG_GETARG_DATUM(1); |
1728 | Size len1, |
1729 | len2; |
1730 | |
1731 | /* |
1732 | * Since we only care about equality or not-equality, we can avoid all |
1733 | * the expense of strcoll() here, and just do bitwise comparison. In |
1734 | * fact, we don't even have to do a bitwise comparison if we can show |
1735 | * the lengths of the strings are unequal; which might save us from |
1736 | * having to detoast one or both values. |
1737 | */ |
1738 | len1 = toast_raw_datum_size(arg1); |
1739 | len2 = toast_raw_datum_size(arg2); |
1740 | if (len1 != len2) |
1741 | result = false; |
1742 | else |
1743 | { |
1744 | text *targ1 = DatumGetTextPP(arg1); |
1745 | text *targ2 = DatumGetTextPP(arg2); |
1746 | |
1747 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1748 | len1 - VARHDRSZ) == 0); |
1749 | |
1750 | PG_FREE_IF_COPY(targ1, 0); |
1751 | PG_FREE_IF_COPY(targ2, 1); |
1752 | } |
1753 | } |
1754 | else |
1755 | { |
1756 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1757 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1758 | |
1759 | result = (text_cmp(arg1, arg2, collid) == 0); |
1760 | |
1761 | PG_FREE_IF_COPY(arg1, 0); |
1762 | PG_FREE_IF_COPY(arg2, 1); |
1763 | } |
1764 | |
1765 | PG_RETURN_BOOL(result); |
1766 | } |
1767 | |
1768 | Datum |
1769 | textne(PG_FUNCTION_ARGS) |
1770 | { |
1771 | Oid collid = PG_GET_COLLATION(); |
1772 | bool result; |
1773 | |
1774 | check_collation_set(collid); |
1775 | |
1776 | if (lc_collate_is_c(collid) || |
1777 | collid == DEFAULT_COLLATION_OID || |
1778 | pg_newlocale_from_collation(collid)->deterministic) |
1779 | { |
1780 | Datum arg1 = PG_GETARG_DATUM(0); |
1781 | Datum arg2 = PG_GETARG_DATUM(1); |
1782 | Size len1, |
1783 | len2; |
1784 | |
1785 | /* See comment in texteq() */ |
1786 | len1 = toast_raw_datum_size(arg1); |
1787 | len2 = toast_raw_datum_size(arg2); |
1788 | if (len1 != len2) |
1789 | result = true; |
1790 | else |
1791 | { |
1792 | text *targ1 = DatumGetTextPP(arg1); |
1793 | text *targ2 = DatumGetTextPP(arg2); |
1794 | |
1795 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1796 | len1 - VARHDRSZ) != 0); |
1797 | |
1798 | PG_FREE_IF_COPY(targ1, 0); |
1799 | PG_FREE_IF_COPY(targ2, 1); |
1800 | } |
1801 | } |
1802 | else |
1803 | { |
1804 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1805 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1806 | |
1807 | result = (text_cmp(arg1, arg2, collid) != 0); |
1808 | |
1809 | PG_FREE_IF_COPY(arg1, 0); |
1810 | PG_FREE_IF_COPY(arg2, 1); |
1811 | } |
1812 | |
1813 | PG_RETURN_BOOL(result); |
1814 | } |
1815 | |
1816 | Datum |
1817 | text_lt(PG_FUNCTION_ARGS) |
1818 | { |
1819 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1820 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1821 | bool result; |
1822 | |
1823 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0); |
1824 | |
1825 | PG_FREE_IF_COPY(arg1, 0); |
1826 | PG_FREE_IF_COPY(arg2, 1); |
1827 | |
1828 | PG_RETURN_BOOL(result); |
1829 | } |
1830 | |
1831 | Datum |
1832 | text_le(PG_FUNCTION_ARGS) |
1833 | { |
1834 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1835 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1836 | bool result; |
1837 | |
1838 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0); |
1839 | |
1840 | PG_FREE_IF_COPY(arg1, 0); |
1841 | PG_FREE_IF_COPY(arg2, 1); |
1842 | |
1843 | PG_RETURN_BOOL(result); |
1844 | } |
1845 | |
1846 | Datum |
1847 | text_gt(PG_FUNCTION_ARGS) |
1848 | { |
1849 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1850 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1851 | bool result; |
1852 | |
1853 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0); |
1854 | |
1855 | PG_FREE_IF_COPY(arg1, 0); |
1856 | PG_FREE_IF_COPY(arg2, 1); |
1857 | |
1858 | PG_RETURN_BOOL(result); |
1859 | } |
1860 | |
1861 | Datum |
1862 | text_ge(PG_FUNCTION_ARGS) |
1863 | { |
1864 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1865 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1866 | bool result; |
1867 | |
1868 | result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0); |
1869 | |
1870 | PG_FREE_IF_COPY(arg1, 0); |
1871 | PG_FREE_IF_COPY(arg2, 1); |
1872 | |
1873 | PG_RETURN_BOOL(result); |
1874 | } |
1875 | |
1876 | Datum |
1877 | text_starts_with(PG_FUNCTION_ARGS) |
1878 | { |
1879 | Datum arg1 = PG_GETARG_DATUM(0); |
1880 | Datum arg2 = PG_GETARG_DATUM(1); |
1881 | Oid collid = PG_GET_COLLATION(); |
1882 | pg_locale_t mylocale = 0; |
1883 | bool result; |
1884 | Size len1, |
1885 | len2; |
1886 | |
1887 | check_collation_set(collid); |
1888 | |
1889 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) |
1890 | mylocale = pg_newlocale_from_collation(collid); |
1891 | |
1892 | if (mylocale && !mylocale->deterministic) |
1893 | ereport(ERROR, |
1894 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1895 | errmsg("nondeterministic collations are not supported for substring searches" ))); |
1896 | |
1897 | len1 = toast_raw_datum_size(arg1); |
1898 | len2 = toast_raw_datum_size(arg2); |
1899 | if (len2 > len1) |
1900 | result = false; |
1901 | else |
1902 | { |
1903 | text *targ1 = text_substring(arg1, 1, len2, false); |
1904 | text *targ2 = DatumGetTextPP(arg2); |
1905 | |
1906 | result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2), |
1907 | VARSIZE_ANY_EXHDR(targ2)) == 0); |
1908 | |
1909 | PG_FREE_IF_COPY(targ1, 0); |
1910 | PG_FREE_IF_COPY(targ2, 1); |
1911 | } |
1912 | |
1913 | PG_RETURN_BOOL(result); |
1914 | } |
1915 | |
1916 | Datum |
1917 | bttextcmp(PG_FUNCTION_ARGS) |
1918 | { |
1919 | text *arg1 = PG_GETARG_TEXT_PP(0); |
1920 | text *arg2 = PG_GETARG_TEXT_PP(1); |
1921 | int32 result; |
1922 | |
1923 | result = text_cmp(arg1, arg2, PG_GET_COLLATION()); |
1924 | |
1925 | PG_FREE_IF_COPY(arg1, 0); |
1926 | PG_FREE_IF_COPY(arg2, 1); |
1927 | |
1928 | PG_RETURN_INT32(result); |
1929 | } |
1930 | |
1931 | Datum |
1932 | bttextsortsupport(PG_FUNCTION_ARGS) |
1933 | { |
1934 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
1935 | Oid collid = ssup->ssup_collation; |
1936 | MemoryContext oldcontext; |
1937 | |
1938 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
1939 | |
1940 | /* Use generic string SortSupport */ |
1941 | varstr_sortsupport(ssup, TEXTOID, collid); |
1942 | |
1943 | MemoryContextSwitchTo(oldcontext); |
1944 | |
1945 | PG_RETURN_VOID(); |
1946 | } |
1947 | |
1948 | /* |
1949 | * Generic sortsupport interface for character type's operator classes. |
1950 | * Includes locale support, and support for BpChar semantics (i.e. removing |
1951 | * trailing spaces before comparison). |
1952 | * |
1953 | * Relies on the assumption that text, VarChar, BpChar, and bytea all have the |
1954 | * same representation. Callers that always use the C collation (e.g. |
1955 | * non-collatable type callers like bytea) may have NUL bytes in their strings; |
1956 | * this will not work with any other collation, though. |
1957 | */ |
1958 | void |
1959 | varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) |
1960 | { |
1961 | bool abbreviate = ssup->abbreviate; |
1962 | bool collate_c = false; |
1963 | VarStringSortSupport *sss; |
1964 | pg_locale_t locale = 0; |
1965 | |
1966 | check_collation_set(collid); |
1967 | |
1968 | /* |
1969 | * If possible, set ssup->comparator to a function which can be used to |
1970 | * directly compare two datums. If we can do this, we'll avoid the |
1971 | * overhead of a trip through the fmgr layer for every comparison, which |
1972 | * can be substantial. |
1973 | * |
1974 | * Most typically, we'll set the comparator to varlenafastcmp_locale, |
1975 | * which uses strcoll() to perform comparisons. We use that for the |
1976 | * BpChar case too, but type NAME uses namefastcmp_locale. However, if |
1977 | * LC_COLLATE = C, we can make things quite a bit faster with |
1978 | * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use |
1979 | * memcmp() rather than strcoll(). |
1980 | */ |
1981 | if (lc_collate_is_c(collid)) |
1982 | { |
1983 | if (typid == BPCHAROID) |
1984 | ssup->comparator = bpcharfastcmp_c; |
1985 | else if (typid == NAMEOID) |
1986 | { |
1987 | ssup->comparator = namefastcmp_c; |
1988 | /* Not supporting abbreviation with type NAME, for now */ |
1989 | abbreviate = false; |
1990 | } |
1991 | else |
1992 | ssup->comparator = varstrfastcmp_c; |
1993 | |
1994 | collate_c = true; |
1995 | } |
1996 | else |
1997 | { |
1998 | /* |
1999 | * We need a collation-sensitive comparison. To make things faster, |
2000 | * we'll figure out the collation based on the locale id and cache the |
2001 | * result. |
2002 | */ |
2003 | if (collid != DEFAULT_COLLATION_OID) |
2004 | locale = pg_newlocale_from_collation(collid); |
2005 | |
2006 | /* |
2007 | * There is a further exception on Windows. When the database |
2008 | * encoding is UTF-8 and we are not using the C collation, complex |
2009 | * hacks are required. We don't currently have a comparator that |
2010 | * handles that case, so we fall back on the slow method of having the |
2011 | * sort code invoke bttextcmp() (in the case of text) via the fmgr |
2012 | * trampoline. ICU locales work just the same on Windows, however. |
2013 | */ |
2014 | #ifdef WIN32 |
2015 | if (GetDatabaseEncoding() == PG_UTF8 && |
2016 | !(locale && locale->provider == COLLPROVIDER_ICU)) |
2017 | return; |
2018 | #endif |
2019 | |
2020 | /* |
2021 | * We use varlenafastcmp_locale except for type NAME. |
2022 | */ |
2023 | if (typid == NAMEOID) |
2024 | { |
2025 | ssup->comparator = namefastcmp_locale; |
2026 | /* Not supporting abbreviation with type NAME, for now */ |
2027 | abbreviate = false; |
2028 | } |
2029 | else |
2030 | ssup->comparator = varlenafastcmp_locale; |
2031 | } |
2032 | |
2033 | /* |
2034 | * Unfortunately, it seems that abbreviation for non-C collations is |
2035 | * broken on many common platforms; testing of multiple versions of glibc |
2036 | * reveals that, for many locales, strcoll() and strxfrm() do not return |
2037 | * consistent results, which is fatal to this optimization. While no |
2038 | * other libc other than Cygwin has so far been shown to have a problem, |
2039 | * we take the conservative course of action for right now and disable |
2040 | * this categorically. (Users who are certain this isn't a problem on |
2041 | * their system can define TRUST_STRXFRM.) |
2042 | * |
2043 | * Even apart from the risk of broken locales, it's possible that there |
2044 | * are platforms where the use of abbreviated keys should be disabled at |
2045 | * compile time. Having only 4 byte datums could make worst-case |
2046 | * performance drastically more likely, for example. Moreover, macOS's |
2047 | * strxfrm() implementation is known to not effectively concentrate a |
2048 | * significant amount of entropy from the original string in earlier |
2049 | * transformed blobs. It's possible that other supported platforms are |
2050 | * similarly encumbered. So, if we ever get past disabling this |
2051 | * categorically, we may still want or need to disable it for particular |
2052 | * platforms. |
2053 | */ |
2054 | #ifndef TRUST_STRXFRM |
2055 | if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) |
2056 | abbreviate = false; |
2057 | #endif |
2058 | |
2059 | /* |
2060 | * If we're using abbreviated keys, or if we're using a locale-aware |
2061 | * comparison, we need to initialize a StringSortSupport object. Both |
2062 | * cases will make use of the temporary buffers we initialize here for |
2063 | * scratch space (and to detect requirement for BpChar semantics from |
2064 | * caller), and the abbreviation case requires additional state. |
2065 | */ |
2066 | if (abbreviate || !collate_c) |
2067 | { |
2068 | sss = palloc(sizeof(VarStringSortSupport)); |
2069 | sss->buf1 = palloc(TEXTBUFLEN); |
2070 | sss->buflen1 = TEXTBUFLEN; |
2071 | sss->buf2 = palloc(TEXTBUFLEN); |
2072 | sss->buflen2 = TEXTBUFLEN; |
2073 | /* Start with invalid values */ |
2074 | sss->last_len1 = -1; |
2075 | sss->last_len2 = -1; |
2076 | /* Initialize */ |
2077 | sss->last_returned = 0; |
2078 | sss->locale = locale; |
2079 | |
2080 | /* |
2081 | * To avoid somehow confusing a strxfrm() blob and an original string, |
2082 | * constantly keep track of the variety of data that buf1 and buf2 |
2083 | * currently contain. |
2084 | * |
2085 | * Comparisons may be interleaved with conversion calls. Frequently, |
2086 | * conversions and comparisons are batched into two distinct phases, |
2087 | * but the correctness of caching cannot hinge upon this. For |
2088 | * comparison caching, buffer state is only trusted if cache_blob is |
2089 | * found set to false, whereas strxfrm() caching only trusts the state |
2090 | * when cache_blob is found set to true. |
2091 | * |
2092 | * Arbitrarily initialize cache_blob to true. |
2093 | */ |
2094 | sss->cache_blob = true; |
2095 | sss->collate_c = collate_c; |
2096 | sss->typid = typid; |
2097 | ssup->ssup_extra = sss; |
2098 | |
2099 | /* |
2100 | * If possible, plan to use the abbreviated keys optimization. The |
2101 | * core code may switch back to authoritative comparator should |
2102 | * abbreviation be aborted. |
2103 | */ |
2104 | if (abbreviate) |
2105 | { |
2106 | sss->prop_card = 0.20; |
2107 | initHyperLogLog(&sss->abbr_card, 10); |
2108 | initHyperLogLog(&sss->full_card, 10); |
2109 | ssup->abbrev_full_comparator = ssup->comparator; |
2110 | ssup->comparator = varstrcmp_abbrev; |
2111 | ssup->abbrev_converter = varstr_abbrev_convert; |
2112 | ssup->abbrev_abort = varstr_abbrev_abort; |
2113 | } |
2114 | } |
2115 | } |
2116 | |
2117 | /* |
2118 | * sortsupport comparison func (for C locale case) |
2119 | */ |
2120 | static int |
2121 | varstrfastcmp_c(Datum x, Datum y, SortSupport ssup) |
2122 | { |
2123 | VarString *arg1 = DatumGetVarStringPP(x); |
2124 | VarString *arg2 = DatumGetVarStringPP(y); |
2125 | char *a1p, |
2126 | *a2p; |
2127 | int len1, |
2128 | len2, |
2129 | result; |
2130 | |
2131 | a1p = VARDATA_ANY(arg1); |
2132 | a2p = VARDATA_ANY(arg2); |
2133 | |
2134 | len1 = VARSIZE_ANY_EXHDR(arg1); |
2135 | len2 = VARSIZE_ANY_EXHDR(arg2); |
2136 | |
2137 | result = memcmp(a1p, a2p, Min(len1, len2)); |
2138 | if ((result == 0) && (len1 != len2)) |
2139 | result = (len1 < len2) ? -1 : 1; |
2140 | |
2141 | /* We can't afford to leak memory here. */ |
2142 | if (PointerGetDatum(arg1) != x) |
2143 | pfree(arg1); |
2144 | if (PointerGetDatum(arg2) != y) |
2145 | pfree(arg2); |
2146 | |
2147 | return result; |
2148 | } |
2149 | |
2150 | /* |
2151 | * sortsupport comparison func (for BpChar C locale case) |
2152 | * |
2153 | * BpChar outsources its sortsupport to this module. Specialization for the |
2154 | * varstr_sortsupport BpChar case, modeled on |
2155 | * internal_bpchar_pattern_compare(). |
2156 | */ |
2157 | static int |
2158 | bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup) |
2159 | { |
2160 | BpChar *arg1 = DatumGetBpCharPP(x); |
2161 | BpChar *arg2 = DatumGetBpCharPP(y); |
2162 | char *a1p, |
2163 | *a2p; |
2164 | int len1, |
2165 | len2, |
2166 | result; |
2167 | |
2168 | a1p = VARDATA_ANY(arg1); |
2169 | a2p = VARDATA_ANY(arg2); |
2170 | |
2171 | len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1)); |
2172 | len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2)); |
2173 | |
2174 | result = memcmp(a1p, a2p, Min(len1, len2)); |
2175 | if ((result == 0) && (len1 != len2)) |
2176 | result = (len1 < len2) ? -1 : 1; |
2177 | |
2178 | /* We can't afford to leak memory here. */ |
2179 | if (PointerGetDatum(arg1) != x) |
2180 | pfree(arg1); |
2181 | if (PointerGetDatum(arg2) != y) |
2182 | pfree(arg2); |
2183 | |
2184 | return result; |
2185 | } |
2186 | |
2187 | /* |
2188 | * sortsupport comparison func (for NAME C locale case) |
2189 | */ |
2190 | static int |
2191 | namefastcmp_c(Datum x, Datum y, SortSupport ssup) |
2192 | { |
2193 | Name arg1 = DatumGetName(x); |
2194 | Name arg2 = DatumGetName(y); |
2195 | |
2196 | return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN); |
2197 | } |
2198 | |
2199 | /* |
2200 | * sortsupport comparison func (for locale case with all varlena types) |
2201 | */ |
2202 | static int |
2203 | varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup) |
2204 | { |
2205 | VarString *arg1 = DatumGetVarStringPP(x); |
2206 | VarString *arg2 = DatumGetVarStringPP(y); |
2207 | char *a1p, |
2208 | *a2p; |
2209 | int len1, |
2210 | len2, |
2211 | result; |
2212 | |
2213 | a1p = VARDATA_ANY(arg1); |
2214 | a2p = VARDATA_ANY(arg2); |
2215 | |
2216 | len1 = VARSIZE_ANY_EXHDR(arg1); |
2217 | len2 = VARSIZE_ANY_EXHDR(arg2); |
2218 | |
2219 | result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup); |
2220 | |
2221 | /* We can't afford to leak memory here. */ |
2222 | if (PointerGetDatum(arg1) != x) |
2223 | pfree(arg1); |
2224 | if (PointerGetDatum(arg2) != y) |
2225 | pfree(arg2); |
2226 | |
2227 | return result; |
2228 | } |
2229 | |
2230 | /* |
2231 | * sortsupport comparison func (for locale case with NAME type) |
2232 | */ |
2233 | static int |
2234 | namefastcmp_locale(Datum x, Datum y, SortSupport ssup) |
2235 | { |
2236 | Name arg1 = DatumGetName(x); |
2237 | Name arg2 = DatumGetName(y); |
2238 | |
2239 | return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)), |
2240 | NameStr(*arg2), strlen(NameStr(*arg2)), |
2241 | ssup); |
2242 | } |
2243 | |
2244 | /* |
2245 | * sortsupport comparison func for locale cases |
2246 | */ |
2247 | static int |
2248 | varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) |
2249 | { |
2250 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2251 | int result; |
2252 | bool arg1_match; |
2253 | |
2254 | /* Fast pre-check for equality, as discussed in varstr_cmp() */ |
2255 | if (len1 == len2 && memcmp(a1p, a2p, len1) == 0) |
2256 | { |
2257 | /* |
2258 | * No change in buf1 or buf2 contents, so avoid changing last_len1 or |
2259 | * last_len2. Existing contents of buffers might still be used by |
2260 | * next call. |
2261 | * |
2262 | * It's fine to allow the comparison of BpChar padding bytes here, |
2263 | * even though that implies that the memcmp() will usually be |
2264 | * performed for BpChar callers (though multibyte characters could |
2265 | * still prevent that from occurring). The memcmp() is still very |
2266 | * cheap, and BpChar's funny semantics have us remove trailing spaces |
2267 | * (not limited to padding), so we need make no distinction between |
2268 | * padding space characters and "real" space characters. |
2269 | */ |
2270 | return 0; |
2271 | } |
2272 | |
2273 | if (sss->typid == BPCHAROID) |
2274 | { |
2275 | /* Get true number of bytes, ignoring trailing spaces */ |
2276 | len1 = bpchartruelen(a1p, len1); |
2277 | len2 = bpchartruelen(a2p, len2); |
2278 | } |
2279 | |
2280 | if (len1 >= sss->buflen1) |
2281 | { |
2282 | pfree(sss->buf1); |
2283 | sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize)); |
2284 | sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1); |
2285 | } |
2286 | if (len2 >= sss->buflen2) |
2287 | { |
2288 | pfree(sss->buf2); |
2289 | sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize)); |
2290 | sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2); |
2291 | } |
2292 | |
2293 | /* |
2294 | * We're likely to be asked to compare the same strings repeatedly, and |
2295 | * memcmp() is so much cheaper than strcoll() that it pays to try to cache |
2296 | * comparisons, even though in general there is no reason to think that |
2297 | * that will work out (every string datum may be unique). Caching does |
2298 | * not slow things down measurably when it doesn't work out, and can speed |
2299 | * things up by rather a lot when it does. In part, this is because the |
2300 | * memcmp() compares data from cachelines that are needed in L1 cache even |
2301 | * when the last comparison's result cannot be reused. |
2302 | */ |
2303 | arg1_match = true; |
2304 | if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0) |
2305 | { |
2306 | arg1_match = false; |
2307 | memcpy(sss->buf1, a1p, len1); |
2308 | sss->buf1[len1] = '\0'; |
2309 | sss->last_len1 = len1; |
2310 | } |
2311 | |
2312 | /* |
2313 | * If we're comparing the same two strings as last time, we can return the |
2314 | * same answer without calling strcoll() again. This is more likely than |
2315 | * it seems (at least with moderate to low cardinality sets), because |
2316 | * quicksort compares the same pivot against many values. |
2317 | */ |
2318 | if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0) |
2319 | { |
2320 | memcpy(sss->buf2, a2p, len2); |
2321 | sss->buf2[len2] = '\0'; |
2322 | sss->last_len2 = len2; |
2323 | } |
2324 | else if (arg1_match && !sss->cache_blob) |
2325 | { |
2326 | /* Use result cached following last actual strcoll() call */ |
2327 | return sss->last_returned; |
2328 | } |
2329 | |
2330 | if (sss->locale) |
2331 | { |
2332 | if (sss->locale->provider == COLLPROVIDER_ICU) |
2333 | { |
2334 | #ifdef USE_ICU |
2335 | #ifdef HAVE_UCOL_STRCOLLUTF8 |
2336 | if (GetDatabaseEncoding() == PG_UTF8) |
2337 | { |
2338 | UErrorCode status; |
2339 | |
2340 | status = U_ZERO_ERROR; |
2341 | result = ucol_strcollUTF8(sss->locale->info.icu.ucol, |
2342 | a1p, len1, |
2343 | a2p, len2, |
2344 | &status); |
2345 | if (U_FAILURE(status)) |
2346 | ereport(ERROR, |
2347 | (errmsg("collation failed: %s" , u_errorName(status)))); |
2348 | } |
2349 | else |
2350 | #endif |
2351 | { |
2352 | int32_t ulen1, |
2353 | ulen2; |
2354 | UChar *uchar1, |
2355 | *uchar2; |
2356 | |
2357 | ulen1 = icu_to_uchar(&uchar1, a1p, len1); |
2358 | ulen2 = icu_to_uchar(&uchar2, a2p, len2); |
2359 | |
2360 | result = ucol_strcoll(sss->locale->info.icu.ucol, |
2361 | uchar1, ulen1, |
2362 | uchar2, ulen2); |
2363 | |
2364 | pfree(uchar1); |
2365 | pfree(uchar2); |
2366 | } |
2367 | #else /* not USE_ICU */ |
2368 | /* shouldn't happen */ |
2369 | elog(ERROR, "unsupported collprovider: %c" , sss->locale->provider); |
2370 | #endif /* not USE_ICU */ |
2371 | } |
2372 | else |
2373 | { |
2374 | #ifdef HAVE_LOCALE_T |
2375 | result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); |
2376 | #else |
2377 | /* shouldn't happen */ |
2378 | elog(ERROR, "unsupported collprovider: %c" , sss->locale->provider); |
2379 | #endif |
2380 | } |
2381 | } |
2382 | else |
2383 | result = strcoll(sss->buf1, sss->buf2); |
2384 | |
2385 | /* Break tie if necessary. */ |
2386 | if (result == 0 && |
2387 | (!sss->locale || sss->locale->deterministic)) |
2388 | result = strcmp(sss->buf1, sss->buf2); |
2389 | |
2390 | /* Cache result, perhaps saving an expensive strcoll() call next time */ |
2391 | sss->cache_blob = false; |
2392 | sss->last_returned = result; |
2393 | return result; |
2394 | } |
2395 | |
2396 | /* |
2397 | * Abbreviated key comparison func |
2398 | */ |
2399 | static int |
2400 | varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup) |
2401 | { |
2402 | /* |
2403 | * When 0 is returned, the core system will call varstrfastcmp_c() |
2404 | * (bpcharfastcmp_c() in BpChar case) or varlenafastcmp_locale(). Even a |
2405 | * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality* |
2406 | * authoritatively, for the same reason that there is a strcoll() |
2407 | * tie-breaker call to strcmp() in varstr_cmp(). |
2408 | */ |
2409 | if (x > y) |
2410 | return 1; |
2411 | else if (x == y) |
2412 | return 0; |
2413 | else |
2414 | return -1; |
2415 | } |
2416 | |
2417 | /* |
2418 | * Conversion routine for sortsupport. Converts original to abbreviated key |
2419 | * representation. Our encoding strategy is simple -- pack the first 8 bytes |
2420 | * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are |
2421 | * stored in reverse order), and treat it as an unsigned integer. When the "C" |
2422 | * locale is used, or in case of bytea, just memcpy() from original instead. |
2423 | */ |
2424 | static Datum |
2425 | varstr_abbrev_convert(Datum original, SortSupport ssup) |
2426 | { |
2427 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2428 | VarString *authoritative = DatumGetVarStringPP(original); |
2429 | char *authoritative_data = VARDATA_ANY(authoritative); |
2430 | |
2431 | /* working state */ |
2432 | Datum res; |
2433 | char *pres; |
2434 | int len; |
2435 | uint32 hash; |
2436 | |
2437 | pres = (char *) &res; |
2438 | /* memset(), so any non-overwritten bytes are NUL */ |
2439 | memset(pres, 0, sizeof(Datum)); |
2440 | len = VARSIZE_ANY_EXHDR(authoritative); |
2441 | |
2442 | /* Get number of bytes, ignoring trailing spaces */ |
2443 | if (sss->typid == BPCHAROID) |
2444 | len = bpchartruelen(authoritative_data, len); |
2445 | |
2446 | /* |
2447 | * If we're using the C collation, use memcpy(), rather than strxfrm(), to |
2448 | * abbreviate keys. The full comparator for the C locale is always |
2449 | * memcmp(). It would be incorrect to allow bytea callers (callers that |
2450 | * always force the C collation -- bytea isn't a collatable type, but this |
2451 | * approach is convenient) to use strxfrm(). This is because bytea |
2452 | * strings may contain NUL bytes. Besides, this should be faster, too. |
2453 | * |
2454 | * More generally, it's okay that bytea callers can have NUL bytes in |
2455 | * strings because varstrcmp_abbrev() need not make a distinction between |
2456 | * terminating NUL bytes, and NUL bytes representing actual NULs in the |
2457 | * authoritative representation. Hopefully a comparison at or past one |
2458 | * abbreviated key's terminating NUL byte will resolve the comparison |
2459 | * without consulting the authoritative representation; specifically, some |
2460 | * later non-NUL byte in the longer string can resolve the comparison |
2461 | * against a subsequent terminating NUL in the shorter string. There will |
2462 | * usually be what is effectively a "length-wise" resolution there and |
2463 | * then. |
2464 | * |
2465 | * If that doesn't work out -- if all bytes in the longer string |
2466 | * positioned at or past the offset of the smaller string's (first) |
2467 | * terminating NUL are actually representative of NUL bytes in the |
2468 | * authoritative binary string (perhaps with some *terminating* NUL bytes |
2469 | * towards the end of the longer string iff it happens to still be small) |
2470 | * -- then an authoritative tie-breaker will happen, and do the right |
2471 | * thing: explicitly consider string length. |
2472 | */ |
2473 | if (sss->collate_c) |
2474 | memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); |
2475 | else |
2476 | { |
2477 | Size bsize; |
2478 | #ifdef USE_ICU |
2479 | int32_t ulen = -1; |
2480 | UChar *uchar = NULL; |
2481 | #endif |
2482 | |
2483 | /* |
2484 | * We're not using the C collation, so fall back on strxfrm or ICU |
2485 | * analogs. |
2486 | */ |
2487 | |
2488 | /* By convention, we use buffer 1 to store and NUL-terminate */ |
2489 | if (len >= sss->buflen1) |
2490 | { |
2491 | pfree(sss->buf1); |
2492 | sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize)); |
2493 | sss->buf1 = palloc(sss->buflen1); |
2494 | } |
2495 | |
2496 | /* Might be able to reuse strxfrm() blob from last call */ |
2497 | if (sss->last_len1 == len && sss->cache_blob && |
2498 | memcmp(sss->buf1, authoritative_data, len) == 0) |
2499 | { |
2500 | memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); |
2501 | /* No change affecting cardinality, so no hashing required */ |
2502 | goto done; |
2503 | } |
2504 | |
2505 | memcpy(sss->buf1, authoritative_data, len); |
2506 | |
2507 | /* |
2508 | * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not |
2509 | * necessary for ICU, but doesn't hurt. |
2510 | */ |
2511 | sss->buf1[len] = '\0'; |
2512 | sss->last_len1 = len; |
2513 | |
2514 | #ifdef USE_ICU |
2515 | /* When using ICU and not UTF8, convert string to UChar. */ |
2516 | if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && |
2517 | GetDatabaseEncoding() != PG_UTF8) |
2518 | ulen = icu_to_uchar(&uchar, sss->buf1, len); |
2519 | #endif |
2520 | |
2521 | /* |
2522 | * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, |
2523 | * and try again. Both of these functions have the result buffer |
2524 | * content undefined if the result did not fit, so we need to retry |
2525 | * until everything fits, even though we only need the first few bytes |
2526 | * in the end. When using ucol_nextSortKeyPart(), however, we only |
2527 | * ask for as many bytes as we actually need. |
2528 | */ |
2529 | for (;;) |
2530 | { |
2531 | #ifdef USE_ICU |
2532 | if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) |
2533 | { |
2534 | /* |
2535 | * When using UTF8, use the iteration interface so we only |
2536 | * need to produce as many bytes as we actually need. |
2537 | */ |
2538 | if (GetDatabaseEncoding() == PG_UTF8) |
2539 | { |
2540 | UCharIterator iter; |
2541 | uint32_t state[2]; |
2542 | UErrorCode status; |
2543 | |
2544 | uiter_setUTF8(&iter, sss->buf1, len); |
2545 | state[0] = state[1] = 0; /* won't need that again */ |
2546 | status = U_ZERO_ERROR; |
2547 | bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, |
2548 | &iter, |
2549 | state, |
2550 | (uint8_t *) sss->buf2, |
2551 | Min(sizeof(Datum), sss->buflen2), |
2552 | &status); |
2553 | if (U_FAILURE(status)) |
2554 | ereport(ERROR, |
2555 | (errmsg("sort key generation failed: %s" , |
2556 | u_errorName(status)))); |
2557 | } |
2558 | else |
2559 | bsize = ucol_getSortKey(sss->locale->info.icu.ucol, |
2560 | uchar, ulen, |
2561 | (uint8_t *) sss->buf2, sss->buflen2); |
2562 | } |
2563 | else |
2564 | #endif |
2565 | #ifdef HAVE_LOCALE_T |
2566 | if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) |
2567 | bsize = strxfrm_l(sss->buf2, sss->buf1, |
2568 | sss->buflen2, sss->locale->info.lt); |
2569 | else |
2570 | #endif |
2571 | bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); |
2572 | |
2573 | sss->last_len2 = bsize; |
2574 | if (bsize < sss->buflen2) |
2575 | break; |
2576 | |
2577 | /* |
2578 | * Grow buffer and retry. |
2579 | */ |
2580 | pfree(sss->buf2); |
2581 | sss->buflen2 = Max(bsize + 1, |
2582 | Min(sss->buflen2 * 2, MaxAllocSize)); |
2583 | sss->buf2 = palloc(sss->buflen2); |
2584 | } |
2585 | |
2586 | /* |
2587 | * Every Datum byte is always compared. This is safe because the |
2588 | * strxfrm() blob is itself NUL terminated, leaving no danger of |
2589 | * misinterpreting any NUL bytes not intended to be interpreted as |
2590 | * logically representing termination. |
2591 | * |
2592 | * (Actually, even if there were NUL bytes in the blob it would be |
2593 | * okay. See remarks on bytea case above.) |
2594 | */ |
2595 | memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); |
2596 | |
2597 | #ifdef USE_ICU |
2598 | if (uchar) |
2599 | pfree(uchar); |
2600 | #endif |
2601 | } |
2602 | |
2603 | /* |
2604 | * Maintain approximate cardinality of both abbreviated keys and original, |
2605 | * authoritative keys using HyperLogLog. Used as cheap insurance against |
2606 | * the worst case, where we do many string transformations for no saving |
2607 | * in full strcoll()-based comparisons. These statistics are used by |
2608 | * varstr_abbrev_abort(). |
2609 | * |
2610 | * First, Hash key proper, or a significant fraction of it. Mix in length |
2611 | * in order to compensate for cases where differences are past |
2612 | * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. |
2613 | */ |
2614 | hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data, |
2615 | Min(len, PG_CACHE_LINE_SIZE))); |
2616 | |
2617 | if (len > PG_CACHE_LINE_SIZE) |
2618 | hash ^= DatumGetUInt32(hash_uint32((uint32) len)); |
2619 | |
2620 | addHyperLogLog(&sss->full_card, hash); |
2621 | |
2622 | /* Hash abbreviated key */ |
2623 | #if SIZEOF_DATUM == 8 |
2624 | { |
2625 | uint32 lohalf, |
2626 | hihalf; |
2627 | |
2628 | lohalf = (uint32) res; |
2629 | hihalf = (uint32) (res >> 32); |
2630 | hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf)); |
2631 | } |
2632 | #else /* SIZEOF_DATUM != 8 */ |
2633 | hash = DatumGetUInt32(hash_uint32((uint32) res)); |
2634 | #endif |
2635 | |
2636 | addHyperLogLog(&sss->abbr_card, hash); |
2637 | |
2638 | /* Cache result, perhaps saving an expensive strxfrm() call next time */ |
2639 | sss->cache_blob = true; |
2640 | done: |
2641 | |
2642 | /* |
2643 | * Byteswap on little-endian machines. |
2644 | * |
2645 | * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way |
2646 | * comparator) works correctly on all platforms. If we didn't do this, |
2647 | * the comparator would have to call memcmp() with a pair of pointers to |
2648 | * the first byte of each abbreviated key, which is slower. |
2649 | */ |
2650 | res = DatumBigEndianToNative(res); |
2651 | |
2652 | /* Don't leak memory here */ |
2653 | if (PointerGetDatum(authoritative) != original) |
2654 | pfree(authoritative); |
2655 | |
2656 | return res; |
2657 | } |
2658 | |
2659 | /* |
2660 | * Callback for estimating effectiveness of abbreviated key optimization, using |
2661 | * heuristic rules. Returns value indicating if the abbreviation optimization |
2662 | * should be aborted, based on its projected effectiveness. |
2663 | */ |
2664 | static bool |
2665 | varstr_abbrev_abort(int memtupcount, SortSupport ssup) |
2666 | { |
2667 | VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; |
2668 | double abbrev_distinct, |
2669 | key_distinct; |
2670 | |
2671 | Assert(ssup->abbreviate); |
2672 | |
2673 | /* Have a little patience */ |
2674 | if (memtupcount < 100) |
2675 | return false; |
2676 | |
2677 | abbrev_distinct = estimateHyperLogLog(&sss->abbr_card); |
2678 | key_distinct = estimateHyperLogLog(&sss->full_card); |
2679 | |
2680 | /* |
2681 | * Clamp cardinality estimates to at least one distinct value. While |
2682 | * NULLs are generally disregarded, if only NULL values were seen so far, |
2683 | * that might misrepresent costs if we failed to clamp. |
2684 | */ |
2685 | if (abbrev_distinct <= 1.0) |
2686 | abbrev_distinct = 1.0; |
2687 | |
2688 | if (key_distinct <= 1.0) |
2689 | key_distinct = 1.0; |
2690 | |
2691 | /* |
2692 | * In the worst case all abbreviated keys are identical, while at the same |
2693 | * time there are differences within full key strings not captured in |
2694 | * abbreviations. |
2695 | */ |
2696 | #ifdef TRACE_SORT |
2697 | if (trace_sort) |
2698 | { |
2699 | double norm_abbrev_card = abbrev_distinct / (double) memtupcount; |
2700 | |
2701 | elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f " |
2702 | "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)" , |
2703 | memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card, |
2704 | sss->prop_card); |
2705 | } |
2706 | #endif |
2707 | |
2708 | /* |
2709 | * If the number of distinct abbreviated keys approximately matches the |
2710 | * number of distinct authoritative original keys, that's reason enough to |
2711 | * proceed. We can win even with a very low cardinality set if most |
2712 | * tie-breakers only memcmp(). This is by far the most important |
2713 | * consideration. |
2714 | * |
2715 | * While comparisons that are resolved at the abbreviated key level are |
2716 | * considerably cheaper than tie-breakers resolved with memcmp(), both of |
2717 | * those two outcomes are so much cheaper than a full strcoll() once |
2718 | * sorting is underway that it doesn't seem worth it to weigh abbreviated |
2719 | * cardinality against the overall size of the set in order to more |
2720 | * accurately model costs. Assume that an abbreviated comparison, and an |
2721 | * abbreviated comparison with a cheap memcmp()-based authoritative |
2722 | * resolution are equivalent. |
2723 | */ |
2724 | if (abbrev_distinct > key_distinct * sss->prop_card) |
2725 | { |
2726 | /* |
2727 | * When we have exceeded 10,000 tuples, decay required cardinality |
2728 | * aggressively for next call. |
2729 | * |
2730 | * This is useful because the number of comparisons required on |
2731 | * average increases at a linearithmic rate, and at roughly 10,000 |
2732 | * tuples that factor will start to dominate over the linear costs of |
2733 | * string transformation (this is a conservative estimate). The decay |
2734 | * rate is chosen to be a little less aggressive than halving -- which |
2735 | * (since we're called at points at which memtupcount has doubled) |
2736 | * would never see the cost model actually abort past the first call |
2737 | * following a decay. This decay rate is mostly a precaution against |
2738 | * a sudden, violent swing in how well abbreviated cardinality tracks |
2739 | * full key cardinality. The decay also serves to prevent a marginal |
2740 | * case from being aborted too late, when too much has already been |
2741 | * invested in string transformation. |
2742 | * |
2743 | * It's possible for sets of several million distinct strings with |
2744 | * mere tens of thousands of distinct abbreviated keys to still |
2745 | * benefit very significantly. This will generally occur provided |
2746 | * each abbreviated key is a proxy for a roughly uniform number of the |
2747 | * set's full keys. If it isn't so, we hope to catch that early and |
2748 | * abort. If it isn't caught early, by the time the problem is |
2749 | * apparent it's probably not worth aborting. |
2750 | */ |
2751 | if (memtupcount > 10000) |
2752 | sss->prop_card *= 0.65; |
2753 | |
2754 | return false; |
2755 | } |
2756 | |
2757 | /* |
2758 | * Abort abbreviation strategy. |
2759 | * |
2760 | * The worst case, where all abbreviated keys are identical while all |
2761 | * original strings differ will typically only see a regression of about |
2762 | * 10% in execution time for small to medium sized lists of strings. |
2763 | * Whereas on modern CPUs where cache stalls are the dominant cost, we can |
2764 | * often expect very large improvements, particularly with sets of strings |
2765 | * of moderately high to high abbreviated cardinality. There is little to |
2766 | * lose but much to gain, which our strategy reflects. |
2767 | */ |
2768 | #ifdef TRACE_SORT |
2769 | if (trace_sort) |
2770 | elog(LOG, "varstr_abbrev: aborted abbreviation at %d " |
2771 | "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)" , |
2772 | memtupcount, abbrev_distinct, key_distinct, sss->prop_card); |
2773 | #endif |
2774 | |
2775 | return true; |
2776 | } |
2777 | |
2778 | Datum |
2779 | text_larger(PG_FUNCTION_ARGS) |
2780 | { |
2781 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2782 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2783 | text *result; |
2784 | |
2785 | result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2); |
2786 | |
2787 | PG_RETURN_TEXT_P(result); |
2788 | } |
2789 | |
2790 | Datum |
2791 | text_smaller(PG_FUNCTION_ARGS) |
2792 | { |
2793 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2794 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2795 | text *result; |
2796 | |
2797 | result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2); |
2798 | |
2799 | PG_RETURN_TEXT_P(result); |
2800 | } |
2801 | |
2802 | |
2803 | /* |
2804 | * Cross-type comparison functions for types text and name. |
2805 | */ |
2806 | |
2807 | Datum |
2808 | nameeqtext(PG_FUNCTION_ARGS) |
2809 | { |
2810 | Name arg1 = PG_GETARG_NAME(0); |
2811 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2812 | size_t len1 = strlen(NameStr(*arg1)); |
2813 | size_t len2 = VARSIZE_ANY_EXHDR(arg2); |
2814 | Oid collid = PG_GET_COLLATION(); |
2815 | bool result; |
2816 | |
2817 | check_collation_set(collid); |
2818 | |
2819 | if (collid == C_COLLATION_OID) |
2820 | result = (len1 == len2 && |
2821 | memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0); |
2822 | else |
2823 | result = (varstr_cmp(NameStr(*arg1), len1, |
2824 | VARDATA_ANY(arg2), len2, |
2825 | collid) == 0); |
2826 | |
2827 | PG_FREE_IF_COPY(arg2, 1); |
2828 | |
2829 | PG_RETURN_BOOL(result); |
2830 | } |
2831 | |
2832 | Datum |
2833 | texteqname(PG_FUNCTION_ARGS) |
2834 | { |
2835 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2836 | Name arg2 = PG_GETARG_NAME(1); |
2837 | size_t len1 = VARSIZE_ANY_EXHDR(arg1); |
2838 | size_t len2 = strlen(NameStr(*arg2)); |
2839 | Oid collid = PG_GET_COLLATION(); |
2840 | bool result; |
2841 | |
2842 | check_collation_set(collid); |
2843 | |
2844 | if (collid == C_COLLATION_OID) |
2845 | result = (len1 == len2 && |
2846 | memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0); |
2847 | else |
2848 | result = (varstr_cmp(VARDATA_ANY(arg1), len1, |
2849 | NameStr(*arg2), len2, |
2850 | collid) == 0); |
2851 | |
2852 | PG_FREE_IF_COPY(arg1, 0); |
2853 | |
2854 | PG_RETURN_BOOL(result); |
2855 | } |
2856 | |
2857 | Datum |
2858 | namenetext(PG_FUNCTION_ARGS) |
2859 | { |
2860 | Name arg1 = PG_GETARG_NAME(0); |
2861 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2862 | size_t len1 = strlen(NameStr(*arg1)); |
2863 | size_t len2 = VARSIZE_ANY_EXHDR(arg2); |
2864 | Oid collid = PG_GET_COLLATION(); |
2865 | bool result; |
2866 | |
2867 | check_collation_set(collid); |
2868 | |
2869 | if (collid == C_COLLATION_OID) |
2870 | result = !(len1 == len2 && |
2871 | memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0); |
2872 | else |
2873 | result = !(varstr_cmp(NameStr(*arg1), len1, |
2874 | VARDATA_ANY(arg2), len2, |
2875 | collid) == 0); |
2876 | |
2877 | PG_FREE_IF_COPY(arg2, 1); |
2878 | |
2879 | PG_RETURN_BOOL(result); |
2880 | } |
2881 | |
2882 | Datum |
2883 | textnename(PG_FUNCTION_ARGS) |
2884 | { |
2885 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2886 | Name arg2 = PG_GETARG_NAME(1); |
2887 | size_t len1 = VARSIZE_ANY_EXHDR(arg1); |
2888 | size_t len2 = strlen(NameStr(*arg2)); |
2889 | Oid collid = PG_GET_COLLATION(); |
2890 | bool result; |
2891 | |
2892 | check_collation_set(collid); |
2893 | |
2894 | if (collid == C_COLLATION_OID) |
2895 | result = !(len1 == len2 && |
2896 | memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0); |
2897 | else |
2898 | result = !(varstr_cmp(VARDATA_ANY(arg1), len1, |
2899 | NameStr(*arg2), len2, |
2900 | collid) == 0); |
2901 | |
2902 | PG_FREE_IF_COPY(arg1, 0); |
2903 | |
2904 | PG_RETURN_BOOL(result); |
2905 | } |
2906 | |
2907 | Datum |
2908 | btnametextcmp(PG_FUNCTION_ARGS) |
2909 | { |
2910 | Name arg1 = PG_GETARG_NAME(0); |
2911 | text *arg2 = PG_GETARG_TEXT_PP(1); |
2912 | int32 result; |
2913 | |
2914 | result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)), |
2915 | VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2), |
2916 | PG_GET_COLLATION()); |
2917 | |
2918 | PG_FREE_IF_COPY(arg2, 1); |
2919 | |
2920 | PG_RETURN_INT32(result); |
2921 | } |
2922 | |
2923 | Datum |
2924 | bttextnamecmp(PG_FUNCTION_ARGS) |
2925 | { |
2926 | text *arg1 = PG_GETARG_TEXT_PP(0); |
2927 | Name arg2 = PG_GETARG_NAME(1); |
2928 | int32 result; |
2929 | |
2930 | result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1), |
2931 | NameStr(*arg2), strlen(NameStr(*arg2)), |
2932 | PG_GET_COLLATION()); |
2933 | |
2934 | PG_FREE_IF_COPY(arg1, 0); |
2935 | |
2936 | PG_RETURN_INT32(result); |
2937 | } |
2938 | |
2939 | #define CmpCall(cmpfunc) \ |
2940 | DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \ |
2941 | PG_GET_COLLATION(), \ |
2942 | PG_GETARG_DATUM(0), \ |
2943 | PG_GETARG_DATUM(1))) |
2944 | |
2945 | Datum |
2946 | namelttext(PG_FUNCTION_ARGS) |
2947 | { |
2948 | PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0); |
2949 | } |
2950 | |
2951 | Datum |
2952 | nameletext(PG_FUNCTION_ARGS) |
2953 | { |
2954 | PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0); |
2955 | } |
2956 | |
2957 | Datum |
2958 | namegttext(PG_FUNCTION_ARGS) |
2959 | { |
2960 | PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0); |
2961 | } |
2962 | |
2963 | Datum |
2964 | namegetext(PG_FUNCTION_ARGS) |
2965 | { |
2966 | PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0); |
2967 | } |
2968 | |
2969 | Datum |
2970 | textltname(PG_FUNCTION_ARGS) |
2971 | { |
2972 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0); |
2973 | } |
2974 | |
2975 | Datum |
2976 | textlename(PG_FUNCTION_ARGS) |
2977 | { |
2978 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0); |
2979 | } |
2980 | |
2981 | Datum |
2982 | textgtname(PG_FUNCTION_ARGS) |
2983 | { |
2984 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0); |
2985 | } |
2986 | |
2987 | Datum |
2988 | textgename(PG_FUNCTION_ARGS) |
2989 | { |
2990 | PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0); |
2991 | } |
2992 | |
2993 | #undef CmpCall |
2994 | |
2995 | |
2996 | /* |
2997 | * The following operators support character-by-character comparison |
2998 | * of text datums, to allow building indexes suitable for LIKE clauses. |
2999 | * Note that the regular texteq/textne comparison operators, and regular |
3000 | * support functions 1 and 2 with "C" collation are assumed to be |
3001 | * compatible with these! |
3002 | */ |
3003 | |
3004 | static int |
3005 | internal_text_pattern_compare(text *arg1, text *arg2) |
3006 | { |
3007 | int result; |
3008 | int len1, |
3009 | len2; |
3010 | |
3011 | len1 = VARSIZE_ANY_EXHDR(arg1); |
3012 | len2 = VARSIZE_ANY_EXHDR(arg2); |
3013 | |
3014 | result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
3015 | if (result != 0) |
3016 | return result; |
3017 | else if (len1 < len2) |
3018 | return -1; |
3019 | else if (len1 > len2) |
3020 | return 1; |
3021 | else |
3022 | return 0; |
3023 | } |
3024 | |
3025 | |
3026 | Datum |
3027 | text_pattern_lt(PG_FUNCTION_ARGS) |
3028 | { |
3029 | text *arg1 = PG_GETARG_TEXT_PP(0); |
3030 | text *arg2 = PG_GETARG_TEXT_PP(1); |
3031 | int result; |
3032 | |
3033 | result = internal_text_pattern_compare(arg1, arg2); |
3034 | |
3035 | PG_FREE_IF_COPY(arg1, 0); |
3036 | PG_FREE_IF_COPY(arg2, 1); |
3037 | |
3038 | PG_RETURN_BOOL(result < 0); |
3039 | } |
3040 | |
3041 | |
3042 | Datum |
3043 | text_pattern_le(PG_FUNCTION_ARGS) |
3044 | { |
3045 | text *arg1 = PG_GETARG_TEXT_PP(0); |
3046 | text *arg2 = PG_GETARG_TEXT_PP(1); |
3047 | int result; |
3048 | |
3049 | result = internal_text_pattern_compare(arg1, arg2); |
3050 | |
3051 | PG_FREE_IF_COPY(arg1, 0); |
3052 | PG_FREE_IF_COPY(arg2, 1); |
3053 | |
3054 | PG_RETURN_BOOL(result <= 0); |
3055 | } |
3056 | |
3057 | |
3058 | Datum |
3059 | text_pattern_ge(PG_FUNCTION_ARGS) |
3060 | { |
3061 | text *arg1 = PG_GETARG_TEXT_PP(0); |
3062 | text *arg2 = PG_GETARG_TEXT_PP(1); |
3063 | int result; |
3064 | |
3065 | result = internal_text_pattern_compare(arg1, arg2); |
3066 | |
3067 | PG_FREE_IF_COPY(arg1, 0); |
3068 | PG_FREE_IF_COPY(arg2, 1); |
3069 | |
3070 | PG_RETURN_BOOL(result >= 0); |
3071 | } |
3072 | |
3073 | |
3074 | Datum |
3075 | text_pattern_gt(PG_FUNCTION_ARGS) |
3076 | { |
3077 | text *arg1 = PG_GETARG_TEXT_PP(0); |
3078 | text *arg2 = PG_GETARG_TEXT_PP(1); |
3079 | int result; |
3080 | |
3081 | result = internal_text_pattern_compare(arg1, arg2); |
3082 | |
3083 | PG_FREE_IF_COPY(arg1, 0); |
3084 | PG_FREE_IF_COPY(arg2, 1); |
3085 | |
3086 | PG_RETURN_BOOL(result > 0); |
3087 | } |
3088 | |
3089 | |
3090 | Datum |
3091 | bttext_pattern_cmp(PG_FUNCTION_ARGS) |
3092 | { |
3093 | text *arg1 = PG_GETARG_TEXT_PP(0); |
3094 | text *arg2 = PG_GETARG_TEXT_PP(1); |
3095 | int result; |
3096 | |
3097 | result = internal_text_pattern_compare(arg1, arg2); |
3098 | |
3099 | PG_FREE_IF_COPY(arg1, 0); |
3100 | PG_FREE_IF_COPY(arg2, 1); |
3101 | |
3102 | PG_RETURN_INT32(result); |
3103 | } |
3104 | |
3105 | |
3106 | Datum |
3107 | bttext_pattern_sortsupport(PG_FUNCTION_ARGS) |
3108 | { |
3109 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
3110 | MemoryContext oldcontext; |
3111 | |
3112 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
3113 | |
3114 | /* Use generic string SortSupport, forcing "C" collation */ |
3115 | varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID); |
3116 | |
3117 | MemoryContextSwitchTo(oldcontext); |
3118 | |
3119 | PG_RETURN_VOID(); |
3120 | } |
3121 | |
3122 | |
3123 | /*------------------------------------------------------------- |
3124 | * byteaoctetlen |
3125 | * |
3126 | * get the number of bytes contained in an instance of type 'bytea' |
3127 | *------------------------------------------------------------- |
3128 | */ |
3129 | Datum |
3130 | byteaoctetlen(PG_FUNCTION_ARGS) |
3131 | { |
3132 | Datum str = PG_GETARG_DATUM(0); |
3133 | |
3134 | /* We need not detoast the input at all */ |
3135 | PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); |
3136 | } |
3137 | |
3138 | /* |
3139 | * byteacat - |
3140 | * takes two bytea* and returns a bytea* that is the concatenation of |
3141 | * the two. |
3142 | * |
3143 | * Cloned from textcat and modified as required. |
3144 | */ |
3145 | Datum |
3146 | byteacat(PG_FUNCTION_ARGS) |
3147 | { |
3148 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3149 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3150 | |
3151 | PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); |
3152 | } |
3153 | |
3154 | /* |
3155 | * bytea_catenate |
3156 | * Guts of byteacat(), broken out so it can be used by other functions |
3157 | * |
3158 | * Arguments can be in short-header form, but not compressed or out-of-line |
3159 | */ |
3160 | static bytea * |
3161 | bytea_catenate(bytea *t1, bytea *t2) |
3162 | { |
3163 | bytea *result; |
3164 | int len1, |
3165 | len2, |
3166 | len; |
3167 | char *ptr; |
3168 | |
3169 | len1 = VARSIZE_ANY_EXHDR(t1); |
3170 | len2 = VARSIZE_ANY_EXHDR(t2); |
3171 | |
3172 | /* paranoia ... probably should throw error instead? */ |
3173 | if (len1 < 0) |
3174 | len1 = 0; |
3175 | if (len2 < 0) |
3176 | len2 = 0; |
3177 | |
3178 | len = len1 + len2 + VARHDRSZ; |
3179 | result = (bytea *) palloc(len); |
3180 | |
3181 | /* Set size of result string... */ |
3182 | SET_VARSIZE(result, len); |
3183 | |
3184 | /* Fill data field of result string... */ |
3185 | ptr = VARDATA(result); |
3186 | if (len1 > 0) |
3187 | memcpy(ptr, VARDATA_ANY(t1), len1); |
3188 | if (len2 > 0) |
3189 | memcpy(ptr + len1, VARDATA_ANY(t2), len2); |
3190 | |
3191 | return result; |
3192 | } |
3193 | |
3194 | #define PG_STR_GET_BYTEA(str_) \ |
3195 | DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) |
3196 | |
3197 | /* |
3198 | * bytea_substr() |
3199 | * Return a substring starting at the specified position. |
3200 | * Cloned from text_substr and modified as required. |
3201 | * |
3202 | * Input: |
3203 | * - string |
3204 | * - starting position (is one-based) |
3205 | * - string length (optional) |
3206 | * |
3207 | * If the starting position is zero or less, then return from the start of the string |
3208 | * adjusting the length to be consistent with the "negative start" per SQL. |
3209 | * If the length is less than zero, an ERROR is thrown. If no third argument |
3210 | * (length) is provided, the length to the end of the string is assumed. |
3211 | */ |
3212 | Datum |
3213 | bytea_substr(PG_FUNCTION_ARGS) |
3214 | { |
3215 | PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), |
3216 | PG_GETARG_INT32(1), |
3217 | PG_GETARG_INT32(2), |
3218 | false)); |
3219 | } |
3220 | |
3221 | /* |
3222 | * bytea_substr_no_len - |
3223 | * Wrapper to avoid opr_sanity failure due to |
3224 | * one function accepting a different number of args. |
3225 | */ |
3226 | Datum |
3227 | bytea_substr_no_len(PG_FUNCTION_ARGS) |
3228 | { |
3229 | PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), |
3230 | PG_GETARG_INT32(1), |
3231 | -1, |
3232 | true)); |
3233 | } |
3234 | |
3235 | static bytea * |
3236 | bytea_substring(Datum str, |
3237 | int S, |
3238 | int L, |
3239 | bool length_not_specified) |
3240 | { |
3241 | int S1; /* adjusted start position */ |
3242 | int L1; /* adjusted substring length */ |
3243 | |
3244 | S1 = Max(S, 1); |
3245 | |
3246 | if (length_not_specified) |
3247 | { |
3248 | /* |
3249 | * Not passed a length - DatumGetByteaPSlice() grabs everything to the |
3250 | * end of the string if we pass it a negative value for length. |
3251 | */ |
3252 | L1 = -1; |
3253 | } |
3254 | else |
3255 | { |
3256 | /* end position */ |
3257 | int E = S + L; |
3258 | |
3259 | /* |
3260 | * A negative value for L is the only way for the end position to be |
3261 | * before the start. SQL99 says to throw an error. |
3262 | */ |
3263 | if (E < S) |
3264 | ereport(ERROR, |
3265 | (errcode(ERRCODE_SUBSTRING_ERROR), |
3266 | errmsg("negative substring length not allowed" ))); |
3267 | |
3268 | /* |
3269 | * A zero or negative value for the end position can happen if the |
3270 | * start was negative or one. SQL99 says to return a zero-length |
3271 | * string. |
3272 | */ |
3273 | if (E < 1) |
3274 | return PG_STR_GET_BYTEA("" ); |
3275 | |
3276 | L1 = E - S1; |
3277 | } |
3278 | |
3279 | /* |
3280 | * If the start position is past the end of the string, SQL99 says to |
3281 | * return a zero-length string -- DatumGetByteaPSlice() will do that for |
3282 | * us. Convert to zero-based starting position |
3283 | */ |
3284 | return DatumGetByteaPSlice(str, S1 - 1, L1); |
3285 | } |
3286 | |
3287 | /* |
3288 | * byteaoverlay |
3289 | * Replace specified substring of first string with second |
3290 | * |
3291 | * The SQL standard defines OVERLAY() in terms of substring and concatenation. |
3292 | * This code is a direct implementation of what the standard says. |
3293 | */ |
3294 | Datum |
3295 | byteaoverlay(PG_FUNCTION_ARGS) |
3296 | { |
3297 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3298 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3299 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
3300 | int sl = PG_GETARG_INT32(3); /* substring length */ |
3301 | |
3302 | PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); |
3303 | } |
3304 | |
3305 | Datum |
3306 | byteaoverlay_no_len(PG_FUNCTION_ARGS) |
3307 | { |
3308 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3309 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3310 | int sp = PG_GETARG_INT32(2); /* substring start position */ |
3311 | int sl; |
3312 | |
3313 | sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ |
3314 | PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); |
3315 | } |
3316 | |
3317 | static bytea * |
3318 | bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) |
3319 | { |
3320 | bytea *result; |
3321 | bytea *s1; |
3322 | bytea *s2; |
3323 | int sp_pl_sl; |
3324 | |
3325 | /* |
3326 | * Check for possible integer-overflow cases. For negative sp, throw a |
3327 | * "substring length" error because that's what should be expected |
3328 | * according to the spec's definition of OVERLAY(). |
3329 | */ |
3330 | if (sp <= 0) |
3331 | ereport(ERROR, |
3332 | (errcode(ERRCODE_SUBSTRING_ERROR), |
3333 | errmsg("negative substring length not allowed" ))); |
3334 | if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) |
3335 | ereport(ERROR, |
3336 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
3337 | errmsg("integer out of range" ))); |
3338 | |
3339 | s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); |
3340 | s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); |
3341 | result = bytea_catenate(s1, t2); |
3342 | result = bytea_catenate(result, s2); |
3343 | |
3344 | return result; |
3345 | } |
3346 | |
3347 | /* |
3348 | * byteapos - |
3349 | * Return the position of the specified substring. |
3350 | * Implements the SQL POSITION() function. |
3351 | * Cloned from textpos and modified as required. |
3352 | */ |
3353 | Datum |
3354 | byteapos(PG_FUNCTION_ARGS) |
3355 | { |
3356 | bytea *t1 = PG_GETARG_BYTEA_PP(0); |
3357 | bytea *t2 = PG_GETARG_BYTEA_PP(1); |
3358 | int pos; |
3359 | int px, |
3360 | p; |
3361 | int len1, |
3362 | len2; |
3363 | char *p1, |
3364 | *p2; |
3365 | |
3366 | len1 = VARSIZE_ANY_EXHDR(t1); |
3367 | len2 = VARSIZE_ANY_EXHDR(t2); |
3368 | |
3369 | if (len2 <= 0) |
3370 | PG_RETURN_INT32(1); /* result for empty pattern */ |
3371 | |
3372 | p1 = VARDATA_ANY(t1); |
3373 | p2 = VARDATA_ANY(t2); |
3374 | |
3375 | pos = 0; |
3376 | px = (len1 - len2); |
3377 | for (p = 0; p <= px; p++) |
3378 | { |
3379 | if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) |
3380 | { |
3381 | pos = p + 1; |
3382 | break; |
3383 | }; |
3384 | p1++; |
3385 | }; |
3386 | |
3387 | PG_RETURN_INT32(pos); |
3388 | } |
3389 | |
3390 | /*------------------------------------------------------------- |
3391 | * byteaGetByte |
3392 | * |
3393 | * this routine treats "bytea" as an array of bytes. |
3394 | * It returns the Nth byte (a number between 0 and 255). |
3395 | *------------------------------------------------------------- |
3396 | */ |
3397 | Datum |
3398 | byteaGetByte(PG_FUNCTION_ARGS) |
3399 | { |
3400 | bytea *v = PG_GETARG_BYTEA_PP(0); |
3401 | int32 n = PG_GETARG_INT32(1); |
3402 | int len; |
3403 | int byte; |
3404 | |
3405 | len = VARSIZE_ANY_EXHDR(v); |
3406 | |
3407 | if (n < 0 || n >= len) |
3408 | ereport(ERROR, |
3409 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3410 | errmsg("index %d out of valid range, 0..%d" , |
3411 | n, len - 1))); |
3412 | |
3413 | byte = ((unsigned char *) VARDATA_ANY(v))[n]; |
3414 | |
3415 | PG_RETURN_INT32(byte); |
3416 | } |
3417 | |
3418 | /*------------------------------------------------------------- |
3419 | * byteaGetBit |
3420 | * |
3421 | * This routine treats a "bytea" type like an array of bits. |
3422 | * It returns the value of the Nth bit (0 or 1). |
3423 | * |
3424 | *------------------------------------------------------------- |
3425 | */ |
3426 | Datum |
3427 | byteaGetBit(PG_FUNCTION_ARGS) |
3428 | { |
3429 | bytea *v = PG_GETARG_BYTEA_PP(0); |
3430 | int32 n = PG_GETARG_INT32(1); |
3431 | int byteNo, |
3432 | bitNo; |
3433 | int len; |
3434 | int byte; |
3435 | |
3436 | len = VARSIZE_ANY_EXHDR(v); |
3437 | |
3438 | if (n < 0 || n >= len * 8) |
3439 | ereport(ERROR, |
3440 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3441 | errmsg("index %d out of valid range, 0..%d" , |
3442 | n, len * 8 - 1))); |
3443 | |
3444 | byteNo = n / 8; |
3445 | bitNo = n % 8; |
3446 | |
3447 | byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; |
3448 | |
3449 | if (byte & (1 << bitNo)) |
3450 | PG_RETURN_INT32(1); |
3451 | else |
3452 | PG_RETURN_INT32(0); |
3453 | } |
3454 | |
3455 | /*------------------------------------------------------------- |
3456 | * byteaSetByte |
3457 | * |
3458 | * Given an instance of type 'bytea' creates a new one with |
3459 | * the Nth byte set to the given value. |
3460 | * |
3461 | *------------------------------------------------------------- |
3462 | */ |
3463 | Datum |
3464 | byteaSetByte(PG_FUNCTION_ARGS) |
3465 | { |
3466 | bytea *res = PG_GETARG_BYTEA_P_COPY(0); |
3467 | int32 n = PG_GETARG_INT32(1); |
3468 | int32 newByte = PG_GETARG_INT32(2); |
3469 | int len; |
3470 | |
3471 | len = VARSIZE(res) - VARHDRSZ; |
3472 | |
3473 | if (n < 0 || n >= len) |
3474 | ereport(ERROR, |
3475 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3476 | errmsg("index %d out of valid range, 0..%d" , |
3477 | n, len - 1))); |
3478 | |
3479 | /* |
3480 | * Now set the byte. |
3481 | */ |
3482 | ((unsigned char *) VARDATA(res))[n] = newByte; |
3483 | |
3484 | PG_RETURN_BYTEA_P(res); |
3485 | } |
3486 | |
3487 | /*------------------------------------------------------------- |
3488 | * byteaSetBit |
3489 | * |
3490 | * Given an instance of type 'bytea' creates a new one with |
3491 | * the Nth bit set to the given value. |
3492 | * |
3493 | *------------------------------------------------------------- |
3494 | */ |
3495 | Datum |
3496 | byteaSetBit(PG_FUNCTION_ARGS) |
3497 | { |
3498 | bytea *res = PG_GETARG_BYTEA_P_COPY(0); |
3499 | int32 n = PG_GETARG_INT32(1); |
3500 | int32 newBit = PG_GETARG_INT32(2); |
3501 | int len; |
3502 | int oldByte, |
3503 | newByte; |
3504 | int byteNo, |
3505 | bitNo; |
3506 | |
3507 | len = VARSIZE(res) - VARHDRSZ; |
3508 | |
3509 | if (n < 0 || n >= len * 8) |
3510 | ereport(ERROR, |
3511 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
3512 | errmsg("index %d out of valid range, 0..%d" , |
3513 | n, len * 8 - 1))); |
3514 | |
3515 | byteNo = n / 8; |
3516 | bitNo = n % 8; |
3517 | |
3518 | /* |
3519 | * sanity check! |
3520 | */ |
3521 | if (newBit != 0 && newBit != 1) |
3522 | ereport(ERROR, |
3523 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
3524 | errmsg("new bit must be 0 or 1" ))); |
3525 | |
3526 | /* |
3527 | * Update the byte. |
3528 | */ |
3529 | oldByte = ((unsigned char *) VARDATA(res))[byteNo]; |
3530 | |
3531 | if (newBit == 0) |
3532 | newByte = oldByte & (~(1 << bitNo)); |
3533 | else |
3534 | newByte = oldByte | (1 << bitNo); |
3535 | |
3536 | ((unsigned char *) VARDATA(res))[byteNo] = newByte; |
3537 | |
3538 | PG_RETURN_BYTEA_P(res); |
3539 | } |
3540 | |
3541 | |
3542 | /* text_name() |
3543 | * Converts a text type to a Name type. |
3544 | */ |
3545 | Datum |
3546 | text_name(PG_FUNCTION_ARGS) |
3547 | { |
3548 | text *s = PG_GETARG_TEXT_PP(0); |
3549 | Name result; |
3550 | int len; |
3551 | |
3552 | len = VARSIZE_ANY_EXHDR(s); |
3553 | |
3554 | /* Truncate oversize input */ |
3555 | if (len >= NAMEDATALEN) |
3556 | len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1); |
3557 | |
3558 | /* We use palloc0 here to ensure result is zero-padded */ |
3559 | result = (Name) palloc0(NAMEDATALEN); |
3560 | memcpy(NameStr(*result), VARDATA_ANY(s), len); |
3561 | |
3562 | PG_RETURN_NAME(result); |
3563 | } |
3564 | |
3565 | /* name_text() |
3566 | * Converts a Name type to a text type. |
3567 | */ |
3568 | Datum |
3569 | name_text(PG_FUNCTION_ARGS) |
3570 | { |
3571 | Name s = PG_GETARG_NAME(0); |
3572 | |
3573 | PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s))); |
3574 | } |
3575 | |
3576 | |
3577 | /* |
3578 | * textToQualifiedNameList - convert a text object to list of names |
3579 | * |
3580 | * This implements the input parsing needed by nextval() and other |
3581 | * functions that take a text parameter representing a qualified name. |
3582 | * We split the name at dots, downcase if not double-quoted, and |
3583 | * truncate names if they're too long. |
3584 | */ |
3585 | List * |
3586 | textToQualifiedNameList(text *textval) |
3587 | { |
3588 | char *rawname; |
3589 | List *result = NIL; |
3590 | List *namelist; |
3591 | ListCell *l; |
3592 | |
3593 | /* Convert to C string (handles possible detoasting). */ |
3594 | /* Note we rely on being able to modify rawname below. */ |
3595 | rawname = text_to_cstring(textval); |
3596 | |
3597 | if (!SplitIdentifierString(rawname, '.', &namelist)) |
3598 | ereport(ERROR, |
3599 | (errcode(ERRCODE_INVALID_NAME), |
3600 | errmsg("invalid name syntax" ))); |
3601 | |
3602 | if (namelist == NIL) |
3603 | ereport(ERROR, |
3604 | (errcode(ERRCODE_INVALID_NAME), |
3605 | errmsg("invalid name syntax" ))); |
3606 | |
3607 | foreach(l, namelist) |
3608 | { |
3609 | char *curname = (char *) lfirst(l); |
3610 | |
3611 | result = lappend(result, makeString(pstrdup(curname))); |
3612 | } |
3613 | |
3614 | pfree(rawname); |
3615 | list_free(namelist); |
3616 | |
3617 | return result; |
3618 | } |
3619 | |
3620 | /* |
3621 | * SplitIdentifierString --- parse a string containing identifiers |
3622 | * |
3623 | * This is the guts of textToQualifiedNameList, and is exported for use in |
3624 | * other situations such as parsing GUC variables. In the GUC case, it's |
3625 | * important to avoid memory leaks, so the API is designed to minimize the |
3626 | * amount of stuff that needs to be allocated and freed. |
3627 | * |
3628 | * Inputs: |
3629 | * rawstring: the input string; must be overwritable! On return, it's |
3630 | * been modified to contain the separated identifiers. |
3631 | * separator: the separator punctuation expected between identifiers |
3632 | * (typically '.' or ','). Whitespace may also appear around |
3633 | * identifiers. |
3634 | * Outputs: |
3635 | * namelist: filled with a palloc'd list of pointers to identifiers within |
3636 | * rawstring. Caller should list_free() this even on error return. |
3637 | * |
3638 | * Returns true if okay, false if there is a syntax error in the string. |
3639 | * |
3640 | * Note that an empty string is considered okay here, though not in |
3641 | * textToQualifiedNameList. |
3642 | */ |
3643 | bool |
3644 | SplitIdentifierString(char *rawstring, char separator, |
3645 | List **namelist) |
3646 | { |
3647 | char *nextp = rawstring; |
3648 | bool done = false; |
3649 | |
3650 | *namelist = NIL; |
3651 | |
3652 | while (scanner_isspace(*nextp)) |
3653 | nextp++; /* skip leading whitespace */ |
3654 | |
3655 | if (*nextp == '\0') |
3656 | return true; /* allow empty string */ |
3657 | |
3658 | /* At the top of the loop, we are at start of a new identifier. */ |
3659 | do |
3660 | { |
3661 | char *curname; |
3662 | char *endp; |
3663 | |
3664 | if (*nextp == '"') |
3665 | { |
3666 | /* Quoted name --- collapse quote-quote pairs, no downcasing */ |
3667 | curname = nextp + 1; |
3668 | for (;;) |
3669 | { |
3670 | endp = strchr(nextp + 1, '"'); |
3671 | if (endp == NULL) |
3672 | return false; /* mismatched quotes */ |
3673 | if (endp[1] != '"') |
3674 | break; /* found end of quoted name */ |
3675 | /* Collapse adjacent quotes into one quote, and look again */ |
3676 | memmove(endp, endp + 1, strlen(endp)); |
3677 | nextp = endp; |
3678 | } |
3679 | /* endp now points at the terminating quote */ |
3680 | nextp = endp + 1; |
3681 | } |
3682 | else |
3683 | { |
3684 | /* Unquoted name --- extends to separator or whitespace */ |
3685 | char *downname; |
3686 | int len; |
3687 | |
3688 | curname = nextp; |
3689 | while (*nextp && *nextp != separator && |
3690 | !scanner_isspace(*nextp)) |
3691 | nextp++; |
3692 | endp = nextp; |
3693 | if (curname == nextp) |
3694 | return false; /* empty unquoted name not allowed */ |
3695 | |
3696 | /* |
3697 | * Downcase the identifier, using same code as main lexer does. |
3698 | * |
3699 | * XXX because we want to overwrite the input in-place, we cannot |
3700 | * support a downcasing transformation that increases the string |
3701 | * length. This is not a problem given the current implementation |
3702 | * of downcase_truncate_identifier, but we'll probably have to do |
3703 | * something about this someday. |
3704 | */ |
3705 | len = endp - curname; |
3706 | downname = downcase_truncate_identifier(curname, len, false); |
3707 | Assert(strlen(downname) <= len); |
3708 | strncpy(curname, downname, len); /* strncpy is required here */ |
3709 | pfree(downname); |
3710 | } |
3711 | |
3712 | while (scanner_isspace(*nextp)) |
3713 | nextp++; /* skip trailing whitespace */ |
3714 | |
3715 | if (*nextp == separator) |
3716 | { |
3717 | nextp++; |
3718 | while (scanner_isspace(*nextp)) |
3719 | nextp++; /* skip leading whitespace for next */ |
3720 | /* we expect another name, so done remains false */ |
3721 | } |
3722 | else if (*nextp == '\0') |
3723 | done = true; |
3724 | else |
3725 | return false; /* invalid syntax */ |
3726 | |
3727 | /* Now safe to overwrite separator with a null */ |
3728 | *endp = '\0'; |
3729 | |
3730 | /* Truncate name if it's overlength */ |
3731 | truncate_identifier(curname, strlen(curname), false); |
3732 | |
3733 | /* |
3734 | * Finished isolating current name --- add it to list |
3735 | */ |
3736 | *namelist = lappend(*namelist, curname); |
3737 | |
3738 | /* Loop back if we didn't reach end of string */ |
3739 | } while (!done); |
3740 | |
3741 | return true; |
3742 | } |
3743 | |
3744 | |
3745 | /* |
3746 | * SplitDirectoriesString --- parse a string containing file/directory names |
3747 | * |
3748 | * This works fine on file names too; the function name is historical. |
3749 | * |
3750 | * This is similar to SplitIdentifierString, except that the parsing |
3751 | * rules are meant to handle pathnames instead of identifiers: there is |
3752 | * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1, |
3753 | * and we apply canonicalize_path() to each extracted string. Because of the |
3754 | * last, the returned strings are separately palloc'd rather than being |
3755 | * pointers into rawstring --- but we still scribble on rawstring. |
3756 | * |
3757 | * Inputs: |
3758 | * rawstring: the input string; must be modifiable! |
3759 | * separator: the separator punctuation expected between directories |
3760 | * (typically ',' or ';'). Whitespace may also appear around |
3761 | * directories. |
3762 | * Outputs: |
3763 | * namelist: filled with a palloc'd list of directory names. |
3764 | * Caller should list_free_deep() this even on error return. |
3765 | * |
3766 | * Returns true if okay, false if there is a syntax error in the string. |
3767 | * |
3768 | * Note that an empty string is considered okay here. |
3769 | */ |
3770 | bool |
3771 | SplitDirectoriesString(char *rawstring, char separator, |
3772 | List **namelist) |
3773 | { |
3774 | char *nextp = rawstring; |
3775 | bool done = false; |
3776 | |
3777 | *namelist = NIL; |
3778 | |
3779 | while (scanner_isspace(*nextp)) |
3780 | nextp++; /* skip leading whitespace */ |
3781 | |
3782 | if (*nextp == '\0') |
3783 | return true; /* allow empty string */ |
3784 | |
3785 | /* At the top of the loop, we are at start of a new directory. */ |
3786 | do |
3787 | { |
3788 | char *curname; |
3789 | char *endp; |
3790 | |
3791 | if (*nextp == '"') |
3792 | { |
3793 | /* Quoted name --- collapse quote-quote pairs */ |
3794 | curname = nextp + 1; |
3795 | for (;;) |
3796 | { |
3797 | endp = strchr(nextp + 1, '"'); |
3798 | if (endp == NULL) |
3799 | return false; /* mismatched quotes */ |
3800 | if (endp[1] != '"') |
3801 | break; /* found end of quoted name */ |
3802 | /* Collapse adjacent quotes into one quote, and look again */ |
3803 | memmove(endp, endp + 1, strlen(endp)); |
3804 | nextp = endp; |
3805 | } |
3806 | /* endp now points at the terminating quote */ |
3807 | nextp = endp + 1; |
3808 | } |
3809 | else |
3810 | { |
3811 | /* Unquoted name --- extends to separator or end of string */ |
3812 | curname = endp = nextp; |
3813 | while (*nextp && *nextp != separator) |
3814 | { |
3815 | /* trailing whitespace should not be included in name */ |
3816 | if (!scanner_isspace(*nextp)) |
3817 | endp = nextp + 1; |
3818 | nextp++; |
3819 | } |
3820 | if (curname == endp) |
3821 | return false; /* empty unquoted name not allowed */ |
3822 | } |
3823 | |
3824 | while (scanner_isspace(*nextp)) |
3825 | nextp++; /* skip trailing whitespace */ |
3826 | |
3827 | if (*nextp == separator) |
3828 | { |
3829 | nextp++; |
3830 | while (scanner_isspace(*nextp)) |
3831 | nextp++; /* skip leading whitespace for next */ |
3832 | /* we expect another name, so done remains false */ |
3833 | } |
3834 | else if (*nextp == '\0') |
3835 | done = true; |
3836 | else |
3837 | return false; /* invalid syntax */ |
3838 | |
3839 | /* Now safe to overwrite separator with a null */ |
3840 | *endp = '\0'; |
3841 | |
3842 | /* Truncate path if it's overlength */ |
3843 | if (strlen(curname) >= MAXPGPATH) |
3844 | curname[MAXPGPATH - 1] = '\0'; |
3845 | |
3846 | /* |
3847 | * Finished isolating current name --- add it to list |
3848 | */ |
3849 | curname = pstrdup(curname); |
3850 | canonicalize_path(curname); |
3851 | *namelist = lappend(*namelist, curname); |
3852 | |
3853 | /* Loop back if we didn't reach end of string */ |
3854 | } while (!done); |
3855 | |
3856 | return true; |
3857 | } |
3858 | |
3859 | |
3860 | /* |
3861 | * SplitGUCList --- parse a string containing identifiers or file names |
3862 | * |
3863 | * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without |
3864 | * presuming whether the elements will be taken as identifiers or file names. |
3865 | * We assume the input has already been through flatten_set_variable_args(), |
3866 | * so that we need never downcase (if appropriate, that was done already). |
3867 | * Nor do we ever truncate, since we don't know the correct max length. |
3868 | * We disallow embedded whitespace for simplicity (it shouldn't matter, |
3869 | * because any embedded whitespace should have led to double-quoting). |
3870 | * Otherwise the API is identical to SplitIdentifierString. |
3871 | * |
3872 | * XXX it's annoying to have so many copies of this string-splitting logic. |
3873 | * However, it's not clear that having one function with a bunch of option |
3874 | * flags would be much better. |
3875 | * |
3876 | * XXX there is a version of this function in src/bin/pg_dump/dumputils.c. |
3877 | * Be sure to update that if you have to change this. |
3878 | * |
3879 | * Inputs: |
3880 | * rawstring: the input string; must be overwritable! On return, it's |
3881 | * been modified to contain the separated identifiers. |
3882 | * separator: the separator punctuation expected between identifiers |
3883 | * (typically '.' or ','). Whitespace may also appear around |
3884 | * identifiers. |
3885 | * Outputs: |
3886 | * namelist: filled with a palloc'd list of pointers to identifiers within |
3887 | * rawstring. Caller should list_free() this even on error return. |
3888 | * |
3889 | * Returns true if okay, false if there is a syntax error in the string. |
3890 | */ |
3891 | bool |
3892 | SplitGUCList(char *rawstring, char separator, |
3893 | List **namelist) |
3894 | { |
3895 | char *nextp = rawstring; |
3896 | bool done = false; |
3897 | |
3898 | *namelist = NIL; |
3899 | |
3900 | while (scanner_isspace(*nextp)) |
3901 | nextp++; /* skip leading whitespace */ |
3902 | |
3903 | if (*nextp == '\0') |
3904 | return true; /* allow empty string */ |
3905 | |
3906 | /* At the top of the loop, we are at start of a new identifier. */ |
3907 | do |
3908 | { |
3909 | char *curname; |
3910 | char *endp; |
3911 | |
3912 | if (*nextp == '"') |
3913 | { |
3914 | /* Quoted name --- collapse quote-quote pairs */ |
3915 | curname = nextp + 1; |
3916 | for (;;) |
3917 | { |
3918 | endp = strchr(nextp + 1, '"'); |
3919 | if (endp == NULL) |
3920 | return false; /* mismatched quotes */ |
3921 | if (endp[1] != '"') |
3922 | break; /* found end of quoted name */ |
3923 | /* Collapse adjacent quotes into one quote, and look again */ |
3924 | memmove(endp, endp + 1, strlen(endp)); |
3925 | nextp = endp; |
3926 | } |
3927 | /* endp now points at the terminating quote */ |
3928 | nextp = endp + 1; |
3929 | } |
3930 | else |
3931 | { |
3932 | /* Unquoted name --- extends to separator or whitespace */ |
3933 | curname = nextp; |
3934 | while (*nextp && *nextp != separator && |
3935 | !scanner_isspace(*nextp)) |
3936 | nextp++; |
3937 | endp = nextp; |
3938 | if (curname == nextp) |
3939 | return false; /* empty unquoted name not allowed */ |
3940 | } |
3941 | |
3942 | while (scanner_isspace(*nextp)) |
3943 | nextp++; /* skip trailing whitespace */ |
3944 | |
3945 | if (*nextp == separator) |
3946 | { |
3947 | nextp++; |
3948 | while (scanner_isspace(*nextp)) |
3949 | nextp++; /* skip leading whitespace for next */ |
3950 | /* we expect another name, so done remains false */ |
3951 | } |
3952 | else if (*nextp == '\0') |
3953 | done = true; |
3954 | else |
3955 | return false; /* invalid syntax */ |
3956 | |
3957 | /* Now safe to overwrite separator with a null */ |
3958 | *endp = '\0'; |
3959 | |
3960 | /* |
3961 | * Finished isolating current name --- add it to list |
3962 | */ |
3963 | *namelist = lappend(*namelist, curname); |
3964 | |
3965 | /* Loop back if we didn't reach end of string */ |
3966 | } while (!done); |
3967 | |
3968 | return true; |
3969 | } |
3970 | |
3971 | |
3972 | /***************************************************************************** |
3973 | * Comparison Functions used for bytea |
3974 | * |
3975 | * Note: btree indexes need these routines not to leak memory; therefore, |
3976 | * be careful to free working copies of toasted datums. Most places don't |
3977 | * need to be so careful. |
3978 | *****************************************************************************/ |
3979 | |
3980 | Datum |
3981 | byteaeq(PG_FUNCTION_ARGS) |
3982 | { |
3983 | Datum arg1 = PG_GETARG_DATUM(0); |
3984 | Datum arg2 = PG_GETARG_DATUM(1); |
3985 | bool result; |
3986 | Size len1, |
3987 | len2; |
3988 | |
3989 | /* |
3990 | * We can use a fast path for unequal lengths, which might save us from |
3991 | * having to detoast one or both values. |
3992 | */ |
3993 | len1 = toast_raw_datum_size(arg1); |
3994 | len2 = toast_raw_datum_size(arg2); |
3995 | if (len1 != len2) |
3996 | result = false; |
3997 | else |
3998 | { |
3999 | bytea *barg1 = DatumGetByteaPP(arg1); |
4000 | bytea *barg2 = DatumGetByteaPP(arg2); |
4001 | |
4002 | result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), |
4003 | len1 - VARHDRSZ) == 0); |
4004 | |
4005 | PG_FREE_IF_COPY(barg1, 0); |
4006 | PG_FREE_IF_COPY(barg2, 1); |
4007 | } |
4008 | |
4009 | PG_RETURN_BOOL(result); |
4010 | } |
4011 | |
4012 | Datum |
4013 | byteane(PG_FUNCTION_ARGS) |
4014 | { |
4015 | Datum arg1 = PG_GETARG_DATUM(0); |
4016 | Datum arg2 = PG_GETARG_DATUM(1); |
4017 | bool result; |
4018 | Size len1, |
4019 | len2; |
4020 | |
4021 | /* |
4022 | * We can use a fast path for unequal lengths, which might save us from |
4023 | * having to detoast one or both values. |
4024 | */ |
4025 | len1 = toast_raw_datum_size(arg1); |
4026 | len2 = toast_raw_datum_size(arg2); |
4027 | if (len1 != len2) |
4028 | result = true; |
4029 | else |
4030 | { |
4031 | bytea *barg1 = DatumGetByteaPP(arg1); |
4032 | bytea *barg2 = DatumGetByteaPP(arg2); |
4033 | |
4034 | result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), |
4035 | len1 - VARHDRSZ) != 0); |
4036 | |
4037 | PG_FREE_IF_COPY(barg1, 0); |
4038 | PG_FREE_IF_COPY(barg2, 1); |
4039 | } |
4040 | |
4041 | PG_RETURN_BOOL(result); |
4042 | } |
4043 | |
4044 | Datum |
4045 | bytealt(PG_FUNCTION_ARGS) |
4046 | { |
4047 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4048 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4049 | int len1, |
4050 | len2; |
4051 | int cmp; |
4052 | |
4053 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4054 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4055 | |
4056 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4057 | |
4058 | PG_FREE_IF_COPY(arg1, 0); |
4059 | PG_FREE_IF_COPY(arg2, 1); |
4060 | |
4061 | PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); |
4062 | } |
4063 | |
4064 | Datum |
4065 | byteale(PG_FUNCTION_ARGS) |
4066 | { |
4067 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4068 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4069 | int len1, |
4070 | len2; |
4071 | int cmp; |
4072 | |
4073 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4074 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4075 | |
4076 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4077 | |
4078 | PG_FREE_IF_COPY(arg1, 0); |
4079 | PG_FREE_IF_COPY(arg2, 1); |
4080 | |
4081 | PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); |
4082 | } |
4083 | |
4084 | Datum |
4085 | byteagt(PG_FUNCTION_ARGS) |
4086 | { |
4087 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4088 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4089 | int len1, |
4090 | len2; |
4091 | int cmp; |
4092 | |
4093 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4094 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4095 | |
4096 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4097 | |
4098 | PG_FREE_IF_COPY(arg1, 0); |
4099 | PG_FREE_IF_COPY(arg2, 1); |
4100 | |
4101 | PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); |
4102 | } |
4103 | |
4104 | Datum |
4105 | byteage(PG_FUNCTION_ARGS) |
4106 | { |
4107 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4108 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4109 | int len1, |
4110 | len2; |
4111 | int cmp; |
4112 | |
4113 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4114 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4115 | |
4116 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4117 | |
4118 | PG_FREE_IF_COPY(arg1, 0); |
4119 | PG_FREE_IF_COPY(arg2, 1); |
4120 | |
4121 | PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); |
4122 | } |
4123 | |
4124 | Datum |
4125 | byteacmp(PG_FUNCTION_ARGS) |
4126 | { |
4127 | bytea *arg1 = PG_GETARG_BYTEA_PP(0); |
4128 | bytea *arg2 = PG_GETARG_BYTEA_PP(1); |
4129 | int len1, |
4130 | len2; |
4131 | int cmp; |
4132 | |
4133 | len1 = VARSIZE_ANY_EXHDR(arg1); |
4134 | len2 = VARSIZE_ANY_EXHDR(arg2); |
4135 | |
4136 | cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); |
4137 | if ((cmp == 0) && (len1 != len2)) |
4138 | cmp = (len1 < len2) ? -1 : 1; |
4139 | |
4140 | PG_FREE_IF_COPY(arg1, 0); |
4141 | PG_FREE_IF_COPY(arg2, 1); |
4142 | |
4143 | PG_RETURN_INT32(cmp); |
4144 | } |
4145 | |
4146 | Datum |
4147 | bytea_sortsupport(PG_FUNCTION_ARGS) |
4148 | { |
4149 | SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); |
4150 | MemoryContext oldcontext; |
4151 | |
4152 | oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); |
4153 | |
4154 | /* Use generic string SortSupport, forcing "C" collation */ |
4155 | varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); |
4156 | |
4157 | MemoryContextSwitchTo(oldcontext); |
4158 | |
4159 | PG_RETURN_VOID(); |
4160 | } |
4161 | |
4162 | /* |
4163 | * appendStringInfoText |
4164 | * |
4165 | * Append a text to str. |
4166 | * Like appendStringInfoString(str, text_to_cstring(t)) but faster. |
4167 | */ |
4168 | static void |
4169 | appendStringInfoText(StringInfo str, const text *t) |
4170 | { |
4171 | appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); |
4172 | } |
4173 | |
4174 | /* |
4175 | * replace_text |
4176 | * replace all occurrences of 'old_sub_str' in 'orig_str' |
4177 | * with 'new_sub_str' to form 'new_str' |
4178 | * |
4179 | * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == '' |
4180 | * otherwise returns 'new_str' |
4181 | */ |
4182 | Datum |
4183 | replace_text(PG_FUNCTION_ARGS) |
4184 | { |
4185 | text *src_text = PG_GETARG_TEXT_PP(0); |
4186 | text *from_sub_text = PG_GETARG_TEXT_PP(1); |
4187 | text *to_sub_text = PG_GETARG_TEXT_PP(2); |
4188 | int src_text_len; |
4189 | int from_sub_text_len; |
4190 | TextPositionState state; |
4191 | text *ret_text; |
4192 | int chunk_len; |
4193 | char *curr_ptr; |
4194 | char *start_ptr; |
4195 | StringInfoData str; |
4196 | bool found; |
4197 | |
4198 | src_text_len = VARSIZE_ANY_EXHDR(src_text); |
4199 | from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text); |
4200 | |
4201 | /* Return unmodified source string if empty source or pattern */ |
4202 | if (src_text_len < 1 || from_sub_text_len < 1) |
4203 | { |
4204 | PG_RETURN_TEXT_P(src_text); |
4205 | } |
4206 | |
4207 | text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state); |
4208 | |
4209 | found = text_position_next(&state); |
4210 | |
4211 | /* When the from_sub_text is not found, there is nothing to do. */ |
4212 | if (!found) |
4213 | { |
4214 | text_position_cleanup(&state); |
4215 | PG_RETURN_TEXT_P(src_text); |
4216 | } |
4217 | curr_ptr = text_position_get_match_ptr(&state); |
4218 | start_ptr = VARDATA_ANY(src_text); |
4219 | |
4220 | initStringInfo(&str); |
4221 | |
4222 | do |
4223 | { |
4224 | CHECK_FOR_INTERRUPTS(); |
4225 | |
4226 | /* copy the data skipped over by last text_position_next() */ |
4227 | chunk_len = curr_ptr - start_ptr; |
4228 | appendBinaryStringInfo(&str, start_ptr, chunk_len); |
4229 | |
4230 | appendStringInfoText(&str, to_sub_text); |
4231 | |
4232 | start_ptr = curr_ptr + from_sub_text_len; |
4233 | |
4234 | found = text_position_next(&state); |
4235 | if (found) |
4236 | curr_ptr = text_position_get_match_ptr(&state); |
4237 | } |
4238 | while (found); |
4239 | |
4240 | /* copy trailing data */ |
4241 | chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; |
4242 | appendBinaryStringInfo(&str, start_ptr, chunk_len); |
4243 | |
4244 | text_position_cleanup(&state); |
4245 | |
4246 | ret_text = cstring_to_text_with_len(str.data, str.len); |
4247 | pfree(str.data); |
4248 | |
4249 | PG_RETURN_TEXT_P(ret_text); |
4250 | } |
4251 | |
4252 | /* |
4253 | * check_replace_text_has_escape_char |
4254 | * |
4255 | * check whether replace_text contains escape char. |
4256 | */ |
4257 | static bool |
4258 | check_replace_text_has_escape_char(const text *replace_text) |
4259 | { |
4260 | const char *p = VARDATA_ANY(replace_text); |
4261 | const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); |
4262 | |
4263 | if (pg_database_encoding_max_length() == 1) |
4264 | { |
4265 | for (; p < p_end; p++) |
4266 | { |
4267 | if (*p == '\\') |
4268 | return true; |
4269 | } |
4270 | } |
4271 | else |
4272 | { |
4273 | for (; p < p_end; p += pg_mblen(p)) |
4274 | { |
4275 | if (*p == '\\') |
4276 | return true; |
4277 | } |
4278 | } |
4279 | |
4280 | return false; |
4281 | } |
4282 | |
4283 | /* |
4284 | * appendStringInfoRegexpSubstr |
4285 | * |
4286 | * Append replace_text to str, substituting regexp back references for |
4287 | * \n escapes. start_ptr is the start of the match in the source string, |
4288 | * at logical character position data_pos. |
4289 | */ |
4290 | static void |
4291 | appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, |
4292 | regmatch_t *pmatch, |
4293 | char *start_ptr, int data_pos) |
4294 | { |
4295 | const char *p = VARDATA_ANY(replace_text); |
4296 | const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); |
4297 | int eml = pg_database_encoding_max_length(); |
4298 | |
4299 | for (;;) |
4300 | { |
4301 | const char *chunk_start = p; |
4302 | int so; |
4303 | int eo; |
4304 | |
4305 | /* Find next escape char. */ |
4306 | if (eml == 1) |
4307 | { |
4308 | for (; p < p_end && *p != '\\'; p++) |
4309 | /* nothing */ ; |
4310 | } |
4311 | else |
4312 | { |
4313 | for (; p < p_end && *p != '\\'; p += pg_mblen(p)) |
4314 | /* nothing */ ; |
4315 | } |
4316 | |
4317 | /* Copy the text we just scanned over, if any. */ |
4318 | if (p > chunk_start) |
4319 | appendBinaryStringInfo(str, chunk_start, p - chunk_start); |
4320 | |
4321 | /* Done if at end of string, else advance over escape char. */ |
4322 | if (p >= p_end) |
4323 | break; |
4324 | p++; |
4325 | |
4326 | if (p >= p_end) |
4327 | { |
4328 | /* Escape at very end of input. Treat same as unexpected char */ |
4329 | appendStringInfoChar(str, '\\'); |
4330 | break; |
4331 | } |
4332 | |
4333 | if (*p >= '1' && *p <= '9') |
4334 | { |
4335 | /* Use the back reference of regexp. */ |
4336 | int idx = *p - '0'; |
4337 | |
4338 | so = pmatch[idx].rm_so; |
4339 | eo = pmatch[idx].rm_eo; |
4340 | p++; |
4341 | } |
4342 | else if (*p == '&') |
4343 | { |
4344 | /* Use the entire matched string. */ |
4345 | so = pmatch[0].rm_so; |
4346 | eo = pmatch[0].rm_eo; |
4347 | p++; |
4348 | } |
4349 | else if (*p == '\\') |
4350 | { |
4351 | /* \\ means transfer one \ to output. */ |
4352 | appendStringInfoChar(str, '\\'); |
4353 | p++; |
4354 | continue; |
4355 | } |
4356 | else |
4357 | { |
4358 | /* |
4359 | * If escape char is not followed by any expected char, just treat |
4360 | * it as ordinary data to copy. (XXX would it be better to throw |
4361 | * an error?) |
4362 | */ |
4363 | appendStringInfoChar(str, '\\'); |
4364 | continue; |
4365 | } |
4366 | |
4367 | if (so != -1 && eo != -1) |
4368 | { |
4369 | /* |
4370 | * Copy the text that is back reference of regexp. Note so and eo |
4371 | * are counted in characters not bytes. |
4372 | */ |
4373 | char *chunk_start; |
4374 | int chunk_len; |
4375 | |
4376 | Assert(so >= data_pos); |
4377 | chunk_start = start_ptr; |
4378 | chunk_start += charlen_to_bytelen(chunk_start, so - data_pos); |
4379 | chunk_len = charlen_to_bytelen(chunk_start, eo - so); |
4380 | appendBinaryStringInfo(str, chunk_start, chunk_len); |
4381 | } |
4382 | } |
4383 | } |
4384 | |
4385 | #define REGEXP_REPLACE_BACKREF_CNT 10 |
4386 | |
4387 | /* |
4388 | * replace_text_regexp |
4389 | * |
4390 | * replace text that matches to regexp in src_text to replace_text. |
4391 | * |
4392 | * Note: to avoid having to include regex.h in builtins.h, we declare |
4393 | * the regexp argument as void *, but really it's regex_t *. |
4394 | */ |
4395 | text * |
4396 | replace_text_regexp(text *src_text, void *regexp, |
4397 | text *replace_text, bool glob) |
4398 | { |
4399 | text *ret_text; |
4400 | regex_t *re = (regex_t *) regexp; |
4401 | int src_text_len = VARSIZE_ANY_EXHDR(src_text); |
4402 | StringInfoData buf; |
4403 | regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; |
4404 | pg_wchar *data; |
4405 | size_t data_len; |
4406 | int search_start; |
4407 | int data_pos; |
4408 | char *start_ptr; |
4409 | bool have_escape; |
4410 | |
4411 | initStringInfo(&buf); |
4412 | |
4413 | /* Convert data string to wide characters. */ |
4414 | data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); |
4415 | data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len); |
4416 | |
4417 | /* Check whether replace_text has escape char. */ |
4418 | have_escape = check_replace_text_has_escape_char(replace_text); |
4419 | |
4420 | /* start_ptr points to the data_pos'th character of src_text */ |
4421 | start_ptr = (char *) VARDATA_ANY(src_text); |
4422 | data_pos = 0; |
4423 | |
4424 | search_start = 0; |
4425 | while (search_start <= data_len) |
4426 | { |
4427 | int regexec_result; |
4428 | |
4429 | CHECK_FOR_INTERRUPTS(); |
4430 | |
4431 | regexec_result = pg_regexec(re, |
4432 | data, |
4433 | data_len, |
4434 | search_start, |
4435 | NULL, /* no details */ |
4436 | REGEXP_REPLACE_BACKREF_CNT, |
4437 | pmatch, |
4438 | 0); |
4439 | |
4440 | if (regexec_result == REG_NOMATCH) |
4441 | break; |
4442 | |
4443 | if (regexec_result != REG_OKAY) |
4444 | { |
4445 | char errMsg[100]; |
4446 | |
4447 | CHECK_FOR_INTERRUPTS(); |
4448 | pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); |
4449 | ereport(ERROR, |
4450 | (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), |
4451 | errmsg("regular expression failed: %s" , errMsg))); |
4452 | } |
4453 | |
4454 | /* |
4455 | * Copy the text to the left of the match position. Note we are given |
4456 | * character not byte indexes. |
4457 | */ |
4458 | if (pmatch[0].rm_so - data_pos > 0) |
4459 | { |
4460 | int chunk_len; |
4461 | |
4462 | chunk_len = charlen_to_bytelen(start_ptr, |
4463 | pmatch[0].rm_so - data_pos); |
4464 | appendBinaryStringInfo(&buf, start_ptr, chunk_len); |
4465 | |
4466 | /* |
4467 | * Advance start_ptr over that text, to avoid multiple rescans of |
4468 | * it if the replace_text contains multiple back-references. |
4469 | */ |
4470 | start_ptr += chunk_len; |
4471 | data_pos = pmatch[0].rm_so; |
4472 | } |
4473 | |
4474 | /* |
4475 | * Copy the replace_text. Process back references when the |
4476 | * replace_text has escape characters. |
4477 | */ |
4478 | if (have_escape) |
4479 | appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, |
4480 | start_ptr, data_pos); |
4481 | else |
4482 | appendStringInfoText(&buf, replace_text); |
4483 | |
4484 | /* Advance start_ptr and data_pos over the matched text. */ |
4485 | start_ptr += charlen_to_bytelen(start_ptr, |
4486 | pmatch[0].rm_eo - data_pos); |
4487 | data_pos = pmatch[0].rm_eo; |
4488 | |
4489 | /* |
4490 | * When global option is off, replace the first instance only. |
4491 | */ |
4492 | if (!glob) |
4493 | break; |
4494 | |
4495 | /* |
4496 | * Advance search position. Normally we start the next search at the |
4497 | * end of the previous match; but if the match was of zero length, we |
4498 | * have to advance by one character, or we'd just find the same match |
4499 | * again. |
4500 | */ |
4501 | search_start = data_pos; |
4502 | if (pmatch[0].rm_so == pmatch[0].rm_eo) |
4503 | search_start++; |
4504 | } |
4505 | |
4506 | /* |
4507 | * Copy the text to the right of the last match. |
4508 | */ |
4509 | if (data_pos < data_len) |
4510 | { |
4511 | int chunk_len; |
4512 | |
4513 | chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; |
4514 | appendBinaryStringInfo(&buf, start_ptr, chunk_len); |
4515 | } |
4516 | |
4517 | ret_text = cstring_to_text_with_len(buf.data, buf.len); |
4518 | pfree(buf.data); |
4519 | pfree(data); |
4520 | |
4521 | return ret_text; |
4522 | } |
4523 | |
4524 | /* |
4525 | * split_text |
4526 | * parse input string |
4527 | * return ord item (1 based) |
4528 | * based on provided field separator |
4529 | */ |
4530 | Datum |
4531 | split_text(PG_FUNCTION_ARGS) |
4532 | { |
4533 | text *inputstring = PG_GETARG_TEXT_PP(0); |
4534 | text *fldsep = PG_GETARG_TEXT_PP(1); |
4535 | int fldnum = PG_GETARG_INT32(2); |
4536 | int inputstring_len; |
4537 | int fldsep_len; |
4538 | TextPositionState state; |
4539 | char *start_ptr; |
4540 | char *end_ptr; |
4541 | text *result_text; |
4542 | bool found; |
4543 | |
4544 | /* field number is 1 based */ |
4545 | if (fldnum < 1) |
4546 | ereport(ERROR, |
4547 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4548 | errmsg("field position must be greater than zero" ))); |
4549 | |
4550 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4551 | fldsep_len = VARSIZE_ANY_EXHDR(fldsep); |
4552 | |
4553 | /* return empty string for empty input string */ |
4554 | if (inputstring_len < 1) |
4555 | PG_RETURN_TEXT_P(cstring_to_text("" )); |
4556 | |
4557 | /* empty field separator */ |
4558 | if (fldsep_len < 1) |
4559 | { |
4560 | text_position_cleanup(&state); |
4561 | /* if first field, return input string, else empty string */ |
4562 | if (fldnum == 1) |
4563 | PG_RETURN_TEXT_P(inputstring); |
4564 | else |
4565 | PG_RETURN_TEXT_P(cstring_to_text("" )); |
4566 | } |
4567 | |
4568 | text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state); |
4569 | |
4570 | /* identify bounds of first field */ |
4571 | start_ptr = VARDATA_ANY(inputstring); |
4572 | found = text_position_next(&state); |
4573 | |
4574 | /* special case if fldsep not found at all */ |
4575 | if (!found) |
4576 | { |
4577 | text_position_cleanup(&state); |
4578 | /* if field 1 requested, return input string, else empty string */ |
4579 | if (fldnum == 1) |
4580 | PG_RETURN_TEXT_P(inputstring); |
4581 | else |
4582 | PG_RETURN_TEXT_P(cstring_to_text("" )); |
4583 | } |
4584 | end_ptr = text_position_get_match_ptr(&state); |
4585 | |
4586 | while (found && --fldnum > 0) |
4587 | { |
4588 | /* identify bounds of next field */ |
4589 | start_ptr = end_ptr + fldsep_len; |
4590 | found = text_position_next(&state); |
4591 | if (found) |
4592 | end_ptr = text_position_get_match_ptr(&state); |
4593 | } |
4594 | |
4595 | text_position_cleanup(&state); |
4596 | |
4597 | if (fldnum > 0) |
4598 | { |
4599 | /* N'th field separator not found */ |
4600 | /* if last field requested, return it, else empty string */ |
4601 | if (fldnum == 1) |
4602 | { |
4603 | int last_len = start_ptr - VARDATA_ANY(inputstring); |
4604 | |
4605 | result_text = cstring_to_text_with_len(start_ptr, |
4606 | inputstring_len - last_len); |
4607 | } |
4608 | else |
4609 | result_text = cstring_to_text("" ); |
4610 | } |
4611 | else |
4612 | { |
4613 | /* non-last field requested */ |
4614 | result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr); |
4615 | } |
4616 | |
4617 | PG_RETURN_TEXT_P(result_text); |
4618 | } |
4619 | |
4620 | /* |
4621 | * Convenience function to return true when two text params are equal. |
4622 | */ |
4623 | static bool |
4624 | text_isequal(text *txt1, text *txt2, Oid collid) |
4625 | { |
4626 | return DatumGetBool(DirectFunctionCall2Coll(texteq, |
4627 | collid, |
4628 | PointerGetDatum(txt1), |
4629 | PointerGetDatum(txt2))); |
4630 | } |
4631 | |
4632 | /* |
4633 | * text_to_array |
4634 | * parse input string and return text array of elements, |
4635 | * based on provided field separator |
4636 | */ |
4637 | Datum |
4638 | text_to_array(PG_FUNCTION_ARGS) |
4639 | { |
4640 | return text_to_array_internal(fcinfo); |
4641 | } |
4642 | |
4643 | /* |
4644 | * text_to_array_null |
4645 | * parse input string and return text array of elements, |
4646 | * based on provided field separator and null string |
4647 | * |
4648 | * This is a separate entry point only to prevent the regression tests from |
4649 | * complaining about different argument sets for the same internal function. |
4650 | */ |
4651 | Datum |
4652 | text_to_array_null(PG_FUNCTION_ARGS) |
4653 | { |
4654 | return text_to_array_internal(fcinfo); |
4655 | } |
4656 | |
4657 | /* |
4658 | * common code for text_to_array and text_to_array_null functions |
4659 | * |
4660 | * These are not strict so we have to test for null inputs explicitly. |
4661 | */ |
4662 | static Datum |
4663 | text_to_array_internal(PG_FUNCTION_ARGS) |
4664 | { |
4665 | text *inputstring; |
4666 | text *fldsep; |
4667 | text *null_string; |
4668 | int inputstring_len; |
4669 | int fldsep_len; |
4670 | char *start_ptr; |
4671 | text *result_text; |
4672 | bool is_null; |
4673 | ArrayBuildState *astate = NULL; |
4674 | |
4675 | /* when input string is NULL, then result is NULL too */ |
4676 | if (PG_ARGISNULL(0)) |
4677 | PG_RETURN_NULL(); |
4678 | |
4679 | inputstring = PG_GETARG_TEXT_PP(0); |
4680 | |
4681 | /* fldsep can be NULL */ |
4682 | if (!PG_ARGISNULL(1)) |
4683 | fldsep = PG_GETARG_TEXT_PP(1); |
4684 | else |
4685 | fldsep = NULL; |
4686 | |
4687 | /* null_string can be NULL or omitted */ |
4688 | if (PG_NARGS() > 2 && !PG_ARGISNULL(2)) |
4689 | null_string = PG_GETARG_TEXT_PP(2); |
4690 | else |
4691 | null_string = NULL; |
4692 | |
4693 | if (fldsep != NULL) |
4694 | { |
4695 | /* |
4696 | * Normal case with non-null fldsep. Use the text_position machinery |
4697 | * to search for occurrences of fldsep. |
4698 | */ |
4699 | TextPositionState state; |
4700 | |
4701 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4702 | fldsep_len = VARSIZE_ANY_EXHDR(fldsep); |
4703 | |
4704 | /* return empty array for empty input string */ |
4705 | if (inputstring_len < 1) |
4706 | PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); |
4707 | |
4708 | /* |
4709 | * empty field separator: return the input string as a one-element |
4710 | * array |
4711 | */ |
4712 | if (fldsep_len < 1) |
4713 | { |
4714 | Datum elems[1]; |
4715 | bool nulls[1]; |
4716 | int dims[1]; |
4717 | int lbs[1]; |
4718 | |
4719 | /* single element can be a NULL too */ |
4720 | is_null = null_string ? text_isequal(inputstring, null_string, PG_GET_COLLATION()) : false; |
4721 | |
4722 | elems[0] = PointerGetDatum(inputstring); |
4723 | nulls[0] = is_null; |
4724 | dims[0] = 1; |
4725 | lbs[0] = 1; |
4726 | /* XXX: this hardcodes assumptions about the text type */ |
4727 | PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls, |
4728 | 1, dims, lbs, |
4729 | TEXTOID, -1, false, 'i')); |
4730 | } |
4731 | |
4732 | text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state); |
4733 | |
4734 | start_ptr = VARDATA_ANY(inputstring); |
4735 | |
4736 | for (;;) |
4737 | { |
4738 | bool found; |
4739 | char *end_ptr; |
4740 | int chunk_len; |
4741 | |
4742 | CHECK_FOR_INTERRUPTS(); |
4743 | |
4744 | found = text_position_next(&state); |
4745 | if (!found) |
4746 | { |
4747 | /* fetch last field */ |
4748 | chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr; |
4749 | end_ptr = NULL; /* not used, but some compilers complain */ |
4750 | } |
4751 | else |
4752 | { |
4753 | /* fetch non-last field */ |
4754 | end_ptr = text_position_get_match_ptr(&state); |
4755 | chunk_len = end_ptr - start_ptr; |
4756 | } |
4757 | |
4758 | /* must build a temp text datum to pass to accumArrayResult */ |
4759 | result_text = cstring_to_text_with_len(start_ptr, chunk_len); |
4760 | is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false; |
4761 | |
4762 | /* stash away this field */ |
4763 | astate = accumArrayResult(astate, |
4764 | PointerGetDatum(result_text), |
4765 | is_null, |
4766 | TEXTOID, |
4767 | CurrentMemoryContext); |
4768 | |
4769 | pfree(result_text); |
4770 | |
4771 | if (!found) |
4772 | break; |
4773 | |
4774 | start_ptr = end_ptr + fldsep_len; |
4775 | } |
4776 | |
4777 | text_position_cleanup(&state); |
4778 | } |
4779 | else |
4780 | { |
4781 | /* |
4782 | * When fldsep is NULL, each character in the inputstring becomes an |
4783 | * element in the result array. The separator is effectively the |
4784 | * space between characters. |
4785 | */ |
4786 | inputstring_len = VARSIZE_ANY_EXHDR(inputstring); |
4787 | |
4788 | /* return empty array for empty input string */ |
4789 | if (inputstring_len < 1) |
4790 | PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); |
4791 | |
4792 | start_ptr = VARDATA_ANY(inputstring); |
4793 | |
4794 | while (inputstring_len > 0) |
4795 | { |
4796 | int chunk_len = pg_mblen(start_ptr); |
4797 | |
4798 | CHECK_FOR_INTERRUPTS(); |
4799 | |
4800 | /* must build a temp text datum to pass to accumArrayResult */ |
4801 | result_text = cstring_to_text_with_len(start_ptr, chunk_len); |
4802 | is_null = null_string ? text_isequal(result_text, null_string, PG_GET_COLLATION()) : false; |
4803 | |
4804 | /* stash away this field */ |
4805 | astate = accumArrayResult(astate, |
4806 | PointerGetDatum(result_text), |
4807 | is_null, |
4808 | TEXTOID, |
4809 | CurrentMemoryContext); |
4810 | |
4811 | pfree(result_text); |
4812 | |
4813 | start_ptr += chunk_len; |
4814 | inputstring_len -= chunk_len; |
4815 | } |
4816 | } |
4817 | |
4818 | PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, |
4819 | CurrentMemoryContext)); |
4820 | } |
4821 | |
4822 | /* |
4823 | * array_to_text |
4824 | * concatenate Cstring representation of input array elements |
4825 | * using provided field separator |
4826 | */ |
4827 | Datum |
4828 | array_to_text(PG_FUNCTION_ARGS) |
4829 | { |
4830 | ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); |
4831 | char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
4832 | |
4833 | PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL)); |
4834 | } |
4835 | |
4836 | /* |
4837 | * array_to_text_null |
4838 | * concatenate Cstring representation of input array elements |
4839 | * using provided field separator and null string |
4840 | * |
4841 | * This version is not strict so we have to test for null inputs explicitly. |
4842 | */ |
4843 | Datum |
4844 | array_to_text_null(PG_FUNCTION_ARGS) |
4845 | { |
4846 | ArrayType *v; |
4847 | char *fldsep; |
4848 | char *null_string; |
4849 | |
4850 | /* returns NULL when first or second parameter is NULL */ |
4851 | if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) |
4852 | PG_RETURN_NULL(); |
4853 | |
4854 | v = PG_GETARG_ARRAYTYPE_P(0); |
4855 | fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
4856 | |
4857 | /* NULL null string is passed through as a null pointer */ |
4858 | if (!PG_ARGISNULL(2)) |
4859 | null_string = text_to_cstring(PG_GETARG_TEXT_PP(2)); |
4860 | else |
4861 | null_string = NULL; |
4862 | |
4863 | PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string)); |
4864 | } |
4865 | |
4866 | /* |
4867 | * common code for array_to_text and array_to_text_null functions |
4868 | */ |
4869 | static text * |
4870 | array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v, |
4871 | const char *fldsep, const char *null_string) |
4872 | { |
4873 | text *result; |
4874 | int nitems, |
4875 | *dims, |
4876 | ndims; |
4877 | Oid element_type; |
4878 | int typlen; |
4879 | bool typbyval; |
4880 | char typalign; |
4881 | StringInfoData buf; |
4882 | bool printed = false; |
4883 | char *p; |
4884 | bits8 *bitmap; |
4885 | int bitmask; |
4886 | int i; |
4887 | ArrayMetaState *; |
4888 | |
4889 | ndims = ARR_NDIM(v); |
4890 | dims = ARR_DIMS(v); |
4891 | nitems = ArrayGetNItems(ndims, dims); |
4892 | |
4893 | /* if there are no elements, return an empty string */ |
4894 | if (nitems == 0) |
4895 | return cstring_to_text_with_len("" , 0); |
4896 | |
4897 | element_type = ARR_ELEMTYPE(v); |
4898 | initStringInfo(&buf); |
4899 | |
4900 | /* |
4901 | * We arrange to look up info about element type, including its output |
4902 | * conversion proc, only once per series of calls, assuming the element |
4903 | * type doesn't change underneath us. |
4904 | */ |
4905 | my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
4906 | if (my_extra == NULL) |
4907 | { |
4908 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
4909 | sizeof(ArrayMetaState)); |
4910 | my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra; |
4911 | my_extra->element_type = ~element_type; |
4912 | } |
4913 | |
4914 | if (my_extra->element_type != element_type) |
4915 | { |
4916 | /* |
4917 | * Get info about element type, including its output conversion proc |
4918 | */ |
4919 | get_type_io_data(element_type, IOFunc_output, |
4920 | &my_extra->typlen, &my_extra->typbyval, |
4921 | &my_extra->typalign, &my_extra->typdelim, |
4922 | &my_extra->typioparam, &my_extra->typiofunc); |
4923 | fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc, |
4924 | fcinfo->flinfo->fn_mcxt); |
4925 | my_extra->element_type = element_type; |
4926 | } |
4927 | typlen = my_extra->typlen; |
4928 | typbyval = my_extra->typbyval; |
4929 | typalign = my_extra->typalign; |
4930 | |
4931 | p = ARR_DATA_PTR(v); |
4932 | bitmap = ARR_NULLBITMAP(v); |
4933 | bitmask = 1; |
4934 | |
4935 | for (i = 0; i < nitems; i++) |
4936 | { |
4937 | Datum itemvalue; |
4938 | char *value; |
4939 | |
4940 | /* Get source element, checking for NULL */ |
4941 | if (bitmap && (*bitmap & bitmask) == 0) |
4942 | { |
4943 | /* if null_string is NULL, we just ignore null elements */ |
4944 | if (null_string != NULL) |
4945 | { |
4946 | if (printed) |
4947 | appendStringInfo(&buf, "%s%s" , fldsep, null_string); |
4948 | else |
4949 | appendStringInfoString(&buf, null_string); |
4950 | printed = true; |
4951 | } |
4952 | } |
4953 | else |
4954 | { |
4955 | itemvalue = fetch_att(p, typbyval, typlen); |
4956 | |
4957 | value = OutputFunctionCall(&my_extra->proc, itemvalue); |
4958 | |
4959 | if (printed) |
4960 | appendStringInfo(&buf, "%s%s" , fldsep, value); |
4961 | else |
4962 | appendStringInfoString(&buf, value); |
4963 | printed = true; |
4964 | |
4965 | p = att_addlength_pointer(p, typlen, p); |
4966 | p = (char *) att_align_nominal(p, typalign); |
4967 | } |
4968 | |
4969 | /* advance bitmap pointer if any */ |
4970 | if (bitmap) |
4971 | { |
4972 | bitmask <<= 1; |
4973 | if (bitmask == 0x100) |
4974 | { |
4975 | bitmap++; |
4976 | bitmask = 1; |
4977 | } |
4978 | } |
4979 | } |
4980 | |
4981 | result = cstring_to_text_with_len(buf.data, buf.len); |
4982 | pfree(buf.data); |
4983 | |
4984 | return result; |
4985 | } |
4986 | |
4987 | #define HEXBASE 16 |
4988 | /* |
4989 | * Convert an int32 to a string containing a base 16 (hex) representation of |
4990 | * the number. |
4991 | */ |
4992 | Datum |
4993 | to_hex32(PG_FUNCTION_ARGS) |
4994 | { |
4995 | uint32 value = (uint32) PG_GETARG_INT32(0); |
4996 | char *ptr; |
4997 | const char *digits = "0123456789abcdef" ; |
4998 | char buf[32]; /* bigger than needed, but reasonable */ |
4999 | |
5000 | ptr = buf + sizeof(buf) - 1; |
5001 | *ptr = '\0'; |
5002 | |
5003 | do |
5004 | { |
5005 | *--ptr = digits[value % HEXBASE]; |
5006 | value /= HEXBASE; |
5007 | } while (ptr > buf && value); |
5008 | |
5009 | PG_RETURN_TEXT_P(cstring_to_text(ptr)); |
5010 | } |
5011 | |
5012 | /* |
5013 | * Convert an int64 to a string containing a base 16 (hex) representation of |
5014 | * the number. |
5015 | */ |
5016 | Datum |
5017 | to_hex64(PG_FUNCTION_ARGS) |
5018 | { |
5019 | uint64 value = (uint64) PG_GETARG_INT64(0); |
5020 | char *ptr; |
5021 | const char *digits = "0123456789abcdef" ; |
5022 | char buf[32]; /* bigger than needed, but reasonable */ |
5023 | |
5024 | ptr = buf + sizeof(buf) - 1; |
5025 | *ptr = '\0'; |
5026 | |
5027 | do |
5028 | { |
5029 | *--ptr = digits[value % HEXBASE]; |
5030 | value /= HEXBASE; |
5031 | } while (ptr > buf && value); |
5032 | |
5033 | PG_RETURN_TEXT_P(cstring_to_text(ptr)); |
5034 | } |
5035 | |
5036 | /* |
5037 | * Return the size of a datum, possibly compressed |
5038 | * |
5039 | * Works on any data type |
5040 | */ |
5041 | Datum |
5042 | pg_column_size(PG_FUNCTION_ARGS) |
5043 | { |
5044 | Datum value = PG_GETARG_DATUM(0); |
5045 | int32 result; |
5046 | int typlen; |
5047 | |
5048 | /* On first call, get the input type's typlen, and save at *fn_extra */ |
5049 | if (fcinfo->flinfo->fn_extra == NULL) |
5050 | { |
5051 | /* Lookup the datatype of the supplied argument */ |
5052 | Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0); |
5053 | |
5054 | typlen = get_typlen(argtypeid); |
5055 | if (typlen == 0) /* should not happen */ |
5056 | elog(ERROR, "cache lookup failed for type %u" , argtypeid); |
5057 | |
5058 | fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5059 | sizeof(int)); |
5060 | *((int *) fcinfo->flinfo->fn_extra) = typlen; |
5061 | } |
5062 | else |
5063 | typlen = *((int *) fcinfo->flinfo->fn_extra); |
5064 | |
5065 | if (typlen == -1) |
5066 | { |
5067 | /* varlena type, possibly toasted */ |
5068 | result = toast_datum_size(value); |
5069 | } |
5070 | else if (typlen == -2) |
5071 | { |
5072 | /* cstring */ |
5073 | result = strlen(DatumGetCString(value)) + 1; |
5074 | } |
5075 | else |
5076 | { |
5077 | /* ordinary fixed-width type */ |
5078 | result = typlen; |
5079 | } |
5080 | |
5081 | PG_RETURN_INT32(result); |
5082 | } |
5083 | |
5084 | /* |
5085 | * string_agg - Concatenates values and returns string. |
5086 | * |
5087 | * Syntax: string_agg(value text, delimiter text) RETURNS text |
5088 | * |
5089 | * Note: Any NULL values are ignored. The first-call delimiter isn't |
5090 | * actually used at all, and on subsequent calls the delimiter precedes |
5091 | * the associated value. |
5092 | */ |
5093 | |
5094 | /* subroutine to initialize state */ |
5095 | static StringInfo |
5096 | makeStringAggState(FunctionCallInfo fcinfo) |
5097 | { |
5098 | StringInfo state; |
5099 | MemoryContext aggcontext; |
5100 | MemoryContext oldcontext; |
5101 | |
5102 | if (!AggCheckCallContext(fcinfo, &aggcontext)) |
5103 | { |
5104 | /* cannot be called directly because of internal-type argument */ |
5105 | elog(ERROR, "string_agg_transfn called in non-aggregate context" ); |
5106 | } |
5107 | |
5108 | /* |
5109 | * Create state in aggregate context. It'll stay there across subsequent |
5110 | * calls. |
5111 | */ |
5112 | oldcontext = MemoryContextSwitchTo(aggcontext); |
5113 | state = makeStringInfo(); |
5114 | MemoryContextSwitchTo(oldcontext); |
5115 | |
5116 | return state; |
5117 | } |
5118 | |
5119 | Datum |
5120 | string_agg_transfn(PG_FUNCTION_ARGS) |
5121 | { |
5122 | StringInfo state; |
5123 | |
5124 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
5125 | |
5126 | /* Append the value unless null. */ |
5127 | if (!PG_ARGISNULL(1)) |
5128 | { |
5129 | /* On the first time through, we ignore the delimiter. */ |
5130 | if (state == NULL) |
5131 | state = makeStringAggState(fcinfo); |
5132 | else if (!PG_ARGISNULL(2)) |
5133 | appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */ |
5134 | |
5135 | appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */ |
5136 | } |
5137 | |
5138 | /* |
5139 | * The transition type for string_agg() is declared to be "internal", |
5140 | * which is a pass-by-value type the same size as a pointer. |
5141 | */ |
5142 | PG_RETURN_POINTER(state); |
5143 | } |
5144 | |
5145 | Datum |
5146 | string_agg_finalfn(PG_FUNCTION_ARGS) |
5147 | { |
5148 | StringInfo state; |
5149 | |
5150 | /* cannot be called directly because of internal-type argument */ |
5151 | Assert(AggCheckCallContext(fcinfo, NULL)); |
5152 | |
5153 | state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); |
5154 | |
5155 | if (state != NULL) |
5156 | PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len)); |
5157 | else |
5158 | PG_RETURN_NULL(); |
5159 | } |
5160 | |
5161 | /* |
5162 | * Prepare cache with fmgr info for the output functions of the datatypes of |
5163 | * the arguments of a concat-like function, beginning with argument "argidx". |
5164 | * (Arguments before that will have corresponding slots in the resulting |
5165 | * FmgrInfo array, but we don't fill those slots.) |
5166 | */ |
5167 | static FmgrInfo * |
5168 | build_concat_foutcache(FunctionCallInfo fcinfo, int argidx) |
5169 | { |
5170 | FmgrInfo *foutcache; |
5171 | int i; |
5172 | |
5173 | /* We keep the info in fn_mcxt so it survives across calls */ |
5174 | foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, |
5175 | PG_NARGS() * sizeof(FmgrInfo)); |
5176 | |
5177 | for (i = argidx; i < PG_NARGS(); i++) |
5178 | { |
5179 | Oid valtype; |
5180 | Oid typOutput; |
5181 | bool typIsVarlena; |
5182 | |
5183 | valtype = get_fn_expr_argtype(fcinfo->flinfo, i); |
5184 | if (!OidIsValid(valtype)) |
5185 | elog(ERROR, "could not determine data type of concat() input" ); |
5186 | |
5187 | getTypeOutputInfo(valtype, &typOutput, &typIsVarlena); |
5188 | fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt); |
5189 | } |
5190 | |
5191 | fcinfo->flinfo->fn_extra = foutcache; |
5192 | |
5193 | return foutcache; |
5194 | } |
5195 | |
5196 | /* |
5197 | * Implementation of both concat() and concat_ws(). |
5198 | * |
5199 | * sepstr is the separator string to place between values. |
5200 | * argidx identifies the first argument to concatenate (counting from zero); |
5201 | * note that this must be constant across any one series of calls. |
5202 | * |
5203 | * Returns NULL if result should be NULL, else text value. |
5204 | */ |
5205 | static text * |
5206 | concat_internal(const char *sepstr, int argidx, |
5207 | FunctionCallInfo fcinfo) |
5208 | { |
5209 | text *result; |
5210 | StringInfoData str; |
5211 | FmgrInfo *foutcache; |
5212 | bool first_arg = true; |
5213 | int i; |
5214 | |
5215 | /* |
5216 | * concat(VARIADIC some-array) is essentially equivalent to |
5217 | * array_to_text(), ie concat the array elements with the given separator. |
5218 | * So we just pass the case off to that code. |
5219 | */ |
5220 | if (get_fn_expr_variadic(fcinfo->flinfo)) |
5221 | { |
5222 | ArrayType *arr; |
5223 | |
5224 | /* Should have just the one argument */ |
5225 | Assert(argidx == PG_NARGS() - 1); |
5226 | |
5227 | /* concat(VARIADIC NULL) is defined as NULL */ |
5228 | if (PG_ARGISNULL(argidx)) |
5229 | return NULL; |
5230 | |
5231 | /* |
5232 | * Non-null argument had better be an array. We assume that any call |
5233 | * context that could let get_fn_expr_variadic return true will have |
5234 | * checked that a VARIADIC-labeled parameter actually is an array. So |
5235 | * it should be okay to just Assert that it's an array rather than |
5236 | * doing a full-fledged error check. |
5237 | */ |
5238 | Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx)))); |
5239 | |
5240 | /* OK, safe to fetch the array value */ |
5241 | arr = PG_GETARG_ARRAYTYPE_P(argidx); |
5242 | |
5243 | /* |
5244 | * And serialize the array. We tell array_to_text to ignore null |
5245 | * elements, which matches the behavior of the loop below. |
5246 | */ |
5247 | return array_to_text_internal(fcinfo, arr, sepstr, NULL); |
5248 | } |
5249 | |
5250 | /* Normal case without explicit VARIADIC marker */ |
5251 | initStringInfo(&str); |
5252 | |
5253 | /* Get output function info, building it if first time through */ |
5254 | foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra; |
5255 | if (foutcache == NULL) |
5256 | foutcache = build_concat_foutcache(fcinfo, argidx); |
5257 | |
5258 | for (i = argidx; i < PG_NARGS(); i++) |
5259 | { |
5260 | if (!PG_ARGISNULL(i)) |
5261 | { |
5262 | Datum value = PG_GETARG_DATUM(i); |
5263 | |
5264 | /* add separator if appropriate */ |
5265 | if (first_arg) |
5266 | first_arg = false; |
5267 | else |
5268 | appendStringInfoString(&str, sepstr); |
5269 | |
5270 | /* call the appropriate type output function, append the result */ |
5271 | appendStringInfoString(&str, |
5272 | OutputFunctionCall(&foutcache[i], value)); |
5273 | } |
5274 | } |
5275 | |
5276 | result = cstring_to_text_with_len(str.data, str.len); |
5277 | pfree(str.data); |
5278 | |
5279 | return result; |
5280 | } |
5281 | |
5282 | /* |
5283 | * Concatenate all arguments. NULL arguments are ignored. |
5284 | */ |
5285 | Datum |
5286 | text_concat(PG_FUNCTION_ARGS) |
5287 | { |
5288 | text *result; |
5289 | |
5290 | result = concat_internal("" , 0, fcinfo); |
5291 | if (result == NULL) |
5292 | PG_RETURN_NULL(); |
5293 | PG_RETURN_TEXT_P(result); |
5294 | } |
5295 | |
5296 | /* |
5297 | * Concatenate all but first argument value with separators. The first |
5298 | * parameter is used as the separator. NULL arguments are ignored. |
5299 | */ |
5300 | Datum |
5301 | text_concat_ws(PG_FUNCTION_ARGS) |
5302 | { |
5303 | char *sep; |
5304 | text *result; |
5305 | |
5306 | /* return NULL when separator is NULL */ |
5307 | if (PG_ARGISNULL(0)) |
5308 | PG_RETURN_NULL(); |
5309 | sep = text_to_cstring(PG_GETARG_TEXT_PP(0)); |
5310 | |
5311 | result = concat_internal(sep, 1, fcinfo); |
5312 | if (result == NULL) |
5313 | PG_RETURN_NULL(); |
5314 | PG_RETURN_TEXT_P(result); |
5315 | } |
5316 | |
5317 | /* |
5318 | * Return first n characters in the string. When n is negative, |
5319 | * return all but last |n| characters. |
5320 | */ |
5321 | Datum |
5322 | text_left(PG_FUNCTION_ARGS) |
5323 | { |
5324 | int n = PG_GETARG_INT32(1); |
5325 | |
5326 | if (n < 0) |
5327 | { |
5328 | text *str = PG_GETARG_TEXT_PP(0); |
5329 | const char *p = VARDATA_ANY(str); |
5330 | int len = VARSIZE_ANY_EXHDR(str); |
5331 | int rlen; |
5332 | |
5333 | n = pg_mbstrlen_with_len(p, len) + n; |
5334 | rlen = pg_mbcharcliplen(p, len, n); |
5335 | PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen)); |
5336 | } |
5337 | else |
5338 | PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false)); |
5339 | } |
5340 | |
5341 | /* |
5342 | * Return last n characters in the string. When n is negative, |
5343 | * return all but first |n| characters. |
5344 | */ |
5345 | Datum |
5346 | text_right(PG_FUNCTION_ARGS) |
5347 | { |
5348 | text *str = PG_GETARG_TEXT_PP(0); |
5349 | const char *p = VARDATA_ANY(str); |
5350 | int len = VARSIZE_ANY_EXHDR(str); |
5351 | int n = PG_GETARG_INT32(1); |
5352 | int off; |
5353 | |
5354 | if (n < 0) |
5355 | n = -n; |
5356 | else |
5357 | n = pg_mbstrlen_with_len(p, len) - n; |
5358 | off = pg_mbcharcliplen(p, len, n); |
5359 | |
5360 | PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off)); |
5361 | } |
5362 | |
5363 | /* |
5364 | * Return reversed string |
5365 | */ |
5366 | Datum |
5367 | text_reverse(PG_FUNCTION_ARGS) |
5368 | { |
5369 | text *str = PG_GETARG_TEXT_PP(0); |
5370 | const char *p = VARDATA_ANY(str); |
5371 | int len = VARSIZE_ANY_EXHDR(str); |
5372 | const char *endp = p + len; |
5373 | text *result; |
5374 | char *dst; |
5375 | |
5376 | result = palloc(len + VARHDRSZ); |
5377 | dst = (char *) VARDATA(result) + len; |
5378 | SET_VARSIZE(result, len + VARHDRSZ); |
5379 | |
5380 | if (pg_database_encoding_max_length() > 1) |
5381 | { |
5382 | /* multibyte version */ |
5383 | while (p < endp) |
5384 | { |
5385 | int sz; |
5386 | |
5387 | sz = pg_mblen(p); |
5388 | dst -= sz; |
5389 | memcpy(dst, p, sz); |
5390 | p += sz; |
5391 | } |
5392 | } |
5393 | else |
5394 | { |
5395 | /* single byte version */ |
5396 | while (p < endp) |
5397 | *(--dst) = *p++; |
5398 | } |
5399 | |
5400 | PG_RETURN_TEXT_P(result); |
5401 | } |
5402 | |
5403 | |
5404 | /* |
5405 | * Support macros for text_format() |
5406 | */ |
5407 | #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */ |
5408 | |
5409 | #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \ |
5410 | do { \ |
5411 | if (++(ptr) >= (end_ptr)) \ |
5412 | ereport(ERROR, \ |
5413 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ |
5414 | errmsg("unterminated format() type specifier"), \ |
5415 | errhint("For a single \"%%\" use \"%%%%\"."))); \ |
5416 | } while (0) |
5417 | |
5418 | /* |
5419 | * Returns a formatted string |
5420 | */ |
5421 | Datum |
5422 | text_format(PG_FUNCTION_ARGS) |
5423 | { |
5424 | text *fmt; |
5425 | StringInfoData str; |
5426 | const char *cp; |
5427 | const char *start_ptr; |
5428 | const char *end_ptr; |
5429 | text *result; |
5430 | int arg; |
5431 | bool funcvariadic; |
5432 | int nargs; |
5433 | Datum *elements = NULL; |
5434 | bool *nulls = NULL; |
5435 | Oid element_type = InvalidOid; |
5436 | Oid prev_type = InvalidOid; |
5437 | Oid prev_width_type = InvalidOid; |
5438 | FmgrInfo typoutputfinfo; |
5439 | FmgrInfo typoutputinfo_width; |
5440 | |
5441 | /* When format string is null, immediately return null */ |
5442 | if (PG_ARGISNULL(0)) |
5443 | PG_RETURN_NULL(); |
5444 | |
5445 | /* If argument is marked VARIADIC, expand array into elements */ |
5446 | if (get_fn_expr_variadic(fcinfo->flinfo)) |
5447 | { |
5448 | ArrayType *arr; |
5449 | int16 elmlen; |
5450 | bool elmbyval; |
5451 | char elmalign; |
5452 | int nitems; |
5453 | |
5454 | /* Should have just the one argument */ |
5455 | Assert(PG_NARGS() == 2); |
5456 | |
5457 | /* If argument is NULL, we treat it as zero-length array */ |
5458 | if (PG_ARGISNULL(1)) |
5459 | nitems = 0; |
5460 | else |
5461 | { |
5462 | /* |
5463 | * Non-null argument had better be an array. We assume that any |
5464 | * call context that could let get_fn_expr_variadic return true |
5465 | * will have checked that a VARIADIC-labeled parameter actually is |
5466 | * an array. So it should be okay to just Assert that it's an |
5467 | * array rather than doing a full-fledged error check. |
5468 | */ |
5469 | Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1)))); |
5470 | |
5471 | /* OK, safe to fetch the array value */ |
5472 | arr = PG_GETARG_ARRAYTYPE_P(1); |
5473 | |
5474 | /* Get info about array element type */ |
5475 | element_type = ARR_ELEMTYPE(arr); |
5476 | get_typlenbyvalalign(element_type, |
5477 | &elmlen, &elmbyval, &elmalign); |
5478 | |
5479 | /* Extract all array elements */ |
5480 | deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign, |
5481 | &elements, &nulls, &nitems); |
5482 | } |
5483 | |
5484 | nargs = nitems + 1; |
5485 | funcvariadic = true; |
5486 | } |
5487 | else |
5488 | { |
5489 | /* Non-variadic case, we'll process the arguments individually */ |
5490 | nargs = PG_NARGS(); |
5491 | funcvariadic = false; |
5492 | } |
5493 | |
5494 | /* Setup for main loop. */ |
5495 | fmt = PG_GETARG_TEXT_PP(0); |
5496 | start_ptr = VARDATA_ANY(fmt); |
5497 | end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt); |
5498 | initStringInfo(&str); |
5499 | arg = 1; /* next argument position to print */ |
5500 | |
5501 | /* Scan format string, looking for conversion specifiers. */ |
5502 | for (cp = start_ptr; cp < end_ptr; cp++) |
5503 | { |
5504 | int argpos; |
5505 | int widthpos; |
5506 | int flags; |
5507 | int width; |
5508 | Datum value; |
5509 | bool isNull; |
5510 | Oid typid; |
5511 | |
5512 | /* |
5513 | * If it's not the start of a conversion specifier, just copy it to |
5514 | * the output buffer. |
5515 | */ |
5516 | if (*cp != '%') |
5517 | { |
5518 | appendStringInfoCharMacro(&str, *cp); |
5519 | continue; |
5520 | } |
5521 | |
5522 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5523 | |
5524 | /* Easy case: %% outputs a single % */ |
5525 | if (*cp == '%') |
5526 | { |
5527 | appendStringInfoCharMacro(&str, *cp); |
5528 | continue; |
5529 | } |
5530 | |
5531 | /* Parse the optional portions of the format specifier */ |
5532 | cp = text_format_parse_format(cp, end_ptr, |
5533 | &argpos, &widthpos, |
5534 | &flags, &width); |
5535 | |
5536 | /* |
5537 | * Next we should see the main conversion specifier. Whether or not |
5538 | * an argument position was present, it's known that at least one |
5539 | * character remains in the string at this point. Experience suggests |
5540 | * that it's worth checking that that character is one of the expected |
5541 | * ones before we try to fetch arguments, so as to produce the least |
5542 | * confusing response to a mis-formatted specifier. |
5543 | */ |
5544 | if (strchr("sIL" , *cp) == NULL) |
5545 | ereport(ERROR, |
5546 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5547 | errmsg("unrecognized format() type specifier \"%c\"" , |
5548 | *cp), |
5549 | errhint("For a single \"%%\" use \"%%%%\"." ))); |
5550 | |
5551 | /* If indirect width was specified, get its value */ |
5552 | if (widthpos >= 0) |
5553 | { |
5554 | /* Collect the specified or next argument position */ |
5555 | if (widthpos > 0) |
5556 | arg = widthpos; |
5557 | if (arg >= nargs) |
5558 | ereport(ERROR, |
5559 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5560 | errmsg("too few arguments for format()" ))); |
5561 | |
5562 | /* Get the value and type of the selected argument */ |
5563 | if (!funcvariadic) |
5564 | { |
5565 | value = PG_GETARG_DATUM(arg); |
5566 | isNull = PG_ARGISNULL(arg); |
5567 | typid = get_fn_expr_argtype(fcinfo->flinfo, arg); |
5568 | } |
5569 | else |
5570 | { |
5571 | value = elements[arg - 1]; |
5572 | isNull = nulls[arg - 1]; |
5573 | typid = element_type; |
5574 | } |
5575 | if (!OidIsValid(typid)) |
5576 | elog(ERROR, "could not determine data type of format() input" ); |
5577 | |
5578 | arg++; |
5579 | |
5580 | /* We can treat NULL width the same as zero */ |
5581 | if (isNull) |
5582 | width = 0; |
5583 | else if (typid == INT4OID) |
5584 | width = DatumGetInt32(value); |
5585 | else if (typid == INT2OID) |
5586 | width = DatumGetInt16(value); |
5587 | else |
5588 | { |
5589 | /* For less-usual datatypes, convert to text then to int */ |
5590 | char *str; |
5591 | |
5592 | if (typid != prev_width_type) |
5593 | { |
5594 | Oid typoutputfunc; |
5595 | bool typIsVarlena; |
5596 | |
5597 | getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena); |
5598 | fmgr_info(typoutputfunc, &typoutputinfo_width); |
5599 | prev_width_type = typid; |
5600 | } |
5601 | |
5602 | str = OutputFunctionCall(&typoutputinfo_width, value); |
5603 | |
5604 | /* pg_strtoint32 will complain about bad data or overflow */ |
5605 | width = pg_strtoint32(str); |
5606 | |
5607 | pfree(str); |
5608 | } |
5609 | } |
5610 | |
5611 | /* Collect the specified or next argument position */ |
5612 | if (argpos > 0) |
5613 | arg = argpos; |
5614 | if (arg >= nargs) |
5615 | ereport(ERROR, |
5616 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5617 | errmsg("too few arguments for format()" ))); |
5618 | |
5619 | /* Get the value and type of the selected argument */ |
5620 | if (!funcvariadic) |
5621 | { |
5622 | value = PG_GETARG_DATUM(arg); |
5623 | isNull = PG_ARGISNULL(arg); |
5624 | typid = get_fn_expr_argtype(fcinfo->flinfo, arg); |
5625 | } |
5626 | else |
5627 | { |
5628 | value = elements[arg - 1]; |
5629 | isNull = nulls[arg - 1]; |
5630 | typid = element_type; |
5631 | } |
5632 | if (!OidIsValid(typid)) |
5633 | elog(ERROR, "could not determine data type of format() input" ); |
5634 | |
5635 | arg++; |
5636 | |
5637 | /* |
5638 | * Get the appropriate typOutput function, reusing previous one if |
5639 | * same type as previous argument. That's particularly useful in the |
5640 | * variadic-array case, but often saves work even for ordinary calls. |
5641 | */ |
5642 | if (typid != prev_type) |
5643 | { |
5644 | Oid typoutputfunc; |
5645 | bool typIsVarlena; |
5646 | |
5647 | getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena); |
5648 | fmgr_info(typoutputfunc, &typoutputfinfo); |
5649 | prev_type = typid; |
5650 | } |
5651 | |
5652 | /* |
5653 | * And now we can format the value. |
5654 | */ |
5655 | switch (*cp) |
5656 | { |
5657 | case 's': |
5658 | case 'I': |
5659 | case 'L': |
5660 | text_format_string_conversion(&str, *cp, &typoutputfinfo, |
5661 | value, isNull, |
5662 | flags, width); |
5663 | break; |
5664 | default: |
5665 | /* should not get here, because of previous check */ |
5666 | ereport(ERROR, |
5667 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5668 | errmsg("unrecognized format() type specifier \"%c\"" , |
5669 | *cp), |
5670 | errhint("For a single \"%%\" use \"%%%%\"." ))); |
5671 | break; |
5672 | } |
5673 | } |
5674 | |
5675 | /* Don't need deconstruct_array results anymore. */ |
5676 | if (elements != NULL) |
5677 | pfree(elements); |
5678 | if (nulls != NULL) |
5679 | pfree(nulls); |
5680 | |
5681 | /* Generate results. */ |
5682 | result = cstring_to_text_with_len(str.data, str.len); |
5683 | pfree(str.data); |
5684 | |
5685 | PG_RETURN_TEXT_P(result); |
5686 | } |
5687 | |
5688 | /* |
5689 | * Parse contiguous digits as a decimal number. |
5690 | * |
5691 | * Returns true if some digits could be parsed. |
5692 | * The value is returned into *value, and *ptr is advanced to the next |
5693 | * character to be parsed. |
5694 | * |
5695 | * Note parsing invariant: at least one character is known available before |
5696 | * string end (end_ptr) at entry, and this is still true at exit. |
5697 | */ |
5698 | static bool |
5699 | text_format_parse_digits(const char **ptr, const char *end_ptr, int *value) |
5700 | { |
5701 | bool found = false; |
5702 | const char *cp = *ptr; |
5703 | int val = 0; |
5704 | |
5705 | while (*cp >= '0' && *cp <= '9') |
5706 | { |
5707 | int8 digit = (*cp - '0'); |
5708 | |
5709 | if (unlikely(pg_mul_s32_overflow(val, 10, &val)) || |
5710 | unlikely(pg_add_s32_overflow(val, digit, &val))) |
5711 | ereport(ERROR, |
5712 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
5713 | errmsg("number is out of range" ))); |
5714 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5715 | found = true; |
5716 | } |
5717 | |
5718 | *ptr = cp; |
5719 | *value = val; |
5720 | |
5721 | return found; |
5722 | } |
5723 | |
5724 | /* |
5725 | * Parse a format specifier (generally following the SUS printf spec). |
5726 | * |
5727 | * We have already advanced over the initial '%', and we are looking for |
5728 | * [argpos][flags][width]type (but the type character is not consumed here). |
5729 | * |
5730 | * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1). |
5731 | * Output parameters: |
5732 | * argpos: argument position for value to be printed. -1 means unspecified. |
5733 | * widthpos: argument position for width. Zero means the argument position |
5734 | * was unspecified (ie, take the next arg) and -1 means no width |
5735 | * argument (width was omitted or specified as a constant). |
5736 | * flags: bitmask of flags. |
5737 | * width: directly-specified width value. Zero means the width was omitted |
5738 | * (note it's not necessary to distinguish this case from an explicit |
5739 | * zero width value). |
5740 | * |
5741 | * The function result is the next character position to be parsed, ie, the |
5742 | * location where the type character is/should be. |
5743 | * |
5744 | * Note parsing invariant: at least one character is known available before |
5745 | * string end (end_ptr) at entry, and this is still true at exit. |
5746 | */ |
5747 | static const char * |
5748 | text_format_parse_format(const char *start_ptr, const char *end_ptr, |
5749 | int *argpos, int *widthpos, |
5750 | int *flags, int *width) |
5751 | { |
5752 | const char *cp = start_ptr; |
5753 | int n; |
5754 | |
5755 | /* set defaults for output parameters */ |
5756 | *argpos = -1; |
5757 | *widthpos = -1; |
5758 | *flags = 0; |
5759 | *width = 0; |
5760 | |
5761 | /* try to identify first number */ |
5762 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
5763 | { |
5764 | if (*cp != '$') |
5765 | { |
5766 | /* Must be just a width and a type, so we're done */ |
5767 | *width = n; |
5768 | return cp; |
5769 | } |
5770 | /* The number was argument position */ |
5771 | *argpos = n; |
5772 | /* Explicit 0 for argument index is immediately refused */ |
5773 | if (n == 0) |
5774 | ereport(ERROR, |
5775 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5776 | errmsg("format specifies argument 0, but arguments are numbered from 1" ))); |
5777 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5778 | } |
5779 | |
5780 | /* Handle flags (only minus is supported now) */ |
5781 | while (*cp == '-') |
5782 | { |
5783 | *flags |= TEXT_FORMAT_FLAG_MINUS; |
5784 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5785 | } |
5786 | |
5787 | if (*cp == '*') |
5788 | { |
5789 | /* Handle indirect width */ |
5790 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5791 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
5792 | { |
5793 | /* number in this position must be closed by $ */ |
5794 | if (*cp != '$') |
5795 | ereport(ERROR, |
5796 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5797 | errmsg("width argument position must be ended by \"$\"" ))); |
5798 | /* The number was width argument position */ |
5799 | *widthpos = n; |
5800 | /* Explicit 0 for argument index is immediately refused */ |
5801 | if (n == 0) |
5802 | ereport(ERROR, |
5803 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
5804 | errmsg("format specifies argument 0, but arguments are numbered from 1" ))); |
5805 | ADVANCE_PARSE_POINTER(cp, end_ptr); |
5806 | } |
5807 | else |
5808 | *widthpos = 0; /* width's argument position is unspecified */ |
5809 | } |
5810 | else |
5811 | { |
5812 | /* Check for direct width specification */ |
5813 | if (text_format_parse_digits(&cp, end_ptr, &n)) |
5814 | *width = n; |
5815 | } |
5816 | |
5817 | /* cp should now be pointing at type character */ |
5818 | return cp; |
5819 | } |
5820 | |
5821 | /* |
5822 | * Format a %s, %I, or %L conversion |
5823 | */ |
5824 | static void |
5825 | text_format_string_conversion(StringInfo buf, char conversion, |
5826 | FmgrInfo *typOutputInfo, |
5827 | Datum value, bool isNull, |
5828 | int flags, int width) |
5829 | { |
5830 | char *str; |
5831 | |
5832 | /* Handle NULL arguments before trying to stringify the value. */ |
5833 | if (isNull) |
5834 | { |
5835 | if (conversion == 's') |
5836 | text_format_append_string(buf, "" , flags, width); |
5837 | else if (conversion == 'L') |
5838 | text_format_append_string(buf, "NULL" , flags, width); |
5839 | else if (conversion == 'I') |
5840 | ereport(ERROR, |
5841 | (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
5842 | errmsg("null values cannot be formatted as an SQL identifier" ))); |
5843 | return; |
5844 | } |
5845 | |
5846 | /* Stringify. */ |
5847 | str = OutputFunctionCall(typOutputInfo, value); |
5848 | |
5849 | /* Escape. */ |
5850 | if (conversion == 'I') |
5851 | { |
5852 | /* quote_identifier may or may not allocate a new string. */ |
5853 | text_format_append_string(buf, quote_identifier(str), flags, width); |
5854 | } |
5855 | else if (conversion == 'L') |
5856 | { |
5857 | char *qstr = quote_literal_cstr(str); |
5858 | |
5859 | text_format_append_string(buf, qstr, flags, width); |
5860 | /* quote_literal_cstr() always allocates a new string */ |
5861 | pfree(qstr); |
5862 | } |
5863 | else |
5864 | text_format_append_string(buf, str, flags, width); |
5865 | |
5866 | /* Cleanup. */ |
5867 | pfree(str); |
5868 | } |
5869 | |
5870 | /* |
5871 | * Append str to buf, padding as directed by flags/width |
5872 | */ |
5873 | static void |
5874 | text_format_append_string(StringInfo buf, const char *str, |
5875 | int flags, int width) |
5876 | { |
5877 | bool align_to_left = false; |
5878 | int len; |
5879 | |
5880 | /* fast path for typical easy case */ |
5881 | if (width == 0) |
5882 | { |
5883 | appendStringInfoString(buf, str); |
5884 | return; |
5885 | } |
5886 | |
5887 | if (width < 0) |
5888 | { |
5889 | /* Negative width: implicit '-' flag, then take absolute value */ |
5890 | align_to_left = true; |
5891 | /* -INT_MIN is undefined */ |
5892 | if (width <= INT_MIN) |
5893 | ereport(ERROR, |
5894 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
5895 | errmsg("number is out of range" ))); |
5896 | width = -width; |
5897 | } |
5898 | else if (flags & TEXT_FORMAT_FLAG_MINUS) |
5899 | align_to_left = true; |
5900 | |
5901 | len = pg_mbstrlen(str); |
5902 | if (align_to_left) |
5903 | { |
5904 | /* left justify */ |
5905 | appendStringInfoString(buf, str); |
5906 | if (len < width) |
5907 | appendStringInfoSpaces(buf, width - len); |
5908 | } |
5909 | else |
5910 | { |
5911 | /* right justify */ |
5912 | if (len < width) |
5913 | appendStringInfoSpaces(buf, width - len); |
5914 | appendStringInfoString(buf, str); |
5915 | } |
5916 | } |
5917 | |
5918 | /* |
5919 | * text_format_nv - nonvariadic wrapper for text_format function. |
5920 | * |
5921 | * note: this wrapper is necessary to pass the sanity check in opr_sanity, |
5922 | * which checks that all built-in functions that share the implementing C |
5923 | * function take the same number of arguments. |
5924 | */ |
5925 | Datum |
5926 | text_format_nv(PG_FUNCTION_ARGS) |
5927 | { |
5928 | return text_format(fcinfo); |
5929 | } |
5930 | |
5931 | /* |
5932 | * Helper function for Levenshtein distance functions. Faster than memcmp(), |
5933 | * for this use case. |
5934 | */ |
5935 | static inline bool |
5936 | rest_of_char_same(const char *s1, const char *s2, int len) |
5937 | { |
5938 | while (len > 0) |
5939 | { |
5940 | len--; |
5941 | if (s1[len] != s2[len]) |
5942 | return false; |
5943 | } |
5944 | return true; |
5945 | } |
5946 | |
5947 | /* Expand each Levenshtein distance variant */ |
5948 | #include "levenshtein.c" |
5949 | #define LEVENSHTEIN_LESS_EQUAL |
5950 | #include "levenshtein.c" |
5951 | |