1/*-------------------------------------------------------------------------
2 *
3 * like_support.c
4 * Planner support functions for LIKE, regex, and related operators.
5 *
6 * These routines handle special optimization of operators that can be
7 * used with index scans even though they are not known to the executor's
8 * indexscan machinery. The key idea is that these operators allow us
9 * to derive approximate indexscan qual clauses, such that any tuples
10 * that pass the operator clause itself must also satisfy the simpler
11 * indexscan condition(s). Then we can use the indexscan machinery
12 * to avoid scanning as much of the table as we'd otherwise have to,
13 * while applying the original operator as a qpqual condition to ensure
14 * we deliver only the tuples we want. (In essence, we're using a regular
15 * index as if it were a lossy index.)
16 *
17 * An example of what we're doing is
18 * textfield LIKE 'abc%def'
19 * from which we can generate the indexscanable conditions
20 * textfield >= 'abc' AND textfield < 'abd'
21 * which allow efficient scanning of an index on textfield.
22 * (In reality, character set and collation issues make the transformation
23 * from LIKE to indexscan limits rather harder than one might think ...
24 * but that's the basic idea.)
25 *
26 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27 * Portions Copyright (c) 1994, Regents of the University of California
28 *
29 *
30 * IDENTIFICATION
31 * src/backend/utils/adt/like_support.c
32 *
33 *-------------------------------------------------------------------------
34 */
35#include "postgres.h"
36
37#include <math.h>
38
39#include "access/htup_details.h"
40#include "access/stratnum.h"
41#include "catalog/pg_collation.h"
42#include "catalog/pg_opfamily.h"
43#include "catalog/pg_statistic.h"
44#include "catalog/pg_type.h"
45#include "mb/pg_wchar.h"
46#include "nodes/makefuncs.h"
47#include "nodes/nodeFuncs.h"
48#include "nodes/supportnodes.h"
49#include "utils/builtins.h"
50#include "utils/datum.h"
51#include "utils/lsyscache.h"
52#include "utils/pg_locale.h"
53#include "utils/selfuncs.h"
54#include "utils/varlena.h"
55
56
57typedef enum
58{
59 Pattern_Type_Like,
60 Pattern_Type_Like_IC,
61 Pattern_Type_Regex,
62 Pattern_Type_Regex_IC,
63 Pattern_Type_Prefix
64} Pattern_Type;
65
66typedef enum
67{
68 Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
69} Pattern_Prefix_Status;
70
71static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
72static List *match_pattern_prefix(Node *leftop,
73 Node *rightop,
74 Pattern_Type ptype,
75 Oid expr_coll,
76 Oid opfamily,
77 Oid indexcollation);
78static double patternsel_common(PlannerInfo *root,
79 Oid oprid,
80 Oid opfuncid,
81 List *args,
82 int varRelid,
83 Oid collation,
84 Pattern_Type ptype,
85 bool negate);
86static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
87 Pattern_Type ptype,
88 Oid collation,
89 Const **prefix,
90 Selectivity *rest_selec);
91static Selectivity prefix_selectivity(PlannerInfo *root,
92 VariableStatData *vardata,
93 Oid vartype, Oid opfamily, Const *prefixcon);
94static Selectivity like_selectivity(const char *patt, int pattlen,
95 bool case_insensitive);
96static Selectivity regex_selectivity(const char *patt, int pattlen,
97 bool case_insensitive,
98 int fixed_prefix_len);
99static int pattern_char_isalpha(char c, bool is_multibyte,
100 pg_locale_t locale, bool locale_is_c);
101static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
102 Oid collation);
103static Datum string_to_datum(const char *str, Oid datatype);
104static Const *string_to_const(const char *str, Oid datatype);
105static Const *string_to_bytea_const(const char *str, size_t str_len);
106
107
108/*
109 * Planner support functions for LIKE, regex, and related operators
110 */
111Datum
112textlike_support(PG_FUNCTION_ARGS)
113{
114 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
115
116 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
117}
118
119Datum
120texticlike_support(PG_FUNCTION_ARGS)
121{
122 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
123
124 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
125}
126
127Datum
128textregexeq_support(PG_FUNCTION_ARGS)
129{
130 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
131
132 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
133}
134
135Datum
136texticregexeq_support(PG_FUNCTION_ARGS)
137{
138 Node *rawreq = (Node *) PG_GETARG_POINTER(0);
139
140 PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
141}
142
143/* Common code for the above */
144static Node *
145like_regex_support(Node *rawreq, Pattern_Type ptype)
146{
147 Node *ret = NULL;
148
149 if (IsA(rawreq, SupportRequestSelectivity))
150 {
151 /*
152 * Make a selectivity estimate for a function call, just as we'd do if
153 * the call was via the corresponding operator.
154 */
155 SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
156 Selectivity s1;
157
158 if (req->is_join)
159 {
160 /*
161 * For the moment we just punt. If patternjoinsel is ever
162 * improved to do better, this should be made to call it.
163 */
164 s1 = DEFAULT_MATCH_SEL;
165 }
166 else
167 {
168 /* Share code with operator restriction selectivity functions */
169 s1 = patternsel_common(req->root,
170 InvalidOid,
171 req->funcid,
172 req->args,
173 req->varRelid,
174 req->inputcollid,
175 ptype,
176 false);
177 }
178 req->selectivity = s1;
179 ret = (Node *) req;
180 }
181 else if (IsA(rawreq, SupportRequestIndexCondition))
182 {
183 /* Try to convert operator/function call to index conditions */
184 SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
185
186 /*
187 * Currently we have no "reverse" match operators with the pattern on
188 * the left, so we only need consider cases with the indexkey on the
189 * left.
190 */
191 if (req->indexarg != 0)
192 return NULL;
193
194 if (is_opclause(req->node))
195 {
196 OpExpr *clause = (OpExpr *) req->node;
197
198 Assert(list_length(clause->args) == 2);
199 ret = (Node *)
200 match_pattern_prefix((Node *) linitial(clause->args),
201 (Node *) lsecond(clause->args),
202 ptype,
203 clause->inputcollid,
204 req->opfamily,
205 req->indexcollation);
206 }
207 else if (is_funcclause(req->node)) /* be paranoid */
208 {
209 FuncExpr *clause = (FuncExpr *) req->node;
210
211 Assert(list_length(clause->args) == 2);
212 ret = (Node *)
213 match_pattern_prefix((Node *) linitial(clause->args),
214 (Node *) lsecond(clause->args),
215 ptype,
216 clause->inputcollid,
217 req->opfamily,
218 req->indexcollation);
219 }
220 }
221
222 return ret;
223}
224
225/*
226 * match_pattern_prefix
227 * Try to generate an indexqual for a LIKE or regex operator.
228 */
229static List *
230match_pattern_prefix(Node *leftop,
231 Node *rightop,
232 Pattern_Type ptype,
233 Oid expr_coll,
234 Oid opfamily,
235 Oid indexcollation)
236{
237 List *result;
238 Const *patt;
239 Const *prefix;
240 Pattern_Prefix_Status pstatus;
241 Oid ldatatype;
242 Oid rdatatype;
243 Oid oproid;
244 Expr *expr;
245 FmgrInfo ltproc;
246 Const *greaterstr;
247
248 /*
249 * Can't do anything with a non-constant or NULL pattern argument.
250 *
251 * Note that since we restrict ourselves to cases with a hard constant on
252 * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
253 * about verifying that.
254 */
255 if (!IsA(rightop, Const) ||
256 ((Const *) rightop)->constisnull)
257 return NIL;
258 patt = (Const *) rightop;
259
260 /*
261 * Not supported if the expression collation is nondeterministic. The
262 * optimized equality or prefix tests use bytewise comparisons, which is
263 * not consistent with nondeterministic collations. The actual
264 * pattern-matching implementation functions will later error out that
265 * pattern-matching is not supported with nondeterministic collations. (We
266 * could also error out here, but by doing it later we get more precise
267 * error messages.) (It should be possible to support at least
268 * Pattern_Prefix_Exact, but no point as along as the actual
269 * pattern-matching implementations don't support it.)
270 *
271 * expr_coll is not set for a non-collation-aware data type such as bytea.
272 */
273 if (expr_coll && !get_collation_isdeterministic(expr_coll))
274 return NIL;
275
276 /*
277 * Try to extract a fixed prefix from the pattern.
278 */
279 pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
280 &prefix, NULL);
281
282 /* fail if no fixed prefix */
283 if (pstatus == Pattern_Prefix_None)
284 return NIL;
285
286 /*
287 * Must also check that index's opfamily supports the operators we will
288 * want to apply. (A hash index, for example, will not support ">=".)
289 * Currently, only btree and spgist support the operators we need.
290 *
291 * Note: actually, in the Pattern_Prefix_Exact case, we only need "=" so a
292 * hash index would work. Currently it doesn't seem worth checking for
293 * that, however.
294 *
295 * We insist on the opfamily being one of the specific ones we expect,
296 * else we'd do the wrong thing if someone were to make a reverse-sort
297 * opfamily with the same operators.
298 *
299 * The non-pattern opclasses will not sort the way we need in most non-C
300 * locales. We can use such an index anyway for an exact match (simple
301 * equality), but not for prefix-match cases. Note that here we are
302 * looking at the index's collation, not the expression's collation --
303 * this test is *not* dependent on the LIKE/regex operator's collation.
304 *
305 * While we're at it, identify the type the comparison constant(s) should
306 * have, based on the opfamily.
307 */
308 switch (opfamily)
309 {
310 case TEXT_BTREE_FAM_OID:
311 if (!(pstatus == Pattern_Prefix_Exact ||
312 lc_collate_is_c(indexcollation)))
313 return NIL;
314 rdatatype = TEXTOID;
315 break;
316
317 case TEXT_PATTERN_BTREE_FAM_OID:
318 case TEXT_SPGIST_FAM_OID:
319 rdatatype = TEXTOID;
320 break;
321
322 case BPCHAR_BTREE_FAM_OID:
323 if (!(pstatus == Pattern_Prefix_Exact ||
324 lc_collate_is_c(indexcollation)))
325 return NIL;
326 rdatatype = BPCHAROID;
327 break;
328
329 case BPCHAR_PATTERN_BTREE_FAM_OID:
330 rdatatype = BPCHAROID;
331 break;
332
333 case BYTEA_BTREE_FAM_OID:
334 rdatatype = BYTEAOID;
335 break;
336
337 default:
338 return NIL;
339 }
340
341 /* OK, prepare to create the indexqual(s) */
342 ldatatype = exprType(leftop);
343
344 /*
345 * If necessary, coerce the prefix constant to the right type. The given
346 * prefix constant is either text or bytea type, therefore the only case
347 * where we need to do anything is when converting text to bpchar. Those
348 * two types are binary-compatible, so relabeling the Const node is
349 * sufficient.
350 */
351 if (prefix->consttype != rdatatype)
352 {
353 Assert(prefix->consttype == TEXTOID &&
354 rdatatype == BPCHAROID);
355 prefix->consttype = rdatatype;
356 }
357
358 /*
359 * If we found an exact-match pattern, generate an "=" indexqual.
360 */
361 if (pstatus == Pattern_Prefix_Exact)
362 {
363 oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
364 BTEqualStrategyNumber);
365 if (oproid == InvalidOid)
366 elog(ERROR, "no = operator for opfamily %u", opfamily);
367 expr = make_opclause(oproid, BOOLOID, false,
368 (Expr *) leftop, (Expr *) prefix,
369 InvalidOid, indexcollation);
370 result = list_make1(expr);
371 return result;
372 }
373
374 /*
375 * Otherwise, we have a nonempty required prefix of the values.
376 *
377 * We can always say "x >= prefix".
378 */
379 oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
380 BTGreaterEqualStrategyNumber);
381 if (oproid == InvalidOid)
382 elog(ERROR, "no >= operator for opfamily %u", opfamily);
383 expr = make_opclause(oproid, BOOLOID, false,
384 (Expr *) leftop, (Expr *) prefix,
385 InvalidOid, indexcollation);
386 result = list_make1(expr);
387
388 /*-------
389 * If we can create a string larger than the prefix, we can say
390 * "x < greaterstr". NB: we rely on make_greater_string() to generate
391 * a guaranteed-greater string, not just a probably-greater string.
392 * In general this is only guaranteed in C locale, so we'd better be
393 * using a C-locale index collation.
394 *-------
395 */
396 oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
397 BTLessStrategyNumber);
398 if (oproid == InvalidOid)
399 elog(ERROR, "no < operator for opfamily %u", opfamily);
400 fmgr_info(get_opcode(oproid), &ltproc);
401 greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
402 if (greaterstr)
403 {
404 expr = make_opclause(oproid, BOOLOID, false,
405 (Expr *) leftop, (Expr *) greaterstr,
406 InvalidOid, indexcollation);
407 result = lappend(result, expr);
408 }
409
410 return result;
411}
412
413
414/*
415 * patternsel_common - generic code for pattern-match restriction selectivity.
416 *
417 * To support using this from either the operator or function paths, caller
418 * may pass either operator OID or underlying function OID; we look up the
419 * latter from the former if needed. (We could just have patternsel() call
420 * get_opcode(), but the work would be wasted if we don't have a need to
421 * compare a fixed prefix to the pg_statistic data.)
422 *
423 * Note that oprid and/or opfuncid should be for the positive-match operator
424 * even when negate is true.
425 */
426static double
427patternsel_common(PlannerInfo *root,
428 Oid oprid,
429 Oid opfuncid,
430 List *args,
431 int varRelid,
432 Oid collation,
433 Pattern_Type ptype,
434 bool negate)
435{
436 VariableStatData vardata;
437 Node *other;
438 bool varonleft;
439 Datum constval;
440 Oid consttype;
441 Oid vartype;
442 Oid opfamily;
443 Pattern_Prefix_Status pstatus;
444 Const *patt;
445 Const *prefix = NULL;
446 Selectivity rest_selec = 0;
447 double nullfrac = 0.0;
448 double result;
449
450 /*
451 * Initialize result to the appropriate default estimate depending on
452 * whether it's a match or not-match operator.
453 */
454 if (negate)
455 result = 1.0 - DEFAULT_MATCH_SEL;
456 else
457 result = DEFAULT_MATCH_SEL;
458
459 /*
460 * If expression is not variable op constant, then punt and return the
461 * default estimate.
462 */
463 if (!get_restriction_variable(root, args, varRelid,
464 &vardata, &other, &varonleft))
465 return result;
466 if (!varonleft || !IsA(other, Const))
467 {
468 ReleaseVariableStats(vardata);
469 return result;
470 }
471
472 /*
473 * If the constant is NULL, assume operator is strict and return zero, ie,
474 * operator will never return TRUE. (It's zero even for a negator op.)
475 */
476 if (((Const *) other)->constisnull)
477 {
478 ReleaseVariableStats(vardata);
479 return 0.0;
480 }
481 constval = ((Const *) other)->constvalue;
482 consttype = ((Const *) other)->consttype;
483
484 /*
485 * The right-hand const is type text or bytea for all supported operators.
486 * We do not expect to see binary-compatible types here, since
487 * const-folding should have relabeled the const to exactly match the
488 * operator's declared type.
489 */
490 if (consttype != TEXTOID && consttype != BYTEAOID)
491 {
492 ReleaseVariableStats(vardata);
493 return result;
494 }
495
496 /*
497 * Similarly, the exposed type of the left-hand side should be one of
498 * those we know. (Do not look at vardata.atttype, which might be
499 * something binary-compatible but different.) We can use it to choose
500 * the index opfamily from which we must draw the comparison operators.
501 *
502 * NOTE: It would be more correct to use the PATTERN opfamilies than the
503 * simple ones, but at the moment ANALYZE will not generate statistics for
504 * the PATTERN operators. But our results are so approximate anyway that
505 * it probably hardly matters.
506 */
507 vartype = vardata.vartype;
508
509 switch (vartype)
510 {
511 case TEXTOID:
512 case NAMEOID:
513 opfamily = TEXT_BTREE_FAM_OID;
514 break;
515 case BPCHAROID:
516 opfamily = BPCHAR_BTREE_FAM_OID;
517 break;
518 case BYTEAOID:
519 opfamily = BYTEA_BTREE_FAM_OID;
520 break;
521 default:
522 ReleaseVariableStats(vardata);
523 return result;
524 }
525
526 /*
527 * Grab the nullfrac for use below.
528 */
529 if (HeapTupleIsValid(vardata.statsTuple))
530 {
531 Form_pg_statistic stats;
532
533 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
534 nullfrac = stats->stanullfrac;
535 }
536
537 /*
538 * Pull out any fixed prefix implied by the pattern, and estimate the
539 * fractional selectivity of the remainder of the pattern. Unlike many
540 * other selectivity estimators, we use the pattern operator's actual
541 * collation for this step. This is not because we expect the collation
542 * to make a big difference in the selectivity estimate (it seldom would),
543 * but because we want to be sure we cache compiled regexps under the
544 * right cache key, so that they can be re-used at runtime.
545 */
546 patt = (Const *) other;
547 pstatus = pattern_fixed_prefix(patt, ptype, collation,
548 &prefix, &rest_selec);
549
550 /*
551 * If necessary, coerce the prefix constant to the right type.
552 */
553 if (prefix && prefix->consttype != vartype)
554 {
555 char *prefixstr;
556
557 switch (prefix->consttype)
558 {
559 case TEXTOID:
560 prefixstr = TextDatumGetCString(prefix->constvalue);
561 break;
562 case BYTEAOID:
563 prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
564 prefix->constvalue));
565 break;
566 default:
567 elog(ERROR, "unrecognized consttype: %u",
568 prefix->consttype);
569 ReleaseVariableStats(vardata);
570 return result;
571 }
572 prefix = string_to_const(prefixstr, vartype);
573 pfree(prefixstr);
574 }
575
576 if (pstatus == Pattern_Prefix_Exact)
577 {
578 /*
579 * Pattern specifies an exact match, so pretend operator is '='
580 */
581 Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
582 BTEqualStrategyNumber);
583
584 if (eqopr == InvalidOid)
585 elog(ERROR, "no = operator for opfamily %u", opfamily);
586 result = var_eq_const(&vardata, eqopr, prefix->constvalue,
587 false, true, false);
588 }
589 else
590 {
591 /*
592 * Not exact-match pattern. If we have a sufficiently large
593 * histogram, estimate selectivity for the histogram part of the
594 * population by counting matches in the histogram. If not, estimate
595 * selectivity of the fixed prefix and remainder of pattern
596 * separately, then combine the two to get an estimate of the
597 * selectivity for the part of the column population represented by
598 * the histogram. (For small histograms, we combine these
599 * approaches.)
600 *
601 * We then add up data for any most-common-values values; these are
602 * not in the histogram population, and we can get exact answers for
603 * them by applying the pattern operator, so there's no reason to
604 * approximate. (If the MCVs cover a significant part of the total
605 * population, this gives us a big leg up in accuracy.)
606 */
607 Selectivity selec;
608 int hist_size;
609 FmgrInfo opproc;
610 double mcv_selec,
611 sumcommon;
612
613 /* Try to use the histogram entries to get selectivity */
614 if (!OidIsValid(opfuncid))
615 opfuncid = get_opcode(oprid);
616 fmgr_info(opfuncid, &opproc);
617
618 selec = histogram_selectivity(&vardata, &opproc, constval, true,
619 10, 1, &hist_size);
620
621 /* If not at least 100 entries, use the heuristic method */
622 if (hist_size < 100)
623 {
624 Selectivity heursel;
625 Selectivity prefixsel;
626
627 if (pstatus == Pattern_Prefix_Partial)
628 prefixsel = prefix_selectivity(root, &vardata, vartype,
629 opfamily, prefix);
630 else
631 prefixsel = 1.0;
632 heursel = prefixsel * rest_selec;
633
634 if (selec < 0) /* fewer than 10 histogram entries? */
635 selec = heursel;
636 else
637 {
638 /*
639 * For histogram sizes from 10 to 100, we combine the
640 * histogram and heuristic selectivities, putting increasingly
641 * more trust in the histogram for larger sizes.
642 */
643 double hist_weight = hist_size / 100.0;
644
645 selec = selec * hist_weight + heursel * (1.0 - hist_weight);
646 }
647 }
648
649 /* In any case, don't believe extremely small or large estimates. */
650 if (selec < 0.0001)
651 selec = 0.0001;
652 else if (selec > 0.9999)
653 selec = 0.9999;
654
655 /*
656 * If we have most-common-values info, add up the fractions of the MCV
657 * entries that satisfy MCV OP PATTERN. These fractions contribute
658 * directly to the result selectivity. Also add up the total fraction
659 * represented by MCV entries.
660 */
661 mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
662 &sumcommon);
663
664 /*
665 * Now merge the results from the MCV and histogram calculations,
666 * realizing that the histogram covers only the non-null values that
667 * are not listed in MCV.
668 */
669 selec *= 1.0 - nullfrac - sumcommon;
670 selec += mcv_selec;
671 result = selec;
672 }
673
674 /* now adjust if we wanted not-match rather than match */
675 if (negate)
676 result = 1.0 - result - nullfrac;
677
678 /* result should be in range, but make sure... */
679 CLAMP_PROBABILITY(result);
680
681 if (prefix)
682 {
683 pfree(DatumGetPointer(prefix->constvalue));
684 pfree(prefix);
685 }
686
687 ReleaseVariableStats(vardata);
688
689 return result;
690}
691
692/*
693 * Fix impedance mismatch between SQL-callable functions and patternsel_common
694 */
695static double
696patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
697{
698 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
699 Oid operator = PG_GETARG_OID(1);
700 List *args = (List *) PG_GETARG_POINTER(2);
701 int varRelid = PG_GETARG_INT32(3);
702 Oid collation = PG_GET_COLLATION();
703
704 /*
705 * If this is for a NOT LIKE or similar operator, get the corresponding
706 * positive-match operator and work with that.
707 */
708 if (negate)
709 {
710 operator = get_negator(operator);
711 if (!OidIsValid(operator))
712 elog(ERROR, "patternsel called for operator without a negator");
713 }
714
715 return patternsel_common(root,
716 operator,
717 InvalidOid,
718 args,
719 varRelid,
720 collation,
721 ptype,
722 negate);
723}
724
725/*
726 * regexeqsel - Selectivity of regular-expression pattern match.
727 */
728Datum
729regexeqsel(PG_FUNCTION_ARGS)
730{
731 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
732}
733
734/*
735 * icregexeqsel - Selectivity of case-insensitive regex match.
736 */
737Datum
738icregexeqsel(PG_FUNCTION_ARGS)
739{
740 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
741}
742
743/*
744 * likesel - Selectivity of LIKE pattern match.
745 */
746Datum
747likesel(PG_FUNCTION_ARGS)
748{
749 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
750}
751
752/*
753 * prefixsel - selectivity of prefix operator
754 */
755Datum
756prefixsel(PG_FUNCTION_ARGS)
757{
758 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
759}
760
761/*
762 *
763 * iclikesel - Selectivity of ILIKE pattern match.
764 */
765Datum
766iclikesel(PG_FUNCTION_ARGS)
767{
768 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
769}
770
771/*
772 * regexnesel - Selectivity of regular-expression pattern non-match.
773 */
774Datum
775regexnesel(PG_FUNCTION_ARGS)
776{
777 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
778}
779
780/*
781 * icregexnesel - Selectivity of case-insensitive regex non-match.
782 */
783Datum
784icregexnesel(PG_FUNCTION_ARGS)
785{
786 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
787}
788
789/*
790 * nlikesel - Selectivity of LIKE pattern non-match.
791 */
792Datum
793nlikesel(PG_FUNCTION_ARGS)
794{
795 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
796}
797
798/*
799 * icnlikesel - Selectivity of ILIKE pattern non-match.
800 */
801Datum
802icnlikesel(PG_FUNCTION_ARGS)
803{
804 PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
805}
806
807/*
808 * patternjoinsel - Generic code for pattern-match join selectivity.
809 */
810static double
811patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
812{
813 /* For the moment we just punt. */
814 return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
815}
816
817/*
818 * regexeqjoinsel - Join selectivity of regular-expression pattern match.
819 */
820Datum
821regexeqjoinsel(PG_FUNCTION_ARGS)
822{
823 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
824}
825
826/*
827 * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
828 */
829Datum
830icregexeqjoinsel(PG_FUNCTION_ARGS)
831{
832 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
833}
834
835/*
836 * likejoinsel - Join selectivity of LIKE pattern match.
837 */
838Datum
839likejoinsel(PG_FUNCTION_ARGS)
840{
841 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
842}
843
844/*
845 * prefixjoinsel - Join selectivity of prefix operator
846 */
847Datum
848prefixjoinsel(PG_FUNCTION_ARGS)
849{
850 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
851}
852
853/*
854 * iclikejoinsel - Join selectivity of ILIKE pattern match.
855 */
856Datum
857iclikejoinsel(PG_FUNCTION_ARGS)
858{
859 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
860}
861
862/*
863 * regexnejoinsel - Join selectivity of regex non-match.
864 */
865Datum
866regexnejoinsel(PG_FUNCTION_ARGS)
867{
868 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
869}
870
871/*
872 * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
873 */
874Datum
875icregexnejoinsel(PG_FUNCTION_ARGS)
876{
877 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
878}
879
880/*
881 * nlikejoinsel - Join selectivity of LIKE pattern non-match.
882 */
883Datum
884nlikejoinsel(PG_FUNCTION_ARGS)
885{
886 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
887}
888
889/*
890 * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
891 */
892Datum
893icnlikejoinsel(PG_FUNCTION_ARGS)
894{
895 PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
896}
897
898
899/*-------------------------------------------------------------------------
900 *
901 * Pattern analysis functions
902 *
903 * These routines support analysis of LIKE and regular-expression patterns
904 * by the planner/optimizer. It's important that they agree with the
905 * regular-expression code in backend/regex/ and the LIKE code in
906 * backend/utils/adt/like.c. Also, the computation of the fixed prefix
907 * must be conservative: if we report a string longer than the true fixed
908 * prefix, the query may produce actually wrong answers, rather than just
909 * getting a bad selectivity estimate!
910 *
911 *-------------------------------------------------------------------------
912 */
913
914/*
915 * Extract the fixed prefix, if any, for a pattern.
916 *
917 * *prefix is set to a palloc'd prefix string (in the form of a Const node),
918 * or to NULL if no fixed prefix exists for the pattern.
919 * If rest_selec is not NULL, *rest_selec is set to an estimate of the
920 * selectivity of the remainder of the pattern (without any fixed prefix).
921 * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
922 *
923 * The return value distinguishes no fixed prefix, a partial prefix,
924 * or an exact-match-only pattern.
925 */
926
927static Pattern_Prefix_Status
928like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
929 Const **prefix_const, Selectivity *rest_selec)
930{
931 char *match;
932 char *patt;
933 int pattlen;
934 Oid typeid = patt_const->consttype;
935 int pos,
936 match_pos;
937 bool is_multibyte = (pg_database_encoding_max_length() > 1);
938 pg_locale_t locale = 0;
939 bool locale_is_c = false;
940
941 /* the right-hand const is type text or bytea */
942 Assert(typeid == BYTEAOID || typeid == TEXTOID);
943
944 if (case_insensitive)
945 {
946 if (typeid == BYTEAOID)
947 ereport(ERROR,
948 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
949 errmsg("case insensitive matching not supported on type bytea")));
950
951 /* If case-insensitive, we need locale info */
952 if (lc_ctype_is_c(collation))
953 locale_is_c = true;
954 else if (collation != DEFAULT_COLLATION_OID)
955 {
956 if (!OidIsValid(collation))
957 {
958 /*
959 * This typically means that the parser could not resolve a
960 * conflict of implicit collations, so report it that way.
961 */
962 ereport(ERROR,
963 (errcode(ERRCODE_INDETERMINATE_COLLATION),
964 errmsg("could not determine which collation to use for ILIKE"),
965 errhint("Use the COLLATE clause to set the collation explicitly.")));
966 }
967 locale = pg_newlocale_from_collation(collation);
968 }
969 }
970
971 if (typeid != BYTEAOID)
972 {
973 patt = TextDatumGetCString(patt_const->constvalue);
974 pattlen = strlen(patt);
975 }
976 else
977 {
978 bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
979
980 pattlen = VARSIZE_ANY_EXHDR(bstr);
981 patt = (char *) palloc(pattlen);
982 memcpy(patt, VARDATA_ANY(bstr), pattlen);
983 Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
984 }
985
986 match = palloc(pattlen + 1);
987 match_pos = 0;
988 for (pos = 0; pos < pattlen; pos++)
989 {
990 /* % and _ are wildcard characters in LIKE */
991 if (patt[pos] == '%' ||
992 patt[pos] == '_')
993 break;
994
995 /* Backslash escapes the next character */
996 if (patt[pos] == '\\')
997 {
998 pos++;
999 if (pos >= pattlen)
1000 break;
1001 }
1002
1003 /* Stop if case-varying character (it's sort of a wildcard) */
1004 if (case_insensitive &&
1005 pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1006 break;
1007
1008 match[match_pos++] = patt[pos];
1009 }
1010
1011 match[match_pos] = '\0';
1012
1013 if (typeid != BYTEAOID)
1014 *prefix_const = string_to_const(match, typeid);
1015 else
1016 *prefix_const = string_to_bytea_const(match, match_pos);
1017
1018 if (rest_selec != NULL)
1019 *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1020 case_insensitive);
1021
1022 pfree(patt);
1023 pfree(match);
1024
1025 /* in LIKE, an empty pattern is an exact match! */
1026 if (pos == pattlen)
1027 return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1028
1029 if (match_pos > 0)
1030 return Pattern_Prefix_Partial;
1031
1032 return Pattern_Prefix_None;
1033}
1034
1035static Pattern_Prefix_Status
1036regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1037 Const **prefix_const, Selectivity *rest_selec)
1038{
1039 Oid typeid = patt_const->consttype;
1040 char *prefix;
1041 bool exact;
1042
1043 /*
1044 * Should be unnecessary, there are no bytea regex operators defined. As
1045 * such, it should be noted that the rest of this function has *not* been
1046 * made safe for binary (possibly NULL containing) strings.
1047 */
1048 if (typeid == BYTEAOID)
1049 ereport(ERROR,
1050 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1051 errmsg("regular-expression matching not supported on type bytea")));
1052
1053 /* Use the regexp machinery to extract the prefix, if any */
1054 prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1055 case_insensitive, collation,
1056 &exact);
1057
1058 if (prefix == NULL)
1059 {
1060 *prefix_const = NULL;
1061
1062 if (rest_selec != NULL)
1063 {
1064 char *patt = TextDatumGetCString(patt_const->constvalue);
1065
1066 *rest_selec = regex_selectivity(patt, strlen(patt),
1067 case_insensitive,
1068 0);
1069 pfree(patt);
1070 }
1071
1072 return Pattern_Prefix_None;
1073 }
1074
1075 *prefix_const = string_to_const(prefix, typeid);
1076
1077 if (rest_selec != NULL)
1078 {
1079 if (exact)
1080 {
1081 /* Exact match, so there's no additional selectivity */
1082 *rest_selec = 1.0;
1083 }
1084 else
1085 {
1086 char *patt = TextDatumGetCString(patt_const->constvalue);
1087
1088 *rest_selec = regex_selectivity(patt, strlen(patt),
1089 case_insensitive,
1090 strlen(prefix));
1091 pfree(patt);
1092 }
1093 }
1094
1095 pfree(prefix);
1096
1097 if (exact)
1098 return Pattern_Prefix_Exact; /* pattern specifies exact match */
1099 else
1100 return Pattern_Prefix_Partial;
1101}
1102
1103static Pattern_Prefix_Status
1104pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1105 Const **prefix, Selectivity *rest_selec)
1106{
1107 Pattern_Prefix_Status result;
1108
1109 switch (ptype)
1110 {
1111 case Pattern_Type_Like:
1112 result = like_fixed_prefix(patt, false, collation,
1113 prefix, rest_selec);
1114 break;
1115 case Pattern_Type_Like_IC:
1116 result = like_fixed_prefix(patt, true, collation,
1117 prefix, rest_selec);
1118 break;
1119 case Pattern_Type_Regex:
1120 result = regex_fixed_prefix(patt, false, collation,
1121 prefix, rest_selec);
1122 break;
1123 case Pattern_Type_Regex_IC:
1124 result = regex_fixed_prefix(patt, true, collation,
1125 prefix, rest_selec);
1126 break;
1127 case Pattern_Type_Prefix:
1128 /* Prefix type work is trivial. */
1129 result = Pattern_Prefix_Partial;
1130 *rest_selec = 1.0; /* all */
1131 *prefix = makeConst(patt->consttype,
1132 patt->consttypmod,
1133 patt->constcollid,
1134 patt->constlen,
1135 datumCopy(patt->constvalue,
1136 patt->constbyval,
1137 patt->constlen),
1138 patt->constisnull,
1139 patt->constbyval);
1140 break;
1141 default:
1142 elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1143 result = Pattern_Prefix_None; /* keep compiler quiet */
1144 break;
1145 }
1146 return result;
1147}
1148
1149/*
1150 * Estimate the selectivity of a fixed prefix for a pattern match.
1151 *
1152 * A fixed prefix "foo" is estimated as the selectivity of the expression
1153 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
1154 *
1155 * The selectivity estimate is with respect to the portion of the column
1156 * population represented by the histogram --- the caller must fold this
1157 * together with info about MCVs and NULLs.
1158 *
1159 * We use the >= and < operators from the specified btree opfamily to do the
1160 * estimation. The given variable and Const must be of the associated
1161 * datatype.
1162 *
1163 * XXX Note: we make use of the upper bound to estimate operator selectivity
1164 * even if the locale is such that we cannot rely on the upper-bound string.
1165 * The selectivity only needs to be approximately right anyway, so it seems
1166 * more useful to use the upper-bound code than not.
1167 */
1168static Selectivity
1169prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1170 Oid vartype, Oid opfamily, Const *prefixcon)
1171{
1172 Selectivity prefixsel;
1173 Oid cmpopr;
1174 FmgrInfo opproc;
1175 AttStatsSlot sslot;
1176 Const *greaterstrcon;
1177 Selectivity eq_sel;
1178
1179 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1180 BTGreaterEqualStrategyNumber);
1181 if (cmpopr == InvalidOid)
1182 elog(ERROR, "no >= operator for opfamily %u", opfamily);
1183 fmgr_info(get_opcode(cmpopr), &opproc);
1184
1185 prefixsel = ineq_histogram_selectivity(root, vardata,
1186 &opproc, true, true,
1187 prefixcon->constvalue,
1188 prefixcon->consttype);
1189
1190 if (prefixsel < 0.0)
1191 {
1192 /* No histogram is present ... return a suitable default estimate */
1193 return DEFAULT_MATCH_SEL;
1194 }
1195
1196 /*-------
1197 * If we can create a string larger than the prefix, say
1198 * "x < greaterstr". We try to generate the string referencing the
1199 * collation of the var's statistics, but if that's not available,
1200 * use DEFAULT_COLLATION_OID.
1201 *-------
1202 */
1203 if (HeapTupleIsValid(vardata->statsTuple) &&
1204 get_attstatsslot(&sslot, vardata->statsTuple,
1205 STATISTIC_KIND_HISTOGRAM, InvalidOid, 0))
1206 /* sslot.stacoll is set up */ ;
1207 else
1208 sslot.stacoll = DEFAULT_COLLATION_OID;
1209 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1210 BTLessStrategyNumber);
1211 if (cmpopr == InvalidOid)
1212 elog(ERROR, "no < operator for opfamily %u", opfamily);
1213 fmgr_info(get_opcode(cmpopr), &opproc);
1214 greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
1215 if (greaterstrcon)
1216 {
1217 Selectivity topsel;
1218
1219 topsel = ineq_histogram_selectivity(root, vardata,
1220 &opproc, false, false,
1221 greaterstrcon->constvalue,
1222 greaterstrcon->consttype);
1223
1224 /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1225 Assert(topsel >= 0.0);
1226
1227 /*
1228 * Merge the two selectivities in the same way as for a range query
1229 * (see clauselist_selectivity()). Note that we don't need to worry
1230 * about double-exclusion of nulls, since ineq_histogram_selectivity
1231 * doesn't count those anyway.
1232 */
1233 prefixsel = topsel + prefixsel - 1.0;
1234 }
1235
1236 /*
1237 * If the prefix is long then the two bounding values might be too close
1238 * together for the histogram to distinguish them usefully, resulting in a
1239 * zero estimate (plus or minus roundoff error). To avoid returning a
1240 * ridiculously small estimate, compute the estimated selectivity for
1241 * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1242 * estimate should be at least that.)
1243 *
1244 * We apply this even if we couldn't make a greater string. That case
1245 * suggests that the prefix is near the maximum possible, and thus
1246 * probably off the end of the histogram, and thus we probably got a very
1247 * small estimate from the >= condition; so we still need to clamp.
1248 */
1249 cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1250 BTEqualStrategyNumber);
1251 if (cmpopr == InvalidOid)
1252 elog(ERROR, "no = operator for opfamily %u", opfamily);
1253 eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
1254 false, true, false);
1255
1256 prefixsel = Max(prefixsel, eq_sel);
1257
1258 return prefixsel;
1259}
1260
1261
1262/*
1263 * Estimate the selectivity of a pattern of the specified type.
1264 * Note that any fixed prefix of the pattern will have been removed already,
1265 * so actually we may be looking at just a fragment of the pattern.
1266 *
1267 * For now, we use a very simplistic approach: fixed characters reduce the
1268 * selectivity a good deal, character ranges reduce it a little,
1269 * wildcards (such as % for LIKE or .* for regex) increase it.
1270 */
1271
1272#define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1273#define CHAR_RANGE_SEL 0.25
1274#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1275#define FULL_WILDCARD_SEL 5.0
1276#define PARTIAL_WILDCARD_SEL 2.0
1277
1278static Selectivity
1279like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1280{
1281 Selectivity sel = 1.0;
1282 int pos;
1283
1284 /* Skip any leading wildcard; it's already factored into initial sel */
1285 for (pos = 0; pos < pattlen; pos++)
1286 {
1287 if (patt[pos] != '%' && patt[pos] != '_')
1288 break;
1289 }
1290
1291 for (; pos < pattlen; pos++)
1292 {
1293 /* % and _ are wildcard characters in LIKE */
1294 if (patt[pos] == '%')
1295 sel *= FULL_WILDCARD_SEL;
1296 else if (patt[pos] == '_')
1297 sel *= ANY_CHAR_SEL;
1298 else if (patt[pos] == '\\')
1299 {
1300 /* Backslash quotes the next character */
1301 pos++;
1302 if (pos >= pattlen)
1303 break;
1304 sel *= FIXED_CHAR_SEL;
1305 }
1306 else
1307 sel *= FIXED_CHAR_SEL;
1308 }
1309 /* Could get sel > 1 if multiple wildcards */
1310 if (sel > 1.0)
1311 sel = 1.0;
1312 return sel;
1313}
1314
1315static Selectivity
1316regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1317{
1318 Selectivity sel = 1.0;
1319 int paren_depth = 0;
1320 int paren_pos = 0; /* dummy init to keep compiler quiet */
1321 int pos;
1322
1323 for (pos = 0; pos < pattlen; pos++)
1324 {
1325 if (patt[pos] == '(')
1326 {
1327 if (paren_depth == 0)
1328 paren_pos = pos; /* remember start of parenthesized item */
1329 paren_depth++;
1330 }
1331 else if (patt[pos] == ')' && paren_depth > 0)
1332 {
1333 paren_depth--;
1334 if (paren_depth == 0)
1335 sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1336 pos - (paren_pos + 1),
1337 case_insensitive);
1338 }
1339 else if (patt[pos] == '|' && paren_depth == 0)
1340 {
1341 /*
1342 * If unquoted | is present at paren level 0 in pattern, we have
1343 * multiple alternatives; sum their probabilities.
1344 */
1345 sel += regex_selectivity_sub(patt + (pos + 1),
1346 pattlen - (pos + 1),
1347 case_insensitive);
1348 break; /* rest of pattern is now processed */
1349 }
1350 else if (patt[pos] == '[')
1351 {
1352 bool negclass = false;
1353
1354 if (patt[++pos] == '^')
1355 {
1356 negclass = true;
1357 pos++;
1358 }
1359 if (patt[pos] == ']') /* ']' at start of class is not special */
1360 pos++;
1361 while (pos < pattlen && patt[pos] != ']')
1362 pos++;
1363 if (paren_depth == 0)
1364 sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1365 }
1366 else if (patt[pos] == '.')
1367 {
1368 if (paren_depth == 0)
1369 sel *= ANY_CHAR_SEL;
1370 }
1371 else if (patt[pos] == '*' ||
1372 patt[pos] == '?' ||
1373 patt[pos] == '+')
1374 {
1375 /* Ought to be smarter about quantifiers... */
1376 if (paren_depth == 0)
1377 sel *= PARTIAL_WILDCARD_SEL;
1378 }
1379 else if (patt[pos] == '{')
1380 {
1381 while (pos < pattlen && patt[pos] != '}')
1382 pos++;
1383 if (paren_depth == 0)
1384 sel *= PARTIAL_WILDCARD_SEL;
1385 }
1386 else if (patt[pos] == '\\')
1387 {
1388 /* backslash quotes the next character */
1389 pos++;
1390 if (pos >= pattlen)
1391 break;
1392 if (paren_depth == 0)
1393 sel *= FIXED_CHAR_SEL;
1394 }
1395 else
1396 {
1397 if (paren_depth == 0)
1398 sel *= FIXED_CHAR_SEL;
1399 }
1400 }
1401 /* Could get sel > 1 if multiple wildcards */
1402 if (sel > 1.0)
1403 sel = 1.0;
1404 return sel;
1405}
1406
1407static Selectivity
1408regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1409 int fixed_prefix_len)
1410{
1411 Selectivity sel;
1412
1413 /* If patt doesn't end with $, consider it to have a trailing wildcard */
1414 if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1415 (pattlen == 1 || patt[pattlen - 2] != '\\'))
1416 {
1417 /* has trailing $ */
1418 sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1419 }
1420 else
1421 {
1422 /* no trailing $ */
1423 sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1424 sel *= FULL_WILDCARD_SEL;
1425 }
1426
1427 /* If there's a fixed prefix, discount its selectivity */
1428 if (fixed_prefix_len > 0)
1429 sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
1430
1431 /* Make sure result stays in range */
1432 CLAMP_PROBABILITY(sel);
1433 return sel;
1434}
1435
1436/*
1437 * Check whether char is a letter (and, hence, subject to case-folding)
1438 *
1439 * In multibyte character sets or with ICU, we can't use isalpha, and it does
1440 * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1441 * Instead, just assume any non-ASCII char is potentially case-varying, and
1442 * hard-wire knowledge of which ASCII chars are letters.
1443 */
1444static int
1445pattern_char_isalpha(char c, bool is_multibyte,
1446 pg_locale_t locale, bool locale_is_c)
1447{
1448 if (locale_is_c)
1449 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1450 else if (is_multibyte && IS_HIGHBIT_SET(c))
1451 return true;
1452 else if (locale && locale->provider == COLLPROVIDER_ICU)
1453 return IS_HIGHBIT_SET(c) ||
1454 (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1455#ifdef HAVE_LOCALE_T
1456 else if (locale && locale->provider == COLLPROVIDER_LIBC)
1457 return isalpha_l((unsigned char) c, locale->info.lt);
1458#endif
1459 else
1460 return isalpha((unsigned char) c);
1461}
1462
1463
1464/*
1465 * For bytea, the increment function need only increment the current byte
1466 * (there are no multibyte characters to worry about).
1467 */
1468static bool
1469byte_increment(unsigned char *ptr, int len)
1470{
1471 if (*ptr >= 255)
1472 return false;
1473 (*ptr)++;
1474 return true;
1475}
1476
1477/*
1478 * Try to generate a string greater than the given string or any
1479 * string it is a prefix of. If successful, return a palloc'd string
1480 * in the form of a Const node; else return NULL.
1481 *
1482 * The caller must provide the appropriate "less than" comparison function
1483 * for testing the strings, along with the collation to use.
1484 *
1485 * The key requirement here is that given a prefix string, say "foo",
1486 * we must be able to generate another string "fop" that is greater than
1487 * all strings "foobar" starting with "foo". We can test that we have
1488 * generated a string greater than the prefix string, but in non-C collations
1489 * that is not a bulletproof guarantee that an extension of the string might
1490 * not sort after it; an example is that "foo " is less than "foo!", but it
1491 * is not clear that a "dictionary" sort ordering will consider "foo!" less
1492 * than "foo bar". CAUTION: Therefore, this function should be used only for
1493 * estimation purposes when working in a non-C collation.
1494 *
1495 * To try to catch most cases where an extended string might otherwise sort
1496 * before the result value, we determine which of the strings "Z", "z", "y",
1497 * and "9" is seen as largest by the collation, and append that to the given
1498 * prefix before trying to find a string that compares as larger.
1499 *
1500 * To search for a greater string, we repeatedly "increment" the rightmost
1501 * character, using an encoding-specific character incrementer function.
1502 * When it's no longer possible to increment the last character, we truncate
1503 * off that character and start incrementing the next-to-rightmost.
1504 * For example, if "z" were the last character in the sort order, then we
1505 * could produce "foo" as a string greater than "fonz".
1506 *
1507 * This could be rather slow in the worst case, but in most cases we
1508 * won't have to try more than one or two strings before succeeding.
1509 *
1510 * Note that it's important for the character incrementer not to be too anal
1511 * about producing every possible character code, since in some cases the only
1512 * way to get a larger string is to increment a previous character position.
1513 * So we don't want to spend too much time trying every possible character
1514 * code at the last position. A good rule of thumb is to be sure that we
1515 * don't try more than 256*K values for a K-byte character (and definitely
1516 * not 256^K, which is what an exhaustive search would approach).
1517 */
1518static Const *
1519make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1520{
1521 Oid datatype = str_const->consttype;
1522 char *workstr;
1523 int len;
1524 Datum cmpstr;
1525 char *cmptxt = NULL;
1526 mbcharacter_incrementer charinc;
1527
1528 /*
1529 * Get a modifiable copy of the prefix string in C-string format, and set
1530 * up the string we will compare to as a Datum. In C locale this can just
1531 * be the given prefix string, otherwise we need to add a suffix. Type
1532 * BYTEA sorts bytewise so it never needs a suffix either.
1533 */
1534 if (datatype == BYTEAOID)
1535 {
1536 bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1537
1538 len = VARSIZE_ANY_EXHDR(bstr);
1539 workstr = (char *) palloc(len);
1540 memcpy(workstr, VARDATA_ANY(bstr), len);
1541 Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1542 cmpstr = str_const->constvalue;
1543 }
1544 else
1545 {
1546 if (datatype == NAMEOID)
1547 workstr = DatumGetCString(DirectFunctionCall1(nameout,
1548 str_const->constvalue));
1549 else
1550 workstr = TextDatumGetCString(str_const->constvalue);
1551 len = strlen(workstr);
1552 if (lc_collate_is_c(collation) || len == 0)
1553 cmpstr = str_const->constvalue;
1554 else
1555 {
1556 /* If first time through, determine the suffix to use */
1557 static char suffixchar = 0;
1558 static Oid suffixcollation = 0;
1559
1560 if (!suffixchar || suffixcollation != collation)
1561 {
1562 char *best;
1563
1564 best = "Z";
1565 if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1566 best = "z";
1567 if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1568 best = "y";
1569 if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1570 best = "9";
1571 suffixchar = *best;
1572 suffixcollation = collation;
1573 }
1574
1575 /* And build the string to compare to */
1576 if (datatype == NAMEOID)
1577 {
1578 cmptxt = palloc(len + 2);
1579 memcpy(cmptxt, workstr, len);
1580 cmptxt[len] = suffixchar;
1581 cmptxt[len + 1] = '\0';
1582 cmpstr = PointerGetDatum(cmptxt);
1583 }
1584 else
1585 {
1586 cmptxt = palloc(VARHDRSZ + len + 1);
1587 SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1588 memcpy(VARDATA(cmptxt), workstr, len);
1589 *(VARDATA(cmptxt) + len) = suffixchar;
1590 cmpstr = PointerGetDatum(cmptxt);
1591 }
1592 }
1593 }
1594
1595 /* Select appropriate character-incrementer function */
1596 if (datatype == BYTEAOID)
1597 charinc = byte_increment;
1598 else
1599 charinc = pg_database_encoding_character_incrementer();
1600
1601 /* And search ... */
1602 while (len > 0)
1603 {
1604 int charlen;
1605 unsigned char *lastchar;
1606
1607 /* Identify the last character --- for bytea, just the last byte */
1608 if (datatype == BYTEAOID)
1609 charlen = 1;
1610 else
1611 charlen = len - pg_mbcliplen(workstr, len, len - 1);
1612 lastchar = (unsigned char *) (workstr + len - charlen);
1613
1614 /*
1615 * Try to generate a larger string by incrementing the last character
1616 * (for BYTEA, we treat each byte as a character).
1617 *
1618 * Note: the incrementer function is expected to return true if it's
1619 * generated a valid-per-the-encoding new character, otherwise false.
1620 * The contents of the character on false return are unspecified.
1621 */
1622 while (charinc(lastchar, charlen))
1623 {
1624 Const *workstr_const;
1625
1626 if (datatype == BYTEAOID)
1627 workstr_const = string_to_bytea_const(workstr, len);
1628 else
1629 workstr_const = string_to_const(workstr, datatype);
1630
1631 if (DatumGetBool(FunctionCall2Coll(ltproc,
1632 collation,
1633 cmpstr,
1634 workstr_const->constvalue)))
1635 {
1636 /* Successfully made a string larger than cmpstr */
1637 if (cmptxt)
1638 pfree(cmptxt);
1639 pfree(workstr);
1640 return workstr_const;
1641 }
1642
1643 /* No good, release unusable value and try again */
1644 pfree(DatumGetPointer(workstr_const->constvalue));
1645 pfree(workstr_const);
1646 }
1647
1648 /*
1649 * No luck here, so truncate off the last character and try to
1650 * increment the next one.
1651 */
1652 len -= charlen;
1653 workstr[len] = '\0';
1654 }
1655
1656 /* Failed... */
1657 if (cmptxt)
1658 pfree(cmptxt);
1659 pfree(workstr);
1660
1661 return NULL;
1662}
1663
1664/*
1665 * Generate a Datum of the appropriate type from a C string.
1666 * Note that all of the supported types are pass-by-ref, so the
1667 * returned value should be pfree'd if no longer needed.
1668 */
1669static Datum
1670string_to_datum(const char *str, Oid datatype)
1671{
1672 Assert(str != NULL);
1673
1674 /*
1675 * We cheat a little by assuming that CStringGetTextDatum() will do for
1676 * bpchar and varchar constants too...
1677 */
1678 if (datatype == NAMEOID)
1679 return DirectFunctionCall1(namein, CStringGetDatum(str));
1680 else if (datatype == BYTEAOID)
1681 return DirectFunctionCall1(byteain, CStringGetDatum(str));
1682 else
1683 return CStringGetTextDatum(str);
1684}
1685
1686/*
1687 * Generate a Const node of the appropriate type from a C string.
1688 */
1689static Const *
1690string_to_const(const char *str, Oid datatype)
1691{
1692 Datum conval = string_to_datum(str, datatype);
1693 Oid collation;
1694 int constlen;
1695
1696 /*
1697 * We only need to support a few datatypes here, so hard-wire properties
1698 * instead of incurring the expense of catalog lookups.
1699 */
1700 switch (datatype)
1701 {
1702 case TEXTOID:
1703 case VARCHAROID:
1704 case BPCHAROID:
1705 collation = DEFAULT_COLLATION_OID;
1706 constlen = -1;
1707 break;
1708
1709 case NAMEOID:
1710 collation = C_COLLATION_OID;
1711 constlen = NAMEDATALEN;
1712 break;
1713
1714 case BYTEAOID:
1715 collation = InvalidOid;
1716 constlen = -1;
1717 break;
1718
1719 default:
1720 elog(ERROR, "unexpected datatype in string_to_const: %u",
1721 datatype);
1722 return NULL;
1723 }
1724
1725 return makeConst(datatype, -1, collation, constlen,
1726 conval, false, false);
1727}
1728
1729/*
1730 * Generate a Const node of bytea type from a binary C string and a length.
1731 */
1732static Const *
1733string_to_bytea_const(const char *str, size_t str_len)
1734{
1735 bytea *bstr = palloc(VARHDRSZ + str_len);
1736 Datum conval;
1737
1738 memcpy(VARDATA(bstr), str, str_len);
1739 SET_VARSIZE(bstr, VARHDRSZ + str_len);
1740 conval = PointerGetDatum(bstr);
1741
1742 return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1743}
1744