1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * like_support.c |
4 | * Planner support functions for LIKE, regex, and related operators. |
5 | * |
6 | * These routines handle special optimization of operators that can be |
7 | * used with index scans even though they are not known to the executor's |
8 | * indexscan machinery. The key idea is that these operators allow us |
9 | * to derive approximate indexscan qual clauses, such that any tuples |
10 | * that pass the operator clause itself must also satisfy the simpler |
11 | * indexscan condition(s). Then we can use the indexscan machinery |
12 | * to avoid scanning as much of the table as we'd otherwise have to, |
13 | * while applying the original operator as a qpqual condition to ensure |
14 | * we deliver only the tuples we want. (In essence, we're using a regular |
15 | * index as if it were a lossy index.) |
16 | * |
17 | * An example of what we're doing is |
18 | * textfield LIKE 'abc%def' |
19 | * from which we can generate the indexscanable conditions |
20 | * textfield >= 'abc' AND textfield < 'abd' |
21 | * which allow efficient scanning of an index on textfield. |
22 | * (In reality, character set and collation issues make the transformation |
23 | * from LIKE to indexscan limits rather harder than one might think ... |
24 | * but that's the basic idea.) |
25 | * |
26 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
27 | * Portions Copyright (c) 1994, Regents of the University of California |
28 | * |
29 | * |
30 | * IDENTIFICATION |
31 | * src/backend/utils/adt/like_support.c |
32 | * |
33 | *------------------------------------------------------------------------- |
34 | */ |
35 | #include "postgres.h" |
36 | |
37 | #include <math.h> |
38 | |
39 | #include "access/htup_details.h" |
40 | #include "access/stratnum.h" |
41 | #include "catalog/pg_collation.h" |
42 | #include "catalog/pg_opfamily.h" |
43 | #include "catalog/pg_statistic.h" |
44 | #include "catalog/pg_type.h" |
45 | #include "mb/pg_wchar.h" |
46 | #include "nodes/makefuncs.h" |
47 | #include "nodes/nodeFuncs.h" |
48 | #include "nodes/supportnodes.h" |
49 | #include "utils/builtins.h" |
50 | #include "utils/datum.h" |
51 | #include "utils/lsyscache.h" |
52 | #include "utils/pg_locale.h" |
53 | #include "utils/selfuncs.h" |
54 | #include "utils/varlena.h" |
55 | |
56 | |
57 | typedef enum |
58 | { |
59 | Pattern_Type_Like, |
60 | Pattern_Type_Like_IC, |
61 | Pattern_Type_Regex, |
62 | Pattern_Type_Regex_IC, |
63 | Pattern_Type_Prefix |
64 | } Pattern_Type; |
65 | |
66 | typedef enum |
67 | { |
68 | Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact |
69 | } Pattern_Prefix_Status; |
70 | |
71 | static Node *like_regex_support(Node *rawreq, Pattern_Type ptype); |
72 | static List *match_pattern_prefix(Node *leftop, |
73 | Node *rightop, |
74 | Pattern_Type ptype, |
75 | Oid expr_coll, |
76 | Oid opfamily, |
77 | Oid indexcollation); |
78 | static double patternsel_common(PlannerInfo *root, |
79 | Oid oprid, |
80 | Oid opfuncid, |
81 | List *args, |
82 | int varRelid, |
83 | Oid collation, |
84 | Pattern_Type ptype, |
85 | bool negate); |
86 | static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, |
87 | Pattern_Type ptype, |
88 | Oid collation, |
89 | Const **prefix, |
90 | Selectivity *rest_selec); |
91 | static Selectivity prefix_selectivity(PlannerInfo *root, |
92 | VariableStatData *vardata, |
93 | Oid vartype, Oid opfamily, Const *prefixcon); |
94 | static Selectivity like_selectivity(const char *patt, int pattlen, |
95 | bool case_insensitive); |
96 | static Selectivity regex_selectivity(const char *patt, int pattlen, |
97 | bool case_insensitive, |
98 | int fixed_prefix_len); |
99 | static int pattern_char_isalpha(char c, bool is_multibyte, |
100 | pg_locale_t locale, bool locale_is_c); |
101 | static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc, |
102 | Oid collation); |
103 | static Datum string_to_datum(const char *str, Oid datatype); |
104 | static Const *string_to_const(const char *str, Oid datatype); |
105 | static Const *string_to_bytea_const(const char *str, size_t str_len); |
106 | |
107 | |
108 | /* |
109 | * Planner support functions for LIKE, regex, and related operators |
110 | */ |
111 | Datum |
112 | textlike_support(PG_FUNCTION_ARGS) |
113 | { |
114 | Node *rawreq = (Node *) PG_GETARG_POINTER(0); |
115 | |
116 | PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like)); |
117 | } |
118 | |
119 | Datum |
120 | texticlike_support(PG_FUNCTION_ARGS) |
121 | { |
122 | Node *rawreq = (Node *) PG_GETARG_POINTER(0); |
123 | |
124 | PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC)); |
125 | } |
126 | |
127 | Datum |
128 | textregexeq_support(PG_FUNCTION_ARGS) |
129 | { |
130 | Node *rawreq = (Node *) PG_GETARG_POINTER(0); |
131 | |
132 | PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex)); |
133 | } |
134 | |
135 | Datum |
136 | texticregexeq_support(PG_FUNCTION_ARGS) |
137 | { |
138 | Node *rawreq = (Node *) PG_GETARG_POINTER(0); |
139 | |
140 | PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC)); |
141 | } |
142 | |
143 | /* Common code for the above */ |
144 | static Node * |
145 | like_regex_support(Node *rawreq, Pattern_Type ptype) |
146 | { |
147 | Node *ret = NULL; |
148 | |
149 | if (IsA(rawreq, SupportRequestSelectivity)) |
150 | { |
151 | /* |
152 | * Make a selectivity estimate for a function call, just as we'd do if |
153 | * the call was via the corresponding operator. |
154 | */ |
155 | SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq; |
156 | Selectivity s1; |
157 | |
158 | if (req->is_join) |
159 | { |
160 | /* |
161 | * For the moment we just punt. If patternjoinsel is ever |
162 | * improved to do better, this should be made to call it. |
163 | */ |
164 | s1 = DEFAULT_MATCH_SEL; |
165 | } |
166 | else |
167 | { |
168 | /* Share code with operator restriction selectivity functions */ |
169 | s1 = patternsel_common(req->root, |
170 | InvalidOid, |
171 | req->funcid, |
172 | req->args, |
173 | req->varRelid, |
174 | req->inputcollid, |
175 | ptype, |
176 | false); |
177 | } |
178 | req->selectivity = s1; |
179 | ret = (Node *) req; |
180 | } |
181 | else if (IsA(rawreq, SupportRequestIndexCondition)) |
182 | { |
183 | /* Try to convert operator/function call to index conditions */ |
184 | SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq; |
185 | |
186 | /* |
187 | * Currently we have no "reverse" match operators with the pattern on |
188 | * the left, so we only need consider cases with the indexkey on the |
189 | * left. |
190 | */ |
191 | if (req->indexarg != 0) |
192 | return NULL; |
193 | |
194 | if (is_opclause(req->node)) |
195 | { |
196 | OpExpr *clause = (OpExpr *) req->node; |
197 | |
198 | Assert(list_length(clause->args) == 2); |
199 | ret = (Node *) |
200 | match_pattern_prefix((Node *) linitial(clause->args), |
201 | (Node *) lsecond(clause->args), |
202 | ptype, |
203 | clause->inputcollid, |
204 | req->opfamily, |
205 | req->indexcollation); |
206 | } |
207 | else if (is_funcclause(req->node)) /* be paranoid */ |
208 | { |
209 | FuncExpr *clause = (FuncExpr *) req->node; |
210 | |
211 | Assert(list_length(clause->args) == 2); |
212 | ret = (Node *) |
213 | match_pattern_prefix((Node *) linitial(clause->args), |
214 | (Node *) lsecond(clause->args), |
215 | ptype, |
216 | clause->inputcollid, |
217 | req->opfamily, |
218 | req->indexcollation); |
219 | } |
220 | } |
221 | |
222 | return ret; |
223 | } |
224 | |
225 | /* |
226 | * match_pattern_prefix |
227 | * Try to generate an indexqual for a LIKE or regex operator. |
228 | */ |
229 | static List * |
230 | match_pattern_prefix(Node *leftop, |
231 | Node *rightop, |
232 | Pattern_Type ptype, |
233 | Oid expr_coll, |
234 | Oid opfamily, |
235 | Oid indexcollation) |
236 | { |
237 | List *result; |
238 | Const *patt; |
239 | Const *prefix; |
240 | Pattern_Prefix_Status pstatus; |
241 | Oid ldatatype; |
242 | Oid rdatatype; |
243 | Oid oproid; |
244 | Expr *expr; |
245 | FmgrInfo ltproc; |
246 | Const *greaterstr; |
247 | |
248 | /* |
249 | * Can't do anything with a non-constant or NULL pattern argument. |
250 | * |
251 | * Note that since we restrict ourselves to cases with a hard constant on |
252 | * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry |
253 | * about verifying that. |
254 | */ |
255 | if (!IsA(rightop, Const) || |
256 | ((Const *) rightop)->constisnull) |
257 | return NIL; |
258 | patt = (Const *) rightop; |
259 | |
260 | /* |
261 | * Not supported if the expression collation is nondeterministic. The |
262 | * optimized equality or prefix tests use bytewise comparisons, which is |
263 | * not consistent with nondeterministic collations. The actual |
264 | * pattern-matching implementation functions will later error out that |
265 | * pattern-matching is not supported with nondeterministic collations. (We |
266 | * could also error out here, but by doing it later we get more precise |
267 | * error messages.) (It should be possible to support at least |
268 | * Pattern_Prefix_Exact, but no point as along as the actual |
269 | * pattern-matching implementations don't support it.) |
270 | * |
271 | * expr_coll is not set for a non-collation-aware data type such as bytea. |
272 | */ |
273 | if (expr_coll && !get_collation_isdeterministic(expr_coll)) |
274 | return NIL; |
275 | |
276 | /* |
277 | * Try to extract a fixed prefix from the pattern. |
278 | */ |
279 | pstatus = pattern_fixed_prefix(patt, ptype, expr_coll, |
280 | &prefix, NULL); |
281 | |
282 | /* fail if no fixed prefix */ |
283 | if (pstatus == Pattern_Prefix_None) |
284 | return NIL; |
285 | |
286 | /* |
287 | * Must also check that index's opfamily supports the operators we will |
288 | * want to apply. (A hash index, for example, will not support ">=".) |
289 | * Currently, only btree and spgist support the operators we need. |
290 | * |
291 | * Note: actually, in the Pattern_Prefix_Exact case, we only need "=" so a |
292 | * hash index would work. Currently it doesn't seem worth checking for |
293 | * that, however. |
294 | * |
295 | * We insist on the opfamily being one of the specific ones we expect, |
296 | * else we'd do the wrong thing if someone were to make a reverse-sort |
297 | * opfamily with the same operators. |
298 | * |
299 | * The non-pattern opclasses will not sort the way we need in most non-C |
300 | * locales. We can use such an index anyway for an exact match (simple |
301 | * equality), but not for prefix-match cases. Note that here we are |
302 | * looking at the index's collation, not the expression's collation -- |
303 | * this test is *not* dependent on the LIKE/regex operator's collation. |
304 | * |
305 | * While we're at it, identify the type the comparison constant(s) should |
306 | * have, based on the opfamily. |
307 | */ |
308 | switch (opfamily) |
309 | { |
310 | case TEXT_BTREE_FAM_OID: |
311 | if (!(pstatus == Pattern_Prefix_Exact || |
312 | lc_collate_is_c(indexcollation))) |
313 | return NIL; |
314 | rdatatype = TEXTOID; |
315 | break; |
316 | |
317 | case TEXT_PATTERN_BTREE_FAM_OID: |
318 | case TEXT_SPGIST_FAM_OID: |
319 | rdatatype = TEXTOID; |
320 | break; |
321 | |
322 | case BPCHAR_BTREE_FAM_OID: |
323 | if (!(pstatus == Pattern_Prefix_Exact || |
324 | lc_collate_is_c(indexcollation))) |
325 | return NIL; |
326 | rdatatype = BPCHAROID; |
327 | break; |
328 | |
329 | case BPCHAR_PATTERN_BTREE_FAM_OID: |
330 | rdatatype = BPCHAROID; |
331 | break; |
332 | |
333 | case BYTEA_BTREE_FAM_OID: |
334 | rdatatype = BYTEAOID; |
335 | break; |
336 | |
337 | default: |
338 | return NIL; |
339 | } |
340 | |
341 | /* OK, prepare to create the indexqual(s) */ |
342 | ldatatype = exprType(leftop); |
343 | |
344 | /* |
345 | * If necessary, coerce the prefix constant to the right type. The given |
346 | * prefix constant is either text or bytea type, therefore the only case |
347 | * where we need to do anything is when converting text to bpchar. Those |
348 | * two types are binary-compatible, so relabeling the Const node is |
349 | * sufficient. |
350 | */ |
351 | if (prefix->consttype != rdatatype) |
352 | { |
353 | Assert(prefix->consttype == TEXTOID && |
354 | rdatatype == BPCHAROID); |
355 | prefix->consttype = rdatatype; |
356 | } |
357 | |
358 | /* |
359 | * If we found an exact-match pattern, generate an "=" indexqual. |
360 | */ |
361 | if (pstatus == Pattern_Prefix_Exact) |
362 | { |
363 | oproid = get_opfamily_member(opfamily, ldatatype, rdatatype, |
364 | BTEqualStrategyNumber); |
365 | if (oproid == InvalidOid) |
366 | elog(ERROR, "no = operator for opfamily %u" , opfamily); |
367 | expr = make_opclause(oproid, BOOLOID, false, |
368 | (Expr *) leftop, (Expr *) prefix, |
369 | InvalidOid, indexcollation); |
370 | result = list_make1(expr); |
371 | return result; |
372 | } |
373 | |
374 | /* |
375 | * Otherwise, we have a nonempty required prefix of the values. |
376 | * |
377 | * We can always say "x >= prefix". |
378 | */ |
379 | oproid = get_opfamily_member(opfamily, ldatatype, rdatatype, |
380 | BTGreaterEqualStrategyNumber); |
381 | if (oproid == InvalidOid) |
382 | elog(ERROR, "no >= operator for opfamily %u" , opfamily); |
383 | expr = make_opclause(oproid, BOOLOID, false, |
384 | (Expr *) leftop, (Expr *) prefix, |
385 | InvalidOid, indexcollation); |
386 | result = list_make1(expr); |
387 | |
388 | /*------- |
389 | * If we can create a string larger than the prefix, we can say |
390 | * "x < greaterstr". NB: we rely on make_greater_string() to generate |
391 | * a guaranteed-greater string, not just a probably-greater string. |
392 | * In general this is only guaranteed in C locale, so we'd better be |
393 | * using a C-locale index collation. |
394 | *------- |
395 | */ |
396 | oproid = get_opfamily_member(opfamily, ldatatype, rdatatype, |
397 | BTLessStrategyNumber); |
398 | if (oproid == InvalidOid) |
399 | elog(ERROR, "no < operator for opfamily %u" , opfamily); |
400 | fmgr_info(get_opcode(oproid), <proc); |
401 | greaterstr = make_greater_string(prefix, <proc, indexcollation); |
402 | if (greaterstr) |
403 | { |
404 | expr = make_opclause(oproid, BOOLOID, false, |
405 | (Expr *) leftop, (Expr *) greaterstr, |
406 | InvalidOid, indexcollation); |
407 | result = lappend(result, expr); |
408 | } |
409 | |
410 | return result; |
411 | } |
412 | |
413 | |
414 | /* |
415 | * patternsel_common - generic code for pattern-match restriction selectivity. |
416 | * |
417 | * To support using this from either the operator or function paths, caller |
418 | * may pass either operator OID or underlying function OID; we look up the |
419 | * latter from the former if needed. (We could just have patternsel() call |
420 | * get_opcode(), but the work would be wasted if we don't have a need to |
421 | * compare a fixed prefix to the pg_statistic data.) |
422 | * |
423 | * Note that oprid and/or opfuncid should be for the positive-match operator |
424 | * even when negate is true. |
425 | */ |
426 | static double |
427 | patternsel_common(PlannerInfo *root, |
428 | Oid oprid, |
429 | Oid opfuncid, |
430 | List *args, |
431 | int varRelid, |
432 | Oid collation, |
433 | Pattern_Type ptype, |
434 | bool negate) |
435 | { |
436 | VariableStatData vardata; |
437 | Node *other; |
438 | bool varonleft; |
439 | Datum constval; |
440 | Oid consttype; |
441 | Oid vartype; |
442 | Oid opfamily; |
443 | Pattern_Prefix_Status pstatus; |
444 | Const *patt; |
445 | Const *prefix = NULL; |
446 | Selectivity rest_selec = 0; |
447 | double nullfrac = 0.0; |
448 | double result; |
449 | |
450 | /* |
451 | * Initialize result to the appropriate default estimate depending on |
452 | * whether it's a match or not-match operator. |
453 | */ |
454 | if (negate) |
455 | result = 1.0 - DEFAULT_MATCH_SEL; |
456 | else |
457 | result = DEFAULT_MATCH_SEL; |
458 | |
459 | /* |
460 | * If expression is not variable op constant, then punt and return the |
461 | * default estimate. |
462 | */ |
463 | if (!get_restriction_variable(root, args, varRelid, |
464 | &vardata, &other, &varonleft)) |
465 | return result; |
466 | if (!varonleft || !IsA(other, Const)) |
467 | { |
468 | ReleaseVariableStats(vardata); |
469 | return result; |
470 | } |
471 | |
472 | /* |
473 | * If the constant is NULL, assume operator is strict and return zero, ie, |
474 | * operator will never return TRUE. (It's zero even for a negator op.) |
475 | */ |
476 | if (((Const *) other)->constisnull) |
477 | { |
478 | ReleaseVariableStats(vardata); |
479 | return 0.0; |
480 | } |
481 | constval = ((Const *) other)->constvalue; |
482 | consttype = ((Const *) other)->consttype; |
483 | |
484 | /* |
485 | * The right-hand const is type text or bytea for all supported operators. |
486 | * We do not expect to see binary-compatible types here, since |
487 | * const-folding should have relabeled the const to exactly match the |
488 | * operator's declared type. |
489 | */ |
490 | if (consttype != TEXTOID && consttype != BYTEAOID) |
491 | { |
492 | ReleaseVariableStats(vardata); |
493 | return result; |
494 | } |
495 | |
496 | /* |
497 | * Similarly, the exposed type of the left-hand side should be one of |
498 | * those we know. (Do not look at vardata.atttype, which might be |
499 | * something binary-compatible but different.) We can use it to choose |
500 | * the index opfamily from which we must draw the comparison operators. |
501 | * |
502 | * NOTE: It would be more correct to use the PATTERN opfamilies than the |
503 | * simple ones, but at the moment ANALYZE will not generate statistics for |
504 | * the PATTERN operators. But our results are so approximate anyway that |
505 | * it probably hardly matters. |
506 | */ |
507 | vartype = vardata.vartype; |
508 | |
509 | switch (vartype) |
510 | { |
511 | case TEXTOID: |
512 | case NAMEOID: |
513 | opfamily = TEXT_BTREE_FAM_OID; |
514 | break; |
515 | case BPCHAROID: |
516 | opfamily = BPCHAR_BTREE_FAM_OID; |
517 | break; |
518 | case BYTEAOID: |
519 | opfamily = BYTEA_BTREE_FAM_OID; |
520 | break; |
521 | default: |
522 | ReleaseVariableStats(vardata); |
523 | return result; |
524 | } |
525 | |
526 | /* |
527 | * Grab the nullfrac for use below. |
528 | */ |
529 | if (HeapTupleIsValid(vardata.statsTuple)) |
530 | { |
531 | Form_pg_statistic stats; |
532 | |
533 | stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); |
534 | nullfrac = stats->stanullfrac; |
535 | } |
536 | |
537 | /* |
538 | * Pull out any fixed prefix implied by the pattern, and estimate the |
539 | * fractional selectivity of the remainder of the pattern. Unlike many |
540 | * other selectivity estimators, we use the pattern operator's actual |
541 | * collation for this step. This is not because we expect the collation |
542 | * to make a big difference in the selectivity estimate (it seldom would), |
543 | * but because we want to be sure we cache compiled regexps under the |
544 | * right cache key, so that they can be re-used at runtime. |
545 | */ |
546 | patt = (Const *) other; |
547 | pstatus = pattern_fixed_prefix(patt, ptype, collation, |
548 | &prefix, &rest_selec); |
549 | |
550 | /* |
551 | * If necessary, coerce the prefix constant to the right type. |
552 | */ |
553 | if (prefix && prefix->consttype != vartype) |
554 | { |
555 | char *prefixstr; |
556 | |
557 | switch (prefix->consttype) |
558 | { |
559 | case TEXTOID: |
560 | prefixstr = TextDatumGetCString(prefix->constvalue); |
561 | break; |
562 | case BYTEAOID: |
563 | prefixstr = DatumGetCString(DirectFunctionCall1(byteaout, |
564 | prefix->constvalue)); |
565 | break; |
566 | default: |
567 | elog(ERROR, "unrecognized consttype: %u" , |
568 | prefix->consttype); |
569 | ReleaseVariableStats(vardata); |
570 | return result; |
571 | } |
572 | prefix = string_to_const(prefixstr, vartype); |
573 | pfree(prefixstr); |
574 | } |
575 | |
576 | if (pstatus == Pattern_Prefix_Exact) |
577 | { |
578 | /* |
579 | * Pattern specifies an exact match, so pretend operator is '=' |
580 | */ |
581 | Oid eqopr = get_opfamily_member(opfamily, vartype, vartype, |
582 | BTEqualStrategyNumber); |
583 | |
584 | if (eqopr == InvalidOid) |
585 | elog(ERROR, "no = operator for opfamily %u" , opfamily); |
586 | result = var_eq_const(&vardata, eqopr, prefix->constvalue, |
587 | false, true, false); |
588 | } |
589 | else |
590 | { |
591 | /* |
592 | * Not exact-match pattern. If we have a sufficiently large |
593 | * histogram, estimate selectivity for the histogram part of the |
594 | * population by counting matches in the histogram. If not, estimate |
595 | * selectivity of the fixed prefix and remainder of pattern |
596 | * separately, then combine the two to get an estimate of the |
597 | * selectivity for the part of the column population represented by |
598 | * the histogram. (For small histograms, we combine these |
599 | * approaches.) |
600 | * |
601 | * We then add up data for any most-common-values values; these are |
602 | * not in the histogram population, and we can get exact answers for |
603 | * them by applying the pattern operator, so there's no reason to |
604 | * approximate. (If the MCVs cover a significant part of the total |
605 | * population, this gives us a big leg up in accuracy.) |
606 | */ |
607 | Selectivity selec; |
608 | int hist_size; |
609 | FmgrInfo opproc; |
610 | double mcv_selec, |
611 | sumcommon; |
612 | |
613 | /* Try to use the histogram entries to get selectivity */ |
614 | if (!OidIsValid(opfuncid)) |
615 | opfuncid = get_opcode(oprid); |
616 | fmgr_info(opfuncid, &opproc); |
617 | |
618 | selec = histogram_selectivity(&vardata, &opproc, constval, true, |
619 | 10, 1, &hist_size); |
620 | |
621 | /* If not at least 100 entries, use the heuristic method */ |
622 | if (hist_size < 100) |
623 | { |
624 | Selectivity heursel; |
625 | Selectivity prefixsel; |
626 | |
627 | if (pstatus == Pattern_Prefix_Partial) |
628 | prefixsel = prefix_selectivity(root, &vardata, vartype, |
629 | opfamily, prefix); |
630 | else |
631 | prefixsel = 1.0; |
632 | heursel = prefixsel * rest_selec; |
633 | |
634 | if (selec < 0) /* fewer than 10 histogram entries? */ |
635 | selec = heursel; |
636 | else |
637 | { |
638 | /* |
639 | * For histogram sizes from 10 to 100, we combine the |
640 | * histogram and heuristic selectivities, putting increasingly |
641 | * more trust in the histogram for larger sizes. |
642 | */ |
643 | double hist_weight = hist_size / 100.0; |
644 | |
645 | selec = selec * hist_weight + heursel * (1.0 - hist_weight); |
646 | } |
647 | } |
648 | |
649 | /* In any case, don't believe extremely small or large estimates. */ |
650 | if (selec < 0.0001) |
651 | selec = 0.0001; |
652 | else if (selec > 0.9999) |
653 | selec = 0.9999; |
654 | |
655 | /* |
656 | * If we have most-common-values info, add up the fractions of the MCV |
657 | * entries that satisfy MCV OP PATTERN. These fractions contribute |
658 | * directly to the result selectivity. Also add up the total fraction |
659 | * represented by MCV entries. |
660 | */ |
661 | mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, |
662 | &sumcommon); |
663 | |
664 | /* |
665 | * Now merge the results from the MCV and histogram calculations, |
666 | * realizing that the histogram covers only the non-null values that |
667 | * are not listed in MCV. |
668 | */ |
669 | selec *= 1.0 - nullfrac - sumcommon; |
670 | selec += mcv_selec; |
671 | result = selec; |
672 | } |
673 | |
674 | /* now adjust if we wanted not-match rather than match */ |
675 | if (negate) |
676 | result = 1.0 - result - nullfrac; |
677 | |
678 | /* result should be in range, but make sure... */ |
679 | CLAMP_PROBABILITY(result); |
680 | |
681 | if (prefix) |
682 | { |
683 | pfree(DatumGetPointer(prefix->constvalue)); |
684 | pfree(prefix); |
685 | } |
686 | |
687 | ReleaseVariableStats(vardata); |
688 | |
689 | return result; |
690 | } |
691 | |
692 | /* |
693 | * Fix impedance mismatch between SQL-callable functions and patternsel_common |
694 | */ |
695 | static double |
696 | patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) |
697 | { |
698 | PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); |
699 | Oid operator = PG_GETARG_OID(1); |
700 | List *args = (List *) PG_GETARG_POINTER(2); |
701 | int varRelid = PG_GETARG_INT32(3); |
702 | Oid collation = PG_GET_COLLATION(); |
703 | |
704 | /* |
705 | * If this is for a NOT LIKE or similar operator, get the corresponding |
706 | * positive-match operator and work with that. |
707 | */ |
708 | if (negate) |
709 | { |
710 | operator = get_negator(operator); |
711 | if (!OidIsValid(operator)) |
712 | elog(ERROR, "patternsel called for operator without a negator" ); |
713 | } |
714 | |
715 | return patternsel_common(root, |
716 | operator, |
717 | InvalidOid, |
718 | args, |
719 | varRelid, |
720 | collation, |
721 | ptype, |
722 | negate); |
723 | } |
724 | |
725 | /* |
726 | * regexeqsel - Selectivity of regular-expression pattern match. |
727 | */ |
728 | Datum |
729 | regexeqsel(PG_FUNCTION_ARGS) |
730 | { |
731 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false)); |
732 | } |
733 | |
734 | /* |
735 | * icregexeqsel - Selectivity of case-insensitive regex match. |
736 | */ |
737 | Datum |
738 | icregexeqsel(PG_FUNCTION_ARGS) |
739 | { |
740 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false)); |
741 | } |
742 | |
743 | /* |
744 | * likesel - Selectivity of LIKE pattern match. |
745 | */ |
746 | Datum |
747 | likesel(PG_FUNCTION_ARGS) |
748 | { |
749 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false)); |
750 | } |
751 | |
752 | /* |
753 | * prefixsel - selectivity of prefix operator |
754 | */ |
755 | Datum |
756 | prefixsel(PG_FUNCTION_ARGS) |
757 | { |
758 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false)); |
759 | } |
760 | |
761 | /* |
762 | * |
763 | * iclikesel - Selectivity of ILIKE pattern match. |
764 | */ |
765 | Datum |
766 | iclikesel(PG_FUNCTION_ARGS) |
767 | { |
768 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false)); |
769 | } |
770 | |
771 | /* |
772 | * regexnesel - Selectivity of regular-expression pattern non-match. |
773 | */ |
774 | Datum |
775 | regexnesel(PG_FUNCTION_ARGS) |
776 | { |
777 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true)); |
778 | } |
779 | |
780 | /* |
781 | * icregexnesel - Selectivity of case-insensitive regex non-match. |
782 | */ |
783 | Datum |
784 | icregexnesel(PG_FUNCTION_ARGS) |
785 | { |
786 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true)); |
787 | } |
788 | |
789 | /* |
790 | * nlikesel - Selectivity of LIKE pattern non-match. |
791 | */ |
792 | Datum |
793 | nlikesel(PG_FUNCTION_ARGS) |
794 | { |
795 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true)); |
796 | } |
797 | |
798 | /* |
799 | * icnlikesel - Selectivity of ILIKE pattern non-match. |
800 | */ |
801 | Datum |
802 | icnlikesel(PG_FUNCTION_ARGS) |
803 | { |
804 | PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true)); |
805 | } |
806 | |
807 | /* |
808 | * patternjoinsel - Generic code for pattern-match join selectivity. |
809 | */ |
810 | static double |
811 | patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) |
812 | { |
813 | /* For the moment we just punt. */ |
814 | return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL; |
815 | } |
816 | |
817 | /* |
818 | * regexeqjoinsel - Join selectivity of regular-expression pattern match. |
819 | */ |
820 | Datum |
821 | regexeqjoinsel(PG_FUNCTION_ARGS) |
822 | { |
823 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false)); |
824 | } |
825 | |
826 | /* |
827 | * icregexeqjoinsel - Join selectivity of case-insensitive regex match. |
828 | */ |
829 | Datum |
830 | icregexeqjoinsel(PG_FUNCTION_ARGS) |
831 | { |
832 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false)); |
833 | } |
834 | |
835 | /* |
836 | * likejoinsel - Join selectivity of LIKE pattern match. |
837 | */ |
838 | Datum |
839 | likejoinsel(PG_FUNCTION_ARGS) |
840 | { |
841 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false)); |
842 | } |
843 | |
844 | /* |
845 | * prefixjoinsel - Join selectivity of prefix operator |
846 | */ |
847 | Datum |
848 | prefixjoinsel(PG_FUNCTION_ARGS) |
849 | { |
850 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false)); |
851 | } |
852 | |
853 | /* |
854 | * iclikejoinsel - Join selectivity of ILIKE pattern match. |
855 | */ |
856 | Datum |
857 | iclikejoinsel(PG_FUNCTION_ARGS) |
858 | { |
859 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false)); |
860 | } |
861 | |
862 | /* |
863 | * regexnejoinsel - Join selectivity of regex non-match. |
864 | */ |
865 | Datum |
866 | regexnejoinsel(PG_FUNCTION_ARGS) |
867 | { |
868 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true)); |
869 | } |
870 | |
871 | /* |
872 | * icregexnejoinsel - Join selectivity of case-insensitive regex non-match. |
873 | */ |
874 | Datum |
875 | icregexnejoinsel(PG_FUNCTION_ARGS) |
876 | { |
877 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true)); |
878 | } |
879 | |
880 | /* |
881 | * nlikejoinsel - Join selectivity of LIKE pattern non-match. |
882 | */ |
883 | Datum |
884 | nlikejoinsel(PG_FUNCTION_ARGS) |
885 | { |
886 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true)); |
887 | } |
888 | |
889 | /* |
890 | * icnlikejoinsel - Join selectivity of ILIKE pattern non-match. |
891 | */ |
892 | Datum |
893 | icnlikejoinsel(PG_FUNCTION_ARGS) |
894 | { |
895 | PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true)); |
896 | } |
897 | |
898 | |
899 | /*------------------------------------------------------------------------- |
900 | * |
901 | * Pattern analysis functions |
902 | * |
903 | * These routines support analysis of LIKE and regular-expression patterns |
904 | * by the planner/optimizer. It's important that they agree with the |
905 | * regular-expression code in backend/regex/ and the LIKE code in |
906 | * backend/utils/adt/like.c. Also, the computation of the fixed prefix |
907 | * must be conservative: if we report a string longer than the true fixed |
908 | * prefix, the query may produce actually wrong answers, rather than just |
909 | * getting a bad selectivity estimate! |
910 | * |
911 | *------------------------------------------------------------------------- |
912 | */ |
913 | |
914 | /* |
915 | * Extract the fixed prefix, if any, for a pattern. |
916 | * |
917 | * *prefix is set to a palloc'd prefix string (in the form of a Const node), |
918 | * or to NULL if no fixed prefix exists for the pattern. |
919 | * If rest_selec is not NULL, *rest_selec is set to an estimate of the |
920 | * selectivity of the remainder of the pattern (without any fixed prefix). |
921 | * The prefix Const has the same type (TEXT or BYTEA) as the input pattern. |
922 | * |
923 | * The return value distinguishes no fixed prefix, a partial prefix, |
924 | * or an exact-match-only pattern. |
925 | */ |
926 | |
927 | static Pattern_Prefix_Status |
928 | like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, |
929 | Const **prefix_const, Selectivity *rest_selec) |
930 | { |
931 | char *match; |
932 | char *patt; |
933 | int pattlen; |
934 | Oid typeid = patt_const->consttype; |
935 | int pos, |
936 | match_pos; |
937 | bool is_multibyte = (pg_database_encoding_max_length() > 1); |
938 | pg_locale_t locale = 0; |
939 | bool locale_is_c = false; |
940 | |
941 | /* the right-hand const is type text or bytea */ |
942 | Assert(typeid == BYTEAOID || typeid == TEXTOID); |
943 | |
944 | if (case_insensitive) |
945 | { |
946 | if (typeid == BYTEAOID) |
947 | ereport(ERROR, |
948 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
949 | errmsg("case insensitive matching not supported on type bytea" ))); |
950 | |
951 | /* If case-insensitive, we need locale info */ |
952 | if (lc_ctype_is_c(collation)) |
953 | locale_is_c = true; |
954 | else if (collation != DEFAULT_COLLATION_OID) |
955 | { |
956 | if (!OidIsValid(collation)) |
957 | { |
958 | /* |
959 | * This typically means that the parser could not resolve a |
960 | * conflict of implicit collations, so report it that way. |
961 | */ |
962 | ereport(ERROR, |
963 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
964 | errmsg("could not determine which collation to use for ILIKE" ), |
965 | errhint("Use the COLLATE clause to set the collation explicitly." ))); |
966 | } |
967 | locale = pg_newlocale_from_collation(collation); |
968 | } |
969 | } |
970 | |
971 | if (typeid != BYTEAOID) |
972 | { |
973 | patt = TextDatumGetCString(patt_const->constvalue); |
974 | pattlen = strlen(patt); |
975 | } |
976 | else |
977 | { |
978 | bytea *bstr = DatumGetByteaPP(patt_const->constvalue); |
979 | |
980 | pattlen = VARSIZE_ANY_EXHDR(bstr); |
981 | patt = (char *) palloc(pattlen); |
982 | memcpy(patt, VARDATA_ANY(bstr), pattlen); |
983 | Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue)); |
984 | } |
985 | |
986 | match = palloc(pattlen + 1); |
987 | match_pos = 0; |
988 | for (pos = 0; pos < pattlen; pos++) |
989 | { |
990 | /* % and _ are wildcard characters in LIKE */ |
991 | if (patt[pos] == '%' || |
992 | patt[pos] == '_') |
993 | break; |
994 | |
995 | /* Backslash escapes the next character */ |
996 | if (patt[pos] == '\\') |
997 | { |
998 | pos++; |
999 | if (pos >= pattlen) |
1000 | break; |
1001 | } |
1002 | |
1003 | /* Stop if case-varying character (it's sort of a wildcard) */ |
1004 | if (case_insensitive && |
1005 | pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) |
1006 | break; |
1007 | |
1008 | match[match_pos++] = patt[pos]; |
1009 | } |
1010 | |
1011 | match[match_pos] = '\0'; |
1012 | |
1013 | if (typeid != BYTEAOID) |
1014 | *prefix_const = string_to_const(match, typeid); |
1015 | else |
1016 | *prefix_const = string_to_bytea_const(match, match_pos); |
1017 | |
1018 | if (rest_selec != NULL) |
1019 | *rest_selec = like_selectivity(&patt[pos], pattlen - pos, |
1020 | case_insensitive); |
1021 | |
1022 | pfree(patt); |
1023 | pfree(match); |
1024 | |
1025 | /* in LIKE, an empty pattern is an exact match! */ |
1026 | if (pos == pattlen) |
1027 | return Pattern_Prefix_Exact; /* reached end of pattern, so exact */ |
1028 | |
1029 | if (match_pos > 0) |
1030 | return Pattern_Prefix_Partial; |
1031 | |
1032 | return Pattern_Prefix_None; |
1033 | } |
1034 | |
1035 | static Pattern_Prefix_Status |
1036 | regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, |
1037 | Const **prefix_const, Selectivity *rest_selec) |
1038 | { |
1039 | Oid typeid = patt_const->consttype; |
1040 | char *prefix; |
1041 | bool exact; |
1042 | |
1043 | /* |
1044 | * Should be unnecessary, there are no bytea regex operators defined. As |
1045 | * such, it should be noted that the rest of this function has *not* been |
1046 | * made safe for binary (possibly NULL containing) strings. |
1047 | */ |
1048 | if (typeid == BYTEAOID) |
1049 | ereport(ERROR, |
1050 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1051 | errmsg("regular-expression matching not supported on type bytea" ))); |
1052 | |
1053 | /* Use the regexp machinery to extract the prefix, if any */ |
1054 | prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), |
1055 | case_insensitive, collation, |
1056 | &exact); |
1057 | |
1058 | if (prefix == NULL) |
1059 | { |
1060 | *prefix_const = NULL; |
1061 | |
1062 | if (rest_selec != NULL) |
1063 | { |
1064 | char *patt = TextDatumGetCString(patt_const->constvalue); |
1065 | |
1066 | *rest_selec = regex_selectivity(patt, strlen(patt), |
1067 | case_insensitive, |
1068 | 0); |
1069 | pfree(patt); |
1070 | } |
1071 | |
1072 | return Pattern_Prefix_None; |
1073 | } |
1074 | |
1075 | *prefix_const = string_to_const(prefix, typeid); |
1076 | |
1077 | if (rest_selec != NULL) |
1078 | { |
1079 | if (exact) |
1080 | { |
1081 | /* Exact match, so there's no additional selectivity */ |
1082 | *rest_selec = 1.0; |
1083 | } |
1084 | else |
1085 | { |
1086 | char *patt = TextDatumGetCString(patt_const->constvalue); |
1087 | |
1088 | *rest_selec = regex_selectivity(patt, strlen(patt), |
1089 | case_insensitive, |
1090 | strlen(prefix)); |
1091 | pfree(patt); |
1092 | } |
1093 | } |
1094 | |
1095 | pfree(prefix); |
1096 | |
1097 | if (exact) |
1098 | return Pattern_Prefix_Exact; /* pattern specifies exact match */ |
1099 | else |
1100 | return Pattern_Prefix_Partial; |
1101 | } |
1102 | |
1103 | static Pattern_Prefix_Status |
1104 | pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation, |
1105 | Const **prefix, Selectivity *rest_selec) |
1106 | { |
1107 | Pattern_Prefix_Status result; |
1108 | |
1109 | switch (ptype) |
1110 | { |
1111 | case Pattern_Type_Like: |
1112 | result = like_fixed_prefix(patt, false, collation, |
1113 | prefix, rest_selec); |
1114 | break; |
1115 | case Pattern_Type_Like_IC: |
1116 | result = like_fixed_prefix(patt, true, collation, |
1117 | prefix, rest_selec); |
1118 | break; |
1119 | case Pattern_Type_Regex: |
1120 | result = regex_fixed_prefix(patt, false, collation, |
1121 | prefix, rest_selec); |
1122 | break; |
1123 | case Pattern_Type_Regex_IC: |
1124 | result = regex_fixed_prefix(patt, true, collation, |
1125 | prefix, rest_selec); |
1126 | break; |
1127 | case Pattern_Type_Prefix: |
1128 | /* Prefix type work is trivial. */ |
1129 | result = Pattern_Prefix_Partial; |
1130 | *rest_selec = 1.0; /* all */ |
1131 | *prefix = makeConst(patt->consttype, |
1132 | patt->consttypmod, |
1133 | patt->constcollid, |
1134 | patt->constlen, |
1135 | datumCopy(patt->constvalue, |
1136 | patt->constbyval, |
1137 | patt->constlen), |
1138 | patt->constisnull, |
1139 | patt->constbyval); |
1140 | break; |
1141 | default: |
1142 | elog(ERROR, "unrecognized ptype: %d" , (int) ptype); |
1143 | result = Pattern_Prefix_None; /* keep compiler quiet */ |
1144 | break; |
1145 | } |
1146 | return result; |
1147 | } |
1148 | |
1149 | /* |
1150 | * Estimate the selectivity of a fixed prefix for a pattern match. |
1151 | * |
1152 | * A fixed prefix "foo" is estimated as the selectivity of the expression |
1153 | * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c). |
1154 | * |
1155 | * The selectivity estimate is with respect to the portion of the column |
1156 | * population represented by the histogram --- the caller must fold this |
1157 | * together with info about MCVs and NULLs. |
1158 | * |
1159 | * We use the >= and < operators from the specified btree opfamily to do the |
1160 | * estimation. The given variable and Const must be of the associated |
1161 | * datatype. |
1162 | * |
1163 | * XXX Note: we make use of the upper bound to estimate operator selectivity |
1164 | * even if the locale is such that we cannot rely on the upper-bound string. |
1165 | * The selectivity only needs to be approximately right anyway, so it seems |
1166 | * more useful to use the upper-bound code than not. |
1167 | */ |
1168 | static Selectivity |
1169 | prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, |
1170 | Oid vartype, Oid opfamily, Const *prefixcon) |
1171 | { |
1172 | Selectivity prefixsel; |
1173 | Oid cmpopr; |
1174 | FmgrInfo opproc; |
1175 | AttStatsSlot sslot; |
1176 | Const *greaterstrcon; |
1177 | Selectivity eq_sel; |
1178 | |
1179 | cmpopr = get_opfamily_member(opfamily, vartype, vartype, |
1180 | BTGreaterEqualStrategyNumber); |
1181 | if (cmpopr == InvalidOid) |
1182 | elog(ERROR, "no >= operator for opfamily %u" , opfamily); |
1183 | fmgr_info(get_opcode(cmpopr), &opproc); |
1184 | |
1185 | prefixsel = ineq_histogram_selectivity(root, vardata, |
1186 | &opproc, true, true, |
1187 | prefixcon->constvalue, |
1188 | prefixcon->consttype); |
1189 | |
1190 | if (prefixsel < 0.0) |
1191 | { |
1192 | /* No histogram is present ... return a suitable default estimate */ |
1193 | return DEFAULT_MATCH_SEL; |
1194 | } |
1195 | |
1196 | /*------- |
1197 | * If we can create a string larger than the prefix, say |
1198 | * "x < greaterstr". We try to generate the string referencing the |
1199 | * collation of the var's statistics, but if that's not available, |
1200 | * use DEFAULT_COLLATION_OID. |
1201 | *------- |
1202 | */ |
1203 | if (HeapTupleIsValid(vardata->statsTuple) && |
1204 | get_attstatsslot(&sslot, vardata->statsTuple, |
1205 | STATISTIC_KIND_HISTOGRAM, InvalidOid, 0)) |
1206 | /* sslot.stacoll is set up */ ; |
1207 | else |
1208 | sslot.stacoll = DEFAULT_COLLATION_OID; |
1209 | cmpopr = get_opfamily_member(opfamily, vartype, vartype, |
1210 | BTLessStrategyNumber); |
1211 | if (cmpopr == InvalidOid) |
1212 | elog(ERROR, "no < operator for opfamily %u" , opfamily); |
1213 | fmgr_info(get_opcode(cmpopr), &opproc); |
1214 | greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll); |
1215 | if (greaterstrcon) |
1216 | { |
1217 | Selectivity topsel; |
1218 | |
1219 | topsel = ineq_histogram_selectivity(root, vardata, |
1220 | &opproc, false, false, |
1221 | greaterstrcon->constvalue, |
1222 | greaterstrcon->consttype); |
1223 | |
1224 | /* ineq_histogram_selectivity worked before, it shouldn't fail now */ |
1225 | Assert(topsel >= 0.0); |
1226 | |
1227 | /* |
1228 | * Merge the two selectivities in the same way as for a range query |
1229 | * (see clauselist_selectivity()). Note that we don't need to worry |
1230 | * about double-exclusion of nulls, since ineq_histogram_selectivity |
1231 | * doesn't count those anyway. |
1232 | */ |
1233 | prefixsel = topsel + prefixsel - 1.0; |
1234 | } |
1235 | |
1236 | /* |
1237 | * If the prefix is long then the two bounding values might be too close |
1238 | * together for the histogram to distinguish them usefully, resulting in a |
1239 | * zero estimate (plus or minus roundoff error). To avoid returning a |
1240 | * ridiculously small estimate, compute the estimated selectivity for |
1241 | * "variable = 'foo'", and clamp to that. (Obviously, the resultant |
1242 | * estimate should be at least that.) |
1243 | * |
1244 | * We apply this even if we couldn't make a greater string. That case |
1245 | * suggests that the prefix is near the maximum possible, and thus |
1246 | * probably off the end of the histogram, and thus we probably got a very |
1247 | * small estimate from the >= condition; so we still need to clamp. |
1248 | */ |
1249 | cmpopr = get_opfamily_member(opfamily, vartype, vartype, |
1250 | BTEqualStrategyNumber); |
1251 | if (cmpopr == InvalidOid) |
1252 | elog(ERROR, "no = operator for opfamily %u" , opfamily); |
1253 | eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, |
1254 | false, true, false); |
1255 | |
1256 | prefixsel = Max(prefixsel, eq_sel); |
1257 | |
1258 | return prefixsel; |
1259 | } |
1260 | |
1261 | |
1262 | /* |
1263 | * Estimate the selectivity of a pattern of the specified type. |
1264 | * Note that any fixed prefix of the pattern will have been removed already, |
1265 | * so actually we may be looking at just a fragment of the pattern. |
1266 | * |
1267 | * For now, we use a very simplistic approach: fixed characters reduce the |
1268 | * selectivity a good deal, character ranges reduce it a little, |
1269 | * wildcards (such as % for LIKE or .* for regex) increase it. |
1270 | */ |
1271 | |
1272 | #define FIXED_CHAR_SEL 0.20 /* about 1/5 */ |
1273 | #define CHAR_RANGE_SEL 0.25 |
1274 | #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */ |
1275 | #define FULL_WILDCARD_SEL 5.0 |
1276 | #define PARTIAL_WILDCARD_SEL 2.0 |
1277 | |
1278 | static Selectivity |
1279 | like_selectivity(const char *patt, int pattlen, bool case_insensitive) |
1280 | { |
1281 | Selectivity sel = 1.0; |
1282 | int pos; |
1283 | |
1284 | /* Skip any leading wildcard; it's already factored into initial sel */ |
1285 | for (pos = 0; pos < pattlen; pos++) |
1286 | { |
1287 | if (patt[pos] != '%' && patt[pos] != '_') |
1288 | break; |
1289 | } |
1290 | |
1291 | for (; pos < pattlen; pos++) |
1292 | { |
1293 | /* % and _ are wildcard characters in LIKE */ |
1294 | if (patt[pos] == '%') |
1295 | sel *= FULL_WILDCARD_SEL; |
1296 | else if (patt[pos] == '_') |
1297 | sel *= ANY_CHAR_SEL; |
1298 | else if (patt[pos] == '\\') |
1299 | { |
1300 | /* Backslash quotes the next character */ |
1301 | pos++; |
1302 | if (pos >= pattlen) |
1303 | break; |
1304 | sel *= FIXED_CHAR_SEL; |
1305 | } |
1306 | else |
1307 | sel *= FIXED_CHAR_SEL; |
1308 | } |
1309 | /* Could get sel > 1 if multiple wildcards */ |
1310 | if (sel > 1.0) |
1311 | sel = 1.0; |
1312 | return sel; |
1313 | } |
1314 | |
1315 | static Selectivity |
1316 | regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) |
1317 | { |
1318 | Selectivity sel = 1.0; |
1319 | int paren_depth = 0; |
1320 | int paren_pos = 0; /* dummy init to keep compiler quiet */ |
1321 | int pos; |
1322 | |
1323 | for (pos = 0; pos < pattlen; pos++) |
1324 | { |
1325 | if (patt[pos] == '(') |
1326 | { |
1327 | if (paren_depth == 0) |
1328 | paren_pos = pos; /* remember start of parenthesized item */ |
1329 | paren_depth++; |
1330 | } |
1331 | else if (patt[pos] == ')' && paren_depth > 0) |
1332 | { |
1333 | paren_depth--; |
1334 | if (paren_depth == 0) |
1335 | sel *= regex_selectivity_sub(patt + (paren_pos + 1), |
1336 | pos - (paren_pos + 1), |
1337 | case_insensitive); |
1338 | } |
1339 | else if (patt[pos] == '|' && paren_depth == 0) |
1340 | { |
1341 | /* |
1342 | * If unquoted | is present at paren level 0 in pattern, we have |
1343 | * multiple alternatives; sum their probabilities. |
1344 | */ |
1345 | sel += regex_selectivity_sub(patt + (pos + 1), |
1346 | pattlen - (pos + 1), |
1347 | case_insensitive); |
1348 | break; /* rest of pattern is now processed */ |
1349 | } |
1350 | else if (patt[pos] == '[') |
1351 | { |
1352 | bool negclass = false; |
1353 | |
1354 | if (patt[++pos] == '^') |
1355 | { |
1356 | negclass = true; |
1357 | pos++; |
1358 | } |
1359 | if (patt[pos] == ']') /* ']' at start of class is not special */ |
1360 | pos++; |
1361 | while (pos < pattlen && patt[pos] != ']') |
1362 | pos++; |
1363 | if (paren_depth == 0) |
1364 | sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL); |
1365 | } |
1366 | else if (patt[pos] == '.') |
1367 | { |
1368 | if (paren_depth == 0) |
1369 | sel *= ANY_CHAR_SEL; |
1370 | } |
1371 | else if (patt[pos] == '*' || |
1372 | patt[pos] == '?' || |
1373 | patt[pos] == '+') |
1374 | { |
1375 | /* Ought to be smarter about quantifiers... */ |
1376 | if (paren_depth == 0) |
1377 | sel *= PARTIAL_WILDCARD_SEL; |
1378 | } |
1379 | else if (patt[pos] == '{') |
1380 | { |
1381 | while (pos < pattlen && patt[pos] != '}') |
1382 | pos++; |
1383 | if (paren_depth == 0) |
1384 | sel *= PARTIAL_WILDCARD_SEL; |
1385 | } |
1386 | else if (patt[pos] == '\\') |
1387 | { |
1388 | /* backslash quotes the next character */ |
1389 | pos++; |
1390 | if (pos >= pattlen) |
1391 | break; |
1392 | if (paren_depth == 0) |
1393 | sel *= FIXED_CHAR_SEL; |
1394 | } |
1395 | else |
1396 | { |
1397 | if (paren_depth == 0) |
1398 | sel *= FIXED_CHAR_SEL; |
1399 | } |
1400 | } |
1401 | /* Could get sel > 1 if multiple wildcards */ |
1402 | if (sel > 1.0) |
1403 | sel = 1.0; |
1404 | return sel; |
1405 | } |
1406 | |
1407 | static Selectivity |
1408 | regex_selectivity(const char *patt, int pattlen, bool case_insensitive, |
1409 | int fixed_prefix_len) |
1410 | { |
1411 | Selectivity sel; |
1412 | |
1413 | /* If patt doesn't end with $, consider it to have a trailing wildcard */ |
1414 | if (pattlen > 0 && patt[pattlen - 1] == '$' && |
1415 | (pattlen == 1 || patt[pattlen - 2] != '\\')) |
1416 | { |
1417 | /* has trailing $ */ |
1418 | sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive); |
1419 | } |
1420 | else |
1421 | { |
1422 | /* no trailing $ */ |
1423 | sel = regex_selectivity_sub(patt, pattlen, case_insensitive); |
1424 | sel *= FULL_WILDCARD_SEL; |
1425 | } |
1426 | |
1427 | /* If there's a fixed prefix, discount its selectivity */ |
1428 | if (fixed_prefix_len > 0) |
1429 | sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); |
1430 | |
1431 | /* Make sure result stays in range */ |
1432 | CLAMP_PROBABILITY(sel); |
1433 | return sel; |
1434 | } |
1435 | |
1436 | /* |
1437 | * Check whether char is a letter (and, hence, subject to case-folding) |
1438 | * |
1439 | * In multibyte character sets or with ICU, we can't use isalpha, and it does |
1440 | * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha. |
1441 | * Instead, just assume any non-ASCII char is potentially case-varying, and |
1442 | * hard-wire knowledge of which ASCII chars are letters. |
1443 | */ |
1444 | static int |
1445 | pattern_char_isalpha(char c, bool is_multibyte, |
1446 | pg_locale_t locale, bool locale_is_c) |
1447 | { |
1448 | if (locale_is_c) |
1449 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
1450 | else if (is_multibyte && IS_HIGHBIT_SET(c)) |
1451 | return true; |
1452 | else if (locale && locale->provider == COLLPROVIDER_ICU) |
1453 | return IS_HIGHBIT_SET(c) || |
1454 | (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
1455 | #ifdef HAVE_LOCALE_T |
1456 | else if (locale && locale->provider == COLLPROVIDER_LIBC) |
1457 | return isalpha_l((unsigned char) c, locale->info.lt); |
1458 | #endif |
1459 | else |
1460 | return isalpha((unsigned char) c); |
1461 | } |
1462 | |
1463 | |
1464 | /* |
1465 | * For bytea, the increment function need only increment the current byte |
1466 | * (there are no multibyte characters to worry about). |
1467 | */ |
1468 | static bool |
1469 | byte_increment(unsigned char *ptr, int len) |
1470 | { |
1471 | if (*ptr >= 255) |
1472 | return false; |
1473 | (*ptr)++; |
1474 | return true; |
1475 | } |
1476 | |
1477 | /* |
1478 | * Try to generate a string greater than the given string or any |
1479 | * string it is a prefix of. If successful, return a palloc'd string |
1480 | * in the form of a Const node; else return NULL. |
1481 | * |
1482 | * The caller must provide the appropriate "less than" comparison function |
1483 | * for testing the strings, along with the collation to use. |
1484 | * |
1485 | * The key requirement here is that given a prefix string, say "foo", |
1486 | * we must be able to generate another string "fop" that is greater than |
1487 | * all strings "foobar" starting with "foo". We can test that we have |
1488 | * generated a string greater than the prefix string, but in non-C collations |
1489 | * that is not a bulletproof guarantee that an extension of the string might |
1490 | * not sort after it; an example is that "foo " is less than "foo!", but it |
1491 | * is not clear that a "dictionary" sort ordering will consider "foo!" less |
1492 | * than "foo bar". CAUTION: Therefore, this function should be used only for |
1493 | * estimation purposes when working in a non-C collation. |
1494 | * |
1495 | * To try to catch most cases where an extended string might otherwise sort |
1496 | * before the result value, we determine which of the strings "Z", "z", "y", |
1497 | * and "9" is seen as largest by the collation, and append that to the given |
1498 | * prefix before trying to find a string that compares as larger. |
1499 | * |
1500 | * To search for a greater string, we repeatedly "increment" the rightmost |
1501 | * character, using an encoding-specific character incrementer function. |
1502 | * When it's no longer possible to increment the last character, we truncate |
1503 | * off that character and start incrementing the next-to-rightmost. |
1504 | * For example, if "z" were the last character in the sort order, then we |
1505 | * could produce "foo" as a string greater than "fonz". |
1506 | * |
1507 | * This could be rather slow in the worst case, but in most cases we |
1508 | * won't have to try more than one or two strings before succeeding. |
1509 | * |
1510 | * Note that it's important for the character incrementer not to be too anal |
1511 | * about producing every possible character code, since in some cases the only |
1512 | * way to get a larger string is to increment a previous character position. |
1513 | * So we don't want to spend too much time trying every possible character |
1514 | * code at the last position. A good rule of thumb is to be sure that we |
1515 | * don't try more than 256*K values for a K-byte character (and definitely |
1516 | * not 256^K, which is what an exhaustive search would approach). |
1517 | */ |
1518 | static Const * |
1519 | make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) |
1520 | { |
1521 | Oid datatype = str_const->consttype; |
1522 | char *workstr; |
1523 | int len; |
1524 | Datum cmpstr; |
1525 | char *cmptxt = NULL; |
1526 | mbcharacter_incrementer charinc; |
1527 | |
1528 | /* |
1529 | * Get a modifiable copy of the prefix string in C-string format, and set |
1530 | * up the string we will compare to as a Datum. In C locale this can just |
1531 | * be the given prefix string, otherwise we need to add a suffix. Type |
1532 | * BYTEA sorts bytewise so it never needs a suffix either. |
1533 | */ |
1534 | if (datatype == BYTEAOID) |
1535 | { |
1536 | bytea *bstr = DatumGetByteaPP(str_const->constvalue); |
1537 | |
1538 | len = VARSIZE_ANY_EXHDR(bstr); |
1539 | workstr = (char *) palloc(len); |
1540 | memcpy(workstr, VARDATA_ANY(bstr), len); |
1541 | Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue)); |
1542 | cmpstr = str_const->constvalue; |
1543 | } |
1544 | else |
1545 | { |
1546 | if (datatype == NAMEOID) |
1547 | workstr = DatumGetCString(DirectFunctionCall1(nameout, |
1548 | str_const->constvalue)); |
1549 | else |
1550 | workstr = TextDatumGetCString(str_const->constvalue); |
1551 | len = strlen(workstr); |
1552 | if (lc_collate_is_c(collation) || len == 0) |
1553 | cmpstr = str_const->constvalue; |
1554 | else |
1555 | { |
1556 | /* If first time through, determine the suffix to use */ |
1557 | static char suffixchar = 0; |
1558 | static Oid suffixcollation = 0; |
1559 | |
1560 | if (!suffixchar || suffixcollation != collation) |
1561 | { |
1562 | char *best; |
1563 | |
1564 | best = "Z" ; |
1565 | if (varstr_cmp(best, 1, "z" , 1, collation) < 0) |
1566 | best = "z" ; |
1567 | if (varstr_cmp(best, 1, "y" , 1, collation) < 0) |
1568 | best = "y" ; |
1569 | if (varstr_cmp(best, 1, "9" , 1, collation) < 0) |
1570 | best = "9" ; |
1571 | suffixchar = *best; |
1572 | suffixcollation = collation; |
1573 | } |
1574 | |
1575 | /* And build the string to compare to */ |
1576 | if (datatype == NAMEOID) |
1577 | { |
1578 | cmptxt = palloc(len + 2); |
1579 | memcpy(cmptxt, workstr, len); |
1580 | cmptxt[len] = suffixchar; |
1581 | cmptxt[len + 1] = '\0'; |
1582 | cmpstr = PointerGetDatum(cmptxt); |
1583 | } |
1584 | else |
1585 | { |
1586 | cmptxt = palloc(VARHDRSZ + len + 1); |
1587 | SET_VARSIZE(cmptxt, VARHDRSZ + len + 1); |
1588 | memcpy(VARDATA(cmptxt), workstr, len); |
1589 | *(VARDATA(cmptxt) + len) = suffixchar; |
1590 | cmpstr = PointerGetDatum(cmptxt); |
1591 | } |
1592 | } |
1593 | } |
1594 | |
1595 | /* Select appropriate character-incrementer function */ |
1596 | if (datatype == BYTEAOID) |
1597 | charinc = byte_increment; |
1598 | else |
1599 | charinc = pg_database_encoding_character_incrementer(); |
1600 | |
1601 | /* And search ... */ |
1602 | while (len > 0) |
1603 | { |
1604 | int charlen; |
1605 | unsigned char *lastchar; |
1606 | |
1607 | /* Identify the last character --- for bytea, just the last byte */ |
1608 | if (datatype == BYTEAOID) |
1609 | charlen = 1; |
1610 | else |
1611 | charlen = len - pg_mbcliplen(workstr, len, len - 1); |
1612 | lastchar = (unsigned char *) (workstr + len - charlen); |
1613 | |
1614 | /* |
1615 | * Try to generate a larger string by incrementing the last character |
1616 | * (for BYTEA, we treat each byte as a character). |
1617 | * |
1618 | * Note: the incrementer function is expected to return true if it's |
1619 | * generated a valid-per-the-encoding new character, otherwise false. |
1620 | * The contents of the character on false return are unspecified. |
1621 | */ |
1622 | while (charinc(lastchar, charlen)) |
1623 | { |
1624 | Const *workstr_const; |
1625 | |
1626 | if (datatype == BYTEAOID) |
1627 | workstr_const = string_to_bytea_const(workstr, len); |
1628 | else |
1629 | workstr_const = string_to_const(workstr, datatype); |
1630 | |
1631 | if (DatumGetBool(FunctionCall2Coll(ltproc, |
1632 | collation, |
1633 | cmpstr, |
1634 | workstr_const->constvalue))) |
1635 | { |
1636 | /* Successfully made a string larger than cmpstr */ |
1637 | if (cmptxt) |
1638 | pfree(cmptxt); |
1639 | pfree(workstr); |
1640 | return workstr_const; |
1641 | } |
1642 | |
1643 | /* No good, release unusable value and try again */ |
1644 | pfree(DatumGetPointer(workstr_const->constvalue)); |
1645 | pfree(workstr_const); |
1646 | } |
1647 | |
1648 | /* |
1649 | * No luck here, so truncate off the last character and try to |
1650 | * increment the next one. |
1651 | */ |
1652 | len -= charlen; |
1653 | workstr[len] = '\0'; |
1654 | } |
1655 | |
1656 | /* Failed... */ |
1657 | if (cmptxt) |
1658 | pfree(cmptxt); |
1659 | pfree(workstr); |
1660 | |
1661 | return NULL; |
1662 | } |
1663 | |
1664 | /* |
1665 | * Generate a Datum of the appropriate type from a C string. |
1666 | * Note that all of the supported types are pass-by-ref, so the |
1667 | * returned value should be pfree'd if no longer needed. |
1668 | */ |
1669 | static Datum |
1670 | string_to_datum(const char *str, Oid datatype) |
1671 | { |
1672 | Assert(str != NULL); |
1673 | |
1674 | /* |
1675 | * We cheat a little by assuming that CStringGetTextDatum() will do for |
1676 | * bpchar and varchar constants too... |
1677 | */ |
1678 | if (datatype == NAMEOID) |
1679 | return DirectFunctionCall1(namein, CStringGetDatum(str)); |
1680 | else if (datatype == BYTEAOID) |
1681 | return DirectFunctionCall1(byteain, CStringGetDatum(str)); |
1682 | else |
1683 | return CStringGetTextDatum(str); |
1684 | } |
1685 | |
1686 | /* |
1687 | * Generate a Const node of the appropriate type from a C string. |
1688 | */ |
1689 | static Const * |
1690 | string_to_const(const char *str, Oid datatype) |
1691 | { |
1692 | Datum conval = string_to_datum(str, datatype); |
1693 | Oid collation; |
1694 | int constlen; |
1695 | |
1696 | /* |
1697 | * We only need to support a few datatypes here, so hard-wire properties |
1698 | * instead of incurring the expense of catalog lookups. |
1699 | */ |
1700 | switch (datatype) |
1701 | { |
1702 | case TEXTOID: |
1703 | case VARCHAROID: |
1704 | case BPCHAROID: |
1705 | collation = DEFAULT_COLLATION_OID; |
1706 | constlen = -1; |
1707 | break; |
1708 | |
1709 | case NAMEOID: |
1710 | collation = C_COLLATION_OID; |
1711 | constlen = NAMEDATALEN; |
1712 | break; |
1713 | |
1714 | case BYTEAOID: |
1715 | collation = InvalidOid; |
1716 | constlen = -1; |
1717 | break; |
1718 | |
1719 | default: |
1720 | elog(ERROR, "unexpected datatype in string_to_const: %u" , |
1721 | datatype); |
1722 | return NULL; |
1723 | } |
1724 | |
1725 | return makeConst(datatype, -1, collation, constlen, |
1726 | conval, false, false); |
1727 | } |
1728 | |
1729 | /* |
1730 | * Generate a Const node of bytea type from a binary C string and a length. |
1731 | */ |
1732 | static Const * |
1733 | string_to_bytea_const(const char *str, size_t str_len) |
1734 | { |
1735 | bytea *bstr = palloc(VARHDRSZ + str_len); |
1736 | Datum conval; |
1737 | |
1738 | memcpy(VARDATA(bstr), str, str_len); |
1739 | SET_VARSIZE(bstr, VARHDRSZ + str_len); |
1740 | conval = PointerGetDatum(bstr); |
1741 | |
1742 | return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false); |
1743 | } |
1744 | |