1/*-------------------------------------------------------------------------
2 *
3 * rangetypes_selfuncs.c
4 * Functions for selectivity estimation of range operators
5 *
6 * Estimates are based on histograms of lower and upper bounds, and the
7 * fraction of empty ranges.
8 *
9 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/utils/adt/rangetypes_selfuncs.c
15 *
16 *-------------------------------------------------------------------------
17 */
18#include "postgres.h"
19
20#include <math.h>
21
22#include "access/htup_details.h"
23#include "catalog/pg_operator.h"
24#include "catalog/pg_statistic.h"
25#include "catalog/pg_type.h"
26#include "utils/float.h"
27#include "utils/fmgrprotos.h"
28#include "utils/lsyscache.h"
29#include "utils/rangetypes.h"
30#include "utils/selfuncs.h"
31#include "utils/typcache.h"
32
33static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
34 RangeType *constval, Oid operator);
35static double default_range_selectivity(Oid operator);
36static double calc_hist_selectivity(TypeCacheEntry *typcache,
37 VariableStatData *vardata, RangeType *constval,
38 Oid operator);
39static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
40 RangeBound *constbound,
41 RangeBound *hist, int hist_nvalues,
42 bool equal);
43static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
44 RangeBound *hist, int hist_length, bool equal);
45static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
46 RangeBound *hist1, RangeBound *hist2);
47static float8 get_len_position(double value, double hist1, double hist2);
48static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
49 RangeBound *bound2);
50static int length_hist_bsearch(Datum *length_hist_values,
51 int length_hist_nvalues, double value, bool equal);
52static double calc_length_hist_frac(Datum *length_hist_values,
53 int length_hist_nvalues, double length1, double length2, bool equal);
54static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
55 RangeBound *lower, RangeBound *upper,
56 RangeBound *hist_lower, int hist_nvalues,
57 Datum *length_hist_values, int length_hist_nvalues);
58static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
59 RangeBound *lower, RangeBound *upper,
60 RangeBound *hist_lower, int hist_nvalues,
61 Datum *length_hist_values, int length_hist_nvalues);
62
63/*
64 * Returns a default selectivity estimate for given operator, when we don't
65 * have statistics or cannot use them for some reason.
66 */
67static double
68default_range_selectivity(Oid operator)
69{
70 switch (operator)
71 {
72 case OID_RANGE_OVERLAP_OP:
73 return 0.01;
74
75 case OID_RANGE_CONTAINS_OP:
76 case OID_RANGE_CONTAINED_OP:
77 return 0.005;
78
79 case OID_RANGE_CONTAINS_ELEM_OP:
80 case OID_RANGE_ELEM_CONTAINED_OP:
81
82 /*
83 * "range @> elem" is more or less identical to a scalar
84 * inequality "A >= b AND A <= c".
85 */
86 return DEFAULT_RANGE_INEQ_SEL;
87
88 case OID_RANGE_LESS_OP:
89 case OID_RANGE_LESS_EQUAL_OP:
90 case OID_RANGE_GREATER_OP:
91 case OID_RANGE_GREATER_EQUAL_OP:
92 case OID_RANGE_LEFT_OP:
93 case OID_RANGE_RIGHT_OP:
94 case OID_RANGE_OVERLAPS_LEFT_OP:
95 case OID_RANGE_OVERLAPS_RIGHT_OP:
96 /* these are similar to regular scalar inequalities */
97 return DEFAULT_INEQ_SEL;
98
99 default:
100 /* all range operators should be handled above, but just in case */
101 return 0.01;
102 }
103}
104
105/*
106 * rangesel -- restriction selectivity for range operators
107 */
108Datum
109rangesel(PG_FUNCTION_ARGS)
110{
111 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
112 Oid operator = PG_GETARG_OID(1);
113 List *args = (List *) PG_GETARG_POINTER(2);
114 int varRelid = PG_GETARG_INT32(3);
115 VariableStatData vardata;
116 Node *other;
117 bool varonleft;
118 Selectivity selec;
119 TypeCacheEntry *typcache = NULL;
120 RangeType *constrange = NULL;
121
122 /*
123 * If expression is not (variable op something) or (something op
124 * variable), then punt and return a default estimate.
125 */
126 if (!get_restriction_variable(root, args, varRelid,
127 &vardata, &other, &varonleft))
128 PG_RETURN_FLOAT8(default_range_selectivity(operator));
129
130 /*
131 * Can't do anything useful if the something is not a constant, either.
132 */
133 if (!IsA(other, Const))
134 {
135 ReleaseVariableStats(vardata);
136 PG_RETURN_FLOAT8(default_range_selectivity(operator));
137 }
138
139 /*
140 * All the range operators are strict, so we can cope with a NULL constant
141 * right away.
142 */
143 if (((Const *) other)->constisnull)
144 {
145 ReleaseVariableStats(vardata);
146 PG_RETURN_FLOAT8(0.0);
147 }
148
149 /*
150 * If var is on the right, commute the operator, so that we can assume the
151 * var is on the left in what follows.
152 */
153 if (!varonleft)
154 {
155 /* we have other Op var, commute to make var Op other */
156 operator = get_commutator(operator);
157 if (!operator)
158 {
159 /* Use default selectivity (should we raise an error instead?) */
160 ReleaseVariableStats(vardata);
161 PG_RETURN_FLOAT8(default_range_selectivity(operator));
162 }
163 }
164
165 /*
166 * OK, there's a Var and a Const we're dealing with here. We need the
167 * Const to be of same range type as the column, else we can't do anything
168 * useful. (Such cases will likely fail at runtime, but here we'd rather
169 * just return a default estimate.)
170 *
171 * If the operator is "range @> element", the constant should be of the
172 * element type of the range column. Convert it to a range that includes
173 * only that single point, so that we don't need special handling for that
174 * in what follows.
175 */
176 if (operator == OID_RANGE_CONTAINS_ELEM_OP)
177 {
178 typcache = range_get_typcache(fcinfo, vardata.vartype);
179
180 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
181 {
182 RangeBound lower,
183 upper;
184
185 lower.inclusive = true;
186 lower.val = ((Const *) other)->constvalue;
187 lower.infinite = false;
188 lower.lower = true;
189 upper.inclusive = true;
190 upper.val = ((Const *) other)->constvalue;
191 upper.infinite = false;
192 upper.lower = false;
193 constrange = range_serialize(typcache, &lower, &upper, false);
194 }
195 }
196 else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
197 {
198 /*
199 * Here, the Var is the elem, not the range. For now we just punt and
200 * return the default estimate. In future we could disassemble the
201 * range constant and apply scalarineqsel ...
202 */
203 }
204 else if (((Const *) other)->consttype == vardata.vartype)
205 {
206 /* Both sides are the same range type */
207 typcache = range_get_typcache(fcinfo, vardata.vartype);
208
209 constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
210 }
211
212 /*
213 * If we got a valid constant on one side of the operator, proceed to
214 * estimate using statistics. Otherwise punt and return a default constant
215 * estimate. Note that calc_rangesel need not handle
216 * OID_RANGE_ELEM_CONTAINED_OP.
217 */
218 if (constrange)
219 selec = calc_rangesel(typcache, &vardata, constrange, operator);
220 else
221 selec = default_range_selectivity(operator);
222
223 ReleaseVariableStats(vardata);
224
225 CLAMP_PROBABILITY(selec);
226
227 PG_RETURN_FLOAT8((float8) selec);
228}
229
230static double
231calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
232 RangeType *constval, Oid operator)
233{
234 double hist_selec;
235 double selec;
236 float4 empty_frac,
237 null_frac;
238
239 /*
240 * First look up the fraction of NULLs and empty ranges from pg_statistic.
241 */
242 if (HeapTupleIsValid(vardata->statsTuple))
243 {
244 Form_pg_statistic stats;
245 AttStatsSlot sslot;
246
247 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248 null_frac = stats->stanullfrac;
249
250 /* Try to get fraction of empty ranges */
251 if (get_attstatsslot(&sslot, vardata->statsTuple,
252 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
253 InvalidOid,
254 ATTSTATSSLOT_NUMBERS))
255 {
256 if (sslot.nnumbers != 1)
257 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
258 empty_frac = sslot.numbers[0];
259 free_attstatsslot(&sslot);
260 }
261 else
262 {
263 /* No empty fraction statistic. Assume no empty ranges. */
264 empty_frac = 0.0;
265 }
266 }
267 else
268 {
269 /*
270 * No stats are available. Follow through the calculations below
271 * anyway, assuming no NULLs and no empty ranges. This still allows us
272 * to give a better-than-nothing estimate based on whether the
273 * constant is an empty range or not.
274 */
275 null_frac = 0.0;
276 empty_frac = 0.0;
277 }
278
279 if (RangeIsEmpty(constval))
280 {
281 /*
282 * An empty range matches all ranges, all empty ranges, or nothing,
283 * depending on the operator
284 */
285 switch (operator)
286 {
287 /* these return false if either argument is empty */
288 case OID_RANGE_OVERLAP_OP:
289 case OID_RANGE_OVERLAPS_LEFT_OP:
290 case OID_RANGE_OVERLAPS_RIGHT_OP:
291 case OID_RANGE_LEFT_OP:
292 case OID_RANGE_RIGHT_OP:
293 /* nothing is less than an empty range */
294 case OID_RANGE_LESS_OP:
295 selec = 0.0;
296 break;
297
298 /* only empty ranges can be contained by an empty range */
299 case OID_RANGE_CONTAINED_OP:
300 /* only empty ranges are <= an empty range */
301 case OID_RANGE_LESS_EQUAL_OP:
302 selec = empty_frac;
303 break;
304
305 /* everything contains an empty range */
306 case OID_RANGE_CONTAINS_OP:
307 /* everything is >= an empty range */
308 case OID_RANGE_GREATER_EQUAL_OP:
309 selec = 1.0;
310 break;
311
312 /* all non-empty ranges are > an empty range */
313 case OID_RANGE_GREATER_OP:
314 selec = 1.0 - empty_frac;
315 break;
316
317 /* an element cannot be empty */
318 case OID_RANGE_CONTAINS_ELEM_OP:
319 default:
320 elog(ERROR, "unexpected operator %u", operator);
321 selec = 0.0; /* keep compiler quiet */
322 break;
323 }
324 }
325 else
326 {
327 /*
328 * Calculate selectivity using bound histograms. If that fails for
329 * some reason, e.g no histogram in pg_statistic, use the default
330 * constant estimate for the fraction of non-empty values. This is
331 * still somewhat better than just returning the default estimate,
332 * because this still takes into account the fraction of empty and
333 * NULL tuples, if we had statistics for them.
334 */
335 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
336 operator);
337 if (hist_selec < 0.0)
338 hist_selec = default_range_selectivity(operator);
339
340 /*
341 * Now merge the results for the empty ranges and histogram
342 * calculations, realizing that the histogram covers only the
343 * non-null, non-empty values.
344 */
345 if (operator == OID_RANGE_CONTAINED_OP)
346 {
347 /* empty is contained by anything non-empty */
348 selec = (1.0 - empty_frac) * hist_selec + empty_frac;
349 }
350 else
351 {
352 /* with any other operator, empty Op non-empty matches nothing */
353 selec = (1.0 - empty_frac) * hist_selec;
354 }
355 }
356
357 /* all range operators are strict */
358 selec *= (1.0 - null_frac);
359
360 /* result should be in range, but make sure... */
361 CLAMP_PROBABILITY(selec);
362
363 return selec;
364}
365
366/*
367 * Calculate range operator selectivity using histograms of range bounds.
368 *
369 * This estimate is for the portion of values that are not empty and not
370 * NULL.
371 */
372static double
373calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
374 RangeType *constval, Oid operator)
375{
376 AttStatsSlot hslot;
377 AttStatsSlot lslot;
378 int nhist;
379 RangeBound *hist_lower;
380 RangeBound *hist_upper;
381 int i;
382 RangeBound const_lower;
383 RangeBound const_upper;
384 bool empty;
385 double hist_selec;
386
387 /* Can't use the histogram with insecure range support functions */
388 if (!statistic_proc_security_check(vardata,
389 typcache->rng_cmp_proc_finfo.fn_oid))
390 return -1;
391 if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
392 !statistic_proc_security_check(vardata,
393 typcache->rng_subdiff_finfo.fn_oid))
394 return -1;
395
396 /* Try to get histogram of ranges */
397 if (!(HeapTupleIsValid(vardata->statsTuple) &&
398 get_attstatsslot(&hslot, vardata->statsTuple,
399 STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
400 ATTSTATSSLOT_VALUES)))
401 return -1.0;
402
403 /*
404 * Convert histogram of ranges into histograms of its lower and upper
405 * bounds.
406 */
407 nhist = hslot.nvalues;
408 hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
409 hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
410 for (i = 0; i < nhist; i++)
411 {
412 range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
413 &hist_lower[i], &hist_upper[i], &empty);
414 /* The histogram should not contain any empty ranges */
415 if (empty)
416 elog(ERROR, "bounds histogram contains an empty range");
417 }
418
419 /* @> and @< also need a histogram of range lengths */
420 if (operator == OID_RANGE_CONTAINS_OP ||
421 operator == OID_RANGE_CONTAINED_OP)
422 {
423 if (!(HeapTupleIsValid(vardata->statsTuple) &&
424 get_attstatsslot(&lslot, vardata->statsTuple,
425 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
426 InvalidOid,
427 ATTSTATSSLOT_VALUES)))
428 {
429 free_attstatsslot(&hslot);
430 return -1.0;
431 }
432
433 /* check that it's a histogram, not just a dummy entry */
434 if (lslot.nvalues < 2)
435 {
436 free_attstatsslot(&lslot);
437 free_attstatsslot(&hslot);
438 return -1.0;
439 }
440 }
441 else
442 memset(&lslot, 0, sizeof(lslot));
443
444 /* Extract the bounds of the constant value. */
445 range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
446 Assert(!empty);
447
448 /*
449 * Calculate selectivity comparing the lower or upper bound of the
450 * constant with the histogram of lower or upper bounds.
451 */
452 switch (operator)
453 {
454 case OID_RANGE_LESS_OP:
455
456 /*
457 * The regular b-tree comparison operators (<, <=, >, >=) compare
458 * the lower bounds first, and the upper bounds for values with
459 * equal lower bounds. Estimate that by comparing the lower bounds
460 * only. This gives a fairly accurate estimate assuming there
461 * aren't many rows with a lower bound equal to the constant's
462 * lower bound.
463 */
464 hist_selec =
465 calc_hist_selectivity_scalar(typcache, &const_lower,
466 hist_lower, nhist, false);
467 break;
468
469 case OID_RANGE_LESS_EQUAL_OP:
470 hist_selec =
471 calc_hist_selectivity_scalar(typcache, &const_lower,
472 hist_lower, nhist, true);
473 break;
474
475 case OID_RANGE_GREATER_OP:
476 hist_selec =
477 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
478 hist_lower, nhist, false);
479 break;
480
481 case OID_RANGE_GREATER_EQUAL_OP:
482 hist_selec =
483 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
484 hist_lower, nhist, true);
485 break;
486
487 case OID_RANGE_LEFT_OP:
488 /* var << const when upper(var) < lower(const) */
489 hist_selec =
490 calc_hist_selectivity_scalar(typcache, &const_lower,
491 hist_upper, nhist, false);
492 break;
493
494 case OID_RANGE_RIGHT_OP:
495 /* var >> const when lower(var) > upper(const) */
496 hist_selec =
497 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
498 hist_lower, nhist, true);
499 break;
500
501 case OID_RANGE_OVERLAPS_RIGHT_OP:
502 /* compare lower bounds */
503 hist_selec =
504 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
505 hist_lower, nhist, false);
506 break;
507
508 case OID_RANGE_OVERLAPS_LEFT_OP:
509 /* compare upper bounds */
510 hist_selec =
511 calc_hist_selectivity_scalar(typcache, &const_upper,
512 hist_upper, nhist, true);
513 break;
514
515 case OID_RANGE_OVERLAP_OP:
516 case OID_RANGE_CONTAINS_ELEM_OP:
517
518 /*
519 * A && B <=> NOT (A << B OR A >> B).
520 *
521 * Since A << B and A >> B are mutually exclusive events we can
522 * sum their probabilities to find probability of (A << B OR A >>
523 * B).
524 *
525 * "range @> elem" is equivalent to "range && [elem,elem]". The
526 * caller already constructed the singular range from the element
527 * constant, so just treat it the same as &&.
528 */
529 hist_selec =
530 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
531 nhist, false);
532 hist_selec +=
533 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
534 nhist, true));
535 hist_selec = 1.0 - hist_selec;
536 break;
537
538 case OID_RANGE_CONTAINS_OP:
539 hist_selec =
540 calc_hist_selectivity_contains(typcache, &const_lower,
541 &const_upper, hist_lower, nhist,
542 lslot.values, lslot.nvalues);
543 break;
544
545 case OID_RANGE_CONTAINED_OP:
546 if (const_lower.infinite)
547 {
548 /*
549 * Lower bound no longer matters. Just estimate the fraction
550 * with an upper bound <= const upper bound
551 */
552 hist_selec =
553 calc_hist_selectivity_scalar(typcache, &const_upper,
554 hist_upper, nhist, true);
555 }
556 else if (const_upper.infinite)
557 {
558 hist_selec =
559 1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
560 hist_lower, nhist, false);
561 }
562 else
563 {
564 hist_selec =
565 calc_hist_selectivity_contained(typcache, &const_lower,
566 &const_upper, hist_lower, nhist,
567 lslot.values, lslot.nvalues);
568 }
569 break;
570
571 default:
572 elog(ERROR, "unknown range operator %u", operator);
573 hist_selec = -1.0; /* keep compiler quiet */
574 break;
575 }
576
577 free_attstatsslot(&lslot);
578 free_attstatsslot(&hslot);
579
580 return hist_selec;
581}
582
583
584/*
585 * Look up the fraction of values less than (or equal, if 'equal' argument
586 * is true) a given const in a histogram of range bounds.
587 */
588static double
589calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
590 RangeBound *hist, int hist_nvalues, bool equal)
591{
592 Selectivity selec;
593 int index;
594
595 /*
596 * Find the histogram bin the given constant falls into. Estimate
597 * selectivity as the number of preceding whole bins.
598 */
599 index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
600 selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
601
602 /* Adjust using linear interpolation within the bin */
603 if (index >= 0 && index < hist_nvalues - 1)
604 selec += get_position(typcache, constbound, &hist[index],
605 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
606
607 return selec;
608}
609
610/*
611 * Binary search on an array of range bounds. Returns greatest index of range
612 * bound in array which is less(less or equal) than given range bound. If all
613 * range bounds in array are greater or equal(greater) than given range bound,
614 * return -1. When "equal" flag is set conditions in brackets are used.
615 *
616 * This function is used in scalar operator selectivity estimation. Another
617 * goal of this function is to find a histogram bin where to stop
618 * interpolation of portion of bounds which are less or equal to given bound.
619 */
620static int
621rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
622 int hist_length, bool equal)
623{
624 int lower = -1,
625 upper = hist_length - 1,
626 cmp,
627 middle;
628
629 while (lower < upper)
630 {
631 middle = (lower + upper + 1) / 2;
632 cmp = range_cmp_bounds(typcache, &hist[middle], value);
633
634 if (cmp < 0 || (equal && cmp == 0))
635 lower = middle;
636 else
637 upper = middle - 1;
638 }
639 return lower;
640}
641
642
643/*
644 * Binary search on length histogram. Returns greatest index of range length in
645 * histogram which is less than (less than or equal) the given length value. If
646 * all lengths in the histogram are greater than (greater than or equal) the
647 * given length, returns -1.
648 */
649static int
650length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
651 double value, bool equal)
652{
653 int lower = -1,
654 upper = length_hist_nvalues - 1,
655 middle;
656
657 while (lower < upper)
658 {
659 double middleval;
660
661 middle = (lower + upper + 1) / 2;
662
663 middleval = DatumGetFloat8(length_hist_values[middle]);
664 if (middleval < value || (equal && middleval <= value))
665 lower = middle;
666 else
667 upper = middle - 1;
668 }
669 return lower;
670}
671
672/*
673 * Get relative position of value in histogram bin in [0,1] range.
674 */
675static float8
676get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
677 RangeBound *hist2)
678{
679 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
680 float8 position;
681
682 if (!hist1->infinite && !hist2->infinite)
683 {
684 float8 bin_width;
685
686 /*
687 * Both bounds are finite. Assuming the subtype's comparison function
688 * works sanely, the value must be finite, too, because it lies
689 * somewhere between the bounds. If it doesn't, just return something.
690 */
691 if (value->infinite)
692 return 0.5;
693
694 /* Can't interpolate without subdiff function */
695 if (!has_subdiff)
696 return 0.5;
697
698 /* Calculate relative position using subdiff function. */
699 bin_width = DatumGetFloat8(FunctionCall2Coll(
700 &typcache->rng_subdiff_finfo,
701 typcache->rng_collation,
702 hist2->val,
703 hist1->val));
704 if (bin_width <= 0.0)
705 return 0.5; /* zero width bin */
706
707 position = DatumGetFloat8(FunctionCall2Coll(
708 &typcache->rng_subdiff_finfo,
709 typcache->rng_collation,
710 value->val,
711 hist1->val))
712 / bin_width;
713
714 /* Relative position must be in [0,1] range */
715 position = Max(position, 0.0);
716 position = Min(position, 1.0);
717 return position;
718 }
719 else if (hist1->infinite && !hist2->infinite)
720 {
721 /*
722 * Lower bin boundary is -infinite, upper is finite. If the value is
723 * -infinite, return 0.0 to indicate it's equal to the lower bound.
724 * Otherwise return 1.0 to indicate it's infinitely far from the lower
725 * bound.
726 */
727 return ((value->infinite && value->lower) ? 0.0 : 1.0);
728 }
729 else if (!hist1->infinite && hist2->infinite)
730 {
731 /* same as above, but in reverse */
732 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
733 }
734 else
735 {
736 /*
737 * If both bin boundaries are infinite, they should be equal to each
738 * other, and the value should also be infinite and equal to both
739 * bounds. (But don't Assert that, to avoid crashing if a user creates
740 * a datatype with a broken comparison function).
741 *
742 * Assume the value to lie in the middle of the infinite bounds.
743 */
744 return 0.5;
745 }
746}
747
748
749/*
750 * Get relative position of value in a length histogram bin in [0,1] range.
751 */
752static double
753get_len_position(double value, double hist1, double hist2)
754{
755 if (!isinf(hist1) && !isinf(hist2))
756 {
757 /*
758 * Both bounds are finite. The value should be finite too, because it
759 * lies somewhere between the bounds. If it doesn't, just return
760 * something.
761 */
762 if (isinf(value))
763 return 0.5;
764
765 return 1.0 - (hist2 - value) / (hist2 - hist1);
766 }
767 else if (isinf(hist1) && !isinf(hist2))
768 {
769 /*
770 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
771 * indicate the value is infinitely far from the lower bound.
772 */
773 return 1.0;
774 }
775 else if (isinf(hist1) && isinf(hist2))
776 {
777 /* same as above, but in reverse */
778 return 0.0;
779 }
780 else
781 {
782 /*
783 * If both bin boundaries are infinite, they should be equal to each
784 * other, and the value should also be infinite and equal to both
785 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
786 * the caller messes up)
787 *
788 * Assume the value to lie in the middle of the infinite bounds.
789 */
790 return 0.5;
791 }
792}
793
794/*
795 * Measure distance between two range bounds.
796 */
797static float8
798get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
799{
800 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
801
802 if (!bound1->infinite && !bound2->infinite)
803 {
804 /*
805 * No bounds are infinite, use subdiff function or return default
806 * value of 1.0 if no subdiff is available.
807 */
808 if (has_subdiff)
809 return
810 DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
811 typcache->rng_collation,
812 bound2->val,
813 bound1->val));
814 else
815 return 1.0;
816 }
817 else if (bound1->infinite && bound2->infinite)
818 {
819 /* Both bounds are infinite */
820 if (bound1->lower == bound2->lower)
821 return 0.0;
822 else
823 return get_float8_infinity();
824 }
825 else
826 {
827 /* One bound is infinite, another is not */
828 return get_float8_infinity();
829 }
830}
831
832/*
833 * Calculate the average of function P(x), in the interval [length1, length2],
834 * where P(x) is the fraction of tuples with length < x (or length <= x if
835 * 'equal' is true).
836 */
837static double
838calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
839 double length1, double length2, bool equal)
840{
841 double frac;
842 double A,
843 B,
844 PA,
845 PB;
846 double pos;
847 int i;
848 double area;
849
850 Assert(length2 >= length1);
851
852 if (length2 < 0.0)
853 return 0.0; /* shouldn't happen, but doesn't hurt to check */
854
855 /* All lengths in the table are <= infinite. */
856 if (isinf(length2) && equal)
857 return 1.0;
858
859 /*----------
860 * The average of a function between A and B can be calculated by the
861 * formula:
862 *
863 * B
864 * 1 /
865 * ------- | P(x)dx
866 * B - A /
867 * A
868 *
869 * The geometrical interpretation of the integral is the area under the
870 * graph of P(x). P(x) is defined by the length histogram. We calculate
871 * the area in a piecewise fashion, iterating through the length histogram
872 * bins. Each bin is a trapezoid:
873 *
874 * P(x2)
875 * /|
876 * / |
877 * P(x1)/ |
878 * | |
879 * | |
880 * ---+---+--
881 * x1 x2
882 *
883 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
884 * and P(x1) are the cumulative fraction of tuples at the boundaries.
885 *
886 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
887 *
888 * The first bin contains the lower bound passed by the caller, so we
889 * use linear interpolation between the previous and next histogram bin
890 * boundary to calculate P(x1). Likewise for the last bin: we use linear
891 * interpolation to calculate P(x2). For the bins in between, x1 and x2
892 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
893 * P(x1) = (bin index) / (number of bins)
894 * P(x2) = (bin index + 1 / (number of bins)
895 */
896
897 /* First bin, the one that contains lower bound */
898 i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
899 if (i >= length_hist_nvalues - 1)
900 return 1.0;
901
902 if (i < 0)
903 {
904 i = 0;
905 pos = 0.0;
906 }
907 else
908 {
909 /* interpolate length1's position in the bin */
910 pos = get_len_position(length1,
911 DatumGetFloat8(length_hist_values[i]),
912 DatumGetFloat8(length_hist_values[i + 1]));
913 }
914 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
915 B = length1;
916
917 /*
918 * In the degenerate case that length1 == length2, simply return
919 * P(length1). This is not merely an optimization: if length1 == length2,
920 * we'd divide by zero later on.
921 */
922 if (length2 == length1)
923 return PB;
924
925 /*
926 * Loop through all the bins, until we hit the last bin, the one that
927 * contains the upper bound. (if lower and upper bounds are in the same
928 * bin, this falls out immediately)
929 */
930 area = 0.0;
931 for (; i < length_hist_nvalues - 1; i++)
932 {
933 double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
934
935 /* check if we've reached the last bin */
936 if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
937 break;
938
939 /* the upper bound of previous bin is the lower bound of this bin */
940 A = B;
941 PA = PB;
942
943 B = bin_upper;
944 PB = (double) i / (double) (length_hist_nvalues - 1);
945
946 /*
947 * Add the area of this trapezoid to the total. The point of the
948 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
949 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
950 * 0) is zero, regardless of the width (B - A).
951 */
952 if (PA > 0 || PB > 0)
953 area += 0.5 * (PB + PA) * (B - A);
954 }
955
956 /* Last bin */
957 A = B;
958 PA = PB;
959
960 B = length2; /* last bin ends at the query upper bound */
961 if (i >= length_hist_nvalues - 1)
962 pos = 0.0;
963 else
964 {
965 if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
966 pos = 0.0;
967 else
968 pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
969 }
970 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
971
972 if (PA > 0 || PB > 0)
973 area += 0.5 * (PB + PA) * (B - A);
974
975 /*
976 * Ok, we have calculated the area, ie. the integral. Divide by width to
977 * get the requested average.
978 *
979 * Avoid NaN arising from infinite / infinite. This happens at least if
980 * length2 is infinite. It's not clear what the correct value would be in
981 * that case, so 0.5 seems as good as any value.
982 */
983 if (isinf(area) && isinf(length2))
984 frac = 0.5;
985 else
986 frac = area / (length2 - length1);
987
988 return frac;
989}
990
991/*
992 * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
993 * of ranges that fall within the constant lower and upper bounds. This uses
994 * the histograms of range lower bounds and range lengths, on the assumption
995 * that the range lengths are independent of the lower bounds.
996 *
997 * The caller has already checked that constant lower and upper bounds are
998 * finite.
999 */
1000static double
1001calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1002 RangeBound *lower, RangeBound *upper,
1003 RangeBound *hist_lower, int hist_nvalues,
1004 Datum *length_hist_values, int length_hist_nvalues)
1005{
1006 int i,
1007 upper_index;
1008 float8 prev_dist;
1009 double bin_width;
1010 double upper_bin_width;
1011 double sum_frac;
1012
1013 /*
1014 * Begin by finding the bin containing the upper bound, in the lower bound
1015 * histogram. Any range with a lower bound > constant upper bound can't
1016 * match, ie. there are no matches in bins greater than upper_index.
1017 */
1018 upper->inclusive = !upper->inclusive;
1019 upper->lower = true;
1020 upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1021 false);
1022
1023 /*
1024 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1025 * upper_index + 1) bin which is greater than upper bound of query range
1026 * using linear interpolation of subdiff function.
1027 */
1028 if (upper_index >= 0 && upper_index < hist_nvalues - 1)
1029 upper_bin_width = get_position(typcache, upper,
1030 &hist_lower[upper_index],
1031 &hist_lower[upper_index + 1]);
1032 else
1033 upper_bin_width = 0.0;
1034
1035 /*
1036 * In the loop, dist and prev_dist are the distance of the "current" bin's
1037 * lower and upper bounds from the constant upper bound.
1038 *
1039 * bin_width represents the width of the current bin. Normally it is 1.0,
1040 * meaning a full width bin, but can be less in the corner cases: start
1041 * and end of the loop. We start with bin_width = upper_bin_width, because
1042 * we begin at the bin containing the upper bound.
1043 */
1044 prev_dist = 0.0;
1045 bin_width = upper_bin_width;
1046
1047 sum_frac = 0.0;
1048 for (i = upper_index; i >= 0; i--)
1049 {
1050 double dist;
1051 double length_hist_frac;
1052 bool final_bin = false;
1053
1054 /*
1055 * dist -- distance from upper bound of query range to lower bound of
1056 * the current bin in the lower bound histogram. Or to the lower bound
1057 * of the constant range, if this is the final bin, containing the
1058 * constant lower bound.
1059 */
1060 if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1061 {
1062 dist = get_distance(typcache, lower, upper);
1063
1064 /*
1065 * Subtract from bin_width the portion of this bin that we want to
1066 * ignore.
1067 */
1068 bin_width -= get_position(typcache, lower, &hist_lower[i],
1069 &hist_lower[i + 1]);
1070 if (bin_width < 0.0)
1071 bin_width = 0.0;
1072 final_bin = true;
1073 }
1074 else
1075 dist = get_distance(typcache, &hist_lower[i], upper);
1076
1077 /*
1078 * Estimate the fraction of tuples in this bin that are narrow enough
1079 * to not exceed the distance to the upper bound of the query range.
1080 */
1081 length_hist_frac = calc_length_hist_frac(length_hist_values,
1082 length_hist_nvalues,
1083 prev_dist, dist, true);
1084
1085 /*
1086 * Add the fraction of tuples in this bin, with a suitable length, to
1087 * the total.
1088 */
1089 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1090
1091 if (final_bin)
1092 break;
1093
1094 bin_width = 1.0;
1095 prev_dist = dist;
1096 }
1097
1098 return sum_frac;
1099}
1100
1101/*
1102 * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1103 * of ranges that contain the constant lower and upper bounds. This uses
1104 * the histograms of range lower bounds and range lengths, on the assumption
1105 * that the range lengths are independent of the lower bounds.
1106 *
1107 * Note, this is "var @> const", ie. estimate the fraction of ranges that
1108 * contain the constant lower and upper bounds.
1109 */
1110static double
1111calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1112 RangeBound *lower, RangeBound *upper,
1113 RangeBound *hist_lower, int hist_nvalues,
1114 Datum *length_hist_values, int length_hist_nvalues)
1115{
1116 int i,
1117 lower_index;
1118 double bin_width,
1119 lower_bin_width;
1120 double sum_frac;
1121 float8 prev_dist;
1122
1123 /* Find the bin containing the lower bound of query range. */
1124 lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1125 true);
1126
1127 /*
1128 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1129 * lower_index + 1) bin which is greater than lower bound of query range
1130 * using linear interpolation of subdiff function.
1131 */
1132 if (lower_index >= 0 && lower_index < hist_nvalues - 1)
1133 lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1134 &hist_lower[lower_index + 1]);
1135 else
1136 lower_bin_width = 0.0;
1137
1138 /*
1139 * Loop through all the lower bound bins, smaller than the query lower
1140 * bound. In the loop, dist and prev_dist are the distance of the
1141 * "current" bin's lower and upper bounds from the constant upper bound.
1142 * We begin from query lower bound, and walk backwards, so the first bin's
1143 * upper bound is the query lower bound, and its distance to the query
1144 * upper bound is the length of the query range.
1145 *
1146 * bin_width represents the width of the current bin. Normally it is 1.0,
1147 * meaning a full width bin, except for the first bin, which is only
1148 * counted up to the constant lower bound.
1149 */
1150 prev_dist = get_distance(typcache, lower, upper);
1151 sum_frac = 0.0;
1152 bin_width = lower_bin_width;
1153 for (i = lower_index; i >= 0; i--)
1154 {
1155 float8 dist;
1156 double length_hist_frac;
1157
1158 /*
1159 * dist -- distance from upper bound of query range to current value
1160 * of lower bound histogram or lower bound of query range (if we've
1161 * reach it).
1162 */
1163 dist = get_distance(typcache, &hist_lower[i], upper);
1164
1165 /*
1166 * Get average fraction of length histogram which covers intervals
1167 * longer than (or equal to) distance to upper bound of query range.
1168 */
1169 length_hist_frac =
1170 1.0 - calc_length_hist_frac(length_hist_values,
1171 length_hist_nvalues,
1172 prev_dist, dist, false);
1173
1174 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1175
1176 bin_width = 1.0;
1177 prev_dist = dist;
1178 }
1179
1180 return sum_frac;
1181}
1182