rangetypes_selfuncs.c source code [PostgreSQL/src/backend/utils/adt/rangetypes_selfuncs.c]

1	/-------------------------------------------------------------------------*
2	*
3	* rangetypes_selfuncs.c
4	* Functions for selectivity estimation of range operators
5	*
6	* Estimates are based on histograms of lower and upper bounds, and the
7	* fraction of empty ranges.
8	*
9	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10	* Portions Copyright (c) 1994, Regents of the University of California
11	*
12	*
13	* IDENTIFICATION
14	* src/backend/utils/adt/rangetypes_selfuncs.c
15	*
16	*-------------------------------------------------------------------------
17	*/
18	#include "postgres.h"
19
20	#include <math.h>
21
22	#include "access/htup_details.h"
23	#include "catalog/pg_operator.h"
24	#include "catalog/pg_statistic.h"
25	#include "catalog/pg_type.h"
26	#include "utils/float.h"
27	#include "utils/fmgrprotos.h"
28	#include "utils/lsyscache.h"
29	#include "utils/rangetypes.h"
30	#include "utils/selfuncs.h"
31	#include "utils/typcache.h"
32
33	static double calc_rangesel(TypeCacheEntry typcache, VariableStatData vardata,
34	RangeType *constval, Oid operator);
35	static double default_range_selectivity(Oid operator);
36	static double calc_hist_selectivity(TypeCacheEntry *typcache,
37	VariableStatData vardata, RangeType constval,
38	Oid operator);
39	static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
40	RangeBound *constbound,
41	RangeBound hist, int* hist_nvalues,
42	bool equal);
43	static int rbound_bsearch(TypeCacheEntry typcache, RangeBound value,
44	RangeBound hist, int* hist_length, bool equal);
45	static float8 get_position(TypeCacheEntry typcache, RangeBound value,
46	RangeBound hist1, RangeBound hist2);
47	static float8 get_len_position(double value, double hist1, double hist2);
48	static float8 get_distance(TypeCacheEntry typcache, RangeBound bound1,
49	RangeBound *bound2);
50	static int length_hist_bsearch(Datum *length_hist_values,
51	int length_hist_nvalues, double value, bool equal);
52	static double calc_length_hist_frac(Datum *length_hist_values,
53	int length_hist_nvalues, double length1, double length2, bool equal);
54	static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
55	RangeBound lower, RangeBound upper,
56	RangeBound hist_lower, int* hist_nvalues,
57	Datum length_hist_values, int* length_hist_nvalues);
58	static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
59	RangeBound lower, RangeBound upper,
60	RangeBound hist_lower, int* hist_nvalues,
61	Datum length_hist_values, int* length_hist_nvalues);
62
63	/*
64	* Returns a default selectivity estimate for given operator, when we don't
65	* have statistics or cannot use them for some reason.
66	*/
67	static double
68	default_range_selectivity(Oid operator)
69	{
70	switch (operator)
71	{
72	case OID_RANGE_OVERLAP_OP:
73	return `0.01`;
74
75	case OID_RANGE_CONTAINS_OP:
76	case OID_RANGE_CONTAINED_OP:
77	return `0.005`;
78
79	case OID_RANGE_CONTAINS_ELEM_OP:
80	case OID_RANGE_ELEM_CONTAINED_OP:
81
82	/*
83	* "range @> elem" is more or less identical to a scalar
84	* inequality "A >= b AND A <= c".
85	*/
86	return DEFAULT_RANGE_INEQ_SEL;
87
88	case OID_RANGE_LESS_OP:
89	case OID_RANGE_LESS_EQUAL_OP:
90	case OID_RANGE_GREATER_OP:
91	case OID_RANGE_GREATER_EQUAL_OP:
92	case OID_RANGE_LEFT_OP:
93	case OID_RANGE_RIGHT_OP:
94	case OID_RANGE_OVERLAPS_LEFT_OP:
95	case OID_RANGE_OVERLAPS_RIGHT_OP:
96	/ these are similar to regular scalar inequalities /
97	return DEFAULT_INEQ_SEL;
98
99	default:
100	/ all range operators should be handled above, but just in case /
101	return `0.01`;
102	}
103	}
104
105	/*
106	* rangesel -- restriction selectivity for range operators
107	*/
108	Datum
109	rangesel(PG_FUNCTION_ARGS)
110	{
111	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
112	Oid operator = PG_GETARG_OID(`1`);
113	List args = (List ) PG_GETARG_POINTER(`2`);
114	int varRelid = PG_GETARG_INT32(`3`);
115	VariableStatData vardata;
116	Node *other;
117	bool varonleft;
118	Selectivity selec;
119	TypeCacheEntry *typcache = NULL;
120	RangeType *constrange = NULL;
121
122	/*
123	* If expression is not (variable op something) or (something op
124	* variable), then punt and return a default estimate.
125	*/
126	if (!get_restriction_variable(root, args, varRelid,
127	&vardata, &other, &varonleft))
128	PG_RETURN_FLOAT8(default_range_selectivity(operator));
129
130	/*
131	* Can't do anything useful if the something is not a constant, either.
132	*/
133	if (!IsA(other, Const))
134	{
135	ReleaseVariableStats(vardata);
136	PG_RETURN_FLOAT8(default_range_selectivity(operator));
137	}
138
139	/*
140	* All the range operators are strict, so we can cope with a NULL constant
141	* right away.
142	*/
143	if (((Const *) other)->constisnull)
144	{
145	ReleaseVariableStats(vardata);
146	PG_RETURN_FLOAT8(`0.0`);
147	}
148
149	/*
150	* If var is on the right, commute the operator, so that we can assume the
151	* var is on the left in what follows.
152	*/
153	if (!varonleft)
154	{
155	/ we have other Op var, commute to make var Op other /
156	operator = get_commutator(operator);
157	if (!operator)
158	{
159	/ Use default selectivity (should we raise an error instead?) /
160	ReleaseVariableStats(vardata);
161	PG_RETURN_FLOAT8(default_range_selectivity(operator));
162	}
163	}
164
165	/*
166	* OK, there's a Var and a Const we're dealing with here. We need the
167	* Const to be of same range type as the column, else we can't do anything
168	* useful. (Such cases will likely fail at runtime, but here we'd rather
169	* just return a default estimate.)
170	*
171	* If the operator is "range @> element", the constant should be of the
172	* element type of the range column. Convert it to a range that includes
173	* only that single point, so that we don't need special handling for that
174	* in what follows.
175	*/
176	if (operator == OID_RANGE_CONTAINS_ELEM_OP)
177	{
178	typcache = range_get_typcache(fcinfo, vardata.vartype);
179
180	if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
181	{
182	RangeBound lower,
183	upper;
184
185	lower.inclusive = true;
186	lower.val = ((Const *) other)->constvalue;
187	lower.infinite = false;
188	lower.lower = true;
189	upper.inclusive = true;
190	upper.val = ((Const *) other)->constvalue;
191	upper.infinite = false;
192	upper.lower = false;
193	constrange = range_serialize(typcache, &lower, &upper, false);
194	}
195	}
196	else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
197	{
198	/*
199	* Here, the Var is the elem, not the range. For now we just punt and
200	* return the default estimate. In future we could disassemble the
201	* range constant and apply scalarineqsel ...
202	*/
203	}
204	else if (((Const *) other)->consttype == vardata.vartype)
205	{
206	/ Both sides are the same range type /
207	typcache = range_get_typcache(fcinfo, vardata.vartype);
208
209	constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
210	}
211
212	/*
213	* If we got a valid constant on one side of the operator, proceed to
214	* estimate using statistics. Otherwise punt and return a default constant
215	* estimate. Note that calc_rangesel need not handle
216	* OID_RANGE_ELEM_CONTAINED_OP.
217	*/
218	if (constrange)
219	selec = calc_rangesel(typcache, &vardata, constrange, operator);
220	else
221	selec = default_range_selectivity(operator);
222
223	ReleaseVariableStats(vardata);
224
225	CLAMP_PROBABILITY(selec);
226
227	PG_RETURN_FLOAT8((float8) selec);
228	}
229
230	static double
231	calc_rangesel(TypeCacheEntry typcache, VariableStatData vardata,
232	RangeType *constval, Oid operator)
233	{
234	double hist_selec;
235	double selec;
236	float4 empty_frac,
237	null_frac;
238
239	/*
240	* First look up the fraction of NULLs and empty ranges from pg_statistic.
241	*/
242	if (HeapTupleIsValid(vardata->statsTuple))
243	{
244	Form_pg_statistic stats;
245	AttStatsSlot sslot;
246
247	stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248	null_frac = stats->stanullfrac;
249
250	/ Try to get fraction of empty ranges /
251	if (get_attstatsslot(&sslot, vardata->statsTuple,
252	STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
253	InvalidOid,
254	ATTSTATSSLOT_NUMBERS))
255	{
256	if (sslot.nnumbers != `1`)
257	elog(ERROR, "invalid empty fraction statistic"); / shouldn't happen /
258	empty_frac = sslot.numbers[`0`];
259	free_attstatsslot(&sslot);
260	}
261	else
262	{
263	/ No empty fraction statistic. Assume no empty ranges. /
264	empty_frac = `0.0`;
265	}
266	}
267	else
268	{
269	/*
270	* No stats are available. Follow through the calculations below
271	* anyway, assuming no NULLs and no empty ranges. This still allows us
272	* to give a better-than-nothing estimate based on whether the
273	* constant is an empty range or not.
274	*/
275	null_frac = `0.0`;
276	empty_frac = `0.0`;
277	}
278
279	if (RangeIsEmpty(constval))
280	{
281	/*
282	* An empty range matches all ranges, all empty ranges, or nothing,
283	* depending on the operator
284	*/
285	switch (operator)
286	{
287	/ these return false if either argument is empty /
288	case OID_RANGE_OVERLAP_OP:
289	case OID_RANGE_OVERLAPS_LEFT_OP:
290	case OID_RANGE_OVERLAPS_RIGHT_OP:
291	case OID_RANGE_LEFT_OP:
292	case OID_RANGE_RIGHT_OP:
293	/ nothing is less than an empty range /
294	case OID_RANGE_LESS_OP:
295	selec = `0.0`;
296	break;
297
298	/ only empty ranges can be contained by an empty range /
299	case OID_RANGE_CONTAINED_OP:
300	/ only empty ranges are <= an empty range /
301	case OID_RANGE_LESS_EQUAL_OP:
302	selec = empty_frac;
303	break;
304
305	/ everything contains an empty range /
306	case OID_RANGE_CONTAINS_OP:
307	/ everything is >= an empty range /
308	case OID_RANGE_GREATER_EQUAL_OP:
309	selec = `1.0`;
310	break;
311
312	/ all non-empty ranges are > an empty range /
313	case OID_RANGE_GREATER_OP:
314	selec = `1.0` - empty_frac;
315	break;
316
317	/ an element cannot be empty /
318	case OID_RANGE_CONTAINS_ELEM_OP:
319	default:
320	elog(ERROR, "unexpected operator %u", operator);
321	selec = `0.0`; / keep compiler quiet /
322	break;
323	}
324	}
325	else
326	{
327	/*
328	* Calculate selectivity using bound histograms. If that fails for
329	* some reason, e.g no histogram in pg_statistic, use the default
330	* constant estimate for the fraction of non-empty values. This is
331	* still somewhat better than just returning the default estimate,
332	* because this still takes into account the fraction of empty and
333	* NULL tuples, if we had statistics for them.
334	*/
335	hist_selec = calc_hist_selectivity(typcache, vardata, constval,
336	operator);
337	if (hist_selec < `0.0`)
338	hist_selec = default_range_selectivity(operator);
339
340	/*
341	* Now merge the results for the empty ranges and histogram
342	* calculations, realizing that the histogram covers only the
343	* non-null, non-empty values.
344	*/
345	if (operator == OID_RANGE_CONTAINED_OP)
346	{
347	/ empty is contained by anything non-empty /
348	selec = (`1.0` - empty_frac) * hist_selec + empty_frac;
349	}
350	else
351	{
352	/ with any other operator, empty Op non-empty matches nothing /
353	selec = (`1.0` - empty_frac) * hist_selec;
354	}
355	}
356
357	/ all range operators are strict /
358	selec *= (`1.0` - null_frac);
359
360	/ result should be in range, but make sure... /
361	CLAMP_PROBABILITY(selec);
362
363	return selec;
364	}
365
366	/*
367	* Calculate range operator selectivity using histograms of range bounds.
368	*
369	* This estimate is for the portion of values that are not empty and not
370	* NULL.
371	*/
372	static double
373	calc_hist_selectivity(TypeCacheEntry typcache, VariableStatData vardata,
374	RangeType *constval, Oid operator)
375	{
376	AttStatsSlot hslot;
377	AttStatsSlot lslot;
378	int nhist;
379	RangeBound *hist_lower;
380	RangeBound *hist_upper;
381	int i;
382	RangeBound const_lower;
383	RangeBound const_upper;
384	bool empty;
385	double hist_selec;
386
387	/ Can't use the histogram with insecure range support functions /
388	if (!statistic_proc_security_check(vardata,
389	typcache->rng_cmp_proc_finfo.fn_oid))
390	return -`1`;
391	if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
392	!statistic_proc_security_check(vardata,
393	typcache->rng_subdiff_finfo.fn_oid))
394	return -`1`;
395
396	/ Try to get histogram of ranges /
397	if (!(HeapTupleIsValid(vardata->statsTuple) &&
398	get_attstatsslot(&hslot, vardata->statsTuple,
399	STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
400	ATTSTATSSLOT_VALUES)))
401	return -`1.0`;
402
403	/*
404	* Convert histogram of ranges into histograms of its lower and upper
405	* bounds.
406	*/
407	nhist = hslot.nvalues;
408	hist_lower = (RangeBound ) palloc(sizeof(RangeBound) nhist);
409	hist_upper = (RangeBound ) palloc(sizeof(RangeBound) nhist);
410	for (i = `0`; i < nhist; i++)
411	{
412	range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
413	&hist_lower[i], &hist_upper[i], &empty);
414	/ The histogram should not contain any empty ranges /
415	if (empty)
416	elog(ERROR, "bounds histogram contains an empty range");
417	}
418
419	/ @> and @< also need a histogram of range lengths /
420	if (operator == OID_RANGE_CONTAINS_OP \|\|
421	operator == OID_RANGE_CONTAINED_OP)
422	{
423	if (!(HeapTupleIsValid(vardata->statsTuple) &&
424	get_attstatsslot(&lslot, vardata->statsTuple,
425	STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
426	InvalidOid,
427	ATTSTATSSLOT_VALUES)))
428	{
429	free_attstatsslot(&hslot);
430	return -`1.0`;
431	}
432
433	/ check that it's a histogram, not just a dummy entry /
434	if (lslot.nvalues < `2`)
435	{
436	free_attstatsslot(&lslot);
437	free_attstatsslot(&hslot);
438	return -`1.0`;
439	}
440	}
441	else
442	memset(&lslot, `0`, sizeof(lslot));
443
444	/ Extract the bounds of the constant value. /
445	range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
446	Assert(!empty);
447
448	/*
449	* Calculate selectivity comparing the lower or upper bound of the
450	* constant with the histogram of lower or upper bounds.
451	*/
452	switch (operator)
453	{
454	case OID_RANGE_LESS_OP:
455
456	/*
457	* The regular b-tree comparison operators (<, <=, >, >=) compare
458	* the lower bounds first, and the upper bounds for values with
459	* equal lower bounds. Estimate that by comparing the lower bounds
460	* only. This gives a fairly accurate estimate assuming there
461	* aren't many rows with a lower bound equal to the constant's
462	* lower bound.
463	*/
464	hist_selec =
465	calc_hist_selectivity_scalar(typcache, &const_lower,
466	hist_lower, nhist, false);
467	break;
468
469	case OID_RANGE_LESS_EQUAL_OP:
470	hist_selec =
471	calc_hist_selectivity_scalar(typcache, &const_lower,
472	hist_lower, nhist, true);
473	break;
474
475	case OID_RANGE_GREATER_OP:
476	hist_selec =
477	`1` - calc_hist_selectivity_scalar(typcache, &const_lower,
478	hist_lower, nhist, false);
479	break;
480
481	case OID_RANGE_GREATER_EQUAL_OP:
482	hist_selec =
483	`1` - calc_hist_selectivity_scalar(typcache, &const_lower,
484	hist_lower, nhist, true);
485	break;
486
487	case OID_RANGE_LEFT_OP:
488	/ var << const when upper(var) < lower(const) /
489	hist_selec =
490	calc_hist_selectivity_scalar(typcache, &const_lower,
491	hist_upper, nhist, false);
492	break;
493
494	case OID_RANGE_RIGHT_OP:
495	/ var >> const when lower(var) > upper(const) /
496	hist_selec =
497	`1` - calc_hist_selectivity_scalar(typcache, &const_upper,
498	hist_lower, nhist, true);
499	break;
500
501	case OID_RANGE_OVERLAPS_RIGHT_OP:
502	/ compare lower bounds /
503	hist_selec =
504	`1` - calc_hist_selectivity_scalar(typcache, &const_lower,
505	hist_lower, nhist, false);
506	break;
507
508	case OID_RANGE_OVERLAPS_LEFT_OP:
509	/ compare upper bounds /
510	hist_selec =
511	calc_hist_selectivity_scalar(typcache, &const_upper,
512	hist_upper, nhist, true);
513	break;
514
515	case OID_RANGE_OVERLAP_OP:
516	case OID_RANGE_CONTAINS_ELEM_OP:
517
518	/*
519	* A && B <=> NOT (A << B OR A >> B).
520	*
521	* Since A << B and A >> B are mutually exclusive events we can
522	* sum their probabilities to find probability of (A << B OR A >>
523	* B).
524	*
525	* "range @> elem" is equivalent to "range && [elem,elem]". The
526	* caller already constructed the singular range from the element
527	* constant, so just treat it the same as &&.
528	*/
529	hist_selec =
530	calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
531	nhist, false);
532	hist_selec +=
533	(`1.0` - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
534	nhist, true));
535	hist_selec = `1.0` - hist_selec;
536	break;
537
538	case OID_RANGE_CONTAINS_OP:
539	hist_selec =
540	calc_hist_selectivity_contains(typcache, &const_lower,
541	&const_upper, hist_lower, nhist,
542	lslot.values, lslot.nvalues);
543	break;
544
545	case OID_RANGE_CONTAINED_OP:
546	if (const_lower.infinite)
547	{
548	/*
549	* Lower bound no longer matters. Just estimate the fraction
550	* with an upper bound <= const upper bound
551	*/
552	hist_selec =
553	calc_hist_selectivity_scalar(typcache, &const_upper,
554	hist_upper, nhist, true);
555	}
556	else if (const_upper.infinite)
557	{
558	hist_selec =
559	`1.0` - calc_hist_selectivity_scalar(typcache, &const_lower,
560	hist_lower, nhist, false);
561	}
562	else
563	{
564	hist_selec =
565	calc_hist_selectivity_contained(typcache, &const_lower,
566	&const_upper, hist_lower, nhist,
567	lslot.values, lslot.nvalues);
568	}
569	break;
570
571	default:
572	elog(ERROR, "unknown range operator %u", operator);
573	hist_selec = -`1.0`; / keep compiler quiet /
574	break;
575	}
576
577	free_attstatsslot(&lslot);
578	free_attstatsslot(&hslot);
579
580	return hist_selec;
581	}
582
583
584	/*
585	* Look up the fraction of values less than (or equal, if 'equal' argument
586	* is true) a given const in a histogram of range bounds.
587	*/
588	static double
589	calc_hist_selectivity_scalar(TypeCacheEntry typcache, RangeBound constbound,
590	RangeBound hist, int* hist_nvalues, bool equal)
591	{
592	Selectivity selec;
593	int index;
594
595	/*
596	* Find the histogram bin the given constant falls into. Estimate
597	* selectivity as the number of preceding whole bins.
598	*/
599	index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
600	selec = (Selectivity) (Max(index, `0`)) / (Selectivity) (hist_nvalues - `1`);
601
602	/ Adjust using linear interpolation within the bin /
603	if (index >= `0` && index < hist_nvalues - `1`)
604	selec += get_position(typcache, constbound, &hist[index],
605	&hist[index + `1`]) / (Selectivity) (hist_nvalues - `1`);
606
607	return selec;
608	}
609
610	/*
611	* Binary search on an array of range bounds. Returns greatest index of range
612	* bound in array which is less(less or equal) than given range bound. If all
613	* range bounds in array are greater or equal(greater) than given range bound,
614	* return -1. When "equal" flag is set conditions in brackets are used.
615	*
616	* This function is used in scalar operator selectivity estimation. Another
617	* goal of this function is to find a histogram bin where to stop
618	* interpolation of portion of bounds which are less or equal to given bound.
619	*/
620	static int
621	rbound_bsearch(TypeCacheEntry typcache, RangeBound value, RangeBound *hist,
622	int hist_length, bool equal)
623	{
624	int lower = -`1`,
625	upper = hist_length - `1`,
626	cmp,
627	middle;
628
629	while (lower < upper)
630	{
631	middle = (lower + upper + `1`) / `2`;
632	cmp = range_cmp_bounds(typcache, &hist[middle], value);
633
634	if (cmp < `0` \|\| (equal && cmp == `0`))
635	lower = middle;
636	else
637	upper = middle - `1`;
638	}
639	return lower;
640	}
641
642
643	/*
644	* Binary search on length histogram. Returns greatest index of range length in
645	* histogram which is less than (less than or equal) the given length value. If
646	* all lengths in the histogram are greater than (greater than or equal) the
647	* given length, returns -1.
648	*/
649	static int
650	length_hist_bsearch(Datum length_hist_values, int* length_hist_nvalues,
651	double value, bool equal)
652	{
653	int lower = -`1`,
654	upper = length_hist_nvalues - `1`,
655	middle;
656
657	while (lower < upper)
658	{
659	double middleval;
660
661	middle = (lower + upper + `1`) / `2`;
662
663	middleval = DatumGetFloat8(length_hist_values[middle]);
664	if (middleval < value \|\| (equal && middleval <= value))
665	lower = middle;
666	else
667	upper = middle - `1`;
668	}
669	return lower;
670	}
671
672	/*
673	* Get relative position of value in histogram bin in [0,1] range.
674	*/
675	static float8
676	get_position(TypeCacheEntry typcache, RangeBound value, RangeBound *hist1,
677	RangeBound *hist2)
678	{
679	bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
680	float8 position;
681
682	if (!hist1->infinite && !hist2->infinite)
683	{
684	float8 bin_width;
685
686	/*
687	* Both bounds are finite. Assuming the subtype's comparison function
688	* works sanely, the value must be finite, too, because it lies
689	* somewhere between the bounds. If it doesn't, just return something.
690	*/
691	if (value->infinite)
692	return `0.5`;
693
694	/ Can't interpolate without subdiff function /
695	if (!has_subdiff)
696	return `0.5`;
697
698	/ Calculate relative position using subdiff function. /
699	bin_width = DatumGetFloat8(FunctionCall2Coll(
700	&typcache->rng_subdiff_finfo,
701	typcache->rng_collation,
702	hist2->val,
703	hist1->val));
704	if (bin_width <= `0.0`)
705	return `0.5`; / zero width bin /
706
707	position = DatumGetFloat8(FunctionCall2Coll(
708	&typcache->rng_subdiff_finfo,
709	typcache->rng_collation,
710	value->val,
711	hist1->val))
712	/ bin_width;
713
714	/ Relative position must be in [0,1] range /
715	position = Max(position, `0.0`);
716	position = Min(position, `1.0`);
717	return position;
718	}
719	else if (hist1->infinite && !hist2->infinite)
720	{
721	/*
722	* Lower bin boundary is -infinite, upper is finite. If the value is
723	* -infinite, return 0.0 to indicate it's equal to the lower bound.
724	* Otherwise return 1.0 to indicate it's infinitely far from the lower
725	* bound.
726	*/
727	return ((value->infinite && value->lower) ? `0.0` : `1.0`);
728	}
729	else if (!hist1->infinite && hist2->infinite)
730	{
731	/ same as above, but in reverse /
732	return ((value->infinite && !value->lower) ? `1.0` : `0.0`);
733	}
734	else
735	{
736	/*
737	* If both bin boundaries are infinite, they should be equal to each
738	* other, and the value should also be infinite and equal to both
739	* bounds. (But don't Assert that, to avoid crashing if a user creates
740	* a datatype with a broken comparison function).
741	*
742	* Assume the value to lie in the middle of the infinite bounds.
743	*/
744	return `0.5`;
745	}
746	}
747
748
749	/*
750	* Get relative position of value in a length histogram bin in [0,1] range.
751	*/
752	static double
753	get_len_position(double value, double hist1, double hist2)
754	{
755	if (!isinf(hist1) && !isinf(hist2))
756	{
757	/*
758	* Both bounds are finite. The value should be finite too, because it
759	* lies somewhere between the bounds. If it doesn't, just return
760	* something.
761	*/
762	if (isinf(value))
763	return `0.5`;
764
765	return `1.0` - (hist2 - value) / (hist2 - hist1);
766	}
767	else if (isinf(hist1) && !isinf(hist2))
768	{
769	/*
770	* Lower bin boundary is -infinite, upper is finite. Return 1.0 to
771	* indicate the value is infinitely far from the lower bound.
772	*/
773	return `1.0`;
774	}
775	else if (isinf(hist1) && isinf(hist2))
776	{
777	/ same as above, but in reverse /
778	return `0.0`;
779	}
780	else
781	{
782	/*
783	* If both bin boundaries are infinite, they should be equal to each
784	* other, and the value should also be infinite and equal to both
785	* bounds. (But don't Assert that, to avoid crashing unnecessarily if
786	* the caller messes up)
787	*
788	* Assume the value to lie in the middle of the infinite bounds.
789	*/
790	return `0.5`;
791	}
792	}
793
794	/*
795	* Measure distance between two range bounds.
796	*/
797	static float8
798	get_distance(TypeCacheEntry typcache, RangeBound bound1, RangeBound *bound2)
799	{
800	bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
801
802	if (!bound1->infinite && !bound2->infinite)
803	{
804	/*
805	* No bounds are infinite, use subdiff function or return default
806	* value of 1.0 if no subdiff is available.
807	*/
808	if (has_subdiff)
809	return
810	DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
811	typcache->rng_collation,
812	bound2->val,
813	bound1->val));
814	else
815	return `1.0`;
816	}
817	else if (bound1->infinite && bound2->infinite)
818	{
819	/ Both bounds are infinite /
820	if (bound1->lower == bound2->lower)
821	return `0.0`;
822	else
823	return get_float8_infinity();
824	}
825	else
826	{
827	/ One bound is infinite, another is not /
828	return get_float8_infinity();
829	}
830	}
831
832	/*
833	* Calculate the average of function P(x), in the interval [length1, length2],
834	* where P(x) is the fraction of tuples with length < x (or length <= x if
835	* 'equal' is true).
836	*/
837	static double
838	calc_length_hist_frac(Datum length_hist_values, int* length_hist_nvalues,
839	double length1, double length2, bool equal)
840	{
841	double frac;
842	double A,
843	B,
844	PA,
845	PB;
846	double pos;
847	int i;
848	double area;
849
850	Assert(length2 >= length1);
851
852	if (length2 < `0.0`)
853	return `0.0`; / shouldn't happen, but doesn't hurt to check /
854
855	/ All lengths in the table are <= infinite. /
856	if (isinf(length2) && equal)
857	return `1.0`;
858
859	/----------*
860	* The average of a function between A and B can be calculated by the
861	* formula:
862	*
863	* B
864	* 1 /
865	* ------- \| P(x)dx
866	* B - A /
867	* A
868	*
869	* The geometrical interpretation of the integral is the area under the
870	* graph of P(x). P(x) is defined by the length histogram. We calculate
871	* the area in a piecewise fashion, iterating through the length histogram
872	* bins. Each bin is a trapezoid:
873	*
874	* P(x2)
875	* /\|
876	* / \|
877	* P(x1)/ \|
878	* \| \|
879	* \| \|
880	* ---+---+--
881	* x1 x2
882	*
883	* where x1 and x2 are the boundaries of the current histogram, and P(x1)
884	* and P(x1) are the cumulative fraction of tuples at the boundaries.
885	*
886	* The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
887	*
888	* The first bin contains the lower bound passed by the caller, so we
889	* use linear interpolation between the previous and next histogram bin
890	* boundary to calculate P(x1). Likewise for the last bin: we use linear
891	* interpolation to calculate P(x2). For the bins in between, x1 and x2
892	* lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
893	* P(x1) = (bin index) / (number of bins)
894	* P(x2) = (bin index + 1 / (number of bins)
895	*/
896
897	/ First bin, the one that contains lower bound /
898	i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
899	if (i >= length_hist_nvalues - `1`)
900	return `1.0`;
901
902	if (i < `0`)
903	{
904	i = `0`;
905	pos = `0.0`;
906	}
907	else
908	{
909	/ interpolate length1's position in the bin /
910	pos = get_len_position(length1,
911	DatumGetFloat8(length_hist_values[i]),
912	DatumGetFloat8(length_hist_values[i + `1`]));
913	}
914	PB = (((double) i) + pos) / (double) (length_hist_nvalues - `1`);
915	B = length1;
916
917	/*
918	* In the degenerate case that length1 == length2, simply return
919	* P(length1). This is not merely an optimization: if length1 == length2,
920	* we'd divide by zero later on.
921	*/
922	if (length2 == length1)
923	return PB;
924
925	/*
926	* Loop through all the bins, until we hit the last bin, the one that
927	* contains the upper bound. (if lower and upper bounds are in the same
928	* bin, this falls out immediately)
929	*/
930	area = `0.0`;
931	for (; i < length_hist_nvalues - `1`; i++)
932	{
933	double bin_upper = DatumGetFloat8(length_hist_values[i + `1`]);
934
935	/ check if we've reached the last bin /
936	if (!(bin_upper < length2 \|\| (equal && bin_upper <= length2)))
937	break;
938
939	/ the upper bound of previous bin is the lower bound of this bin /
940	A = B;
941	PA = PB;
942
943	B = bin_upper;
944	PB = (double) i / (double) (length_hist_nvalues - `1`);
945
946	/*
947	* Add the area of this trapezoid to the total. The point of the
948	* if-check is to avoid NaN, in the corner case that PA == PB == 0,
949	* and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
950	* 0) is zero, regardless of the width (B - A).
951	*/
952	if (PA > `0` \|\| PB > `0`)
953	area += `0.5` * (PB + PA) * (B - A);
954	}
955
956	/ Last bin /
957	A = B;
958	PA = PB;
959
960	B = length2; / last bin ends at the query upper bound /
961	if (i >= length_hist_nvalues - `1`)
962	pos = `0.0`;
963	else
964	{
965	if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + `1`]))
966	pos = `0.0`;
967	else
968	pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + `1`]));
969	}
970	PB = (((double) i) + pos) / (double) (length_hist_nvalues - `1`);
971
972	if (PA > `0` \|\| PB > `0`)
973	area += `0.5` * (PB + PA) * (B - A);
974
975	/*
976	* Ok, we have calculated the area, ie. the integral. Divide by width to
977	* get the requested average.
978	*
979	* Avoid NaN arising from infinite / infinite. This happens at least if
980	* length2 is infinite. It's not clear what the correct value would be in
981	* that case, so 0.5 seems as good as any value.
982	*/
983	if (isinf(area) && isinf(length2))
984	frac = `0.5`;
985	else
986	frac = area / (length2 - length1);
987
988	return frac;
989	}
990
991	/*
992	* Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
993	* of ranges that fall within the constant lower and upper bounds. This uses
994	* the histograms of range lower bounds and range lengths, on the assumption
995	* that the range lengths are independent of the lower bounds.
996	*
997	* The caller has already checked that constant lower and upper bounds are
998	* finite.
999	*/
1000	static double
1001	calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1002	RangeBound lower, RangeBound upper,
1003	RangeBound hist_lower, int* hist_nvalues,
1004	Datum length_hist_values, int* length_hist_nvalues)
1005	{
1006	int i,
1007	upper_index;
1008	float8 prev_dist;
1009	double bin_width;
1010	double upper_bin_width;
1011	double sum_frac;
1012
1013	/*
1014	* Begin by finding the bin containing the upper bound, in the lower bound
1015	* histogram. Any range with a lower bound > constant upper bound can't
1016	* match, ie. there are no matches in bins greater than upper_index.
1017	*/
1018	upper->inclusive = !upper->inclusive;
1019	upper->lower = true;
1020	upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1021	false);
1022
1023	/*
1024	* Calculate upper_bin_width, ie. the fraction of the (upper_index,
1025	* upper_index + 1) bin which is greater than upper bound of query range
1026	* using linear interpolation of subdiff function.
1027	*/
1028	if (upper_index >= `0` && upper_index < hist_nvalues - `1`)
1029	upper_bin_width = get_position(typcache, upper,
1030	&hist_lower[upper_index],
1031	&hist_lower[upper_index + `1`]);
1032	else
1033	upper_bin_width = `0.0`;
1034
1035	/*
1036	* In the loop, dist and prev_dist are the distance of the "current" bin's
1037	* lower and upper bounds from the constant upper bound.
1038	*
1039	* bin_width represents the width of the current bin. Normally it is 1.0,
1040	* meaning a full width bin, but can be less in the corner cases: start
1041	* and end of the loop. We start with bin_width = upper_bin_width, because
1042	* we begin at the bin containing the upper bound.
1043	*/
1044	prev_dist = `0.0`;
1045	bin_width = upper_bin_width;
1046
1047	sum_frac = `0.0`;
1048	for (i = upper_index; i >= `0`; i--)
1049	{
1050	double dist;
1051	double length_hist_frac;
1052	bool final_bin = false;
1053
1054	/*
1055	* dist -- distance from upper bound of query range to lower bound of
1056	* the current bin in the lower bound histogram. Or to the lower bound
1057	* of the constant range, if this is the final bin, containing the
1058	* constant lower bound.
1059	*/
1060	if (range_cmp_bounds(typcache, &hist_lower[i], lower) < `0`)
1061	{
1062	dist = get_distance(typcache, lower, upper);
1063
1064	/*
1065	* Subtract from bin_width the portion of this bin that we want to
1066	* ignore.
1067	*/
1068	bin_width -= get_position(typcache, lower, &hist_lower[i],
1069	&hist_lower[i + `1`]);
1070	if (bin_width < `0.0`)
1071	bin_width = `0.0`;
1072	final_bin = true;
1073	}
1074	else
1075	dist = get_distance(typcache, &hist_lower[i], upper);
1076
1077	/*
1078	* Estimate the fraction of tuples in this bin that are narrow enough
1079	* to not exceed the distance to the upper bound of the query range.
1080	*/
1081	length_hist_frac = calc_length_hist_frac(length_hist_values,
1082	length_hist_nvalues,
1083	prev_dist, dist, true);
1084
1085	/*
1086	* Add the fraction of tuples in this bin, with a suitable length, to
1087	* the total.
1088	*/
1089	sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - `1`);
1090
1091	if (final_bin)
1092	break;
1093
1094	bin_width = `1.0`;
1095	prev_dist = dist;
1096	}
1097
1098	return sum_frac;
1099	}
1100
1101	/*
1102	* Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1103	* of ranges that contain the constant lower and upper bounds. This uses
1104	* the histograms of range lower bounds and range lengths, on the assumption
1105	* that the range lengths are independent of the lower bounds.
1106	*
1107	* Note, this is "var @> const", ie. estimate the fraction of ranges that
1108	* contain the constant lower and upper bounds.
1109	*/
1110	static double
1111	calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1112	RangeBound lower, RangeBound upper,
1113	RangeBound hist_lower, int* hist_nvalues,
1114	Datum length_hist_values, int* length_hist_nvalues)
1115	{
1116	int i,
1117	lower_index;
1118	double bin_width,
1119	lower_bin_width;
1120	double sum_frac;
1121	float8 prev_dist;
1122
1123	/ Find the bin containing the lower bound of query range. /
1124	lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1125	true);
1126
1127	/*
1128	* Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1129	* lower_index + 1) bin which is greater than lower bound of query range
1130	* using linear interpolation of subdiff function.
1131	*/
1132	if (lower_index >= `0` && lower_index < hist_nvalues - `1`)
1133	lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1134	&hist_lower[lower_index + `1`]);
1135	else
1136	lower_bin_width = `0.0`;
1137
1138	/*
1139	* Loop through all the lower bound bins, smaller than the query lower
1140	* bound. In the loop, dist and prev_dist are the distance of the
1141	* "current" bin's lower and upper bounds from the constant upper bound.
1142	* We begin from query lower bound, and walk backwards, so the first bin's
1143	* upper bound is the query lower bound, and its distance to the query
1144	* upper bound is the length of the query range.
1145	*
1146	* bin_width represents the width of the current bin. Normally it is 1.0,
1147	* meaning a full width bin, except for the first bin, which is only
1148	* counted up to the constant lower bound.
1149	*/
1150	prev_dist = get_distance(typcache, lower, upper);
1151	sum_frac = `0.0`;
1152	bin_width = lower_bin_width;
1153	for (i = lower_index; i >= `0`; i--)
1154	{
1155	float8 dist;
1156	double length_hist_frac;
1157
1158	/*
1159	* dist -- distance from upper bound of query range to current value
1160	* of lower bound histogram or lower bound of query range (if we've
1161	* reach it).
1162	*/
1163	dist = get_distance(typcache, &hist_lower[i], upper);
1164
1165	/*
1166	* Get average fraction of length histogram which covers intervals
1167	* longer than (or equal to) distance to upper bound of query range.
1168	*/
1169	length_hist_frac =
1170	`1.0` - calc_length_hist_frac(length_hist_values,
1171	length_hist_nvalues,
1172	prev_dist, dist, false);
1173
1174	sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - `1`);
1175
1176	bin_width = `1.0`;
1177	prev_dist = dist;
1178	}
1179
1180	return sum_frac;
1181	}
1182

Browse the source code of PostgreSQL/src/backend/utils/adt/rangetypes_selfuncs.c