array_selfuncs.c source code [PostgreSQL/src/backend/utils/adt/array_selfuncs.c]

1	/-------------------------------------------------------------------------*
2	*
3	* array_selfuncs.c
4	* Functions for selectivity estimation of array operators
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/utils/adt/array_selfuncs.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include <math.h>
18
19	#include "access/htup_details.h"
20	#include "catalog/pg_collation.h"
21	#include "catalog/pg_operator.h"
22	#include "catalog/pg_statistic.h"
23	#include "utils/array.h"
24	#include "utils/builtins.h"
25	#include "utils/lsyscache.h"
26	#include "utils/selfuncs.h"
27	#include "utils/typcache.h"
28
29
30	/ Default selectivity constant for "@>" and "<@" operators /
31	#define DEFAULT_CONTAIN_SEL 0.005
32
33	/ Default selectivity constant for "&&" operator /
34	#define DEFAULT_OVERLAP_SEL 0.01
35
36	/ Default selectivity for given operator /
37	#define DEFAULT_SEL(operator) \
38	((operator) == OID_ARRAY_OVERLAP_OP ? \
39	DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
40
41	static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
42	Oid elemtype, Oid operator);
43	static Selectivity mcelem_array_selec(ArrayType *array,
44	TypeCacheEntry *typentry,
45	Datum mcelem, int* nmcelem,
46	float4 numbers, int* nnumbers,
47	float4 hist, int* nhist,
48	Oid operator);
49	static Selectivity mcelem_array_contain_overlap_selec(Datum mcelem, int* nmcelem,
50	float4 numbers, int* nnumbers,
51	Datum array_data, int* nitems,
52	Oid operator, TypeCacheEntry *typentry);
53	static Selectivity mcelem_array_contained_selec(Datum mcelem, int* nmcelem,
54	float4 numbers, int* nnumbers,
55	Datum array_data, int* nitems,
56	float4 hist, int* nhist,
57	Oid operator, TypeCacheEntry *typentry);
58	static float calc_hist(const* float4 hist, int* nhist, int n);
59	static float calc_distr(const* float p, int* n, int m, float rest);
60	static int floor_log2(uint32 n);
61	static bool find_next_mcelem(Datum mcelem, int* nmcelem, Datum value,
62	int index, TypeCacheEntry typentry);
63	static int element_compare(const void key1, const* void key2, void* *arg);
64	static int float_compare_desc(const void key1, const* void *key2);
65
66
67	/*
68	* scalararraysel_containment
69	* Estimate selectivity of ScalarArrayOpExpr via array containment.
70	*
71	* If we have const =/<> ANY/ALL (array_var) then we can estimate the
72	* selectivity as though this were an array containment operator,
73	* array_var op ARRAY[const].
74	*
75	* scalararraysel() has already verified that the ScalarArrayOpExpr's operator
76	* is the array element type's default equality or inequality operator, and
77	* has aggressively simplified both inputs to constants.
78	*
79	* Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
80	*/
81	Selectivity
82	scalararraysel_containment(PlannerInfo *root,
83	Node leftop, Node rightop,
84	Oid elemtype, bool isEquality, bool useOr,
85	int varRelid)
86	{
87	Selectivity selec;
88	VariableStatData vardata;
89	Datum constval;
90	TypeCacheEntry *typentry;
91	FmgrInfo *cmpfunc;
92
93	/*
94	* rightop must be a variable, else punt.
95	*/
96	examine_variable(root, rightop, varRelid, &vardata);
97	if (!vardata.rel)
98	{
99	ReleaseVariableStats(vardata);
100	return -`1.0`;
101	}
102
103	/*
104	* leftop must be a constant, else punt.
105	*/
106	if (!IsA(leftop, Const))
107	{
108	ReleaseVariableStats(vardata);
109	return -`1.0`;
110	}
111	if (((Const *) leftop)->constisnull)
112	{
113	/ qual can't succeed if null on left /
114	ReleaseVariableStats(vardata);
115	return (Selectivity) `0.0`;
116	}
117	constval = ((Const *) leftop)->constvalue;
118
119	/ Get element type's default comparison function /
120	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
121	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
122	{
123	ReleaseVariableStats(vardata);
124	return -`1.0`;
125	}
126	cmpfunc = &typentry->cmp_proc_finfo;
127
128	/*
129	* If the operator is <>, swap ANY/ALL, then invert the result later.
130	*/
131	if (!isEquality)
132	useOr = !useOr;
133
134	/ Get array element stats for var, if available /
135	if (HeapTupleIsValid(vardata.statsTuple) &&
136	statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
137	{
138	Form_pg_statistic stats;
139	AttStatsSlot sslot;
140	AttStatsSlot hslot;
141
142	stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
143
144	/ MCELEM will be an array of same type as element /
145	if (get_attstatsslot(&sslot, vardata.statsTuple,
146	STATISTIC_KIND_MCELEM, InvalidOid,
147	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS))
148	{
149	/ For ALL case, also get histogram of distinct-element counts /
150	if (useOr \|\|
151	!get_attstatsslot(&hslot, vardata.statsTuple,
152	STATISTIC_KIND_DECHIST, InvalidOid,
153	ATTSTATSSLOT_NUMBERS))
154	memset(&hslot, `0`, sizeof(hslot));
155
156	/*
157	* For = ANY, estimate as var @> ARRAY[const].
158	*
159	* For = ALL, estimate as var <@ ARRAY[const].
160	*/
161	if (useOr)
162	selec = mcelem_array_contain_overlap_selec(sslot.values,
163	sslot.nvalues,
164	sslot.numbers,
165	sslot.nnumbers,
166	&constval, `1`,
167	OID_ARRAY_CONTAINS_OP,
168	typentry);
169	else
170	selec = mcelem_array_contained_selec(sslot.values,
171	sslot.nvalues,
172	sslot.numbers,
173	sslot.nnumbers,
174	&constval, `1`,
175	hslot.numbers,
176	hslot.nnumbers,
177	OID_ARRAY_CONTAINED_OP,
178	typentry);
179
180	free_attstatsslot(&hslot);
181	free_attstatsslot(&sslot);
182	}
183	else
184	{
185	/ No most-common-elements info, so do without /
186	if (useOr)
187	selec = mcelem_array_contain_overlap_selec(NULL, `0`,
188	NULL, `0`,
189	&constval, `1`,
190	OID_ARRAY_CONTAINS_OP,
191	typentry);
192	else
193	selec = mcelem_array_contained_selec(NULL, `0`,
194	NULL, `0`,
195	&constval, `1`,
196	NULL, `0`,
197	OID_ARRAY_CONTAINED_OP,
198	typentry);
199	}
200
201	/*
202	* MCE stats count only non-null rows, so adjust for null rows.
203	*/
204	selec *= (`1.0` - stats->stanullfrac);
205	}
206	else
207	{
208	/ No stats at all, so do without /
209	if (useOr)
210	selec = mcelem_array_contain_overlap_selec(NULL, `0`,
211	NULL, `0`,
212	&constval, `1`,
213	OID_ARRAY_CONTAINS_OP,
214	typentry);
215	else
216	selec = mcelem_array_contained_selec(NULL, `0`,
217	NULL, `0`,
218	&constval, `1`,
219	NULL, `0`,
220	OID_ARRAY_CONTAINED_OP,
221	typentry);
222	/ we assume no nulls here, so no stanullfrac correction /
223	}
224
225	ReleaseVariableStats(vardata);
226
227	/*
228	* If the operator is <>, invert the results.
229	*/
230	if (!isEquality)
231	selec = `1.0` - selec;
232
233	CLAMP_PROBABILITY(selec);
234
235	return selec;
236	}
237
238	/*
239	* arraycontsel -- restriction selectivity for array @>, &&, <@ operators
240	*/
241	Datum
242	arraycontsel(PG_FUNCTION_ARGS)
243	{
244	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
245	Oid operator = PG_GETARG_OID(`1`);
246	List args = (List ) PG_GETARG_POINTER(`2`);
247	int varRelid = PG_GETARG_INT32(`3`);
248	VariableStatData vardata;
249	Node *other;
250	bool varonleft;
251	Selectivity selec;
252	Oid element_typeid;
253
254	/*
255	* If expression is not (variable op something) or (something op
256	* variable), then punt and return a default estimate.
257	*/
258	if (!get_restriction_variable(root, args, varRelid,
259	&vardata, &other, &varonleft))
260	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
261
262	/*
263	* Can't do anything useful if the something is not a constant, either.
264	*/
265	if (!IsA(other, Const))
266	{
267	ReleaseVariableStats(vardata);
268	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
269	}
270
271	/*
272	* The "&&", "@>" and "<@" operators are strict, so we can cope with a
273	* NULL constant right away.
274	*/
275	if (((Const *) other)->constisnull)
276	{
277	ReleaseVariableStats(vardata);
278	PG_RETURN_FLOAT8(`0.0`);
279	}
280
281	/*
282	* If var is on the right, commute the operator, so that we can assume the
283	* var is on the left in what follows.
284	*/
285	if (!varonleft)
286	{
287	if (operator == OID_ARRAY_CONTAINS_OP)
288	operator = OID_ARRAY_CONTAINED_OP;
289	else if (operator == OID_ARRAY_CONTAINED_OP)
290	operator = OID_ARRAY_CONTAINS_OP;
291	}
292
293	/*
294	* OK, there's a Var and a Const we're dealing with here. We need the
295	* Const to be an array with same element type as column, else we can't do
296	* anything useful. (Such cases will likely fail at runtime, but here
297	* we'd rather just return a default estimate.)
298	*/
299	element_typeid = get_base_element_type(((Const *) other)->consttype);
300	if (element_typeid != InvalidOid &&
301	element_typeid == get_base_element_type(vardata.vartype))
302	{
303	selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
304	element_typeid, operator);
305	}
306	else
307	{
308	selec = DEFAULT_SEL(operator);
309	}
310
311	ReleaseVariableStats(vardata);
312
313	CLAMP_PROBABILITY(selec);
314
315	PG_RETURN_FLOAT8((float8) selec);
316	}
317
318	/*
319	* arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
320	*/
321	Datum
322	arraycontjoinsel(PG_FUNCTION_ARGS)
323	{
324	/ For the moment this is just a stub /
325	Oid operator = PG_GETARG_OID(`1`);
326
327	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
328	}
329
330	/*
331	* Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
332	* or "arraycolumn <@ const" based on the statistics
333	*
334	* This function is mainly responsible for extracting the pg_statistic data
335	* to be used; we then pass the problem on to mcelem_array_selec().
336	*/
337	static Selectivity
338	calc_arraycontsel(VariableStatData *vardata, Datum constval,
339	Oid elemtype, Oid operator)
340	{
341	Selectivity selec;
342	TypeCacheEntry *typentry;
343	FmgrInfo *cmpfunc;
344	ArrayType *array;
345
346	/ Get element type's default comparison function /
347	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
348	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
349	return DEFAULT_SEL(operator);
350	cmpfunc = &typentry->cmp_proc_finfo;
351
352	/*
353	* The caller made sure the const is an array with same element type, so
354	* get it now
355	*/
356	array = DatumGetArrayTypeP(constval);
357
358	if (HeapTupleIsValid(vardata->statsTuple) &&
359	statistic_proc_security_check(vardata, cmpfunc->fn_oid))
360	{
361	Form_pg_statistic stats;
362	AttStatsSlot sslot;
363	AttStatsSlot hslot;
364
365	stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
366
367	/ MCELEM will be an array of same type as column /
368	if (get_attstatsslot(&sslot, vardata->statsTuple,
369	STATISTIC_KIND_MCELEM, InvalidOid,
370	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS))
371	{
372	/*
373	* For "array <@ const" case we also need histogram of distinct
374	* element counts.
375	*/
376	if (operator != OID_ARRAY_CONTAINED_OP \|\|
377	!get_attstatsslot(&hslot, vardata->statsTuple,
378	STATISTIC_KIND_DECHIST, InvalidOid,
379	ATTSTATSSLOT_NUMBERS))
380	memset(&hslot, `0`, sizeof(hslot));
381
382	/ Use the most-common-elements slot for the array Var. /
383	selec = mcelem_array_selec(array, typentry,
384	sslot.values, sslot.nvalues,
385	sslot.numbers, sslot.nnumbers,
386	hslot.numbers, hslot.nnumbers,
387	operator);
388
389	free_attstatsslot(&hslot);
390	free_attstatsslot(&sslot);
391	}
392	else
393	{
394	/ No most-common-elements info, so do without /
395	selec = mcelem_array_selec(array, typentry,
396	NULL, `0`, NULL, `0`, NULL, `0`,
397	operator);
398	}
399
400	/*
401	* MCE stats count only non-null rows, so adjust for null rows.
402	*/
403	selec *= (`1.0` - stats->stanullfrac);
404	}
405	else
406	{
407	/ No stats at all, so do without /
408	selec = mcelem_array_selec(array, typentry,
409	NULL, `0`, NULL, `0`, NULL, `0`,
410	operator);
411	/ we assume no nulls here, so no stanullfrac correction /
412	}
413
414	/ If constant was toasted, release the copy we made /
415	if (PointerGetDatum(array) != constval)
416	pfree(array);
417
418	return selec;
419	}
420
421	/*
422	* Array selectivity estimation based on most common elements statistics
423	*
424	* This function just deconstructs and sorts the array constant's contents,
425	* and then passes the problem on to mcelem_array_contain_overlap_selec or
426	* mcelem_array_contained_selec depending on the operator.
427	*/
428	static Selectivity
429	mcelem_array_selec(ArrayType array, TypeCacheEntry typentry,
430	Datum mcelem, int* nmcelem,
431	float4 numbers, int* nnumbers,
432	float4 hist, int* nhist,
433	Oid operator)
434	{
435	Selectivity selec;
436	int num_elems;
437	Datum *elem_values;
438	bool *elem_nulls;
439	bool null_present;
440	int nonnull_nitems;
441	int i;
442
443	/*
444	* Prepare constant array data for sorting. Sorting lets us find unique
445	* elements and efficiently merge with the MCELEM array.
446	*/
447	deconstruct_array(array,
448	typentry->type_id,
449	typentry->typlen,
450	typentry->typbyval,
451	typentry->typalign,
452	&elem_values, &elem_nulls, &num_elems);
453
454	/ Collapse out any null elements /
455	nonnull_nitems = `0`;
456	null_present = false;
457	for (i = `0`; i < num_elems; i++)
458	{
459	if (elem_nulls[i])
460	null_present = true;
461	else
462	elem_values[nonnull_nitems++] = elem_values[i];
463	}
464
465	/*
466	* Query "column @> '{anything, null}'" matches nothing. For the other
467	* two operators, presence of a null in the constant can be ignored.
468	*/
469	if (null_present && operator == OID_ARRAY_CONTAINS_OP)
470	{
471	pfree(elem_values);
472	pfree(elem_nulls);
473	return (Selectivity) `0.0`;
474	}
475
476	/ Sort extracted elements using their default comparison function. /
477	qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
478	element_compare, typentry);
479
480	/ Separate cases according to operator /
481	if (operator == OID_ARRAY_CONTAINS_OP \|\| operator == OID_ARRAY_OVERLAP_OP)
482	selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
483	numbers, nnumbers,
484	elem_values, nonnull_nitems,
485	operator, typentry);
486	else if (operator == OID_ARRAY_CONTAINED_OP)
487	selec = mcelem_array_contained_selec(mcelem, nmcelem,
488	numbers, nnumbers,
489	elem_values, nonnull_nitems,
490	hist, nhist,
491	operator, typentry);
492	else
493	{
494	elog(ERROR, "arraycontsel called for unrecognized operator %u",
495	operator);
496	selec = `0.0`; / keep compiler quiet /
497	}
498
499	pfree(elem_values);
500	pfree(elem_nulls);
501	return selec;
502	}
503
504	/*
505	* Estimate selectivity of "column @> const" and "column && const" based on
506	* most common element statistics. This estimation assumes element
507	* occurrences are independent.
508	*
509	* mcelem (of length nmcelem) and numbers (of length nnumbers) are from
510	* the array column's MCELEM statistics slot, or are NULL/0 if stats are
511	* not available. array_data (of length nitems) is the constant's elements.
512	*
513	* Both the mcelem and array_data arrays are assumed presorted according
514	* to the element type's cmpfunc. Null elements are not present.
515	*
516	* TODO: this estimate probably could be improved by using the distinct
517	* elements count histogram. For example, excepting the special case of
518	* "column @> '{}'", we can multiply the calculated selectivity by the
519	* fraction of nonempty arrays in the column.
520	*/
521	static Selectivity
522	mcelem_array_contain_overlap_selec(Datum mcelem, int* nmcelem,
523	float4 numbers, int* nnumbers,
524	Datum array_data, int* nitems,
525	Oid operator, TypeCacheEntry *typentry)
526	{
527	Selectivity selec,
528	elem_selec;
529	int mcelem_index,
530	i;
531	bool use_bsearch;
532	float4 minfreq;
533
534	/*
535	* There should be three more Numbers than Values, because the last three
536	* cells should hold minimal and maximal frequency among the non-null
537	* elements, and then the frequency of null elements. Ignore the Numbers
538	* if not right.
539	*/
540	if (nnumbers != nmcelem + `3`)
541	{
542	numbers = NULL;
543	nnumbers = `0`;
544	}
545
546	if (numbers)
547	{
548	/ Grab the lowest observed frequency /
549	minfreq = numbers[nmcelem];
550	}
551	else
552	{
553	/ Without statistics make some default assumptions /
554	minfreq = `2` * (float4) DEFAULT_CONTAIN_SEL;
555	}
556
557	/ Decide whether it is faster to use binary search or not. /
558	if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
559	use_bsearch = true;
560	else
561	use_bsearch = false;
562
563	if (operator == OID_ARRAY_CONTAINS_OP)
564	{
565	/*
566	* Initial selectivity for "column @> const" query is 1.0, and it will
567	* be decreased with each element of constant array.
568	*/
569	selec = `1.0`;
570	}
571	else
572	{
573	/*
574	* Initial selectivity for "column && const" query is 0.0, and it will
575	* be increased with each element of constant array.
576	*/
577	selec = `0.0`;
578	}
579
580	/ Scan mcelem and array in parallel. /
581	mcelem_index = `0`;
582	for (i = `0`; i < nitems; i++)
583	{
584	bool match = false;
585
586	/ Ignore any duplicates in the array data. /
587	if (i > `0` &&
588	element_compare(&array_data[i - `1`], &array_data[i], typentry) == `0`)
589	continue;
590
591	/ Find the smallest MCELEM >= this array item. /
592	if (use_bsearch)
593	{
594	match = find_next_mcelem(mcelem, nmcelem, array_data[i],
595	&mcelem_index, typentry);
596	}
597	else
598	{
599	while (mcelem_index < nmcelem)
600	{
601	int cmp = element_compare(&mcelem[mcelem_index],
602	&array_data[i],
603	typentry);
604
605	if (cmp < `0`)
606	mcelem_index++;
607	else
608	{
609	if (cmp == `0`)
610	match = true; / mcelem is found /
611	break;
612	}
613	}
614	}
615
616	if (match && numbers)
617	{
618	/ MCELEM matches the array item; use its frequency. /
619	elem_selec = numbers[mcelem_index];
620	mcelem_index++;
621	}
622	else
623	{
624	/*
625	* The element is not in MCELEM. Punt, but assume that the
626	* selectivity cannot be more than minfreq / 2.
627	*/
628	elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / `2`);
629	}
630
631	/*
632	* Update overall selectivity using the current element's selectivity
633	* and an assumption of element occurrence independence.
634	*/
635	if (operator == OID_ARRAY_CONTAINS_OP)
636	selec *= elem_selec;
637	else
638	selec = selec + elem_selec - selec * elem_selec;
639
640	/ Clamp intermediate results to stay sane despite roundoff error /
641	CLAMP_PROBABILITY(selec);
642	}
643
644	return selec;
645	}
646
647	/*
648	* Estimate selectivity of "column <@ const" based on most common element
649	* statistics.
650	*
651	* mcelem (of length nmcelem) and numbers (of length nnumbers) are from
652	* the array column's MCELEM statistics slot, or are NULL/0 if stats are
653	* not available. array_data (of length nitems) is the constant's elements.
654	* hist (of length nhist) is from the array column's DECHIST statistics slot,
655	* or is NULL/0 if those stats are not available.
656	*
657	* Both the mcelem and array_data arrays are assumed presorted according
658	* to the element type's cmpfunc. Null elements are not present.
659	*
660	* Independent element occurrence would imply a particular distribution of
661	* distinct element counts among matching rows. Real data usually falsifies
662	* that assumption. For example, in a set of 11-element integer arrays having
663	* elements in the range [0..10], element occurrences are typically not
664	* independent. If they were, a sufficiently-large set would include all
665	* distinct element counts 0 through 11. We correct for this using the
666	* histogram of distinct element counts.
667	*
668	* In the "column @> const" and "column && const" cases, we usually have a
669	* "const" with low number of elements (otherwise we have selectivity close
670	* to 0 or 1 respectively). That's why the effect of dependence related
671	* to distinct element count distribution is negligible there. In the
672	* "column <@ const" case, number of elements is usually high (otherwise we
673	* have selectivity close to 0). That's why we should do a correction with
674	* the array distinct element count distribution here.
675	*
676	* Using the histogram of distinct element counts produces a different
677	* distribution law than independent occurrences of elements. This
678	* distribution law can be described as follows:
679	*
680	* P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
681	* (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
682	*
683	* where:
684	* o1, o2, ..., on - occurrences of elements 1, 2, ..., n
685	* (1 - occurrence, 0 - no occurrence) in row
686	* f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
687	* (scalar values in [0..1]) according to collected statistics
688	* m = o1 + o2 + ... + on = total number of distinct elements in row
689	* hist[m] - histogram data for occurrence of m elements.
690	* ind[m] - probability of m occurrences from n events assuming their
691	* probabilities to be equal to frequencies of array elements.
692	*
693	* ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
694	* ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) \| o1 + o2 + .. on = m
695	*/
696	static Selectivity
697	mcelem_array_contained_selec(Datum mcelem, int* nmcelem,
698	float4 numbers, int* nnumbers,
699	Datum array_data, int* nitems,
700	float4 hist, int* nhist,
701	Oid operator, TypeCacheEntry *typentry)
702	{
703	int mcelem_index,
704	i,
705	unique_nitems = `0`;
706	float selec,
707	minfreq,
708	nullelem_freq;
709	float *dist,
710	*mcelem_dist,
711	*hist_part;
712	float avg_count,
713	mult,
714	rest;
715	float *elem_selec;
716
717	/*
718	* There should be three more Numbers than Values in the MCELEM slot,
719	* because the last three cells should hold minimal and maximal frequency
720	* among the non-null elements, and then the frequency of null elements.
721	* Punt if not right, because we can't do much without the element freqs.
722	*/
723	if (numbers == NULL \|\| nnumbers != nmcelem + `3`)
724	return DEFAULT_CONTAIN_SEL;
725
726	/ Can't do much without a count histogram, either /
727	if (hist == NULL \|\| nhist < `3`)
728	return DEFAULT_CONTAIN_SEL;
729
730	/*
731	* Grab some of the summary statistics that compute_array_stats() stores:
732	* lowest frequency, frequency of null elements, and average distinct
733	* element count.
734	*/
735	minfreq = numbers[nmcelem];
736	nullelem_freq = numbers[nmcelem + `2`];
737	avg_count = hist[nhist - `1`];
738
739	/*
740	* "rest" will be the sum of the frequencies of all elements not
741	* represented in MCELEM. The average distinct element count is the sum
742	* of the frequencies of all elements. Begin with that; we will proceed
743	* to subtract the MCELEM frequencies.
744	*/
745	rest = avg_count;
746
747	/*
748	* mult is a multiplier representing estimate of probability that each
749	* mcelem that is not present in constant doesn't occur.
750	*/
751	mult = `1.0f`;
752
753	/*
754	* elem_selec is array of estimated frequencies for elements in the
755	* constant.
756	*/
757	elem_selec = (float ) palloc(sizeof(float) nitems);
758
759	/ Scan mcelem and array in parallel. /
760	mcelem_index = `0`;
761	for (i = `0`; i < nitems; i++)
762	{
763	bool match = false;
764
765	/ Ignore any duplicates in the array data. /
766	if (i > `0` &&
767	element_compare(&array_data[i - `1`], &array_data[i], typentry) == `0`)
768	continue;
769
770	/*
771	* Iterate over MCELEM until we find an entry greater than or equal to
772	* this element of the constant. Update "rest" and "mult" for mcelem
773	* entries skipped over.
774	*/
775	while (mcelem_index < nmcelem)
776	{
777	int cmp = element_compare(&mcelem[mcelem_index],
778	&array_data[i],
779	typentry);
780
781	if (cmp < `0`)
782	{
783	mult *= (`1.0f` - numbers[mcelem_index]);
784	rest -= numbers[mcelem_index];
785	mcelem_index++;
786	}
787	else
788	{
789	if (cmp == `0`)
790	match = true; / mcelem is found /
791	break;
792	}
793	}
794
795	if (match)
796	{
797	/ MCELEM matches the array item. /
798	elem_selec[unique_nitems] = numbers[mcelem_index];
799	/ "rest" is decremented for all mcelems, matched or not /
800	rest -= numbers[mcelem_index];
801	mcelem_index++;
802	}
803	else
804	{
805	/*
806	* The element is not in MCELEM. Punt, but assume that the
807	* selectivity cannot be more than minfreq / 2.
808	*/
809	elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
810	minfreq / `2`);
811	}
812
813	unique_nitems++;
814	}
815
816	/*
817	* If we handled all constant elements without exhausting the MCELEM
818	* array, finish walking it to complete calculation of "rest" and "mult".
819	*/
820	while (mcelem_index < nmcelem)
821	{
822	mult *= (`1.0f` - numbers[mcelem_index]);
823	rest -= numbers[mcelem_index];
824	mcelem_index++;
825	}
826
827	/*
828	* The presence of many distinct rare elements materially decreases
829	* selectivity. Use the Poisson distribution to estimate the probability
830	* of a column value having zero occurrences of such elements. See above
831	* for the definition of "rest".
832	*/
833	mult *= exp(-rest);
834
835	/----------*
836	* Using the distinct element count histogram requires
837	* O(unique_nitems * (nmcelem + unique_nitems))
838	* operations. Beyond a certain computational cost threshold, it's
839	* reasonable to sacrifice accuracy for decreased planning time. We limit
840	* the number of operations to EFFORT * nmcelem; since nmcelem is limited
841	* by the column's statistics target, the work done is user-controllable.
842	*
843	* If the number of operations would be too large, we can reduce it
844	* without losing all accuracy by reducing unique_nitems and considering
845	* only the most-common elements of the constant array. To make the
846	* results exactly match what we would have gotten with only those
847	* elements to start with, we'd have to remove any discarded elements'
848	* frequencies from "mult", but since this is only an approximation
849	* anyway, we don't bother with that. Therefore it's sufficient to qsort
850	* elem_selec[] and take the largest elements. (They will no longer match
851	* up with the elements of array_data[], but we don't care.)
852	*----------
853	*/
854	#define EFFORT 100
855
856	if ((nmcelem + unique_nitems) > `0` &&
857	unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
858	{
859	/*
860	* Use the quadratic formula to solve for largest allowable N. We
861	* have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
862	*/
863	double b = (double) nmcelem;
864	int n;
865
866	n = (int) ((sqrt(b * b + `4` * EFFORT * b) - b) / `2`);
867
868	/ Sort, then take just the first n elements /
869	qsort(elem_selec, unique_nitems, sizeof(float),
870	float_compare_desc);
871	unique_nitems = n;
872	}
873
874	/*
875	* Calculate probabilities of each distinct element count for both mcelems
876	* and constant elements. At this point, assume independent element
877	* occurrence.
878	*/
879	dist = calc_distr(elem_selec, unique_nitems, unique_nitems, `0.0f`);
880	mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
881
882	/ ignore hist[nhist-1], which is the average not a histogram member /
883	hist_part = calc_hist(hist, nhist - `1`, unique_nitems);
884
885	selec = `0.0f`;
886	for (i = `0`; i <= unique_nitems; i++)
887	{
888	/*
889	* mult * dist[i] / mcelem_dist[i] gives us probability of qual
890	* matching from assumption of independent element occurrence with the
891	* condition that distinct element count = i.
892	*/
893	if (mcelem_dist[i] > `0`)
894	selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
895	}
896
897	pfree(dist);
898	pfree(mcelem_dist);
899	pfree(hist_part);
900	pfree(elem_selec);
901
902	/ Take into account occurrence of NULL element. /
903	selec *= (`1.0f` - nullelem_freq);
904
905	CLAMP_PROBABILITY(selec);
906
907	return selec;
908	}
909
910	/*
911	* Calculate the first n distinct element count probabilities from a
912	* histogram of distinct element counts.
913	*
914	* Returns a palloc'd array of n+1 entries, with array[k] being the
915	* probability of element count k, k in [0..n].
916	*
917	* We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
918	* (nhist - 1)) probability to each value in (a,b) and an additional half of
919	* that to a and b themselves.
920	*/
921	static float *
922	calc_hist(const float4 hist, int* nhist, int n)
923	{
924	float *hist_part;
925	int k,
926	i = `0`;
927	float prev_interval = `0`,
928	next_interval;
929	float frac;
930
931	hist_part = (float ) palloc((n + `1`) sizeof(float));
932
933	/*
934	* frac is a probability contribution for each interval between histogram
935	* values. We have nhist - 1 intervals, so contribution of each one will
936	* be 1 / (nhist - 1).
937	*/
938	frac = `1.0f` / ((float) (nhist - `1`));
939
940	for (k = `0`; k <= n; k++)
941	{
942	int count = `0`;
943
944	/*
945	* Count the histogram boundaries equal to k. (Although the histogram
946	* should theoretically contain only exact integers, entries are
947	* floats so there could be roundoff error in large values. Treat any
948	* fractional value as equal to the next larger k.)
949	*/
950	while (i < nhist && hist[i] <= k)
951	{
952	count++;
953	i++;
954	}
955
956	if (count > `0`)
957	{
958	/ k is an exact bound for at least one histogram box. /
959	float val;
960
961	/ Find length between current histogram value and the next one /
962	if (i < nhist)
963	next_interval = hist[i] - hist[i - `1`];
964	else
965	next_interval = `0`;
966
967	/*
968	* count - 1 histogram boxes contain k exclusively. They
969	* contribute a total of (count - 1) * frac probability. Also
970	* factor in the partial histogram boxes on either side.
971	*/
972	val = (float) (count - `1`);
973	if (next_interval > `0`)
974	val += `0.5f` / next_interval;
975	if (prev_interval > `0`)
976	val += `0.5f` / prev_interval;
977	hist_part[k] = frac * val;
978
979	prev_interval = next_interval;
980	}
981	else
982	{
983	/ k does not appear as an exact histogram bound. /
984	if (prev_interval > `0`)
985	hist_part[k] = frac / prev_interval;
986	else
987	hist_part[k] = `0.0f`;
988	}
989	}
990
991	return hist_part;
992	}
993
994	/*
995	* Consider n independent events with probabilities p[]. This function
996	* calculates probabilities of exact k of events occurrence for k in [0..m].
997	* Returns a palloc'd array of size m+1.
998	*
999	* "rest" is the sum of the probabilities of all low-probability events not
1000	* included in p.
1001	*
1002	* Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1003	* probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1004	* For any constant j, each increment of i increases the probability iff the
1005	* event occurs. So, by the law of total probability:
1006	* M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1007	* for i > 0, j > 0.
1008	* M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1009	*/
1010	static float *
1011	calc_distr(const float p, int* n, int m, float rest)
1012	{
1013	float *row,
1014	*prev_row,
1015	*tmp;
1016	int i,
1017	j;
1018
1019	/*
1020	* Since we return only the last row of the matrix and need only the
1021	* current and previous row for calculations, allocate two rows.
1022	*/
1023	row = (float ) palloc((m + `1`) sizeof(float));
1024	prev_row = (float ) palloc((m + `1`) sizeof(float));
1025
1026	/ M[0,0] = 1 /
1027	row[`0`] = `1.0f`;
1028	for (i = `1`; i <= n; i++)
1029	{
1030	float t = p[i - `1`];
1031
1032	/ Swap rows /
1033	tmp = row;
1034	row = prev_row;
1035	prev_row = tmp;
1036
1037	/ Calculate next row /
1038	for (j = `0`; j <= i && j <= m; j++)
1039	{
1040	float val = `0.0f`;
1041
1042	if (j < i)
1043	val += prev_row[j] * (`1.0f` - t);
1044	if (j > `0`)
1045	val += prev_row[j - `1`] * t;
1046	row[j] = val;
1047	}
1048	}
1049
1050	/*
1051	* The presence of many distinct rare (not in "p") elements materially
1052	* decreases selectivity. Model their collective occurrence with the
1053	* Poisson distribution.
1054	*/
1055	if (rest > DEFAULT_CONTAIN_SEL)
1056	{
1057	float t;
1058
1059	/ Swap rows /
1060	tmp = row;
1061	row = prev_row;
1062	prev_row = tmp;
1063
1064	for (i = `0`; i <= m; i++)
1065	row[i] = `0.0f`;
1066
1067	/ Value of Poisson distribution for 0 occurrences /
1068	t = exp(-rest);
1069
1070	/*
1071	* Calculate convolution of previously computed distribution and the
1072	* Poisson distribution.
1073	*/
1074	for (i = `0`; i <= m; i++)
1075	{
1076	for (j = `0`; j <= m - i; j++)
1077	row[j + i] += prev_row[j] * t;
1078
1079	/ Get Poisson distribution value for (i + 1) occurrences /
1080	t = rest / (float*) (i + `1`);
1081	}
1082	}
1083
1084	pfree(prev_row);
1085	return row;
1086	}
1087
1088	/ Fast function for floor value of 2 based logarithm calculation. /
1089	static int
1090	floor_log2(uint32 n)
1091	{
1092	int logval = `0`;
1093
1094	if (n == `0`)
1095	return -`1`;
1096	if (n >= (`1` << `16`))
1097	{
1098	n >>= `16`;
1099	logval += `16`;
1100	}
1101	if (n >= (`1` << `8`))
1102	{
1103	n >>= `8`;
1104	logval += `8`;
1105	}
1106	if (n >= (`1` << `4`))
1107	{
1108	n >>= `4`;
1109	logval += `4`;
1110	}
1111	if (n >= (`1` << `2`))
1112	{
1113	n >>= `2`;
1114	logval += `2`;
1115	}
1116	if (n >= (`1` << `1`))
1117	{
1118	logval += `1`;
1119	}
1120	return logval;
1121	}
1122
1123	/*
1124	* find_next_mcelem binary-searches a most common elements array, starting
1125	* from *index, for the first member >= value. It saves the position of the
1126	* match into *index and returns true if it's an exact match. (Note: we
1127	* assume the mcelem elements are distinct so there can't be more than one
1128	* exact match.)
1129	*/
1130	static bool
1131	find_next_mcelem(Datum mcelem, int* nmcelem, Datum value, int *index,
1132	TypeCacheEntry *typentry)
1133	{
1134	int l = *index,
1135	r = nmcelem - `1`,
1136	i,
1137	res;
1138
1139	while (l <= r)
1140	{
1141	i = (l + r) / `2`;
1142	res = element_compare(&mcelem[i], &value, typentry);
1143	if (res == `0`)
1144	{
1145	*index = i;
1146	return true;
1147	}
1148	else if (res < `0`)
1149	l = i + `1`;
1150	else
1151	r = i - `1`;
1152	}
1153	*index = l;
1154	return false;
1155	}
1156
1157	/*
1158	* Comparison function for elements.
1159	*
1160	* We use the element type's default btree opclass, and its default collation
1161	* if the type is collation-sensitive.
1162	*
1163	* XXX consider using SortSupport infrastructure
1164	*/
1165	static int
1166	element_compare(const void key1, const* void key2, void* *arg)
1167	{
1168	Datum d1 = ((const* Datum *) key1);
1169	Datum d2 = ((const* Datum *) key2);
1170	TypeCacheEntry typentry = (TypeCacheEntry ) arg;
1171	FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo;
1172	Datum c;
1173
1174	c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
1175	return DatumGetInt32(c);
1176	}
1177
1178	/*
1179	* Comparison function for sorting floats into descending order.
1180	*/
1181	static int
1182	float_compare_desc(const void key1, const* void *key2)
1183	{
1184	float d1 = ((const* float *) key1);
1185	float d2 = ((const* float *) key2);
1186
1187	if (d1 > d2)
1188	return -`1`;
1189	else if (d1 < d2)
1190	return `1`;
1191	else
1192	return `0`;
1193	}
1194

Browse the source code of PostgreSQL/src/backend/utils/adt/array_selfuncs.c