extended_stats.c source code [PostgreSQL/src/backend/statistics/extended_stats.c]

1	/-------------------------------------------------------------------------*
2	*
3	* extended_stats.c
4	* POSTGRES extended statistics
5	*
6	* Generic code supporting statistics objects created via CREATE STATISTICS.
7	*
8	*
9	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10	* Portions Copyright (c) 1994, Regents of the University of California
11	*
12	* IDENTIFICATION
13	* src/backend/statistics/extended_stats.c
14	*
15	*-------------------------------------------------------------------------
16	*/
17	#include "postgres.h"
18
19	#include "access/genam.h"
20	#include "access/htup_details.h"
21	#include "access/table.h"
22	#include "access/tuptoaster.h"
23	#include "catalog/indexing.h"
24	#include "catalog/pg_collation.h"
25	#include "catalog/pg_statistic_ext.h"
26	#include "catalog/pg_statistic_ext_data.h"
27	#include "miscadmin.h"
28	#include "nodes/nodeFuncs.h"
29	#include "optimizer/clauses.h"
30	#include "optimizer/optimizer.h"
31	#include "postmaster/autovacuum.h"
32	#include "statistics/extended_stats_internal.h"
33	#include "statistics/statistics.h"
34	#include "utils/builtins.h"
35	#include "utils/fmgroids.h"
36	#include "utils/lsyscache.h"
37	#include "utils/memutils.h"
38	#include "utils/rel.h"
39	#include "utils/selfuncs.h"
40	#include "utils/syscache.h"
41
42	/*
43	* To avoid consuming too much memory during analysis and/or too much space
44	* in the resulting pg_statistic rows, we ignore varlena datums that are wider
45	* than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV
46	* and distinct-value calculations since a wide value is unlikely to be
47	* duplicated at all, much less be a most-common value. For the same reason,
48	* ignoring wide values will not affect our estimates of histogram bin
49	* boundaries very much.
50	*/
51	#define WIDTH_THRESHOLD 1024
52
53	/*
54	* Used internally to refer to an individual statistics object, i.e.,
55	* a pg_statistic_ext entry.
56	*/
57	typedef struct StatExtEntry
58	{
59	Oid statOid; / OID of pg_statistic_ext entry /
60	char schema; /* statistics object's schema /
61	char name; /* statistics object's name /
62	Bitmapset columns; /* attribute numbers covered by the object /
63	List types; /* 'char' list of enabled statistic kinds /
64	} StatExtEntry;
65
66
67	static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
68	static VacAttrStats *lookup_var_attr_stats(Relation rel, Bitmapset attrs,
69	int nvacatts, VacAttrStats **vacatts);
70	static void statext_store(Oid relid,
71	MVNDistinct ndistinct, MVDependencies dependencies,
72	MCVList mcv, VacAttrStats *stats);
73
74
75	/*
76	* Compute requested extended stats, using the rows sampled for the plain
77	* (single-column) stats.
78	*
79	* This fetches a list of stats types from pg_statistic_ext, computes the
80	* requested stats, and serializes them back into the catalog.
81	*/
82	void
83	BuildRelationExtStatistics(Relation onerel, double totalrows,
84	int numrows, HeapTuple *rows,
85	int natts, VacAttrStats **vacattrstats)
86	{
87	Relation pg_stext;
88	ListCell *lc;
89	List *stats;
90	MemoryContext cxt;
91	MemoryContext oldcxt;
92
93	cxt = AllocSetContextCreate(CurrentMemoryContext,
94	"BuildRelationExtStatistics",
95	ALLOCSET_DEFAULT_SIZES);
96	oldcxt = MemoryContextSwitchTo(cxt);
97
98	pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
99	stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
100
101	foreach(lc, stats)
102	{
103	StatExtEntry stat = (StatExtEntry ) lfirst(lc);
104	MVNDistinct *ndistinct = NULL;
105	MVDependencies *dependencies = NULL;
106	MCVList *mcv = NULL;
107	VacAttrStats **stats;
108	ListCell *lc2;
109
110	/*
111	* Check if we can build these stats based on the column analyzed. If
112	* not, report this fact (except in autovacuum) and move on.
113	*/
114	stats = lookup_var_attr_stats(onerel, stat->columns,
115	natts, vacattrstats);
116	if (!stats)
117	{
118	if (!IsAutoVacuumWorkerProcess())
119	ereport(WARNING,
120	(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
121	errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
122	stat->schema, stat->name,
123	get_namespace_name(onerel->rd_rel->relnamespace),
124	RelationGetRelationName(onerel)),
125	errtable(onerel)));
126	continue;
127	}
128
129	/ check allowed number of dimensions /
130	Assert(bms_num_members(stat->columns) >= `2` &&
131	bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
132
133	/ compute statistic of each requested type /
134	foreach(lc2, stat->types)
135	{
136	char t = (char) lfirst_int(lc2);
137
138	if (t == STATS_EXT_NDISTINCT)
139	ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
140	stat->columns, stats);
141	else if (t == STATS_EXT_DEPENDENCIES)
142	dependencies = statext_dependencies_build(numrows, rows,
143	stat->columns, stats);
144	else if (t == STATS_EXT_MCV)
145	mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
146	totalrows);
147	}
148
149	/ store the statistics in the catalog /
150	statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
151	}
152
153	table_close(pg_stext, RowExclusiveLock);
154
155	MemoryContextSwitchTo(oldcxt);
156	MemoryContextDelete(cxt);
157	}
158
159	/*
160	* statext_is_kind_built
161	* Is this stat kind built in the given pg_statistic_ext_data tuple?
162	*/
163	bool
164	statext_is_kind_built(HeapTuple htup, char type)
165	{
166	AttrNumber attnum;
167
168	switch (type)
169	{
170	case STATS_EXT_NDISTINCT:
171	attnum = Anum_pg_statistic_ext_data_stxdndistinct;
172	break;
173
174	case STATS_EXT_DEPENDENCIES:
175	attnum = Anum_pg_statistic_ext_data_stxddependencies;
176	break;
177
178	case STATS_EXT_MCV:
179	attnum = Anum_pg_statistic_ext_data_stxdmcv;
180	break;
181
182	default:
183	elog(ERROR, "unexpected statistics type requested: %d", type);
184	}
185
186	return !heap_attisnull(htup, attnum, NULL);
187	}
188
189	/*
190	* Return a list (of StatExtEntry) of statistics objects for the given relation.
191	*/
192	static List *
193	fetch_statentries_for_relation(Relation pg_statext, Oid relid)
194	{
195	SysScanDesc scan;
196	ScanKeyData skey;
197	HeapTuple htup;
198	List *result = NIL;
199
200	/*
201	* Prepare to scan pg_statistic_ext for entries having stxrelid = this
202	* rel.
203	*/
204	ScanKeyInit(&skey,
205	Anum_pg_statistic_ext_stxrelid,
206	BTEqualStrategyNumber, F_OIDEQ,
207	ObjectIdGetDatum(relid));
208
209	scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
210	NULL, `1`, &skey);
211
212	while (HeapTupleIsValid(htup = systable_getnext(scan)))
213	{
214	StatExtEntry *entry;
215	Datum datum;
216	bool isnull;
217	int i;
218	ArrayType *arr;
219	char *enabled;
220	Form_pg_statistic_ext staForm;
221
222	entry = palloc0(sizeof(StatExtEntry));
223	staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
224	entry->statOid = staForm->oid;
225	entry->schema = get_namespace_name(staForm->stxnamespace);
226	entry->name = pstrdup(NameStr(staForm->stxname));
227	for (i = `0`; i < staForm->stxkeys.dim1; i++)
228	{
229	entry->columns = bms_add_member(entry->columns,
230	staForm->stxkeys.values[i]);
231	}
232
233	/ decode the stxkind char array into a list of chars /
234	datum = SysCacheGetAttr(STATEXTOID, htup,
235	Anum_pg_statistic_ext_stxkind, &isnull);
236	Assert(!isnull);
237	arr = DatumGetArrayTypeP(datum);
238	if (ARR_NDIM(arr) != `1` \|\|
239	ARR_HASNULL(arr) \|\|
240	ARR_ELEMTYPE(arr) != CHAROID)
241	elog(ERROR, "stxkind is not a 1-D char array");
242	enabled = (char *) ARR_DATA_PTR(arr);
243	for (i = `0`; i < ARR_DIMS(arr)[`0`]; i++)
244	{
245	Assert((enabled[i] == STATS_EXT_NDISTINCT) \|\|
246	(enabled[i] == STATS_EXT_DEPENDENCIES) \|\|
247	(enabled[i] == STATS_EXT_MCV));
248	entry->types = lappend_int(entry->types, (int) enabled[i]);
249	}
250
251	result = lappend(result, entry);
252	}
253
254	systable_endscan(scan);
255
256	return result;
257	}
258
259	/*
260	* Using 'vacatts' of size 'nvacatts' as input data, return a newly built
261	* VacAttrStats array which includes only the items corresponding to
262	* attributes indicated by 'stxkeys'. If we don't have all of the per column
263	* stats available to compute the extended stats, then we return NULL to indicate
264	* to the caller that the stats should not be built.
265	*/
266	static VacAttrStats **
267	lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
268	int nvacatts, VacAttrStats **vacatts)
269	{
270	int i = `0`;
271	int x = -`1`;
272	VacAttrStats **stats;
273
274	stats = (VacAttrStats **)
275	palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
276
277	/ lookup VacAttrStats info for the requested columns (same attnum) /
278	while ((x = bms_next_member(attrs, x)) >= `0`)
279	{
280	int j;
281
282	stats[i] = NULL;
283	for (j = `0`; j < nvacatts; j++)
284	{
285	if (x == vacatts[j]->tupattnum)
286	{
287	stats[i] = vacatts[j];
288	break;
289	}
290	}
291
292	if (!stats[i])
293	{
294	/*
295	* Looks like stats were not gathered for one of the columns
296	* required. We'll be unable to build the extended stats without
297	* this column.
298	*/
299	pfree(stats);
300	return NULL;
301	}
302
303	/*
304	* Sanity check that the column is not dropped - stats should have
305	* been removed in this case.
306	*/
307	Assert(!stats[i]->attr->attisdropped);
308
309	i++;
310	}
311
312	return stats;
313	}
314
315	/*
316	* statext_store
317	* Serializes the statistics and stores them into the pg_statistic_ext_data
318	* tuple.
319	*/
320	static void
321	statext_store(Oid statOid,
322	MVNDistinct ndistinct, MVDependencies dependencies,
323	MCVList mcv, VacAttrStats *stats)
324	{
325	HeapTuple stup,
326	oldtup;
327	Datum values[Natts_pg_statistic_ext_data];
328	bool nulls[Natts_pg_statistic_ext_data];
329	bool replaces[Natts_pg_statistic_ext_data];
330	Relation pg_stextdata;
331
332	memset(nulls, true, sizeof(nulls));
333	memset(replaces, false, sizeof(replaces));
334	memset(values, `0`, sizeof(values));
335
336	/*
337	* Construct a new pg_statistic_ext_data tuple, replacing the calculated
338	* stats.
339	*/
340	if (ndistinct != NULL)
341	{
342	bytea *data = statext_ndistinct_serialize(ndistinct);
343
344	nulls[Anum_pg_statistic_ext_data_stxdndistinct - `1`] = (data == NULL);
345	values[Anum_pg_statistic_ext_data_stxdndistinct - `1`] = PointerGetDatum(data);
346	}
347
348	if (dependencies != NULL)
349	{
350	bytea *data = statext_dependencies_serialize(dependencies);
351
352	nulls[Anum_pg_statistic_ext_data_stxddependencies - `1`] = (data == NULL);
353	values[Anum_pg_statistic_ext_data_stxddependencies - `1`] = PointerGetDatum(data);
354	}
355	if (mcv != NULL)
356	{
357	bytea *data = statext_mcv_serialize(mcv, stats);
358
359	nulls[Anum_pg_statistic_ext_data_stxdmcv - `1`] = (data == NULL);
360	values[Anum_pg_statistic_ext_data_stxdmcv - `1`] = PointerGetDatum(data);
361	}
362
363	/ always replace the value (either by bytea or NULL) /
364	replaces[Anum_pg_statistic_ext_data_stxdndistinct - `1`] = true;
365	replaces[Anum_pg_statistic_ext_data_stxddependencies - `1`] = true;
366	replaces[Anum_pg_statistic_ext_data_stxdmcv - `1`] = true;
367
368	/ there should already be a pg_statistic_ext_data tuple /
369	oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
370	if (!HeapTupleIsValid(oldtup))
371	elog(ERROR, "cache lookup failed for statistics object %u", statOid);
372
373	/ replace it /
374	pg_stextdata = table_open(StatisticExtDataRelationId, RowExclusiveLock);
375
376	stup = heap_modify_tuple(oldtup,
377	RelationGetDescr(pg_stextdata),
378	values,
379	nulls,
380	replaces);
381	ReleaseSysCache(oldtup);
382	CatalogTupleUpdate(pg_stextdata, &stup->t_self, stup);
383
384	heap_freetuple(stup);
385
386	table_close(pg_stextdata, RowExclusiveLock);
387	}
388
389	/ initialize multi-dimensional sort /
390	MultiSortSupport
391	multi_sort_init(int ndims)
392	{
393	MultiSortSupport mss;
394
395	Assert(ndims >= `2`);
396
397	mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
398	+ sizeof(SortSupportData) * ndims);
399
400	mss->ndims = ndims;
401
402	return mss;
403	}
404
405	/*
406	* Prepare sort support info using the given sort operator and collation
407	* at the position 'sortdim'
408	*/
409	void
410	multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
411	Oid oper, Oid collation)
412	{
413	SortSupport ssup = &mss->ssup[sortdim];
414
415	ssup->ssup_cxt = CurrentMemoryContext;
416	ssup->ssup_collation = collation;
417	ssup->ssup_nulls_first = false;
418
419	PrepareSortSupportFromOrderingOp(oper, ssup);
420	}
421
422	/ compare all the dimensions in the selected order /
423	int
424	multi_sort_compare(const void a, const* void b, void* *arg)
425	{
426	MultiSortSupport mss = (MultiSortSupport) arg;
427	SortItem ia = (SortItem ) a;
428	SortItem ib = (SortItem ) b;
429	int i;
430
431	for (i = `0`; i < mss->ndims; i++)
432	{
433	int compare;
434
435	compare = ApplySortComparator(ia->values[i], ia->isnull[i],
436	ib->values[i], ib->isnull[i],
437	&mss->ssup[i]);
438
439	if (compare != `0`)
440	return compare;
441	}
442
443	/ equal by default /
444	return `0`;
445	}
446
447	/ compare selected dimension /
448	int
449	multi_sort_compare_dim(int dim, const SortItem a, const* SortItem *b,
450	MultiSortSupport mss)
451	{
452	return ApplySortComparator(a->values[dim], a->isnull[dim],
453	b->values[dim], b->isnull[dim],
454	&mss->ssup[dim]);
455	}
456
457	int
458	multi_sort_compare_dims(int start, int end,
459	const SortItem a, const* SortItem *b,
460	MultiSortSupport mss)
461	{
462	int dim;
463
464	for (dim = start; dim <= end; dim++)
465	{
466	int r = ApplySortComparator(a->values[dim], a->isnull[dim],
467	b->values[dim], b->isnull[dim],
468	&mss->ssup[dim]);
469
470	if (r != `0`)
471	return r;
472	}
473
474	return `0`;
475	}
476
477	int
478	compare_scalars_simple(const void a, const* void b, void* *arg)
479	{
480	return compare_datums_simple((Datum ) a,
481	(Datum ) b,
482	(SortSupport) arg);
483	}
484
485	int
486	compare_datums_simple(Datum a, Datum b, SortSupport ssup)
487	{
488	return ApplySortComparator(a, false, b, false, ssup);
489	}
490
491	/ simple counterpart to qsort_arg /
492	void *
493	bsearch_arg(const void key, const* void *base, size_t nmemb, size_t size,
494	int (compar) (const* void , const* void , void* *),
495	void *arg)
496	{
497	size_t l,
498	u,
499	idx;
500	const void *p;
501	int comparison;
502
503	l = `0`;
504	u = nmemb;
505	while (l < u)
506	{
507	idx = (l + u) / `2`;
508	p = (void ) (((const* char ) base) + (idx size));
509	comparison = (*compar) (key, p, arg);
510
511	if (comparison < `0`)
512	u = idx;
513	else if (comparison > `0`)
514	l = idx + `1`;
515	else
516	return (void *) p;
517	}
518
519	return NULL;
520	}
521
522	/*
523	* build_attnums_array
524	* Transforms a bitmap into an array of AttrNumber values.
525	*
526	* This is used for extended statistics only, so all the attribute must be
527	* user-defined. That means offsetting by FirstLowInvalidHeapAttributeNumber
528	* is not necessary here (and when querying the bitmap).
529	*/
530	AttrNumber *
531	build_attnums_array(Bitmapset attrs, int* *numattrs)
532	{
533	int i,
534	j;
535	AttrNumber *attnums;
536	int num = bms_num_members(attrs);
537
538	if (numattrs)
539	*numattrs = num;
540
541	/ build attnums from the bitmapset /
542	attnums = (AttrNumber ) palloc(sizeof(AttrNumber) num);
543	i = `0`;
544	j = -`1`;
545	while ((j = bms_next_member(attrs, j)) >= `0`)
546	{
547	/*
548	* Make sure the bitmap contains only user-defined attributes. As
549	* bitmaps can't contain negative values, this can be violated in two
550	* ways. Firstly, the bitmap might contain 0 as a member, and secondly
551	* the integer value might be larger than MaxAttrNumber.
552	*/
553	Assert(AttrNumberIsForUserDefinedAttr(j));
554	Assert(j <= MaxAttrNumber);
555
556	attnums[i++] = (AttrNumber) j;
557
558	/ protect against overflows /
559	Assert(i <= num);
560	}
561
562	return attnums;
563	}
564
565	/*
566	* build_sorted_items
567	* build a sorted array of SortItem with values from rows
568	*
569	* Note: All the memory is allocated in a single chunk, so that the caller
570	* can simply pfree the return value to release all of it.
571	*/
572	SortItem *
573	build_sorted_items(int numrows, int nitems, HeapTuple rows, TupleDesc tdesc,
574	MultiSortSupport mss, int numattrs, AttrNumber *attnums)
575	{
576	int i,
577	j,
578	len,
579	idx;
580	int nvalues = numrows * numattrs;
581
582	SortItem *items;
583	Datum *values;
584	bool *isnull;
585	char *ptr;
586
587	/ Compute the total amount of memory we need (both items and values). /
588	len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
589
590	/ Allocate the memory and split it into the pieces. /
591	ptr = palloc0(len);
592
593	/ items to sort /
594	items = (SortItem *) ptr;
595	ptr += numrows * sizeof(SortItem);
596
597	/ values and null flags /
598	values = (Datum *) ptr;
599	ptr += nvalues * sizeof(Datum);
600
601	isnull = (bool *) ptr;
602	ptr += nvalues * sizeof(bool);
603
604	/ make sure we consumed the whole buffer exactly /
605	Assert((ptr - (char *) items) == len);
606
607	/ fix the pointers to Datum and bool arrays /
608	idx = `0`;
609	for (i = `0`; i < numrows; i++)
610	{
611	bool toowide = false;
612
613	items[idx].values = &values[idx * numattrs];
614	items[idx].isnull = &isnull[idx * numattrs];
615
616	/ load the values/null flags from sample rows /
617	for (j = `0`; j < numattrs; j++)
618	{
619	Datum value;
620	bool isnull;
621
622	value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
623
624	/*
625	* If this is a varlena value, check if it's too wide and if yes
626	* then skip the whole item. Otherwise detoast the value.
627	*
628	* XXX It may happen that we've already detoasted some preceding
629	* values for the current item. We don't bother to cleanup those
630	* on the assumption that those are small (below WIDTH_THRESHOLD)
631	* and will be discarded at the end of analyze.
632	*/
633	if ((!isnull) &&
634	(TupleDescAttr(tdesc, attnums[j] - `1`)->attlen == -`1`))
635	{
636	if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
637	{
638	toowide = true;
639	break;
640	}
641
642	value = PointerGetDatum(PG_DETOAST_DATUM(value));
643	}
644
645	items[idx].values[j] = value;
646	items[idx].isnull[j] = isnull;
647	}
648
649	if (toowide)
650	continue;
651
652	idx++;
653	}
654
655	/ store the actual number of items (ignoring the too-wide ones) /
656	*nitems = idx;
657
658	/ all items were too wide /
659	if (idx == `0`)
660	{
661	/ everything is allocated as a single chunk /
662	pfree(items);
663	return NULL;
664	}
665
666	/ do the sort, using the multi-sort /
667	qsort_arg((void ) items, idx, sizeof*(SortItem),
668	multi_sort_compare, mss);
669
670	return items;
671	}
672
673	/*
674	* has_stats_of_kind
675	* Check whether the list contains statistic of a given kind
676	*/
677	bool
678	has_stats_of_kind(List stats, char* requiredkind)
679	{
680	ListCell *l;
681
682	foreach(l, stats)
683	{
684	StatisticExtInfo stat = (StatisticExtInfo ) lfirst(l);
685
686	if (stat->kind == requiredkind)
687	return true;
688	}
689
690	return false;
691	}
692
693	/*
694	* choose_best_statistics
695	* Look for and return statistics with the specified 'requiredkind' which
696	* have keys that match at least two of the given attnums. Return NULL if
697	* there's no match.
698	*
699	* The current selection criteria is very simple - we choose the statistics
700	* object referencing the most of the requested attributes, breaking ties
701	* in favor of objects with fewer keys overall.
702	*
703	* XXX If multiple statistics objects tie on both criteria, then which object
704	* is chosen depends on the order that they appear in the stats list. Perhaps
705	* further tiebreakers are needed.
706	*/
707	StatisticExtInfo *
708	choose_best_statistics(List stats, Bitmapset attnums, char requiredkind)
709	{
710	ListCell *lc;
711	StatisticExtInfo *best_match = NULL;
712	int best_num_matched = `2`; / goal #1: maximize /
713	int best_match_keys = (STATS_MAX_DIMENSIONS + `1`); / goal #2: minimize /
714
715	foreach(lc, stats)
716	{
717	StatisticExtInfo info = (StatisticExtInfo ) lfirst(lc);
718	int num_matched;
719	int numkeys;
720	Bitmapset *matched;
721
722	/ skip statistics that are not of the correct type /
723	if (info->kind != requiredkind)
724	continue;
725
726	/ determine how many attributes of these stats can be matched to /
727	matched = bms_intersect(attnums, info->keys);
728	num_matched = bms_num_members(matched);
729	bms_free(matched);
730
731	/*
732	* save the actual number of keys in the stats so that we can choose
733	* the narrowest stats with the most matching keys.
734	*/
735	numkeys = bms_num_members(info->keys);
736
737	/*
738	* Use this object when it increases the number of matched clauses or
739	* when it matches the same number of attributes but these stats have
740	* fewer keys than any previous match.
741	*/
742	if (num_matched > best_num_matched \|\|
743	(num_matched == best_num_matched && numkeys < best_match_keys))
744	{
745	best_match = info;
746	best_num_matched = num_matched;
747	best_match_keys = numkeys;
748	}
749	}
750
751	return best_match;
752	}
753
754	/*
755	* statext_is_compatible_clause_internal
756	* Determines if the clause is compatible with MCV lists.
757	*
758	* Does the heavy lifting of actually inspecting the clauses for
759	* statext_is_compatible_clause. It needs to be split like this because
760	* of recursion. The attnums bitmap is an input/output parameter collecting
761	* attribute numbers from all compatible clauses (recursively).
762	*/
763	static bool
764	statext_is_compatible_clause_internal(PlannerInfo root, Node clause,
765	Index relid, Bitmapset **attnums)
766	{
767	/ Look inside any binary-compatible relabeling (as in examine_variable) /
768	if (IsA(clause, RelabelType))
769	clause = (Node ) ((RelabelType ) clause)->arg;
770
771	/ plain Var references (boolean Vars or recursive checks) /
772	if (IsA(clause, Var))
773	{
774	Var var = (Var ) clause;
775
776	/ Ensure var is from the correct relation /
777	if (var->varno != relid)
778	return false;
779
780	/ we also better ensure the Var is from the current level /
781	if (var->varlevelsup > `0`)
782	return false;
783
784	/ Also skip system attributes (we don't allow stats on those). /
785	if (!AttrNumberIsForUserDefinedAttr(var->varattno))
786	return false;
787
788	attnums = bms_add_member(attnums, var->varattno);
789
790	return true;
791	}
792
793	/ (Var op Const) or (Const op Var) /
794	if (is_opclause(clause))
795	{
796	RangeTblEntry *rte = root->simple_rte_array[relid];
797	OpExpr expr = (OpExpr ) clause;
798	Var *var;
799
800	/ Only expressions with two arguments are considered compatible. /
801	if (list_length(expr->args) != `2`)
802	return false;
803
804	/ Check if the expression the right shape (one Var, one Const) /
805	if (!examine_opclause_expression(expr, &var, NULL, NULL))
806	return false;
807
808	/*
809	* If it's not one of the supported operators ("=", "<", ">", etc.),
810	* just ignore the clause, as it's not compatible with MCV lists.
811	*
812	* This uses the function for estimating selectivity, not the operator
813	* directly (a bit awkward, but well ...).
814	*/
815	switch (get_oprrest(expr->opno))
816	{
817	case F_EQSEL:
818	case F_NEQSEL:
819	case F_SCALARLTSEL:
820	case F_SCALARLESEL:
821	case F_SCALARGTSEL:
822	case F_SCALARGESEL:
823	/ supported, will continue with inspection of the Var /
824	break;
825
826	default:
827	/ other estimators are considered unknown/unsupported /
828	return false;
829	}
830
831	/*
832	* If there are any securityQuals on the RTE from security barrier
833	* views or RLS policies, then the user may not have access to all the
834	* table's data, and we must check that the operator is leak-proof.
835	*
836	* If the operator is leaky, then we must ignore this clause for the
837	* purposes of estimating with MCV lists, otherwise the operator might
838	* reveal values from the MCV list that the user doesn't have
839	* permission to see.
840	*/
841	if (rte->securityQuals != NIL &&
842	!get_func_leakproof(get_opcode(expr->opno)))
843	return false;
844
845	return statext_is_compatible_clause_internal(root, (Node *) var,
846	relid, attnums);
847	}
848
849	/ AND/OR/NOT clause /
850	if (is_andclause(clause) \|\|
851	is_orclause(clause) \|\|
852	is_notclause(clause))
853	{
854	/*
855	* AND/OR/NOT-clauses are supported if all sub-clauses are supported
856	*
857	* Perhaps we could improve this by handling mixed cases, when some of
858	* the clauses are supported and some are not. Selectivity for the
859	* supported subclauses would be computed using extended statistics,
860	* and the remaining clauses would be estimated using the traditional
861	* algorithm (product of selectivities).
862	*
863	* It however seems overly complex, and in a way we already do that
864	* because if we reject the whole clause as unsupported here, it will
865	* be eventually passed to clauselist_selectivity() which does exactly
866	* this (split into supported/unsupported clauses etc).
867	*/
868	BoolExpr expr = (BoolExpr ) clause;
869	ListCell *lc;
870
871	foreach(lc, expr->args)
872	{
873	/*
874	* Had we found incompatible clause in the arguments, treat the
875	* whole clause as incompatible.
876	*/
877	if (!statext_is_compatible_clause_internal(root,
878	(Node *) lfirst(lc),
879	relid, attnums))
880	return false;
881	}
882
883	return true;
884	}
885
886	/ Var IS NULL /
887	if (IsA(clause, NullTest))
888	{
889	NullTest nt = (NullTest ) clause;
890
891	/*
892	* Only simple (Var IS NULL) expressions supported for now. Maybe we
893	* could use examine_variable to fix this?
894	*/
895	if (!IsA(nt->arg, Var))
896	return false;
897
898	return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
899	relid, attnums);
900	}
901
902	return false;
903	}
904
905	/*
906	* statext_is_compatible_clause
907	* Determines if the clause is compatible with MCV lists.
908	*
909	* Currently, we only support three types of clauses:
910	*
911	* (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
912	* is one of ("=", "<", ">", ">=", "<=")
913	*
914	* (b) (Var IS [NOT] NULL)
915	*
916	* (c) combinations using AND/OR/NOT
917	*
918	* In the future, the range of supported clauses may be expanded to more
919	* complex cases, for example (Var op Var).
920	*/
921	static bool
922	statext_is_compatible_clause(PlannerInfo root, Node clause, Index relid,
923	Bitmapset **attnums)
924	{
925	RangeTblEntry *rte = root->simple_rte_array[relid];
926	RestrictInfo rinfo = (RestrictInfo ) clause;
927	Oid userid;
928
929	if (!IsA(rinfo, RestrictInfo))
930	return false;
931
932	/ Pseudoconstants are not really interesting here. /
933	if (rinfo->pseudoconstant)
934	return false;
935
936	/ clauses referencing multiple varnos are incompatible /
937	if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
938	return false;
939
940	/ Check the clause and determine what attributes it references. /
941	if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause,
942	relid, attnums))
943	return false;
944
945	/*
946	* Check that the user has permission to read all these attributes. Use
947	* checkAsUser if it's set, in case we're accessing the table via a view.
948	*/
949	userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
950
951	if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK)
952	{
953	/ Don't have table privilege, must check individual columns /
954	if (bms_is_member(InvalidAttrNumber, *attnums))
955	{
956	/ Have a whole-row reference, must have access to all columns /
957	if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT,
958	ACLMASK_ALL) != ACLCHECK_OK)
959	return false;
960	}
961	else
962	{
963	/ Check the columns referenced by the clause /
964	int attnum = -`1`;
965
966	while ((attnum = bms_next_member(*attnums, attnum)) >= `0`)
967	{
968	if (pg_attribute_aclcheck(rte->relid, attnum, userid,
969	ACL_SELECT) != ACLCHECK_OK)
970	return false;
971	}
972	}
973	}
974
975	/ If we reach here, the clause is OK /
976	return true;
977	}
978
979	/*
980	* statext_mcv_clauselist_selectivity
981	* Estimate clauses using the best multi-column statistics.
982	*
983	* Selects the best extended (multi-column) statistic on a table (measured by
984	* the number of attributes extracted from the clauses and covered by it), and
985	* computes the selectivity for the supplied clauses.
986	*
987	* One of the main challenges with using MCV lists is how to extrapolate the
988	* estimate to the data not covered by the MCV list. To do that, we compute
989	* not only the "MCV selectivity" (selectivities for MCV items matching the
990	* supplied clauses), but also a couple of derived selectivities:
991	*
992	* - simple selectivity: Computed without extended statistic, i.e. as if the
993	* columns/clauses were independent
994	*
995	* - base selectivity: Similar to simple selectivity, but is computed using
996	* the extended statistic by adding up the base frequencies (that we compute
997	* and store for each MCV item) of matching MCV items.
998	*
999	* - total selectivity: Selectivity covered by the whole MCV list.
1000	*
1001	* - other selectivity: A selectivity estimate for data not covered by the MCV
1002	* list (i.e. satisfying the clauses, but not common enough to make it into
1003	* the MCV list)
1004	*
1005	* Note: While simple and base selectivities are defined in a quite similar
1006	* way, the values are computed differently and are not therefore equal. The
1007	* simple selectivity is computed as a product of per-clause estimates, while
1008	* the base selectivity is computed by adding up base frequencies of matching
1009	* items of the multi-column MCV list. So the values may differ for two main
1010	* reasons - (a) the MCV list may not cover 100% of the data and (b) some of
1011	* the MCV items did not match the estimated clauses.
1012	*
1013	* As both (a) and (b) reduce the base selectivity value, it generally holds
1014	* that (simple_selectivity >= base_selectivity). If the MCV list covers all
1015	* the data, the values may be equal.
1016	*
1017	* So, (simple_selectivity - base_selectivity) is an estimate for the part
1018	* not covered by the MCV list, and (mcv_selectivity - base_selectivity) may
1019	* be seen as a correction for the part covered by the MCV list. Those two
1020	* statements are actually equivalent.
1021	*
1022	* Note: Due to rounding errors and minor differences in how the estimates
1023	* are computed, the inequality may not always hold. Which is why we clamp
1024	* the selectivities to prevent strange estimate (negative etc.).
1025	*
1026	* 'estimatedclauses' is an input/output parameter. We set bits for the
1027	* 0-based 'clauses' indexes we estimate for and also skip clause items that
1028	* already have a bit set.
1029	*
1030	* XXX If we were to use multiple statistics, this is where it would happen.
1031	* We would simply repeat this on a loop on the "remaining" clauses, possibly
1032	* using the already estimated clauses as conditions (and combining the values
1033	* using conditional probability formula).
1034	*/
1035	static Selectivity
1036	statext_mcv_clauselist_selectivity(PlannerInfo root, List clauses, int varRelid,
1037	JoinType jointype, SpecialJoinInfo *sjinfo,
1038	RelOptInfo rel, Bitmapset *estimatedclauses)
1039	{
1040	ListCell *l;
1041	Bitmapset *clauses_attnums = NULL;
1042	Bitmapset **list_attnums;
1043	int listidx;
1044	StatisticExtInfo *stat;
1045	List *stat_clauses;
1046	Selectivity simple_sel,
1047	mcv_sel,
1048	mcv_basesel,
1049	mcv_totalsel,
1050	other_sel,
1051	sel;
1052
1053	/ check if there's any stats that might be useful for us. /
1054	if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
1055	return `1.0`;
1056
1057	list_attnums = (Bitmapset ) palloc(sizeof*(Bitmapset ) *
1058	list_length(clauses));
1059
1060	/*
1061	* Pre-process the clauses list to extract the attnums seen in each item.
1062	* We need to determine if there's any clauses which will be useful for
1063	* selectivity estimations with extended stats. Along the way we'll record
1064	* all of the attnums for each clause in a list which we'll reference
1065	* later so we don't need to repeat the same work again. We'll also keep
1066	* track of all attnums seen.
1067	*
1068	* We also skip clauses that we already estimated using different types of
1069	* statistics (we treat them as incompatible).
1070	*/
1071	listidx = `0`;
1072	foreach(l, clauses)
1073	{
1074	Node clause = (Node ) lfirst(l);
1075	Bitmapset *attnums = NULL;
1076
1077	if (!bms_is_member(listidx, *estimatedclauses) &&
1078	statext_is_compatible_clause(root, clause, rel->relid, &attnums))
1079	{
1080	list_attnums[listidx] = attnums;
1081	clauses_attnums = bms_add_members(clauses_attnums, attnums);
1082	}
1083	else
1084	list_attnums[listidx] = NULL;
1085
1086	listidx++;
1087	}
1088
1089	/ We need at least two attributes for multivariate statistics. /
1090	if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1091	return `1.0`;
1092
1093	/ find the best suited statistics object for these attnums /
1094	stat = choose_best_statistics(rel->statlist, clauses_attnums, STATS_EXT_MCV);
1095
1096	/ if no matching stats could be found then we've nothing to do /
1097	if (!stat)
1098	return `1.0`;
1099
1100	/ Ensure choose_best_statistics produced an expected stats type. /
1101	Assert(stat->kind == STATS_EXT_MCV);
1102
1103	/ now filter the clauses to be estimated using the selected MCV /
1104	stat_clauses = NIL;
1105
1106	listidx = `0`;
1107	foreach(l, clauses)
1108	{
1109	/*
1110	* If the clause is compatible with the selected statistics, mark it
1111	* as estimated and add it to the list to estimate.
1112	*/
1113	if (list_attnums[listidx] != NULL &&
1114	bms_is_subset(list_attnums[listidx], stat->keys))
1115	{
1116	stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
1117	estimatedclauses = bms_add_member(estimatedclauses, listidx);
1118	}
1119
1120	listidx++;
1121	}
1122
1123	/*
1124	* First compute "simple" selectivity, i.e. without the extended
1125	* statistics, and essentially assuming independence of the
1126	* columns/clauses. We'll then use the various selectivities computed from
1127	* MCV list to improve it.
1128	*/
1129	simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
1130	jointype, sjinfo, NULL);
1131
1132	/*
1133	* Now compute the multi-column estimate from the MCV list, along with the
1134	* other selectivities (base & total selectivity).
1135	*/
1136	mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
1137	jointype, sjinfo, rel,
1138	&mcv_basesel, &mcv_totalsel);
1139
1140	/ Estimated selectivity of values not covered by MCV matches /
1141	other_sel = simple_sel - mcv_basesel;
1142	CLAMP_PROBABILITY(other_sel);
1143
1144	/ The non-MCV selectivity can't exceed the 1 - mcv_totalsel. /
1145	if (other_sel > `1.0` - mcv_totalsel)
1146	other_sel = `1.0` - mcv_totalsel;
1147
1148	/ Overall selectivity is the combination of MCV and non-MCV estimates. /
1149	sel = mcv_sel + other_sel;
1150	CLAMP_PROBABILITY(sel);
1151
1152	return sel;
1153	}
1154
1155	/*
1156	* statext_clauselist_selectivity
1157	* Estimate clauses using the best multi-column statistics.
1158	*/
1159	Selectivity
1160	statext_clauselist_selectivity(PlannerInfo root, List clauses, int varRelid,
1161	JoinType jointype, SpecialJoinInfo *sjinfo,
1162	RelOptInfo rel, Bitmapset *estimatedclauses)
1163	{
1164	Selectivity sel;
1165
1166	/ First, try estimating clauses using a multivariate MCV list. /
1167	sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype,
1168	sjinfo, rel, estimatedclauses);
1169
1170	/*
1171	* Then, apply functional dependencies on the remaining clauses by calling
1172	* dependencies_clauselist_selectivity. Pass 'estimatedclauses' so the
1173	* function can properly skip clauses already estimated above.
1174	*
1175	* The reasoning for applying dependencies last is that the more complex
1176	* stats can track more complex correlations between the attributes, and
1177	* so may be considered more reliable.
1178	*
1179	* For example, MCV list can give us an exact selectivity for values in
1180	* two columns, while functional dependencies can only provide information
1181	* about the overall strength of the dependency.
1182	*/
1183	sel *= dependencies_clauselist_selectivity(root, clauses, varRelid,
1184	jointype, sjinfo, rel,
1185	estimatedclauses);
1186
1187	return sel;
1188	}
1189
1190	/*
1191	* examine_operator_expression
1192	* Split expression into Var and Const parts.
1193	*
1194	* Attempts to match the arguments to either (Var op Const) or (Const op Var),
1195	* possibly with a RelabelType on top. When the expression matches this form,
1196	* returns true, otherwise returns false.
1197	*
1198	* Optionally returns pointers to the extracted Var/Const nodes, when passed
1199	* non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
1200	* on which side of the operator we found the Var node.
1201	*/
1202	bool
1203	examine_opclause_expression(OpExpr expr, Var varp, Const cstp, bool varonleftp)
1204	{
1205	Var *var;
1206	Const *cst;
1207	bool varonleft;
1208	Node *leftop,
1209	*rightop;
1210
1211	/ enforced by statext_is_compatible_clause_internal /
1212	Assert(list_length(expr->args) == `2`);
1213
1214	leftop = linitial(expr->args);
1215	rightop = lsecond(expr->args);
1216
1217	/ strip RelabelType from either side of the expression /
1218	if (IsA(leftop, RelabelType))
1219	leftop = (Node ) ((RelabelType ) leftop)->arg;
1220
1221	if (IsA(rightop, RelabelType))
1222	rightop = (Node ) ((RelabelType ) rightop)->arg;
1223
1224	if (IsA(leftop, Var) && IsA(rightop, Const))
1225	{
1226	var = (Var *) leftop;
1227	cst = (Const *) rightop;
1228	varonleft = true;
1229	}
1230	else if (IsA(leftop, Const) && IsA(rightop, Var))
1231	{
1232	var = (Var *) rightop;
1233	cst = (Const *) leftop;
1234	varonleft = false;
1235	}
1236	else
1237	return false;
1238
1239	/ return pointers to the extracted parts if requested /
1240	if (varp)
1241	*varp = var;
1242
1243	if (cstp)
1244	*cstp = cst;
1245
1246	if (varonleftp)
1247	*varonleftp = varonleft;
1248
1249	return true;
1250	}
1251

Browse the source code of PostgreSQL/src/backend/statistics/extended_stats.c