1/*-------------------------------------------------------------------------
2 *
3 * extended_stats.c
4 * POSTGRES extended statistics
5 *
6 * Generic code supporting statistics objects created via CREATE STATISTICS.
7 *
8 *
9 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
11 *
12 * IDENTIFICATION
13 * src/backend/statistics/extended_stats.c
14 *
15 *-------------------------------------------------------------------------
16 */
17#include "postgres.h"
18
19#include "access/genam.h"
20#include "access/htup_details.h"
21#include "access/table.h"
22#include "access/tuptoaster.h"
23#include "catalog/indexing.h"
24#include "catalog/pg_collation.h"
25#include "catalog/pg_statistic_ext.h"
26#include "catalog/pg_statistic_ext_data.h"
27#include "miscadmin.h"
28#include "nodes/nodeFuncs.h"
29#include "optimizer/clauses.h"
30#include "optimizer/optimizer.h"
31#include "postmaster/autovacuum.h"
32#include "statistics/extended_stats_internal.h"
33#include "statistics/statistics.h"
34#include "utils/builtins.h"
35#include "utils/fmgroids.h"
36#include "utils/lsyscache.h"
37#include "utils/memutils.h"
38#include "utils/rel.h"
39#include "utils/selfuncs.h"
40#include "utils/syscache.h"
41
42/*
43 * To avoid consuming too much memory during analysis and/or too much space
44 * in the resulting pg_statistic rows, we ignore varlena datums that are wider
45 * than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV
46 * and distinct-value calculations since a wide value is unlikely to be
47 * duplicated at all, much less be a most-common value. For the same reason,
48 * ignoring wide values will not affect our estimates of histogram bin
49 * boundaries very much.
50 */
51#define WIDTH_THRESHOLD 1024
52
53/*
54 * Used internally to refer to an individual statistics object, i.e.,
55 * a pg_statistic_ext entry.
56 */
57typedef struct StatExtEntry
58{
59 Oid statOid; /* OID of pg_statistic_ext entry */
60 char *schema; /* statistics object's schema */
61 char *name; /* statistics object's name */
62 Bitmapset *columns; /* attribute numbers covered by the object */
63 List *types; /* 'char' list of enabled statistic kinds */
64} StatExtEntry;
65
66
67static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
68static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
69 int nvacatts, VacAttrStats **vacatts);
70static void statext_store(Oid relid,
71 MVNDistinct *ndistinct, MVDependencies *dependencies,
72 MCVList *mcv, VacAttrStats **stats);
73
74
75/*
76 * Compute requested extended stats, using the rows sampled for the plain
77 * (single-column) stats.
78 *
79 * This fetches a list of stats types from pg_statistic_ext, computes the
80 * requested stats, and serializes them back into the catalog.
81 */
82void
83BuildRelationExtStatistics(Relation onerel, double totalrows,
84 int numrows, HeapTuple *rows,
85 int natts, VacAttrStats **vacattrstats)
86{
87 Relation pg_stext;
88 ListCell *lc;
89 List *stats;
90 MemoryContext cxt;
91 MemoryContext oldcxt;
92
93 cxt = AllocSetContextCreate(CurrentMemoryContext,
94 "BuildRelationExtStatistics",
95 ALLOCSET_DEFAULT_SIZES);
96 oldcxt = MemoryContextSwitchTo(cxt);
97
98 pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
99 stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
100
101 foreach(lc, stats)
102 {
103 StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
104 MVNDistinct *ndistinct = NULL;
105 MVDependencies *dependencies = NULL;
106 MCVList *mcv = NULL;
107 VacAttrStats **stats;
108 ListCell *lc2;
109
110 /*
111 * Check if we can build these stats based on the column analyzed. If
112 * not, report this fact (except in autovacuum) and move on.
113 */
114 stats = lookup_var_attr_stats(onerel, stat->columns,
115 natts, vacattrstats);
116 if (!stats)
117 {
118 if (!IsAutoVacuumWorkerProcess())
119 ereport(WARNING,
120 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
121 errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
122 stat->schema, stat->name,
123 get_namespace_name(onerel->rd_rel->relnamespace),
124 RelationGetRelationName(onerel)),
125 errtable(onerel)));
126 continue;
127 }
128
129 /* check allowed number of dimensions */
130 Assert(bms_num_members(stat->columns) >= 2 &&
131 bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
132
133 /* compute statistic of each requested type */
134 foreach(lc2, stat->types)
135 {
136 char t = (char) lfirst_int(lc2);
137
138 if (t == STATS_EXT_NDISTINCT)
139 ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
140 stat->columns, stats);
141 else if (t == STATS_EXT_DEPENDENCIES)
142 dependencies = statext_dependencies_build(numrows, rows,
143 stat->columns, stats);
144 else if (t == STATS_EXT_MCV)
145 mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
146 totalrows);
147 }
148
149 /* store the statistics in the catalog */
150 statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
151 }
152
153 table_close(pg_stext, RowExclusiveLock);
154
155 MemoryContextSwitchTo(oldcxt);
156 MemoryContextDelete(cxt);
157}
158
159/*
160 * statext_is_kind_built
161 * Is this stat kind built in the given pg_statistic_ext_data tuple?
162 */
163bool
164statext_is_kind_built(HeapTuple htup, char type)
165{
166 AttrNumber attnum;
167
168 switch (type)
169 {
170 case STATS_EXT_NDISTINCT:
171 attnum = Anum_pg_statistic_ext_data_stxdndistinct;
172 break;
173
174 case STATS_EXT_DEPENDENCIES:
175 attnum = Anum_pg_statistic_ext_data_stxddependencies;
176 break;
177
178 case STATS_EXT_MCV:
179 attnum = Anum_pg_statistic_ext_data_stxdmcv;
180 break;
181
182 default:
183 elog(ERROR, "unexpected statistics type requested: %d", type);
184 }
185
186 return !heap_attisnull(htup, attnum, NULL);
187}
188
189/*
190 * Return a list (of StatExtEntry) of statistics objects for the given relation.
191 */
192static List *
193fetch_statentries_for_relation(Relation pg_statext, Oid relid)
194{
195 SysScanDesc scan;
196 ScanKeyData skey;
197 HeapTuple htup;
198 List *result = NIL;
199
200 /*
201 * Prepare to scan pg_statistic_ext for entries having stxrelid = this
202 * rel.
203 */
204 ScanKeyInit(&skey,
205 Anum_pg_statistic_ext_stxrelid,
206 BTEqualStrategyNumber, F_OIDEQ,
207 ObjectIdGetDatum(relid));
208
209 scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
210 NULL, 1, &skey);
211
212 while (HeapTupleIsValid(htup = systable_getnext(scan)))
213 {
214 StatExtEntry *entry;
215 Datum datum;
216 bool isnull;
217 int i;
218 ArrayType *arr;
219 char *enabled;
220 Form_pg_statistic_ext staForm;
221
222 entry = palloc0(sizeof(StatExtEntry));
223 staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
224 entry->statOid = staForm->oid;
225 entry->schema = get_namespace_name(staForm->stxnamespace);
226 entry->name = pstrdup(NameStr(staForm->stxname));
227 for (i = 0; i < staForm->stxkeys.dim1; i++)
228 {
229 entry->columns = bms_add_member(entry->columns,
230 staForm->stxkeys.values[i]);
231 }
232
233 /* decode the stxkind char array into a list of chars */
234 datum = SysCacheGetAttr(STATEXTOID, htup,
235 Anum_pg_statistic_ext_stxkind, &isnull);
236 Assert(!isnull);
237 arr = DatumGetArrayTypeP(datum);
238 if (ARR_NDIM(arr) != 1 ||
239 ARR_HASNULL(arr) ||
240 ARR_ELEMTYPE(arr) != CHAROID)
241 elog(ERROR, "stxkind is not a 1-D char array");
242 enabled = (char *) ARR_DATA_PTR(arr);
243 for (i = 0; i < ARR_DIMS(arr)[0]; i++)
244 {
245 Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
246 (enabled[i] == STATS_EXT_DEPENDENCIES) ||
247 (enabled[i] == STATS_EXT_MCV));
248 entry->types = lappend_int(entry->types, (int) enabled[i]);
249 }
250
251 result = lappend(result, entry);
252 }
253
254 systable_endscan(scan);
255
256 return result;
257}
258
259/*
260 * Using 'vacatts' of size 'nvacatts' as input data, return a newly built
261 * VacAttrStats array which includes only the items corresponding to
262 * attributes indicated by 'stxkeys'. If we don't have all of the per column
263 * stats available to compute the extended stats, then we return NULL to indicate
264 * to the caller that the stats should not be built.
265 */
266static VacAttrStats **
267lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
268 int nvacatts, VacAttrStats **vacatts)
269{
270 int i = 0;
271 int x = -1;
272 VacAttrStats **stats;
273
274 stats = (VacAttrStats **)
275 palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
276
277 /* lookup VacAttrStats info for the requested columns (same attnum) */
278 while ((x = bms_next_member(attrs, x)) >= 0)
279 {
280 int j;
281
282 stats[i] = NULL;
283 for (j = 0; j < nvacatts; j++)
284 {
285 if (x == vacatts[j]->tupattnum)
286 {
287 stats[i] = vacatts[j];
288 break;
289 }
290 }
291
292 if (!stats[i])
293 {
294 /*
295 * Looks like stats were not gathered for one of the columns
296 * required. We'll be unable to build the extended stats without
297 * this column.
298 */
299 pfree(stats);
300 return NULL;
301 }
302
303 /*
304 * Sanity check that the column is not dropped - stats should have
305 * been removed in this case.
306 */
307 Assert(!stats[i]->attr->attisdropped);
308
309 i++;
310 }
311
312 return stats;
313}
314
315/*
316 * statext_store
317 * Serializes the statistics and stores them into the pg_statistic_ext_data
318 * tuple.
319 */
320static void
321statext_store(Oid statOid,
322 MVNDistinct *ndistinct, MVDependencies *dependencies,
323 MCVList *mcv, VacAttrStats **stats)
324{
325 HeapTuple stup,
326 oldtup;
327 Datum values[Natts_pg_statistic_ext_data];
328 bool nulls[Natts_pg_statistic_ext_data];
329 bool replaces[Natts_pg_statistic_ext_data];
330 Relation pg_stextdata;
331
332 memset(nulls, true, sizeof(nulls));
333 memset(replaces, false, sizeof(replaces));
334 memset(values, 0, sizeof(values));
335
336 /*
337 * Construct a new pg_statistic_ext_data tuple, replacing the calculated
338 * stats.
339 */
340 if (ndistinct != NULL)
341 {
342 bytea *data = statext_ndistinct_serialize(ndistinct);
343
344 nulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = (data == NULL);
345 values[Anum_pg_statistic_ext_data_stxdndistinct - 1] = PointerGetDatum(data);
346 }
347
348 if (dependencies != NULL)
349 {
350 bytea *data = statext_dependencies_serialize(dependencies);
351
352 nulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = (data == NULL);
353 values[Anum_pg_statistic_ext_data_stxddependencies - 1] = PointerGetDatum(data);
354 }
355 if (mcv != NULL)
356 {
357 bytea *data = statext_mcv_serialize(mcv, stats);
358
359 nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
360 values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
361 }
362
363 /* always replace the value (either by bytea or NULL) */
364 replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
365 replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
366 replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
367
368 /* there should already be a pg_statistic_ext_data tuple */
369 oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
370 if (!HeapTupleIsValid(oldtup))
371 elog(ERROR, "cache lookup failed for statistics object %u", statOid);
372
373 /* replace it */
374 pg_stextdata = table_open(StatisticExtDataRelationId, RowExclusiveLock);
375
376 stup = heap_modify_tuple(oldtup,
377 RelationGetDescr(pg_stextdata),
378 values,
379 nulls,
380 replaces);
381 ReleaseSysCache(oldtup);
382 CatalogTupleUpdate(pg_stextdata, &stup->t_self, stup);
383
384 heap_freetuple(stup);
385
386 table_close(pg_stextdata, RowExclusiveLock);
387}
388
389/* initialize multi-dimensional sort */
390MultiSortSupport
391multi_sort_init(int ndims)
392{
393 MultiSortSupport mss;
394
395 Assert(ndims >= 2);
396
397 mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
398 + sizeof(SortSupportData) * ndims);
399
400 mss->ndims = ndims;
401
402 return mss;
403}
404
405/*
406 * Prepare sort support info using the given sort operator and collation
407 * at the position 'sortdim'
408 */
409void
410multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
411 Oid oper, Oid collation)
412{
413 SortSupport ssup = &mss->ssup[sortdim];
414
415 ssup->ssup_cxt = CurrentMemoryContext;
416 ssup->ssup_collation = collation;
417 ssup->ssup_nulls_first = false;
418
419 PrepareSortSupportFromOrderingOp(oper, ssup);
420}
421
422/* compare all the dimensions in the selected order */
423int
424multi_sort_compare(const void *a, const void *b, void *arg)
425{
426 MultiSortSupport mss = (MultiSortSupport) arg;
427 SortItem *ia = (SortItem *) a;
428 SortItem *ib = (SortItem *) b;
429 int i;
430
431 for (i = 0; i < mss->ndims; i++)
432 {
433 int compare;
434
435 compare = ApplySortComparator(ia->values[i], ia->isnull[i],
436 ib->values[i], ib->isnull[i],
437 &mss->ssup[i]);
438
439 if (compare != 0)
440 return compare;
441 }
442
443 /* equal by default */
444 return 0;
445}
446
447/* compare selected dimension */
448int
449multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
450 MultiSortSupport mss)
451{
452 return ApplySortComparator(a->values[dim], a->isnull[dim],
453 b->values[dim], b->isnull[dim],
454 &mss->ssup[dim]);
455}
456
457int
458multi_sort_compare_dims(int start, int end,
459 const SortItem *a, const SortItem *b,
460 MultiSortSupport mss)
461{
462 int dim;
463
464 for (dim = start; dim <= end; dim++)
465 {
466 int r = ApplySortComparator(a->values[dim], a->isnull[dim],
467 b->values[dim], b->isnull[dim],
468 &mss->ssup[dim]);
469
470 if (r != 0)
471 return r;
472 }
473
474 return 0;
475}
476
477int
478compare_scalars_simple(const void *a, const void *b, void *arg)
479{
480 return compare_datums_simple(*(Datum *) a,
481 *(Datum *) b,
482 (SortSupport) arg);
483}
484
485int
486compare_datums_simple(Datum a, Datum b, SortSupport ssup)
487{
488 return ApplySortComparator(a, false, b, false, ssup);
489}
490
491/* simple counterpart to qsort_arg */
492void *
493bsearch_arg(const void *key, const void *base, size_t nmemb, size_t size,
494 int (*compar) (const void *, const void *, void *),
495 void *arg)
496{
497 size_t l,
498 u,
499 idx;
500 const void *p;
501 int comparison;
502
503 l = 0;
504 u = nmemb;
505 while (l < u)
506 {
507 idx = (l + u) / 2;
508 p = (void *) (((const char *) base) + (idx * size));
509 comparison = (*compar) (key, p, arg);
510
511 if (comparison < 0)
512 u = idx;
513 else if (comparison > 0)
514 l = idx + 1;
515 else
516 return (void *) p;
517 }
518
519 return NULL;
520}
521
522/*
523 * build_attnums_array
524 * Transforms a bitmap into an array of AttrNumber values.
525 *
526 * This is used for extended statistics only, so all the attribute must be
527 * user-defined. That means offsetting by FirstLowInvalidHeapAttributeNumber
528 * is not necessary here (and when querying the bitmap).
529 */
530AttrNumber *
531build_attnums_array(Bitmapset *attrs, int *numattrs)
532{
533 int i,
534 j;
535 AttrNumber *attnums;
536 int num = bms_num_members(attrs);
537
538 if (numattrs)
539 *numattrs = num;
540
541 /* build attnums from the bitmapset */
542 attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * num);
543 i = 0;
544 j = -1;
545 while ((j = bms_next_member(attrs, j)) >= 0)
546 {
547 /*
548 * Make sure the bitmap contains only user-defined attributes. As
549 * bitmaps can't contain negative values, this can be violated in two
550 * ways. Firstly, the bitmap might contain 0 as a member, and secondly
551 * the integer value might be larger than MaxAttrNumber.
552 */
553 Assert(AttrNumberIsForUserDefinedAttr(j));
554 Assert(j <= MaxAttrNumber);
555
556 attnums[i++] = (AttrNumber) j;
557
558 /* protect against overflows */
559 Assert(i <= num);
560 }
561
562 return attnums;
563}
564
565/*
566 * build_sorted_items
567 * build a sorted array of SortItem with values from rows
568 *
569 * Note: All the memory is allocated in a single chunk, so that the caller
570 * can simply pfree the return value to release all of it.
571 */
572SortItem *
573build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
574 MultiSortSupport mss, int numattrs, AttrNumber *attnums)
575{
576 int i,
577 j,
578 len,
579 idx;
580 int nvalues = numrows * numattrs;
581
582 SortItem *items;
583 Datum *values;
584 bool *isnull;
585 char *ptr;
586
587 /* Compute the total amount of memory we need (both items and values). */
588 len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
589
590 /* Allocate the memory and split it into the pieces. */
591 ptr = palloc0(len);
592
593 /* items to sort */
594 items = (SortItem *) ptr;
595 ptr += numrows * sizeof(SortItem);
596
597 /* values and null flags */
598 values = (Datum *) ptr;
599 ptr += nvalues * sizeof(Datum);
600
601 isnull = (bool *) ptr;
602 ptr += nvalues * sizeof(bool);
603
604 /* make sure we consumed the whole buffer exactly */
605 Assert((ptr - (char *) items) == len);
606
607 /* fix the pointers to Datum and bool arrays */
608 idx = 0;
609 for (i = 0; i < numrows; i++)
610 {
611 bool toowide = false;
612
613 items[idx].values = &values[idx * numattrs];
614 items[idx].isnull = &isnull[idx * numattrs];
615
616 /* load the values/null flags from sample rows */
617 for (j = 0; j < numattrs; j++)
618 {
619 Datum value;
620 bool isnull;
621
622 value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
623
624 /*
625 * If this is a varlena value, check if it's too wide and if yes
626 * then skip the whole item. Otherwise detoast the value.
627 *
628 * XXX It may happen that we've already detoasted some preceding
629 * values for the current item. We don't bother to cleanup those
630 * on the assumption that those are small (below WIDTH_THRESHOLD)
631 * and will be discarded at the end of analyze.
632 */
633 if ((!isnull) &&
634 (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1))
635 {
636 if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
637 {
638 toowide = true;
639 break;
640 }
641
642 value = PointerGetDatum(PG_DETOAST_DATUM(value));
643 }
644
645 items[idx].values[j] = value;
646 items[idx].isnull[j] = isnull;
647 }
648
649 if (toowide)
650 continue;
651
652 idx++;
653 }
654
655 /* store the actual number of items (ignoring the too-wide ones) */
656 *nitems = idx;
657
658 /* all items were too wide */
659 if (idx == 0)
660 {
661 /* everything is allocated as a single chunk */
662 pfree(items);
663 return NULL;
664 }
665
666 /* do the sort, using the multi-sort */
667 qsort_arg((void *) items, idx, sizeof(SortItem),
668 multi_sort_compare, mss);
669
670 return items;
671}
672
673/*
674 * has_stats_of_kind
675 * Check whether the list contains statistic of a given kind
676 */
677bool
678has_stats_of_kind(List *stats, char requiredkind)
679{
680 ListCell *l;
681
682 foreach(l, stats)
683 {
684 StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
685
686 if (stat->kind == requiredkind)
687 return true;
688 }
689
690 return false;
691}
692
693/*
694 * choose_best_statistics
695 * Look for and return statistics with the specified 'requiredkind' which
696 * have keys that match at least two of the given attnums. Return NULL if
697 * there's no match.
698 *
699 * The current selection criteria is very simple - we choose the statistics
700 * object referencing the most of the requested attributes, breaking ties
701 * in favor of objects with fewer keys overall.
702 *
703 * XXX If multiple statistics objects tie on both criteria, then which object
704 * is chosen depends on the order that they appear in the stats list. Perhaps
705 * further tiebreakers are needed.
706 */
707StatisticExtInfo *
708choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
709{
710 ListCell *lc;
711 StatisticExtInfo *best_match = NULL;
712 int best_num_matched = 2; /* goal #1: maximize */
713 int best_match_keys = (STATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */
714
715 foreach(lc, stats)
716 {
717 StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
718 int num_matched;
719 int numkeys;
720 Bitmapset *matched;
721
722 /* skip statistics that are not of the correct type */
723 if (info->kind != requiredkind)
724 continue;
725
726 /* determine how many attributes of these stats can be matched to */
727 matched = bms_intersect(attnums, info->keys);
728 num_matched = bms_num_members(matched);
729 bms_free(matched);
730
731 /*
732 * save the actual number of keys in the stats so that we can choose
733 * the narrowest stats with the most matching keys.
734 */
735 numkeys = bms_num_members(info->keys);
736
737 /*
738 * Use this object when it increases the number of matched clauses or
739 * when it matches the same number of attributes but these stats have
740 * fewer keys than any previous match.
741 */
742 if (num_matched > best_num_matched ||
743 (num_matched == best_num_matched && numkeys < best_match_keys))
744 {
745 best_match = info;
746 best_num_matched = num_matched;
747 best_match_keys = numkeys;
748 }
749 }
750
751 return best_match;
752}
753
754/*
755 * statext_is_compatible_clause_internal
756 * Determines if the clause is compatible with MCV lists.
757 *
758 * Does the heavy lifting of actually inspecting the clauses for
759 * statext_is_compatible_clause. It needs to be split like this because
760 * of recursion. The attnums bitmap is an input/output parameter collecting
761 * attribute numbers from all compatible clauses (recursively).
762 */
763static bool
764statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
765 Index relid, Bitmapset **attnums)
766{
767 /* Look inside any binary-compatible relabeling (as in examine_variable) */
768 if (IsA(clause, RelabelType))
769 clause = (Node *) ((RelabelType *) clause)->arg;
770
771 /* plain Var references (boolean Vars or recursive checks) */
772 if (IsA(clause, Var))
773 {
774 Var *var = (Var *) clause;
775
776 /* Ensure var is from the correct relation */
777 if (var->varno != relid)
778 return false;
779
780 /* we also better ensure the Var is from the current level */
781 if (var->varlevelsup > 0)
782 return false;
783
784 /* Also skip system attributes (we don't allow stats on those). */
785 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
786 return false;
787
788 *attnums = bms_add_member(*attnums, var->varattno);
789
790 return true;
791 }
792
793 /* (Var op Const) or (Const op Var) */
794 if (is_opclause(clause))
795 {
796 RangeTblEntry *rte = root->simple_rte_array[relid];
797 OpExpr *expr = (OpExpr *) clause;
798 Var *var;
799
800 /* Only expressions with two arguments are considered compatible. */
801 if (list_length(expr->args) != 2)
802 return false;
803
804 /* Check if the expression the right shape (one Var, one Const) */
805 if (!examine_opclause_expression(expr, &var, NULL, NULL))
806 return false;
807
808 /*
809 * If it's not one of the supported operators ("=", "<", ">", etc.),
810 * just ignore the clause, as it's not compatible with MCV lists.
811 *
812 * This uses the function for estimating selectivity, not the operator
813 * directly (a bit awkward, but well ...).
814 */
815 switch (get_oprrest(expr->opno))
816 {
817 case F_EQSEL:
818 case F_NEQSEL:
819 case F_SCALARLTSEL:
820 case F_SCALARLESEL:
821 case F_SCALARGTSEL:
822 case F_SCALARGESEL:
823 /* supported, will continue with inspection of the Var */
824 break;
825
826 default:
827 /* other estimators are considered unknown/unsupported */
828 return false;
829 }
830
831 /*
832 * If there are any securityQuals on the RTE from security barrier
833 * views or RLS policies, then the user may not have access to all the
834 * table's data, and we must check that the operator is leak-proof.
835 *
836 * If the operator is leaky, then we must ignore this clause for the
837 * purposes of estimating with MCV lists, otherwise the operator might
838 * reveal values from the MCV list that the user doesn't have
839 * permission to see.
840 */
841 if (rte->securityQuals != NIL &&
842 !get_func_leakproof(get_opcode(expr->opno)))
843 return false;
844
845 return statext_is_compatible_clause_internal(root, (Node *) var,
846 relid, attnums);
847 }
848
849 /* AND/OR/NOT clause */
850 if (is_andclause(clause) ||
851 is_orclause(clause) ||
852 is_notclause(clause))
853 {
854 /*
855 * AND/OR/NOT-clauses are supported if all sub-clauses are supported
856 *
857 * Perhaps we could improve this by handling mixed cases, when some of
858 * the clauses are supported and some are not. Selectivity for the
859 * supported subclauses would be computed using extended statistics,
860 * and the remaining clauses would be estimated using the traditional
861 * algorithm (product of selectivities).
862 *
863 * It however seems overly complex, and in a way we already do that
864 * because if we reject the whole clause as unsupported here, it will
865 * be eventually passed to clauselist_selectivity() which does exactly
866 * this (split into supported/unsupported clauses etc).
867 */
868 BoolExpr *expr = (BoolExpr *) clause;
869 ListCell *lc;
870
871 foreach(lc, expr->args)
872 {
873 /*
874 * Had we found incompatible clause in the arguments, treat the
875 * whole clause as incompatible.
876 */
877 if (!statext_is_compatible_clause_internal(root,
878 (Node *) lfirst(lc),
879 relid, attnums))
880 return false;
881 }
882
883 return true;
884 }
885
886 /* Var IS NULL */
887 if (IsA(clause, NullTest))
888 {
889 NullTest *nt = (NullTest *) clause;
890
891 /*
892 * Only simple (Var IS NULL) expressions supported for now. Maybe we
893 * could use examine_variable to fix this?
894 */
895 if (!IsA(nt->arg, Var))
896 return false;
897
898 return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
899 relid, attnums);
900 }
901
902 return false;
903}
904
905/*
906 * statext_is_compatible_clause
907 * Determines if the clause is compatible with MCV lists.
908 *
909 * Currently, we only support three types of clauses:
910 *
911 * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
912 * is one of ("=", "<", ">", ">=", "<=")
913 *
914 * (b) (Var IS [NOT] NULL)
915 *
916 * (c) combinations using AND/OR/NOT
917 *
918 * In the future, the range of supported clauses may be expanded to more
919 * complex cases, for example (Var op Var).
920 */
921static bool
922statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
923 Bitmapset **attnums)
924{
925 RangeTblEntry *rte = root->simple_rte_array[relid];
926 RestrictInfo *rinfo = (RestrictInfo *) clause;
927 Oid userid;
928
929 if (!IsA(rinfo, RestrictInfo))
930 return false;
931
932 /* Pseudoconstants are not really interesting here. */
933 if (rinfo->pseudoconstant)
934 return false;
935
936 /* clauses referencing multiple varnos are incompatible */
937 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
938 return false;
939
940 /* Check the clause and determine what attributes it references. */
941 if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause,
942 relid, attnums))
943 return false;
944
945 /*
946 * Check that the user has permission to read all these attributes. Use
947 * checkAsUser if it's set, in case we're accessing the table via a view.
948 */
949 userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
950
951 if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK)
952 {
953 /* Don't have table privilege, must check individual columns */
954 if (bms_is_member(InvalidAttrNumber, *attnums))
955 {
956 /* Have a whole-row reference, must have access to all columns */
957 if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT,
958 ACLMASK_ALL) != ACLCHECK_OK)
959 return false;
960 }
961 else
962 {
963 /* Check the columns referenced by the clause */
964 int attnum = -1;
965
966 while ((attnum = bms_next_member(*attnums, attnum)) >= 0)
967 {
968 if (pg_attribute_aclcheck(rte->relid, attnum, userid,
969 ACL_SELECT) != ACLCHECK_OK)
970 return false;
971 }
972 }
973 }
974
975 /* If we reach here, the clause is OK */
976 return true;
977}
978
979/*
980 * statext_mcv_clauselist_selectivity
981 * Estimate clauses using the best multi-column statistics.
982 *
983 * Selects the best extended (multi-column) statistic on a table (measured by
984 * the number of attributes extracted from the clauses and covered by it), and
985 * computes the selectivity for the supplied clauses.
986 *
987 * One of the main challenges with using MCV lists is how to extrapolate the
988 * estimate to the data not covered by the MCV list. To do that, we compute
989 * not only the "MCV selectivity" (selectivities for MCV items matching the
990 * supplied clauses), but also a couple of derived selectivities:
991 *
992 * - simple selectivity: Computed without extended statistic, i.e. as if the
993 * columns/clauses were independent
994 *
995 * - base selectivity: Similar to simple selectivity, but is computed using
996 * the extended statistic by adding up the base frequencies (that we compute
997 * and store for each MCV item) of matching MCV items.
998 *
999 * - total selectivity: Selectivity covered by the whole MCV list.
1000 *
1001 * - other selectivity: A selectivity estimate for data not covered by the MCV
1002 * list (i.e. satisfying the clauses, but not common enough to make it into
1003 * the MCV list)
1004 *
1005 * Note: While simple and base selectivities are defined in a quite similar
1006 * way, the values are computed differently and are not therefore equal. The
1007 * simple selectivity is computed as a product of per-clause estimates, while
1008 * the base selectivity is computed by adding up base frequencies of matching
1009 * items of the multi-column MCV list. So the values may differ for two main
1010 * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
1011 * the MCV items did not match the estimated clauses.
1012 *
1013 * As both (a) and (b) reduce the base selectivity value, it generally holds
1014 * that (simple_selectivity >= base_selectivity). If the MCV list covers all
1015 * the data, the values may be equal.
1016 *
1017 * So, (simple_selectivity - base_selectivity) is an estimate for the part
1018 * not covered by the MCV list, and (mcv_selectivity - base_selectivity) may
1019 * be seen as a correction for the part covered by the MCV list. Those two
1020 * statements are actually equivalent.
1021 *
1022 * Note: Due to rounding errors and minor differences in how the estimates
1023 * are computed, the inequality may not always hold. Which is why we clamp
1024 * the selectivities to prevent strange estimate (negative etc.).
1025 *
1026 * 'estimatedclauses' is an input/output parameter. We set bits for the
1027 * 0-based 'clauses' indexes we estimate for and also skip clause items that
1028 * already have a bit set.
1029 *
1030 * XXX If we were to use multiple statistics, this is where it would happen.
1031 * We would simply repeat this on a loop on the "remaining" clauses, possibly
1032 * using the already estimated clauses as conditions (and combining the values
1033 * using conditional probability formula).
1034 */
1035static Selectivity
1036statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
1037 JoinType jointype, SpecialJoinInfo *sjinfo,
1038 RelOptInfo *rel, Bitmapset **estimatedclauses)
1039{
1040 ListCell *l;
1041 Bitmapset *clauses_attnums = NULL;
1042 Bitmapset **list_attnums;
1043 int listidx;
1044 StatisticExtInfo *stat;
1045 List *stat_clauses;
1046 Selectivity simple_sel,
1047 mcv_sel,
1048 mcv_basesel,
1049 mcv_totalsel,
1050 other_sel,
1051 sel;
1052
1053 /* check if there's any stats that might be useful for us. */
1054 if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
1055 return 1.0;
1056
1057 list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
1058 list_length(clauses));
1059
1060 /*
1061 * Pre-process the clauses list to extract the attnums seen in each item.
1062 * We need to determine if there's any clauses which will be useful for
1063 * selectivity estimations with extended stats. Along the way we'll record
1064 * all of the attnums for each clause in a list which we'll reference
1065 * later so we don't need to repeat the same work again. We'll also keep
1066 * track of all attnums seen.
1067 *
1068 * We also skip clauses that we already estimated using different types of
1069 * statistics (we treat them as incompatible).
1070 */
1071 listidx = 0;
1072 foreach(l, clauses)
1073 {
1074 Node *clause = (Node *) lfirst(l);
1075 Bitmapset *attnums = NULL;
1076
1077 if (!bms_is_member(listidx, *estimatedclauses) &&
1078 statext_is_compatible_clause(root, clause, rel->relid, &attnums))
1079 {
1080 list_attnums[listidx] = attnums;
1081 clauses_attnums = bms_add_members(clauses_attnums, attnums);
1082 }
1083 else
1084 list_attnums[listidx] = NULL;
1085
1086 listidx++;
1087 }
1088
1089 /* We need at least two attributes for multivariate statistics. */
1090 if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1091 return 1.0;
1092
1093 /* find the best suited statistics object for these attnums */
1094 stat = choose_best_statistics(rel->statlist, clauses_attnums, STATS_EXT_MCV);
1095
1096 /* if no matching stats could be found then we've nothing to do */
1097 if (!stat)
1098 return 1.0;
1099
1100 /* Ensure choose_best_statistics produced an expected stats type. */
1101 Assert(stat->kind == STATS_EXT_MCV);
1102
1103 /* now filter the clauses to be estimated using the selected MCV */
1104 stat_clauses = NIL;
1105
1106 listidx = 0;
1107 foreach(l, clauses)
1108 {
1109 /*
1110 * If the clause is compatible with the selected statistics, mark it
1111 * as estimated and add it to the list to estimate.
1112 */
1113 if (list_attnums[listidx] != NULL &&
1114 bms_is_subset(list_attnums[listidx], stat->keys))
1115 {
1116 stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
1117 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1118 }
1119
1120 listidx++;
1121 }
1122
1123 /*
1124 * First compute "simple" selectivity, i.e. without the extended
1125 * statistics, and essentially assuming independence of the
1126 * columns/clauses. We'll then use the various selectivities computed from
1127 * MCV list to improve it.
1128 */
1129 simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
1130 jointype, sjinfo, NULL);
1131
1132 /*
1133 * Now compute the multi-column estimate from the MCV list, along with the
1134 * other selectivities (base & total selectivity).
1135 */
1136 mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
1137 jointype, sjinfo, rel,
1138 &mcv_basesel, &mcv_totalsel);
1139
1140 /* Estimated selectivity of values not covered by MCV matches */
1141 other_sel = simple_sel - mcv_basesel;
1142 CLAMP_PROBABILITY(other_sel);
1143
1144 /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
1145 if (other_sel > 1.0 - mcv_totalsel)
1146 other_sel = 1.0 - mcv_totalsel;
1147
1148 /* Overall selectivity is the combination of MCV and non-MCV estimates. */
1149 sel = mcv_sel + other_sel;
1150 CLAMP_PROBABILITY(sel);
1151
1152 return sel;
1153}
1154
1155/*
1156 * statext_clauselist_selectivity
1157 * Estimate clauses using the best multi-column statistics.
1158 */
1159Selectivity
1160statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
1161 JoinType jointype, SpecialJoinInfo *sjinfo,
1162 RelOptInfo *rel, Bitmapset **estimatedclauses)
1163{
1164 Selectivity sel;
1165
1166 /* First, try estimating clauses using a multivariate MCV list. */
1167 sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype,
1168 sjinfo, rel, estimatedclauses);
1169
1170 /*
1171 * Then, apply functional dependencies on the remaining clauses by calling
1172 * dependencies_clauselist_selectivity. Pass 'estimatedclauses' so the
1173 * function can properly skip clauses already estimated above.
1174 *
1175 * The reasoning for applying dependencies last is that the more complex
1176 * stats can track more complex correlations between the attributes, and
1177 * so may be considered more reliable.
1178 *
1179 * For example, MCV list can give us an exact selectivity for values in
1180 * two columns, while functional dependencies can only provide information
1181 * about the overall strength of the dependency.
1182 */
1183 sel *= dependencies_clauselist_selectivity(root, clauses, varRelid,
1184 jointype, sjinfo, rel,
1185 estimatedclauses);
1186
1187 return sel;
1188}
1189
1190/*
1191 * examine_operator_expression
1192 * Split expression into Var and Const parts.
1193 *
1194 * Attempts to match the arguments to either (Var op Const) or (Const op Var),
1195 * possibly with a RelabelType on top. When the expression matches this form,
1196 * returns true, otherwise returns false.
1197 *
1198 * Optionally returns pointers to the extracted Var/Const nodes, when passed
1199 * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
1200 * on which side of the operator we found the Var node.
1201 */
1202bool
1203examine_opclause_expression(OpExpr *expr, Var **varp, Const **cstp, bool *varonleftp)
1204{
1205 Var *var;
1206 Const *cst;
1207 bool varonleft;
1208 Node *leftop,
1209 *rightop;
1210
1211 /* enforced by statext_is_compatible_clause_internal */
1212 Assert(list_length(expr->args) == 2);
1213
1214 leftop = linitial(expr->args);
1215 rightop = lsecond(expr->args);
1216
1217 /* strip RelabelType from either side of the expression */
1218 if (IsA(leftop, RelabelType))
1219 leftop = (Node *) ((RelabelType *) leftop)->arg;
1220
1221 if (IsA(rightop, RelabelType))
1222 rightop = (Node *) ((RelabelType *) rightop)->arg;
1223
1224 if (IsA(leftop, Var) && IsA(rightop, Const))
1225 {
1226 var = (Var *) leftop;
1227 cst = (Const *) rightop;
1228 varonleft = true;
1229 }
1230 else if (IsA(leftop, Const) && IsA(rightop, Var))
1231 {
1232 var = (Var *) rightop;
1233 cst = (Const *) leftop;
1234 varonleft = false;
1235 }
1236 else
1237 return false;
1238
1239 /* return pointers to the extracted parts if requested */
1240 if (varp)
1241 *varp = var;
1242
1243 if (cstp)
1244 *cstp = cst;
1245
1246 if (varonleftp)
1247 *varonleftp = varonleft;
1248
1249 return true;
1250}
1251