1/*-------------------------------------------------------------------------
2 *
3 * dependencies.c
4 * POSTGRES functional dependencies
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/statistics/dependencies.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include "access/htup_details.h"
17#include "access/sysattr.h"
18#include "catalog/pg_operator.h"
19#include "catalog/pg_statistic_ext.h"
20#include "catalog/pg_statistic_ext_data.h"
21#include "lib/stringinfo.h"
22#include "nodes/nodeFuncs.h"
23#include "optimizer/clauses.h"
24#include "optimizer/optimizer.h"
25#include "nodes/nodes.h"
26#include "nodes/pathnodes.h"
27#include "statistics/extended_stats_internal.h"
28#include "statistics/statistics.h"
29#include "utils/bytea.h"
30#include "utils/fmgroids.h"
31#include "utils/fmgrprotos.h"
32#include "utils/lsyscache.h"
33#include "utils/syscache.h"
34#include "utils/typcache.h"
35
36/* size of the struct header fields (magic, type, ndeps) */
37#define SizeOfHeader (3 * sizeof(uint32))
38
39/* size of a serialized dependency (degree, natts, atts) */
40#define SizeOfItem(natts) \
41 (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
42
43/* minimal size of a dependency (with two attributes) */
44#define MinSizeOfItem SizeOfItem(2)
45
46/* minimal size of dependencies, when all deps are minimal */
47#define MinSizeOfItems(ndeps) \
48 (SizeOfHeader + (ndeps) * MinSizeOfItem)
49
50/*
51 * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
52 * k-permutations of n elements, except that the order does not matter for the
53 * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
54 */
55typedef struct DependencyGeneratorData
56{
57 int k; /* size of the dependency */
58 int n; /* number of possible attributes */
59 int current; /* next dependency to return (index) */
60 AttrNumber ndependencies; /* number of dependencies generated */
61 AttrNumber *dependencies; /* array of pre-generated dependencies */
62} DependencyGeneratorData;
63
64typedef DependencyGeneratorData *DependencyGenerator;
65
66static void generate_dependencies_recurse(DependencyGenerator state,
67 int index, AttrNumber start, AttrNumber *current);
68static void generate_dependencies(DependencyGenerator state);
69static DependencyGenerator DependencyGenerator_init(int n, int k);
70static void DependencyGenerator_free(DependencyGenerator state);
71static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
72static double dependency_degree(int numrows, HeapTuple *rows, int k,
73 AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
74static bool dependency_is_fully_matched(MVDependency *dependency,
75 Bitmapset *attnums);
76static bool dependency_implies_attribute(MVDependency *dependency,
77 AttrNumber attnum);
78static bool dependency_is_compatible_clause(Node *clause, Index relid,
79 AttrNumber *attnum);
80static MVDependency *find_strongest_dependency(StatisticExtInfo *stats,
81 MVDependencies *dependencies,
82 Bitmapset *attnums);
83
84static void
85generate_dependencies_recurse(DependencyGenerator state, int index,
86 AttrNumber start, AttrNumber *current)
87{
88 /*
89 * The generator handles the first (k-1) elements differently from the
90 * last element.
91 */
92 if (index < (state->k - 1))
93 {
94 AttrNumber i;
95
96 /*
97 * The first (k-1) values have to be in ascending order, which we
98 * generate recursively.
99 */
100
101 for (i = start; i < state->n; i++)
102 {
103 current[index] = i;
104 generate_dependencies_recurse(state, (index + 1), (i + 1), current);
105 }
106 }
107 else
108 {
109 int i;
110
111 /*
112 * the last element is the implied value, which does not respect the
113 * ascending order. We just need to check that the value is not in the
114 * first (k-1) elements.
115 */
116
117 for (i = 0; i < state->n; i++)
118 {
119 int j;
120 bool match = false;
121
122 current[index] = i;
123
124 for (j = 0; j < index; j++)
125 {
126 if (current[j] == i)
127 {
128 match = true;
129 break;
130 }
131 }
132
133 /*
134 * If the value is not found in the first part of the dependency,
135 * we're done.
136 */
137 if (!match)
138 {
139 state->dependencies = (AttrNumber *) repalloc(state->dependencies,
140 state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
141 memcpy(&state->dependencies[(state->k * state->ndependencies)],
142 current, state->k * sizeof(AttrNumber));
143 state->ndependencies++;
144 }
145 }
146 }
147}
148
149/* generate all dependencies (k-permutations of n elements) */
150static void
151generate_dependencies(DependencyGenerator state)
152{
153 AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
154
155 generate_dependencies_recurse(state, 0, 0, current);
156
157 pfree(current);
158}
159
160/*
161 * initialize the DependencyGenerator of variations, and prebuild the variations
162 *
163 * This pre-builds all the variations. We could also generate them in
164 * DependencyGenerator_next(), but this seems simpler.
165 */
166static DependencyGenerator
167DependencyGenerator_init(int n, int k)
168{
169 DependencyGenerator state;
170
171 Assert((n >= k) && (k > 0));
172
173 /* allocate the DependencyGenerator state */
174 state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
175 state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
176
177 state->ndependencies = 0;
178 state->current = 0;
179 state->k = k;
180 state->n = n;
181
182 /* now actually pre-generate all the variations */
183 generate_dependencies(state);
184
185 return state;
186}
187
188/* free the DependencyGenerator state */
189static void
190DependencyGenerator_free(DependencyGenerator state)
191{
192 pfree(state->dependencies);
193 pfree(state);
194
195}
196
197/* generate next combination */
198static AttrNumber *
199DependencyGenerator_next(DependencyGenerator state)
200{
201 if (state->current == state->ndependencies)
202 return NULL;
203
204 return &state->dependencies[state->k * state->current++];
205}
206
207
208/*
209 * validates functional dependency on the data
210 *
211 * An actual work horse of detecting functional dependencies. Given a variation
212 * of k attributes, it checks that the first (k-1) are sufficient to determine
213 * the last one.
214 */
215static double
216dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
217 VacAttrStats **stats, Bitmapset *attrs)
218{
219 int i,
220 nitems;
221 MultiSortSupport mss;
222 SortItem *items;
223 AttrNumber *attnums;
224 AttrNumber *attnums_dep;
225 int numattrs;
226
227 /* counters valid within a group */
228 int group_size = 0;
229 int n_violations = 0;
230
231 /* total number of rows supporting (consistent with) the dependency */
232 int n_supporting_rows = 0;
233
234 /* Make sure we have at least two input attributes. */
235 Assert(k >= 2);
236
237 /* sort info for all attributes columns */
238 mss = multi_sort_init(k);
239
240 /*
241 * Transform the attrs from bitmap to an array to make accessing the i-th
242 * member easier, and then construct a filtered version with only attnums
243 * referenced by the dependency we validate.
244 */
245 attnums = build_attnums_array(attrs, &numattrs);
246
247 attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
248 for (i = 0; i < k; i++)
249 attnums_dep[i] = attnums[dependency[i]];
250
251 /*
252 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
253 *
254 * (a) sort the data lexicographically
255 *
256 * (b) split the data into groups by first (k-1) columns
257 *
258 * (c) for each group count different values in the last column
259 *
260 * We use the column data types' default sort operators and collations;
261 * perhaps at some point it'd be worth using column-specific collations?
262 */
263
264 /* prepare the sort function for the dimensions */
265 for (i = 0; i < k; i++)
266 {
267 VacAttrStats *colstat = stats[dependency[i]];
268 TypeCacheEntry *type;
269
270 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
271 if (type->lt_opr == InvalidOid) /* shouldn't happen */
272 elog(ERROR, "cache lookup failed for ordering operator for type %u",
273 colstat->attrtypid);
274
275 /* prepare the sort function for this dimension */
276 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
277 }
278
279 /*
280 * build an array of SortItem(s) sorted using the multi-sort support
281 *
282 * XXX This relies on all stats entries pointing to the same tuple
283 * descriptor. For now that assumption holds, but it might change in the
284 * future for example if we support statistics on multiple tables.
285 */
286 items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
287 mss, k, attnums_dep);
288
289 /*
290 * Walk through the sorted array, split it into rows according to the
291 * first (k-1) columns. If there's a single value in the last column, we
292 * count the group as 'supporting' the functional dependency. Otherwise we
293 * count it as contradicting.
294 */
295
296 /* start with the first row forming a group */
297 group_size = 1;
298
299 /* loop 1 beyond the end of the array so that we count the final group */
300 for (i = 1; i <= nitems; i++)
301 {
302 /*
303 * Check if the group ended, which may be either because we processed
304 * all the items (i==nitems), or because the i-th item is not equal to
305 * the preceding one.
306 */
307 if (i == nitems ||
308 multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
309 {
310 /*
311 * If no violations were found in the group then track the rows of
312 * the group as supporting the functional dependency.
313 */
314 if (n_violations == 0)
315 n_supporting_rows += group_size;
316
317 /* Reset counters for the new group */
318 n_violations = 0;
319 group_size = 1;
320 continue;
321 }
322 /* first columns match, but the last one does not (so contradicting) */
323 else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
324 n_violations++;
325
326 group_size++;
327 }
328
329 if (items)
330 pfree(items);
331
332 pfree(mss);
333 pfree(attnums);
334 pfree(attnums_dep);
335
336 /* Compute the 'degree of validity' as (supporting/total). */
337 return (n_supporting_rows * 1.0 / numrows);
338}
339
340/*
341 * detects functional dependencies between groups of columns
342 *
343 * Generates all possible subsets of columns (variations) and computes
344 * the degree of validity for each one. For example when creating statistics
345 * on three columns (a,b,c) there are 9 possible dependencies
346 *
347 * two columns three columns
348 * ----------- -------------
349 * (a) -> b (a,b) -> c
350 * (a) -> c (a,c) -> b
351 * (b) -> a (b,c) -> a
352 * (b) -> c
353 * (c) -> a
354 * (c) -> b
355 */
356MVDependencies *
357statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
358 VacAttrStats **stats)
359{
360 int i,
361 k;
362 int numattrs;
363 AttrNumber *attnums;
364
365 /* result */
366 MVDependencies *dependencies = NULL;
367
368 /*
369 * Transform the bms into an array, to make accessing i-th member easier.
370 */
371 attnums = build_attnums_array(attrs, &numattrs);
372
373 Assert(numattrs >= 2);
374
375 /*
376 * We'll try build functional dependencies starting from the smallest ones
377 * covering just 2 columns, to the largest ones, covering all columns
378 * included in the statistics object. We start from the smallest ones
379 * because we want to be able to skip already implied ones.
380 */
381 for (k = 2; k <= numattrs; k++)
382 {
383 AttrNumber *dependency; /* array with k elements */
384
385 /* prepare a DependencyGenerator of variation */
386 DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
387
388 /* generate all possible variations of k values (out of n) */
389 while ((dependency = DependencyGenerator_next(DependencyGenerator)))
390 {
391 double degree;
392 MVDependency *d;
393
394 /* compute how valid the dependency seems */
395 degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
396
397 /*
398 * if the dependency seems entirely invalid, don't store it
399 */
400 if (degree == 0.0)
401 continue;
402
403 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
404 + k * sizeof(AttrNumber));
405
406 /* copy the dependency (and keep the indexes into stxkeys) */
407 d->degree = degree;
408 d->nattributes = k;
409 for (i = 0; i < k; i++)
410 d->attributes[i] = attnums[dependency[i]];
411
412 /* initialize the list of dependencies */
413 if (dependencies == NULL)
414 {
415 dependencies
416 = (MVDependencies *) palloc0(sizeof(MVDependencies));
417
418 dependencies->magic = STATS_DEPS_MAGIC;
419 dependencies->type = STATS_DEPS_TYPE_BASIC;
420 dependencies->ndeps = 0;
421 }
422
423 dependencies->ndeps++;
424 dependencies = (MVDependencies *) repalloc(dependencies,
425 offsetof(MVDependencies, deps)
426 + dependencies->ndeps * sizeof(MVDependency *));
427
428 dependencies->deps[dependencies->ndeps - 1] = d;
429 }
430
431 /*
432 * we're done with variations of k elements, so free the
433 * DependencyGenerator
434 */
435 DependencyGenerator_free(DependencyGenerator);
436 }
437
438 return dependencies;
439}
440
441
442/*
443 * Serialize list of dependencies into a bytea value.
444 */
445bytea *
446statext_dependencies_serialize(MVDependencies *dependencies)
447{
448 int i;
449 bytea *output;
450 char *tmp;
451 Size len;
452
453 /* we need to store ndeps, with a number of attributes for each one */
454 len = VARHDRSZ + SizeOfHeader;
455
456 /* and also include space for the actual attribute numbers and degrees */
457 for (i = 0; i < dependencies->ndeps; i++)
458 len += SizeOfItem(dependencies->deps[i]->nattributes);
459
460 output = (bytea *) palloc0(len);
461 SET_VARSIZE(output, len);
462
463 tmp = VARDATA(output);
464
465 /* Store the base struct values (magic, type, ndeps) */
466 memcpy(tmp, &dependencies->magic, sizeof(uint32));
467 tmp += sizeof(uint32);
468 memcpy(tmp, &dependencies->type, sizeof(uint32));
469 tmp += sizeof(uint32);
470 memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
471 tmp += sizeof(uint32);
472
473 /* store number of attributes and attribute numbers for each dependency */
474 for (i = 0; i < dependencies->ndeps; i++)
475 {
476 MVDependency *d = dependencies->deps[i];
477
478 memcpy(tmp, &d->degree, sizeof(double));
479 tmp += sizeof(double);
480
481 memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
482 tmp += sizeof(AttrNumber);
483
484 memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
485 tmp += sizeof(AttrNumber) * d->nattributes;
486
487 /* protect against overflow */
488 Assert(tmp <= ((char *) output + len));
489 }
490
491 /* make sure we've produced exactly the right amount of data */
492 Assert(tmp == ((char *) output + len));
493
494 return output;
495}
496
497/*
498 * Reads serialized dependencies into MVDependencies structure.
499 */
500MVDependencies *
501statext_dependencies_deserialize(bytea *data)
502{
503 int i;
504 Size min_expected_size;
505 MVDependencies *dependencies;
506 char *tmp;
507
508 if (data == NULL)
509 return NULL;
510
511 if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
512 elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
513 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
514
515 /* read the MVDependencies header */
516 dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
517
518 /* initialize pointer to the data part (skip the varlena header) */
519 tmp = VARDATA_ANY(data);
520
521 /* read the header fields and perform basic sanity checks */
522 memcpy(&dependencies->magic, tmp, sizeof(uint32));
523 tmp += sizeof(uint32);
524 memcpy(&dependencies->type, tmp, sizeof(uint32));
525 tmp += sizeof(uint32);
526 memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
527 tmp += sizeof(uint32);
528
529 if (dependencies->magic != STATS_DEPS_MAGIC)
530 elog(ERROR, "invalid dependency magic %d (expected %d)",
531 dependencies->magic, STATS_DEPS_MAGIC);
532
533 if (dependencies->type != STATS_DEPS_TYPE_BASIC)
534 elog(ERROR, "invalid dependency type %d (expected %d)",
535 dependencies->type, STATS_DEPS_TYPE_BASIC);
536
537 if (dependencies->ndeps == 0)
538 elog(ERROR, "invalid zero-length item array in MVDependencies");
539
540 /* what minimum bytea size do we expect for those parameters */
541 min_expected_size = SizeOfItem(dependencies->ndeps);
542
543 if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
544 elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
545 VARSIZE_ANY_EXHDR(data), min_expected_size);
546
547 /* allocate space for the MCV items */
548 dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
549 + (dependencies->ndeps * sizeof(MVDependency *)));
550
551 for (i = 0; i < dependencies->ndeps; i++)
552 {
553 double degree;
554 AttrNumber k;
555 MVDependency *d;
556
557 /* degree of validity */
558 memcpy(&degree, tmp, sizeof(double));
559 tmp += sizeof(double);
560
561 /* number of attributes */
562 memcpy(&k, tmp, sizeof(AttrNumber));
563 tmp += sizeof(AttrNumber);
564
565 /* is the number of attributes valid? */
566 Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
567
568 /* now that we know the number of attributes, allocate the dependency */
569 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
570 + (k * sizeof(AttrNumber)));
571
572 d->degree = degree;
573 d->nattributes = k;
574
575 /* copy attribute numbers */
576 memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
577 tmp += sizeof(AttrNumber) * d->nattributes;
578
579 dependencies->deps[i] = d;
580
581 /* still within the bytea */
582 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
583 }
584
585 /* we should have consumed the whole bytea exactly */
586 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
587
588 return dependencies;
589}
590
591/*
592 * dependency_is_fully_matched
593 * checks that a functional dependency is fully matched given clauses on
594 * attributes (assuming the clauses are suitable equality clauses)
595 */
596static bool
597dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
598{
599 int j;
600
601 /*
602 * Check that the dependency actually is fully covered by clauses. We have
603 * to translate all attribute numbers, as those are referenced
604 */
605 for (j = 0; j < dependency->nattributes; j++)
606 {
607 int attnum = dependency->attributes[j];
608
609 if (!bms_is_member(attnum, attnums))
610 return false;
611 }
612
613 return true;
614}
615
616/*
617 * dependency_implies_attribute
618 * check that the attnum matches is implied by the functional dependency
619 */
620static bool
621dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum)
622{
623 if (attnum == dependency->attributes[dependency->nattributes - 1])
624 return true;
625
626 return false;
627}
628
629/*
630 * statext_dependencies_load
631 * Load the functional dependencies for the indicated pg_statistic_ext tuple
632 */
633MVDependencies *
634statext_dependencies_load(Oid mvoid)
635{
636 MVDependencies *result;
637 bool isnull;
638 Datum deps;
639 HeapTuple htup;
640
641 htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
642 if (!HeapTupleIsValid(htup))
643 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
644
645 deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
646 Anum_pg_statistic_ext_data_stxddependencies, &isnull);
647 if (isnull)
648 elog(ERROR,
649 "requested statistic kind \"%c\" is not yet built for statistics object %u",
650 STATS_EXT_DEPENDENCIES, mvoid);
651
652 result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
653
654 ReleaseSysCache(htup);
655
656 return result;
657}
658
659/*
660 * pg_dependencies_in - input routine for type pg_dependencies.
661 *
662 * pg_dependencies is real enough to be a table column, but it has no operations
663 * of its own, and disallows input too
664 */
665Datum
666pg_dependencies_in(PG_FUNCTION_ARGS)
667{
668 /*
669 * pg_node_list stores the data in binary form and parsing text input is
670 * not needed, so disallow this.
671 */
672 ereport(ERROR,
673 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
674 errmsg("cannot accept a value of type %s", "pg_dependencies")));
675
676 PG_RETURN_VOID(); /* keep compiler quiet */
677}
678
679/*
680 * pg_dependencies - output routine for type pg_dependencies.
681 */
682Datum
683pg_dependencies_out(PG_FUNCTION_ARGS)
684{
685 bytea *data = PG_GETARG_BYTEA_PP(0);
686 MVDependencies *dependencies = statext_dependencies_deserialize(data);
687 int i,
688 j;
689 StringInfoData str;
690
691 initStringInfo(&str);
692 appendStringInfoChar(&str, '{');
693
694 for (i = 0; i < dependencies->ndeps; i++)
695 {
696 MVDependency *dependency = dependencies->deps[i];
697
698 if (i > 0)
699 appendStringInfoString(&str, ", ");
700
701 appendStringInfoChar(&str, '"');
702 for (j = 0; j < dependency->nattributes; j++)
703 {
704 if (j == dependency->nattributes - 1)
705 appendStringInfoString(&str, " => ");
706 else if (j > 0)
707 appendStringInfoString(&str, ", ");
708
709 appendStringInfo(&str, "%d", dependency->attributes[j]);
710 }
711 appendStringInfo(&str, "\": %f", dependency->degree);
712 }
713
714 appendStringInfoChar(&str, '}');
715
716 PG_RETURN_CSTRING(str.data);
717}
718
719/*
720 * pg_dependencies_recv - binary input routine for type pg_dependencies.
721 */
722Datum
723pg_dependencies_recv(PG_FUNCTION_ARGS)
724{
725 ereport(ERROR,
726 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
727 errmsg("cannot accept a value of type %s", "pg_dependencies")));
728
729 PG_RETURN_VOID(); /* keep compiler quiet */
730}
731
732/*
733 * pg_dependencies_send - binary output routine for type pg_dependencies.
734 *
735 * Functional dependencies are serialized in a bytea value (although the type
736 * is named differently), so let's just send that.
737 */
738Datum
739pg_dependencies_send(PG_FUNCTION_ARGS)
740{
741 return byteasend(fcinfo);
742}
743
744/*
745 * dependency_is_compatible_clause
746 * Determines if the clause is compatible with functional dependencies
747 *
748 * Only clauses that have the form of equality to a pseudoconstant, or can be
749 * interpreted that way, are currently accepted. Furthermore the variable
750 * part of the clause must be a simple Var belonging to the specified
751 * relation, whose attribute number we return in *attnum on success.
752 */
753static bool
754dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
755{
756 RestrictInfo *rinfo = (RestrictInfo *) clause;
757 Var *var;
758
759 if (!IsA(rinfo, RestrictInfo))
760 return false;
761
762 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
763 if (rinfo->pseudoconstant)
764 return false;
765
766 /* Clauses referencing multiple, or no, varnos are incompatible */
767 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
768 return false;
769
770 if (is_opclause(rinfo->clause))
771 {
772 /* If it's an opclause, check for Var = Const or Const = Var. */
773 OpExpr *expr = (OpExpr *) rinfo->clause;
774
775 /* Only expressions with two arguments are candidates. */
776 if (list_length(expr->args) != 2)
777 return false;
778
779 /* Make sure non-selected argument is a pseudoconstant. */
780 if (is_pseudo_constant_clause(lsecond(expr->args)))
781 var = linitial(expr->args);
782 else if (is_pseudo_constant_clause(linitial(expr->args)))
783 var = lsecond(expr->args);
784 else
785 return false;
786
787 /*
788 * If it's not an "=" operator, just ignore the clause, as it's not
789 * compatible with functional dependencies.
790 *
791 * This uses the function for estimating selectivity, not the operator
792 * directly (a bit awkward, but well ...).
793 *
794 * XXX this is pretty dubious; probably it'd be better to check btree
795 * or hash opclass membership, so as not to be fooled by custom
796 * selectivity functions, and to be more consistent with decisions
797 * elsewhere in the planner.
798 */
799 if (get_oprrest(expr->opno) != F_EQSEL)
800 return false;
801
802 /* OK to proceed with checking "var" */
803 }
804 else if (is_notclause(rinfo->clause))
805 {
806 /*
807 * "NOT x" can be interpreted as "x = false", so get the argument and
808 * proceed with seeing if it's a suitable Var.
809 */
810 var = (Var *) get_notclausearg(rinfo->clause);
811 }
812 else
813 {
814 /*
815 * A boolean expression "x" can be interpreted as "x = true", so
816 * proceed with seeing if it's a suitable Var.
817 */
818 var = (Var *) rinfo->clause;
819 }
820
821 /*
822 * We may ignore any RelabelType node above the operand. (There won't be
823 * more than one, since eval_const_expressions has been applied already.)
824 */
825 if (IsA(var, RelabelType))
826 var = (Var *) ((RelabelType *) var)->arg;
827
828 /* We only support plain Vars for now */
829 if (!IsA(var, Var))
830 return false;
831
832 /* Ensure Var is from the correct relation */
833 if (var->varno != relid)
834 return false;
835
836 /* We also better ensure the Var is from the current level */
837 if (var->varlevelsup != 0)
838 return false;
839
840 /* Also ignore system attributes (we don't allow stats on those) */
841 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
842 return false;
843
844 *attnum = var->varattno;
845 return true;
846}
847
848/*
849 * find_strongest_dependency
850 * find the strongest dependency on the attributes
851 *
852 * When applying functional dependencies, we start with the strongest
853 * dependencies. That is, we select the dependency that:
854 *
855 * (a) has all attributes covered by equality clauses
856 *
857 * (b) has the most attributes
858 *
859 * (c) has the highest degree of validity
860 *
861 * This guarantees that we eliminate the most redundant conditions first
862 * (see the comment in dependencies_clauselist_selectivity).
863 */
864static MVDependency *
865find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies,
866 Bitmapset *attnums)
867{
868 int i;
869 MVDependency *strongest = NULL;
870
871 /* number of attnums in clauses */
872 int nattnums = bms_num_members(attnums);
873
874 /*
875 * Iterate over the MVDependency items and find the strongest one from the
876 * fully-matched dependencies. We do the cheap checks first, before
877 * matching it against the attnums.
878 */
879 for (i = 0; i < dependencies->ndeps; i++)
880 {
881 MVDependency *dependency = dependencies->deps[i];
882
883 /*
884 * Skip dependencies referencing more attributes than available
885 * clauses, as those can't be fully matched.
886 */
887 if (dependency->nattributes > nattnums)
888 continue;
889
890 if (strongest)
891 {
892 /* skip dependencies on fewer attributes than the strongest. */
893 if (dependency->nattributes < strongest->nattributes)
894 continue;
895
896 /* also skip weaker dependencies when attribute count matches */
897 if (strongest->nattributes == dependency->nattributes &&
898 strongest->degree > dependency->degree)
899 continue;
900 }
901
902 /*
903 * this dependency is stronger, but we must still check that it's
904 * fully matched to these attnums. We perform this check last as it's
905 * slightly more expensive than the previous checks.
906 */
907 if (dependency_is_fully_matched(dependency, attnums))
908 strongest = dependency; /* save new best match */
909 }
910
911 return strongest;
912}
913
914/*
915 * dependencies_clauselist_selectivity
916 * Return the estimated selectivity of (a subset of) the given clauses
917 * using functional dependency statistics, or 1.0 if no useful functional
918 * dependency statistic exists.
919 *
920 * 'estimatedclauses' is an input/output argument that gets a bit set
921 * corresponding to the (zero-based) list index of each clause that is included
922 * in the estimated selectivity.
923 *
924 * Given equality clauses on attributes (a,b) we find the strongest dependency
925 * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
926 * dependency, we then combine the per-clause selectivities using the formula
927 *
928 * P(a,b) = P(a) * [f + (1-f)*P(b)]
929 *
930 * where 'f' is the degree of the dependency.
931 *
932 * With clauses on more than two attributes, the dependencies are applied
933 * recursively, starting with the widest/strongest dependencies. For example
934 * P(a,b,c) is first split like this:
935 *
936 * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)]
937 *
938 * assuming (a,b=>c) is the strongest dependency.
939 */
940Selectivity
941dependencies_clauselist_selectivity(PlannerInfo *root,
942 List *clauses,
943 int varRelid,
944 JoinType jointype,
945 SpecialJoinInfo *sjinfo,
946 RelOptInfo *rel,
947 Bitmapset **estimatedclauses)
948{
949 Selectivity s1 = 1.0;
950 ListCell *l;
951 Bitmapset *clauses_attnums = NULL;
952 StatisticExtInfo *stat;
953 MVDependencies *dependencies;
954 AttrNumber *list_attnums;
955 int listidx;
956
957 /* check if there's any stats that might be useful for us. */
958 if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
959 return 1.0;
960
961 list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
962 list_length(clauses));
963
964 /*
965 * Pre-process the clauses list to extract the attnums seen in each item.
966 * We need to determine if there's any clauses which will be useful for
967 * dependency selectivity estimations. Along the way we'll record all of
968 * the attnums for each clause in a list which we'll reference later so we
969 * don't need to repeat the same work again. We'll also keep track of all
970 * attnums seen.
971 *
972 * We also skip clauses that we already estimated using different types of
973 * statistics (we treat them as incompatible).
974 */
975 listidx = 0;
976 foreach(l, clauses)
977 {
978 Node *clause = (Node *) lfirst(l);
979 AttrNumber attnum;
980
981 if (!bms_is_member(listidx, *estimatedclauses) &&
982 dependency_is_compatible_clause(clause, rel->relid, &attnum))
983 {
984 list_attnums[listidx] = attnum;
985 clauses_attnums = bms_add_member(clauses_attnums, attnum);
986 }
987 else
988 list_attnums[listidx] = InvalidAttrNumber;
989
990 listidx++;
991 }
992
993 /*
994 * If there's not at least two distinct attnums then reject the whole list
995 * of clauses. We must return 1.0 so the calling function's selectivity is
996 * unaffected.
997 */
998 if (bms_num_members(clauses_attnums) < 2)
999 {
1000 pfree(list_attnums);
1001 return 1.0;
1002 }
1003
1004 /* find the best suited statistics object for these attnums */
1005 stat = choose_best_statistics(rel->statlist, clauses_attnums,
1006 STATS_EXT_DEPENDENCIES);
1007
1008 /* if no matching stats could be found then we've nothing to do */
1009 if (!stat)
1010 {
1011 pfree(list_attnums);
1012 return 1.0;
1013 }
1014
1015 /* load the dependency items stored in the statistics object */
1016 dependencies = statext_dependencies_load(stat->statOid);
1017
1018 /*
1019 * Apply the dependencies recursively, starting with the widest/strongest
1020 * ones, and proceeding to the smaller/weaker ones. At the end of each
1021 * round we factor in the selectivity of clauses on the implied attribute,
1022 * and remove the clauses from the list.
1023 */
1024 while (true)
1025 {
1026 Selectivity s2 = 1.0;
1027 MVDependency *dependency;
1028
1029 /* the widest/strongest dependency, fully matched by clauses */
1030 dependency = find_strongest_dependency(stat, dependencies,
1031 clauses_attnums);
1032
1033 /* if no suitable dependency was found, we're done */
1034 if (!dependency)
1035 break;
1036
1037 /*
1038 * We found an applicable dependency, so find all the clauses on the
1039 * implied attribute - with dependency (a,b => c) we look for clauses
1040 * on 'c'.
1041 */
1042 listidx = -1;
1043 foreach(l, clauses)
1044 {
1045 Node *clause;
1046
1047 listidx++;
1048
1049 /*
1050 * Skip incompatible clauses, and ones we've already estimated on.
1051 */
1052 if (list_attnums[listidx] == InvalidAttrNumber)
1053 continue;
1054
1055 /*
1056 * Technically we could find more than one clause for a given
1057 * attnum. Since these clauses must be equality clauses, we choose
1058 * to only take the selectivity estimate from the final clause in
1059 * the list for this attnum. If the attnum happens to be compared
1060 * to a different Const in another clause then no rows will match
1061 * anyway. If it happens to be compared to the same Const, then
1062 * ignoring the additional clause is just the thing to do.
1063 */
1064 if (dependency_implies_attribute(dependency,
1065 list_attnums[listidx]))
1066 {
1067 clause = (Node *) lfirst(l);
1068
1069 s2 = clause_selectivity(root, clause, varRelid, jointype,
1070 sjinfo);
1071
1072 /* mark this one as done, so we don't touch it again. */
1073 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1074
1075 /*
1076 * Mark that we've got and used the dependency on this clause.
1077 * We'll want to ignore this when looking for the next
1078 * strongest dependency above.
1079 */
1080 clauses_attnums = bms_del_member(clauses_attnums,
1081 list_attnums[listidx]);
1082 }
1083 }
1084
1085 /*
1086 * Now factor in the selectivity for all the "implied" clauses into
1087 * the final one, using this formula:
1088 *
1089 * P(a,b) = P(a) * (f + (1-f) * P(b))
1090 *
1091 * where 'f' is the degree of validity of the dependency.
1092 */
1093 s1 *= (dependency->degree + (1 - dependency->degree) * s2);
1094 }
1095
1096 pfree(dependencies);
1097 pfree(list_attnums);
1098
1099 return s1;
1100}
1101