1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * dependencies.c |
4 | * POSTGRES functional dependencies |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/statistics/dependencies.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #include "postgres.h" |
15 | |
16 | #include "access/htup_details.h" |
17 | #include "access/sysattr.h" |
18 | #include "catalog/pg_operator.h" |
19 | #include "catalog/pg_statistic_ext.h" |
20 | #include "catalog/pg_statistic_ext_data.h" |
21 | #include "lib/stringinfo.h" |
22 | #include "nodes/nodeFuncs.h" |
23 | #include "optimizer/clauses.h" |
24 | #include "optimizer/optimizer.h" |
25 | #include "nodes/nodes.h" |
26 | #include "nodes/pathnodes.h" |
27 | #include "statistics/extended_stats_internal.h" |
28 | #include "statistics/statistics.h" |
29 | #include "utils/bytea.h" |
30 | #include "utils/fmgroids.h" |
31 | #include "utils/fmgrprotos.h" |
32 | #include "utils/lsyscache.h" |
33 | #include "utils/syscache.h" |
34 | #include "utils/typcache.h" |
35 | |
36 | /* size of the struct header fields (magic, type, ndeps) */ |
37 | #define (3 * sizeof(uint32)) |
38 | |
39 | /* size of a serialized dependency (degree, natts, atts) */ |
40 | #define SizeOfItem(natts) \ |
41 | (sizeof(double) + sizeof(AttrNumber) * (1 + (natts))) |
42 | |
43 | /* minimal size of a dependency (with two attributes) */ |
44 | #define MinSizeOfItem SizeOfItem(2) |
45 | |
46 | /* minimal size of dependencies, when all deps are minimal */ |
47 | #define MinSizeOfItems(ndeps) \ |
48 | (SizeOfHeader + (ndeps) * MinSizeOfItem) |
49 | |
50 | /* |
51 | * Internal state for DependencyGenerator of dependencies. Dependencies are similar to |
52 | * k-permutations of n elements, except that the order does not matter for the |
53 | * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent. |
54 | */ |
55 | typedef struct DependencyGeneratorData |
56 | { |
57 | int k; /* size of the dependency */ |
58 | int n; /* number of possible attributes */ |
59 | int current; /* next dependency to return (index) */ |
60 | AttrNumber ndependencies; /* number of dependencies generated */ |
61 | AttrNumber *dependencies; /* array of pre-generated dependencies */ |
62 | } DependencyGeneratorData; |
63 | |
64 | typedef DependencyGeneratorData *DependencyGenerator; |
65 | |
66 | static void generate_dependencies_recurse(DependencyGenerator state, |
67 | int index, AttrNumber start, AttrNumber *current); |
68 | static void generate_dependencies(DependencyGenerator state); |
69 | static DependencyGenerator DependencyGenerator_init(int n, int k); |
70 | static void DependencyGenerator_free(DependencyGenerator state); |
71 | static AttrNumber *DependencyGenerator_next(DependencyGenerator state); |
72 | static double dependency_degree(int numrows, HeapTuple *rows, int k, |
73 | AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); |
74 | static bool dependency_is_fully_matched(MVDependency *dependency, |
75 | Bitmapset *attnums); |
76 | static bool dependency_implies_attribute(MVDependency *dependency, |
77 | AttrNumber attnum); |
78 | static bool dependency_is_compatible_clause(Node *clause, Index relid, |
79 | AttrNumber *attnum); |
80 | static MVDependency *find_strongest_dependency(StatisticExtInfo *stats, |
81 | MVDependencies *dependencies, |
82 | Bitmapset *attnums); |
83 | |
84 | static void |
85 | generate_dependencies_recurse(DependencyGenerator state, int index, |
86 | AttrNumber start, AttrNumber *current) |
87 | { |
88 | /* |
89 | * The generator handles the first (k-1) elements differently from the |
90 | * last element. |
91 | */ |
92 | if (index < (state->k - 1)) |
93 | { |
94 | AttrNumber i; |
95 | |
96 | /* |
97 | * The first (k-1) values have to be in ascending order, which we |
98 | * generate recursively. |
99 | */ |
100 | |
101 | for (i = start; i < state->n; i++) |
102 | { |
103 | current[index] = i; |
104 | generate_dependencies_recurse(state, (index + 1), (i + 1), current); |
105 | } |
106 | } |
107 | else |
108 | { |
109 | int i; |
110 | |
111 | /* |
112 | * the last element is the implied value, which does not respect the |
113 | * ascending order. We just need to check that the value is not in the |
114 | * first (k-1) elements. |
115 | */ |
116 | |
117 | for (i = 0; i < state->n; i++) |
118 | { |
119 | int j; |
120 | bool match = false; |
121 | |
122 | current[index] = i; |
123 | |
124 | for (j = 0; j < index; j++) |
125 | { |
126 | if (current[j] == i) |
127 | { |
128 | match = true; |
129 | break; |
130 | } |
131 | } |
132 | |
133 | /* |
134 | * If the value is not found in the first part of the dependency, |
135 | * we're done. |
136 | */ |
137 | if (!match) |
138 | { |
139 | state->dependencies = (AttrNumber *) repalloc(state->dependencies, |
140 | state->k * (state->ndependencies + 1) * sizeof(AttrNumber)); |
141 | memcpy(&state->dependencies[(state->k * state->ndependencies)], |
142 | current, state->k * sizeof(AttrNumber)); |
143 | state->ndependencies++; |
144 | } |
145 | } |
146 | } |
147 | } |
148 | |
149 | /* generate all dependencies (k-permutations of n elements) */ |
150 | static void |
151 | generate_dependencies(DependencyGenerator state) |
152 | { |
153 | AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); |
154 | |
155 | generate_dependencies_recurse(state, 0, 0, current); |
156 | |
157 | pfree(current); |
158 | } |
159 | |
160 | /* |
161 | * initialize the DependencyGenerator of variations, and prebuild the variations |
162 | * |
163 | * This pre-builds all the variations. We could also generate them in |
164 | * DependencyGenerator_next(), but this seems simpler. |
165 | */ |
166 | static DependencyGenerator |
167 | DependencyGenerator_init(int n, int k) |
168 | { |
169 | DependencyGenerator state; |
170 | |
171 | Assert((n >= k) && (k > 0)); |
172 | |
173 | /* allocate the DependencyGenerator state */ |
174 | state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); |
175 | state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber)); |
176 | |
177 | state->ndependencies = 0; |
178 | state->current = 0; |
179 | state->k = k; |
180 | state->n = n; |
181 | |
182 | /* now actually pre-generate all the variations */ |
183 | generate_dependencies(state); |
184 | |
185 | return state; |
186 | } |
187 | |
188 | /* free the DependencyGenerator state */ |
189 | static void |
190 | DependencyGenerator_free(DependencyGenerator state) |
191 | { |
192 | pfree(state->dependencies); |
193 | pfree(state); |
194 | |
195 | } |
196 | |
197 | /* generate next combination */ |
198 | static AttrNumber * |
199 | DependencyGenerator_next(DependencyGenerator state) |
200 | { |
201 | if (state->current == state->ndependencies) |
202 | return NULL; |
203 | |
204 | return &state->dependencies[state->k * state->current++]; |
205 | } |
206 | |
207 | |
208 | /* |
209 | * validates functional dependency on the data |
210 | * |
211 | * An actual work horse of detecting functional dependencies. Given a variation |
212 | * of k attributes, it checks that the first (k-1) are sufficient to determine |
213 | * the last one. |
214 | */ |
215 | static double |
216 | dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, |
217 | VacAttrStats **stats, Bitmapset *attrs) |
218 | { |
219 | int i, |
220 | nitems; |
221 | MultiSortSupport mss; |
222 | SortItem *items; |
223 | AttrNumber *attnums; |
224 | AttrNumber *attnums_dep; |
225 | int numattrs; |
226 | |
227 | /* counters valid within a group */ |
228 | int group_size = 0; |
229 | int n_violations = 0; |
230 | |
231 | /* total number of rows supporting (consistent with) the dependency */ |
232 | int n_supporting_rows = 0; |
233 | |
234 | /* Make sure we have at least two input attributes. */ |
235 | Assert(k >= 2); |
236 | |
237 | /* sort info for all attributes columns */ |
238 | mss = multi_sort_init(k); |
239 | |
240 | /* |
241 | * Transform the attrs from bitmap to an array to make accessing the i-th |
242 | * member easier, and then construct a filtered version with only attnums |
243 | * referenced by the dependency we validate. |
244 | */ |
245 | attnums = build_attnums_array(attrs, &numattrs); |
246 | |
247 | attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber)); |
248 | for (i = 0; i < k; i++) |
249 | attnums_dep[i] = attnums[dependency[i]]; |
250 | |
251 | /* |
252 | * Verify the dependency (a,b,...)->z, using a rather simple algorithm: |
253 | * |
254 | * (a) sort the data lexicographically |
255 | * |
256 | * (b) split the data into groups by first (k-1) columns |
257 | * |
258 | * (c) for each group count different values in the last column |
259 | * |
260 | * We use the column data types' default sort operators and collations; |
261 | * perhaps at some point it'd be worth using column-specific collations? |
262 | */ |
263 | |
264 | /* prepare the sort function for the dimensions */ |
265 | for (i = 0; i < k; i++) |
266 | { |
267 | VacAttrStats *colstat = stats[dependency[i]]; |
268 | TypeCacheEntry *type; |
269 | |
270 | type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); |
271 | if (type->lt_opr == InvalidOid) /* shouldn't happen */ |
272 | elog(ERROR, "cache lookup failed for ordering operator for type %u" , |
273 | colstat->attrtypid); |
274 | |
275 | /* prepare the sort function for this dimension */ |
276 | multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid); |
277 | } |
278 | |
279 | /* |
280 | * build an array of SortItem(s) sorted using the multi-sort support |
281 | * |
282 | * XXX This relies on all stats entries pointing to the same tuple |
283 | * descriptor. For now that assumption holds, but it might change in the |
284 | * future for example if we support statistics on multiple tables. |
285 | */ |
286 | items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, |
287 | mss, k, attnums_dep); |
288 | |
289 | /* |
290 | * Walk through the sorted array, split it into rows according to the |
291 | * first (k-1) columns. If there's a single value in the last column, we |
292 | * count the group as 'supporting' the functional dependency. Otherwise we |
293 | * count it as contradicting. |
294 | */ |
295 | |
296 | /* start with the first row forming a group */ |
297 | group_size = 1; |
298 | |
299 | /* loop 1 beyond the end of the array so that we count the final group */ |
300 | for (i = 1; i <= nitems; i++) |
301 | { |
302 | /* |
303 | * Check if the group ended, which may be either because we processed |
304 | * all the items (i==nitems), or because the i-th item is not equal to |
305 | * the preceding one. |
306 | */ |
307 | if (i == nitems || |
308 | multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0) |
309 | { |
310 | /* |
311 | * If no violations were found in the group then track the rows of |
312 | * the group as supporting the functional dependency. |
313 | */ |
314 | if (n_violations == 0) |
315 | n_supporting_rows += group_size; |
316 | |
317 | /* Reset counters for the new group */ |
318 | n_violations = 0; |
319 | group_size = 1; |
320 | continue; |
321 | } |
322 | /* first columns match, but the last one does not (so contradicting) */ |
323 | else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0) |
324 | n_violations++; |
325 | |
326 | group_size++; |
327 | } |
328 | |
329 | if (items) |
330 | pfree(items); |
331 | |
332 | pfree(mss); |
333 | pfree(attnums); |
334 | pfree(attnums_dep); |
335 | |
336 | /* Compute the 'degree of validity' as (supporting/total). */ |
337 | return (n_supporting_rows * 1.0 / numrows); |
338 | } |
339 | |
340 | /* |
341 | * detects functional dependencies between groups of columns |
342 | * |
343 | * Generates all possible subsets of columns (variations) and computes |
344 | * the degree of validity for each one. For example when creating statistics |
345 | * on three columns (a,b,c) there are 9 possible dependencies |
346 | * |
347 | * two columns three columns |
348 | * ----------- ------------- |
349 | * (a) -> b (a,b) -> c |
350 | * (a) -> c (a,c) -> b |
351 | * (b) -> a (b,c) -> a |
352 | * (b) -> c |
353 | * (c) -> a |
354 | * (c) -> b |
355 | */ |
356 | MVDependencies * |
357 | statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, |
358 | VacAttrStats **stats) |
359 | { |
360 | int i, |
361 | k; |
362 | int numattrs; |
363 | AttrNumber *attnums; |
364 | |
365 | /* result */ |
366 | MVDependencies *dependencies = NULL; |
367 | |
368 | /* |
369 | * Transform the bms into an array, to make accessing i-th member easier. |
370 | */ |
371 | attnums = build_attnums_array(attrs, &numattrs); |
372 | |
373 | Assert(numattrs >= 2); |
374 | |
375 | /* |
376 | * We'll try build functional dependencies starting from the smallest ones |
377 | * covering just 2 columns, to the largest ones, covering all columns |
378 | * included in the statistics object. We start from the smallest ones |
379 | * because we want to be able to skip already implied ones. |
380 | */ |
381 | for (k = 2; k <= numattrs; k++) |
382 | { |
383 | AttrNumber *dependency; /* array with k elements */ |
384 | |
385 | /* prepare a DependencyGenerator of variation */ |
386 | DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k); |
387 | |
388 | /* generate all possible variations of k values (out of n) */ |
389 | while ((dependency = DependencyGenerator_next(DependencyGenerator))) |
390 | { |
391 | double degree; |
392 | MVDependency *d; |
393 | |
394 | /* compute how valid the dependency seems */ |
395 | degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); |
396 | |
397 | /* |
398 | * if the dependency seems entirely invalid, don't store it |
399 | */ |
400 | if (degree == 0.0) |
401 | continue; |
402 | |
403 | d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
404 | + k * sizeof(AttrNumber)); |
405 | |
406 | /* copy the dependency (and keep the indexes into stxkeys) */ |
407 | d->degree = degree; |
408 | d->nattributes = k; |
409 | for (i = 0; i < k; i++) |
410 | d->attributes[i] = attnums[dependency[i]]; |
411 | |
412 | /* initialize the list of dependencies */ |
413 | if (dependencies == NULL) |
414 | { |
415 | dependencies |
416 | = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
417 | |
418 | dependencies->magic = STATS_DEPS_MAGIC; |
419 | dependencies->type = STATS_DEPS_TYPE_BASIC; |
420 | dependencies->ndeps = 0; |
421 | } |
422 | |
423 | dependencies->ndeps++; |
424 | dependencies = (MVDependencies *) repalloc(dependencies, |
425 | offsetof(MVDependencies, deps) |
426 | + dependencies->ndeps * sizeof(MVDependency *)); |
427 | |
428 | dependencies->deps[dependencies->ndeps - 1] = d; |
429 | } |
430 | |
431 | /* |
432 | * we're done with variations of k elements, so free the |
433 | * DependencyGenerator |
434 | */ |
435 | DependencyGenerator_free(DependencyGenerator); |
436 | } |
437 | |
438 | return dependencies; |
439 | } |
440 | |
441 | |
442 | /* |
443 | * Serialize list of dependencies into a bytea value. |
444 | */ |
445 | bytea * |
446 | statext_dependencies_serialize(MVDependencies *dependencies) |
447 | { |
448 | int i; |
449 | bytea *output; |
450 | char *tmp; |
451 | Size len; |
452 | |
453 | /* we need to store ndeps, with a number of attributes for each one */ |
454 | len = VARHDRSZ + SizeOfHeader; |
455 | |
456 | /* and also include space for the actual attribute numbers and degrees */ |
457 | for (i = 0; i < dependencies->ndeps; i++) |
458 | len += SizeOfItem(dependencies->deps[i]->nattributes); |
459 | |
460 | output = (bytea *) palloc0(len); |
461 | SET_VARSIZE(output, len); |
462 | |
463 | tmp = VARDATA(output); |
464 | |
465 | /* Store the base struct values (magic, type, ndeps) */ |
466 | memcpy(tmp, &dependencies->magic, sizeof(uint32)); |
467 | tmp += sizeof(uint32); |
468 | memcpy(tmp, &dependencies->type, sizeof(uint32)); |
469 | tmp += sizeof(uint32); |
470 | memcpy(tmp, &dependencies->ndeps, sizeof(uint32)); |
471 | tmp += sizeof(uint32); |
472 | |
473 | /* store number of attributes and attribute numbers for each dependency */ |
474 | for (i = 0; i < dependencies->ndeps; i++) |
475 | { |
476 | MVDependency *d = dependencies->deps[i]; |
477 | |
478 | memcpy(tmp, &d->degree, sizeof(double)); |
479 | tmp += sizeof(double); |
480 | |
481 | memcpy(tmp, &d->nattributes, sizeof(AttrNumber)); |
482 | tmp += sizeof(AttrNumber); |
483 | |
484 | memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes); |
485 | tmp += sizeof(AttrNumber) * d->nattributes; |
486 | |
487 | /* protect against overflow */ |
488 | Assert(tmp <= ((char *) output + len)); |
489 | } |
490 | |
491 | /* make sure we've produced exactly the right amount of data */ |
492 | Assert(tmp == ((char *) output + len)); |
493 | |
494 | return output; |
495 | } |
496 | |
497 | /* |
498 | * Reads serialized dependencies into MVDependencies structure. |
499 | */ |
500 | MVDependencies * |
501 | statext_dependencies_deserialize(bytea *data) |
502 | { |
503 | int i; |
504 | Size min_expected_size; |
505 | MVDependencies *dependencies; |
506 | char *tmp; |
507 | |
508 | if (data == NULL) |
509 | return NULL; |
510 | |
511 | if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader) |
512 | elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)" , |
513 | VARSIZE_ANY_EXHDR(data), SizeOfHeader); |
514 | |
515 | /* read the MVDependencies header */ |
516 | dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
517 | |
518 | /* initialize pointer to the data part (skip the varlena header) */ |
519 | tmp = VARDATA_ANY(data); |
520 | |
521 | /* read the header fields and perform basic sanity checks */ |
522 | memcpy(&dependencies->magic, tmp, sizeof(uint32)); |
523 | tmp += sizeof(uint32); |
524 | memcpy(&dependencies->type, tmp, sizeof(uint32)); |
525 | tmp += sizeof(uint32); |
526 | memcpy(&dependencies->ndeps, tmp, sizeof(uint32)); |
527 | tmp += sizeof(uint32); |
528 | |
529 | if (dependencies->magic != STATS_DEPS_MAGIC) |
530 | elog(ERROR, "invalid dependency magic %d (expected %d)" , |
531 | dependencies->magic, STATS_DEPS_MAGIC); |
532 | |
533 | if (dependencies->type != STATS_DEPS_TYPE_BASIC) |
534 | elog(ERROR, "invalid dependency type %d (expected %d)" , |
535 | dependencies->type, STATS_DEPS_TYPE_BASIC); |
536 | |
537 | if (dependencies->ndeps == 0) |
538 | elog(ERROR, "invalid zero-length item array in MVDependencies" ); |
539 | |
540 | /* what minimum bytea size do we expect for those parameters */ |
541 | min_expected_size = SizeOfItem(dependencies->ndeps); |
542 | |
543 | if (VARSIZE_ANY_EXHDR(data) < min_expected_size) |
544 | elog(ERROR, "invalid dependencies size %zd (expected at least %zd)" , |
545 | VARSIZE_ANY_EXHDR(data), min_expected_size); |
546 | |
547 | /* allocate space for the MCV items */ |
548 | dependencies = repalloc(dependencies, offsetof(MVDependencies, deps) |
549 | + (dependencies->ndeps * sizeof(MVDependency *))); |
550 | |
551 | for (i = 0; i < dependencies->ndeps; i++) |
552 | { |
553 | double degree; |
554 | AttrNumber k; |
555 | MVDependency *d; |
556 | |
557 | /* degree of validity */ |
558 | memcpy(°ree, tmp, sizeof(double)); |
559 | tmp += sizeof(double); |
560 | |
561 | /* number of attributes */ |
562 | memcpy(&k, tmp, sizeof(AttrNumber)); |
563 | tmp += sizeof(AttrNumber); |
564 | |
565 | /* is the number of attributes valid? */ |
566 | Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS)); |
567 | |
568 | /* now that we know the number of attributes, allocate the dependency */ |
569 | d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
570 | + (k * sizeof(AttrNumber))); |
571 | |
572 | d->degree = degree; |
573 | d->nattributes = k; |
574 | |
575 | /* copy attribute numbers */ |
576 | memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes); |
577 | tmp += sizeof(AttrNumber) * d->nattributes; |
578 | |
579 | dependencies->deps[i] = d; |
580 | |
581 | /* still within the bytea */ |
582 | Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); |
583 | } |
584 | |
585 | /* we should have consumed the whole bytea exactly */ |
586 | Assert(tmp == ((char *) data + VARSIZE_ANY(data))); |
587 | |
588 | return dependencies; |
589 | } |
590 | |
591 | /* |
592 | * dependency_is_fully_matched |
593 | * checks that a functional dependency is fully matched given clauses on |
594 | * attributes (assuming the clauses are suitable equality clauses) |
595 | */ |
596 | static bool |
597 | dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums) |
598 | { |
599 | int j; |
600 | |
601 | /* |
602 | * Check that the dependency actually is fully covered by clauses. We have |
603 | * to translate all attribute numbers, as those are referenced |
604 | */ |
605 | for (j = 0; j < dependency->nattributes; j++) |
606 | { |
607 | int attnum = dependency->attributes[j]; |
608 | |
609 | if (!bms_is_member(attnum, attnums)) |
610 | return false; |
611 | } |
612 | |
613 | return true; |
614 | } |
615 | |
616 | /* |
617 | * dependency_implies_attribute |
618 | * check that the attnum matches is implied by the functional dependency |
619 | */ |
620 | static bool |
621 | dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum) |
622 | { |
623 | if (attnum == dependency->attributes[dependency->nattributes - 1]) |
624 | return true; |
625 | |
626 | return false; |
627 | } |
628 | |
629 | /* |
630 | * statext_dependencies_load |
631 | * Load the functional dependencies for the indicated pg_statistic_ext tuple |
632 | */ |
633 | MVDependencies * |
634 | statext_dependencies_load(Oid mvoid) |
635 | { |
636 | MVDependencies *result; |
637 | bool isnull; |
638 | Datum deps; |
639 | HeapTuple htup; |
640 | |
641 | htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid)); |
642 | if (!HeapTupleIsValid(htup)) |
643 | elog(ERROR, "cache lookup failed for statistics object %u" , mvoid); |
644 | |
645 | deps = SysCacheGetAttr(STATEXTDATASTXOID, htup, |
646 | Anum_pg_statistic_ext_data_stxddependencies, &isnull); |
647 | if (isnull) |
648 | elog(ERROR, |
649 | "requested statistic kind \"%c\" is not yet built for statistics object %u" , |
650 | STATS_EXT_DEPENDENCIES, mvoid); |
651 | |
652 | result = statext_dependencies_deserialize(DatumGetByteaPP(deps)); |
653 | |
654 | ReleaseSysCache(htup); |
655 | |
656 | return result; |
657 | } |
658 | |
659 | /* |
660 | * pg_dependencies_in - input routine for type pg_dependencies. |
661 | * |
662 | * pg_dependencies is real enough to be a table column, but it has no operations |
663 | * of its own, and disallows input too |
664 | */ |
665 | Datum |
666 | pg_dependencies_in(PG_FUNCTION_ARGS) |
667 | { |
668 | /* |
669 | * pg_node_list stores the data in binary form and parsing text input is |
670 | * not needed, so disallow this. |
671 | */ |
672 | ereport(ERROR, |
673 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
674 | errmsg("cannot accept a value of type %s" , "pg_dependencies" ))); |
675 | |
676 | PG_RETURN_VOID(); /* keep compiler quiet */ |
677 | } |
678 | |
679 | /* |
680 | * pg_dependencies - output routine for type pg_dependencies. |
681 | */ |
682 | Datum |
683 | pg_dependencies_out(PG_FUNCTION_ARGS) |
684 | { |
685 | bytea *data = PG_GETARG_BYTEA_PP(0); |
686 | MVDependencies *dependencies = statext_dependencies_deserialize(data); |
687 | int i, |
688 | j; |
689 | StringInfoData str; |
690 | |
691 | initStringInfo(&str); |
692 | appendStringInfoChar(&str, '{'); |
693 | |
694 | for (i = 0; i < dependencies->ndeps; i++) |
695 | { |
696 | MVDependency *dependency = dependencies->deps[i]; |
697 | |
698 | if (i > 0) |
699 | appendStringInfoString(&str, ", " ); |
700 | |
701 | appendStringInfoChar(&str, '"'); |
702 | for (j = 0; j < dependency->nattributes; j++) |
703 | { |
704 | if (j == dependency->nattributes - 1) |
705 | appendStringInfoString(&str, " => " ); |
706 | else if (j > 0) |
707 | appendStringInfoString(&str, ", " ); |
708 | |
709 | appendStringInfo(&str, "%d" , dependency->attributes[j]); |
710 | } |
711 | appendStringInfo(&str, "\": %f" , dependency->degree); |
712 | } |
713 | |
714 | appendStringInfoChar(&str, '}'); |
715 | |
716 | PG_RETURN_CSTRING(str.data); |
717 | } |
718 | |
719 | /* |
720 | * pg_dependencies_recv - binary input routine for type pg_dependencies. |
721 | */ |
722 | Datum |
723 | pg_dependencies_recv(PG_FUNCTION_ARGS) |
724 | { |
725 | ereport(ERROR, |
726 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
727 | errmsg("cannot accept a value of type %s" , "pg_dependencies" ))); |
728 | |
729 | PG_RETURN_VOID(); /* keep compiler quiet */ |
730 | } |
731 | |
732 | /* |
733 | * pg_dependencies_send - binary output routine for type pg_dependencies. |
734 | * |
735 | * Functional dependencies are serialized in a bytea value (although the type |
736 | * is named differently), so let's just send that. |
737 | */ |
738 | Datum |
739 | pg_dependencies_send(PG_FUNCTION_ARGS) |
740 | { |
741 | return byteasend(fcinfo); |
742 | } |
743 | |
744 | /* |
745 | * dependency_is_compatible_clause |
746 | * Determines if the clause is compatible with functional dependencies |
747 | * |
748 | * Only clauses that have the form of equality to a pseudoconstant, or can be |
749 | * interpreted that way, are currently accepted. Furthermore the variable |
750 | * part of the clause must be a simple Var belonging to the specified |
751 | * relation, whose attribute number we return in *attnum on success. |
752 | */ |
753 | static bool |
754 | dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) |
755 | { |
756 | RestrictInfo *rinfo = (RestrictInfo *) clause; |
757 | Var *var; |
758 | |
759 | if (!IsA(rinfo, RestrictInfo)) |
760 | return false; |
761 | |
762 | /* Pseudoconstants are not interesting (they couldn't contain a Var) */ |
763 | if (rinfo->pseudoconstant) |
764 | return false; |
765 | |
766 | /* Clauses referencing multiple, or no, varnos are incompatible */ |
767 | if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) |
768 | return false; |
769 | |
770 | if (is_opclause(rinfo->clause)) |
771 | { |
772 | /* If it's an opclause, check for Var = Const or Const = Var. */ |
773 | OpExpr *expr = (OpExpr *) rinfo->clause; |
774 | |
775 | /* Only expressions with two arguments are candidates. */ |
776 | if (list_length(expr->args) != 2) |
777 | return false; |
778 | |
779 | /* Make sure non-selected argument is a pseudoconstant. */ |
780 | if (is_pseudo_constant_clause(lsecond(expr->args))) |
781 | var = linitial(expr->args); |
782 | else if (is_pseudo_constant_clause(linitial(expr->args))) |
783 | var = lsecond(expr->args); |
784 | else |
785 | return false; |
786 | |
787 | /* |
788 | * If it's not an "=" operator, just ignore the clause, as it's not |
789 | * compatible with functional dependencies. |
790 | * |
791 | * This uses the function for estimating selectivity, not the operator |
792 | * directly (a bit awkward, but well ...). |
793 | * |
794 | * XXX this is pretty dubious; probably it'd be better to check btree |
795 | * or hash opclass membership, so as not to be fooled by custom |
796 | * selectivity functions, and to be more consistent with decisions |
797 | * elsewhere in the planner. |
798 | */ |
799 | if (get_oprrest(expr->opno) != F_EQSEL) |
800 | return false; |
801 | |
802 | /* OK to proceed with checking "var" */ |
803 | } |
804 | else if (is_notclause(rinfo->clause)) |
805 | { |
806 | /* |
807 | * "NOT x" can be interpreted as "x = false", so get the argument and |
808 | * proceed with seeing if it's a suitable Var. |
809 | */ |
810 | var = (Var *) get_notclausearg(rinfo->clause); |
811 | } |
812 | else |
813 | { |
814 | /* |
815 | * A boolean expression "x" can be interpreted as "x = true", so |
816 | * proceed with seeing if it's a suitable Var. |
817 | */ |
818 | var = (Var *) rinfo->clause; |
819 | } |
820 | |
821 | /* |
822 | * We may ignore any RelabelType node above the operand. (There won't be |
823 | * more than one, since eval_const_expressions has been applied already.) |
824 | */ |
825 | if (IsA(var, RelabelType)) |
826 | var = (Var *) ((RelabelType *) var)->arg; |
827 | |
828 | /* We only support plain Vars for now */ |
829 | if (!IsA(var, Var)) |
830 | return false; |
831 | |
832 | /* Ensure Var is from the correct relation */ |
833 | if (var->varno != relid) |
834 | return false; |
835 | |
836 | /* We also better ensure the Var is from the current level */ |
837 | if (var->varlevelsup != 0) |
838 | return false; |
839 | |
840 | /* Also ignore system attributes (we don't allow stats on those) */ |
841 | if (!AttrNumberIsForUserDefinedAttr(var->varattno)) |
842 | return false; |
843 | |
844 | *attnum = var->varattno; |
845 | return true; |
846 | } |
847 | |
848 | /* |
849 | * find_strongest_dependency |
850 | * find the strongest dependency on the attributes |
851 | * |
852 | * When applying functional dependencies, we start with the strongest |
853 | * dependencies. That is, we select the dependency that: |
854 | * |
855 | * (a) has all attributes covered by equality clauses |
856 | * |
857 | * (b) has the most attributes |
858 | * |
859 | * (c) has the highest degree of validity |
860 | * |
861 | * This guarantees that we eliminate the most redundant conditions first |
862 | * (see the comment in dependencies_clauselist_selectivity). |
863 | */ |
864 | static MVDependency * |
865 | find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies, |
866 | Bitmapset *attnums) |
867 | { |
868 | int i; |
869 | MVDependency *strongest = NULL; |
870 | |
871 | /* number of attnums in clauses */ |
872 | int nattnums = bms_num_members(attnums); |
873 | |
874 | /* |
875 | * Iterate over the MVDependency items and find the strongest one from the |
876 | * fully-matched dependencies. We do the cheap checks first, before |
877 | * matching it against the attnums. |
878 | */ |
879 | for (i = 0; i < dependencies->ndeps; i++) |
880 | { |
881 | MVDependency *dependency = dependencies->deps[i]; |
882 | |
883 | /* |
884 | * Skip dependencies referencing more attributes than available |
885 | * clauses, as those can't be fully matched. |
886 | */ |
887 | if (dependency->nattributes > nattnums) |
888 | continue; |
889 | |
890 | if (strongest) |
891 | { |
892 | /* skip dependencies on fewer attributes than the strongest. */ |
893 | if (dependency->nattributes < strongest->nattributes) |
894 | continue; |
895 | |
896 | /* also skip weaker dependencies when attribute count matches */ |
897 | if (strongest->nattributes == dependency->nattributes && |
898 | strongest->degree > dependency->degree) |
899 | continue; |
900 | } |
901 | |
902 | /* |
903 | * this dependency is stronger, but we must still check that it's |
904 | * fully matched to these attnums. We perform this check last as it's |
905 | * slightly more expensive than the previous checks. |
906 | */ |
907 | if (dependency_is_fully_matched(dependency, attnums)) |
908 | strongest = dependency; /* save new best match */ |
909 | } |
910 | |
911 | return strongest; |
912 | } |
913 | |
914 | /* |
915 | * dependencies_clauselist_selectivity |
916 | * Return the estimated selectivity of (a subset of) the given clauses |
917 | * using functional dependency statistics, or 1.0 if no useful functional |
918 | * dependency statistic exists. |
919 | * |
920 | * 'estimatedclauses' is an input/output argument that gets a bit set |
921 | * corresponding to the (zero-based) list index of each clause that is included |
922 | * in the estimated selectivity. |
923 | * |
924 | * Given equality clauses on attributes (a,b) we find the strongest dependency |
925 | * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected |
926 | * dependency, we then combine the per-clause selectivities using the formula |
927 | * |
928 | * P(a,b) = P(a) * [f + (1-f)*P(b)] |
929 | * |
930 | * where 'f' is the degree of the dependency. |
931 | * |
932 | * With clauses on more than two attributes, the dependencies are applied |
933 | * recursively, starting with the widest/strongest dependencies. For example |
934 | * P(a,b,c) is first split like this: |
935 | * |
936 | * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] |
937 | * |
938 | * assuming (a,b=>c) is the strongest dependency. |
939 | */ |
940 | Selectivity |
941 | dependencies_clauselist_selectivity(PlannerInfo *root, |
942 | List *clauses, |
943 | int varRelid, |
944 | JoinType jointype, |
945 | SpecialJoinInfo *sjinfo, |
946 | RelOptInfo *rel, |
947 | Bitmapset **estimatedclauses) |
948 | { |
949 | Selectivity s1 = 1.0; |
950 | ListCell *l; |
951 | Bitmapset *clauses_attnums = NULL; |
952 | StatisticExtInfo *stat; |
953 | MVDependencies *dependencies; |
954 | AttrNumber *list_attnums; |
955 | int listidx; |
956 | |
957 | /* check if there's any stats that might be useful for us. */ |
958 | if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) |
959 | return 1.0; |
960 | |
961 | list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * |
962 | list_length(clauses)); |
963 | |
964 | /* |
965 | * Pre-process the clauses list to extract the attnums seen in each item. |
966 | * We need to determine if there's any clauses which will be useful for |
967 | * dependency selectivity estimations. Along the way we'll record all of |
968 | * the attnums for each clause in a list which we'll reference later so we |
969 | * don't need to repeat the same work again. We'll also keep track of all |
970 | * attnums seen. |
971 | * |
972 | * We also skip clauses that we already estimated using different types of |
973 | * statistics (we treat them as incompatible). |
974 | */ |
975 | listidx = 0; |
976 | foreach(l, clauses) |
977 | { |
978 | Node *clause = (Node *) lfirst(l); |
979 | AttrNumber attnum; |
980 | |
981 | if (!bms_is_member(listidx, *estimatedclauses) && |
982 | dependency_is_compatible_clause(clause, rel->relid, &attnum)) |
983 | { |
984 | list_attnums[listidx] = attnum; |
985 | clauses_attnums = bms_add_member(clauses_attnums, attnum); |
986 | } |
987 | else |
988 | list_attnums[listidx] = InvalidAttrNumber; |
989 | |
990 | listidx++; |
991 | } |
992 | |
993 | /* |
994 | * If there's not at least two distinct attnums then reject the whole list |
995 | * of clauses. We must return 1.0 so the calling function's selectivity is |
996 | * unaffected. |
997 | */ |
998 | if (bms_num_members(clauses_attnums) < 2) |
999 | { |
1000 | pfree(list_attnums); |
1001 | return 1.0; |
1002 | } |
1003 | |
1004 | /* find the best suited statistics object for these attnums */ |
1005 | stat = choose_best_statistics(rel->statlist, clauses_attnums, |
1006 | STATS_EXT_DEPENDENCIES); |
1007 | |
1008 | /* if no matching stats could be found then we've nothing to do */ |
1009 | if (!stat) |
1010 | { |
1011 | pfree(list_attnums); |
1012 | return 1.0; |
1013 | } |
1014 | |
1015 | /* load the dependency items stored in the statistics object */ |
1016 | dependencies = statext_dependencies_load(stat->statOid); |
1017 | |
1018 | /* |
1019 | * Apply the dependencies recursively, starting with the widest/strongest |
1020 | * ones, and proceeding to the smaller/weaker ones. At the end of each |
1021 | * round we factor in the selectivity of clauses on the implied attribute, |
1022 | * and remove the clauses from the list. |
1023 | */ |
1024 | while (true) |
1025 | { |
1026 | Selectivity s2 = 1.0; |
1027 | MVDependency *dependency; |
1028 | |
1029 | /* the widest/strongest dependency, fully matched by clauses */ |
1030 | dependency = find_strongest_dependency(stat, dependencies, |
1031 | clauses_attnums); |
1032 | |
1033 | /* if no suitable dependency was found, we're done */ |
1034 | if (!dependency) |
1035 | break; |
1036 | |
1037 | /* |
1038 | * We found an applicable dependency, so find all the clauses on the |
1039 | * implied attribute - with dependency (a,b => c) we look for clauses |
1040 | * on 'c'. |
1041 | */ |
1042 | listidx = -1; |
1043 | foreach(l, clauses) |
1044 | { |
1045 | Node *clause; |
1046 | |
1047 | listidx++; |
1048 | |
1049 | /* |
1050 | * Skip incompatible clauses, and ones we've already estimated on. |
1051 | */ |
1052 | if (list_attnums[listidx] == InvalidAttrNumber) |
1053 | continue; |
1054 | |
1055 | /* |
1056 | * Technically we could find more than one clause for a given |
1057 | * attnum. Since these clauses must be equality clauses, we choose |
1058 | * to only take the selectivity estimate from the final clause in |
1059 | * the list for this attnum. If the attnum happens to be compared |
1060 | * to a different Const in another clause then no rows will match |
1061 | * anyway. If it happens to be compared to the same Const, then |
1062 | * ignoring the additional clause is just the thing to do. |
1063 | */ |
1064 | if (dependency_implies_attribute(dependency, |
1065 | list_attnums[listidx])) |
1066 | { |
1067 | clause = (Node *) lfirst(l); |
1068 | |
1069 | s2 = clause_selectivity(root, clause, varRelid, jointype, |
1070 | sjinfo); |
1071 | |
1072 | /* mark this one as done, so we don't touch it again. */ |
1073 | *estimatedclauses = bms_add_member(*estimatedclauses, listidx); |
1074 | |
1075 | /* |
1076 | * Mark that we've got and used the dependency on this clause. |
1077 | * We'll want to ignore this when looking for the next |
1078 | * strongest dependency above. |
1079 | */ |
1080 | clauses_attnums = bms_del_member(clauses_attnums, |
1081 | list_attnums[listidx]); |
1082 | } |
1083 | } |
1084 | |
1085 | /* |
1086 | * Now factor in the selectivity for all the "implied" clauses into |
1087 | * the final one, using this formula: |
1088 | * |
1089 | * P(a,b) = P(a) * (f + (1-f) * P(b)) |
1090 | * |
1091 | * where 'f' is the degree of validity of the dependency. |
1092 | */ |
1093 | s1 *= (dependency->degree + (1 - dependency->degree) * s2); |
1094 | } |
1095 | |
1096 | pfree(dependencies); |
1097 | pfree(list_attnums); |
1098 | |
1099 | return s1; |
1100 | } |
1101 | |