1/*-------------------------------------------------------------------------
2 *
3 * mvdistinct.c
4 * POSTGRES multivariate ndistinct coefficients
5 *
6 * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 * is tricky, and the estimation error is often significant.
8
9 * The multivariate ndistinct coefficients address this by storing ndistinct
10 * estimates for combinations of the user-specified columns. So for example
11 * given a statistics object on three columns (a,b,c), this module estimates
12 * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 * estimates are already available in pg_statistic.
14 *
15 *
16 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/statistics/mvdistinct.c
21 *
22 *-------------------------------------------------------------------------
23 */
24#include "postgres.h"
25
26#include <math.h>
27
28#include "access/htup_details.h"
29#include "catalog/pg_statistic_ext.h"
30#include "catalog/pg_statistic_ext_data.h"
31#include "utils/fmgrprotos.h"
32#include "utils/lsyscache.h"
33#include "lib/stringinfo.h"
34#include "utils/syscache.h"
35#include "utils/typcache.h"
36#include "statistics/extended_stats_internal.h"
37#include "statistics/statistics.h"
38
39
40static double ndistinct_for_combination(double totalrows, int numrows,
41 HeapTuple *rows, VacAttrStats **stats,
42 int k, int *combination);
43static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
44static int n_choose_k(int n, int k);
45static int num_combinations(int n);
46
47/* size of the struct header fields (magic, type, nitems) */
48#define SizeOfHeader (3 * sizeof(uint32))
49
50/* size of a serialized ndistinct item (coefficient, natts, atts) */
51#define SizeOfItem(natts) \
52 (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
53
54/* minimal size of a ndistinct item (with two attributes) */
55#define MinSizeOfItem SizeOfItem(2)
56
57/* minimal size of mvndistinct, when all items are minimal */
58#define MinSizeOfItems(nitems) \
59 (SizeOfHeader + (nitems) * MinSizeOfItem)
60
61/* Combination generator API */
62
63/* internal state for generator of k-combinations of n elements */
64typedef struct CombinationGenerator
65{
66 int k; /* size of the combination */
67 int n; /* total number of elements */
68 int current; /* index of the next combination to return */
69 int ncombinations; /* number of combinations (size of array) */
70 int *combinations; /* array of pre-built combinations */
71} CombinationGenerator;
72
73static CombinationGenerator *generator_init(int n, int k);
74static void generator_free(CombinationGenerator *state);
75static int *generator_next(CombinationGenerator *state);
76static void generate_combinations(CombinationGenerator *state);
77
78
79/*
80 * statext_ndistinct_build
81 * Compute ndistinct coefficient for the combination of attributes.
82 *
83 * This computes the ndistinct estimate using the same estimator used
84 * in analyze.c and then computes the coefficient.
85 */
86MVNDistinct *
87statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
88 Bitmapset *attrs, VacAttrStats **stats)
89{
90 MVNDistinct *result;
91 int k;
92 int itemcnt;
93 int numattrs = bms_num_members(attrs);
94 int numcombs = num_combinations(numattrs);
95
96 result = palloc(offsetof(MVNDistinct, items) +
97 numcombs * sizeof(MVNDistinctItem));
98 result->magic = STATS_NDISTINCT_MAGIC;
99 result->type = STATS_NDISTINCT_TYPE_BASIC;
100 result->nitems = numcombs;
101
102 itemcnt = 0;
103 for (k = 2; k <= numattrs; k++)
104 {
105 int *combination;
106 CombinationGenerator *generator;
107
108 /* generate combinations of K out of N elements */
109 generator = generator_init(numattrs, k);
110
111 while ((combination = generator_next(generator)))
112 {
113 MVNDistinctItem *item = &result->items[itemcnt];
114 int j;
115
116 item->attrs = NULL;
117 for (j = 0; j < k; j++)
118 item->attrs = bms_add_member(item->attrs,
119 stats[combination[j]]->attr->attnum);
120 item->ndistinct =
121 ndistinct_for_combination(totalrows, numrows, rows,
122 stats, k, combination);
123
124 itemcnt++;
125 Assert(itemcnt <= result->nitems);
126 }
127
128 generator_free(generator);
129 }
130
131 /* must consume exactly the whole output array */
132 Assert(itemcnt == result->nitems);
133
134 return result;
135}
136
137/*
138 * statext_ndistinct_load
139 * Load the ndistinct value for the indicated pg_statistic_ext tuple
140 */
141MVNDistinct *
142statext_ndistinct_load(Oid mvoid)
143{
144 MVNDistinct *result;
145 bool isnull;
146 Datum ndist;
147 HeapTuple htup;
148
149 htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
150 if (!HeapTupleIsValid(htup))
151 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
152
153 ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
154 Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
155 if (isnull)
156 elog(ERROR,
157 "requested statistic kind \"%c\" is not yet built for statistics object %u",
158 STATS_EXT_NDISTINCT, mvoid);
159
160 result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist));
161
162 ReleaseSysCache(htup);
163
164 return result;
165}
166
167/*
168 * statext_ndistinct_serialize
169 * serialize ndistinct to the on-disk bytea format
170 */
171bytea *
172statext_ndistinct_serialize(MVNDistinct *ndistinct)
173{
174 int i;
175 bytea *output;
176 char *tmp;
177 Size len;
178
179 Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
180 Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC);
181
182 /*
183 * Base size is size of scalar fields in the struct, plus one base struct
184 * for each item, including number of items for each.
185 */
186 len = VARHDRSZ + SizeOfHeader;
187
188 /* and also include space for the actual attribute numbers */
189 for (i = 0; i < ndistinct->nitems; i++)
190 {
191 int nmembers;
192
193 nmembers = bms_num_members(ndistinct->items[i].attrs);
194 Assert(nmembers >= 2);
195
196 len += SizeOfItem(nmembers);
197 }
198
199 output = (bytea *) palloc(len);
200 SET_VARSIZE(output, len);
201
202 tmp = VARDATA(output);
203
204 /* Store the base struct values (magic, type, nitems) */
205 memcpy(tmp, &ndistinct->magic, sizeof(uint32));
206 tmp += sizeof(uint32);
207 memcpy(tmp, &ndistinct->type, sizeof(uint32));
208 tmp += sizeof(uint32);
209 memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
210 tmp += sizeof(uint32);
211
212 /*
213 * store number of attributes and attribute numbers for each entry
214 */
215 for (i = 0; i < ndistinct->nitems; i++)
216 {
217 MVNDistinctItem item = ndistinct->items[i];
218 int nmembers = bms_num_members(item.attrs);
219 int x;
220
221 memcpy(tmp, &item.ndistinct, sizeof(double));
222 tmp += sizeof(double);
223 memcpy(tmp, &nmembers, sizeof(int));
224 tmp += sizeof(int);
225
226 x = -1;
227 while ((x = bms_next_member(item.attrs, x)) >= 0)
228 {
229 AttrNumber value = (AttrNumber) x;
230
231 memcpy(tmp, &value, sizeof(AttrNumber));
232 tmp += sizeof(AttrNumber);
233 }
234
235 /* protect against overflows */
236 Assert(tmp <= ((char *) output + len));
237 }
238
239 /* check we used exactly the expected space */
240 Assert(tmp == ((char *) output + len));
241
242 return output;
243}
244
245/*
246 * statext_ndistinct_deserialize
247 * Read an on-disk bytea format MVNDistinct to in-memory format
248 */
249MVNDistinct *
250statext_ndistinct_deserialize(bytea *data)
251{
252 int i;
253 Size minimum_size;
254 MVNDistinct ndist;
255 MVNDistinct *ndistinct;
256 char *tmp;
257
258 if (data == NULL)
259 return NULL;
260
261 /* we expect at least the basic fields of MVNDistinct struct */
262 if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
263 elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
264 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
265
266 /* initialize pointer to the data part (skip the varlena header) */
267 tmp = VARDATA_ANY(data);
268
269 /* read the header fields and perform basic sanity checks */
270 memcpy(&ndist.magic, tmp, sizeof(uint32));
271 tmp += sizeof(uint32);
272 memcpy(&ndist.type, tmp, sizeof(uint32));
273 tmp += sizeof(uint32);
274 memcpy(&ndist.nitems, tmp, sizeof(uint32));
275 tmp += sizeof(uint32);
276
277 if (ndist.magic != STATS_NDISTINCT_MAGIC)
278 elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
279 ndist.magic, STATS_NDISTINCT_MAGIC);
280 if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
281 elog(ERROR, "invalid ndistinct type %d (expected %d)",
282 ndist.type, STATS_NDISTINCT_TYPE_BASIC);
283 if (ndist.nitems == 0)
284 elog(ERROR, "invalid zero-length item array in MVNDistinct");
285
286 /* what minimum bytea size do we expect for those parameters */
287 minimum_size = MinSizeOfItems(ndist.nitems);
288 if (VARSIZE_ANY_EXHDR(data) < minimum_size)
289 elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
290 VARSIZE_ANY_EXHDR(data), minimum_size);
291
292 /*
293 * Allocate space for the ndistinct items (no space for each item's
294 * attnos: those live in bitmapsets allocated separately)
295 */
296 ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
297 (ndist.nitems * sizeof(MVNDistinctItem)));
298 ndistinct->magic = ndist.magic;
299 ndistinct->type = ndist.type;
300 ndistinct->nitems = ndist.nitems;
301
302 for (i = 0; i < ndistinct->nitems; i++)
303 {
304 MVNDistinctItem *item = &ndistinct->items[i];
305 int nelems;
306
307 item->attrs = NULL;
308
309 /* ndistinct value */
310 memcpy(&item->ndistinct, tmp, sizeof(double));
311 tmp += sizeof(double);
312
313 /* number of attributes */
314 memcpy(&nelems, tmp, sizeof(int));
315 tmp += sizeof(int);
316 Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS));
317
318 while (nelems-- > 0)
319 {
320 AttrNumber attno;
321
322 memcpy(&attno, tmp, sizeof(AttrNumber));
323 tmp += sizeof(AttrNumber);
324 item->attrs = bms_add_member(item->attrs, attno);
325 }
326
327 /* still within the bytea */
328 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
329 }
330
331 /* we should have consumed the whole bytea exactly */
332 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
333
334 return ndistinct;
335}
336
337/*
338 * pg_ndistinct_in
339 * input routine for type pg_ndistinct
340 *
341 * pg_ndistinct is real enough to be a table column, but it has no
342 * operations of its own, and disallows input (jus like pg_node_tree).
343 */
344Datum
345pg_ndistinct_in(PG_FUNCTION_ARGS)
346{
347 ereport(ERROR,
348 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
349 errmsg("cannot accept a value of type %s", "pg_ndistinct")));
350
351 PG_RETURN_VOID(); /* keep compiler quiet */
352}
353
354/*
355 * pg_ndistinct
356 * output routine for type pg_ndistinct
357 *
358 * Produces a human-readable representation of the value.
359 */
360Datum
361pg_ndistinct_out(PG_FUNCTION_ARGS)
362{
363 bytea *data = PG_GETARG_BYTEA_PP(0);
364 MVNDistinct *ndist = statext_ndistinct_deserialize(data);
365 int i;
366 StringInfoData str;
367
368 initStringInfo(&str);
369 appendStringInfoChar(&str, '{');
370
371 for (i = 0; i < ndist->nitems; i++)
372 {
373 MVNDistinctItem item = ndist->items[i];
374 int x = -1;
375 bool first = true;
376
377 if (i > 0)
378 appendStringInfoString(&str, ", ");
379
380 while ((x = bms_next_member(item.attrs, x)) >= 0)
381 {
382 appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x);
383 first = false;
384 }
385 appendStringInfo(&str, "\": %d", (int) item.ndistinct);
386 }
387
388 appendStringInfoChar(&str, '}');
389
390 PG_RETURN_CSTRING(str.data);
391}
392
393/*
394 * pg_ndistinct_recv
395 * binary input routine for type pg_ndistinct
396 */
397Datum
398pg_ndistinct_recv(PG_FUNCTION_ARGS)
399{
400 ereport(ERROR,
401 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
402 errmsg("cannot accept a value of type %s", "pg_ndistinct")));
403
404 PG_RETURN_VOID(); /* keep compiler quiet */
405}
406
407/*
408 * pg_ndistinct_send
409 * binary output routine for type pg_ndistinct
410 *
411 * n-distinct is serialized into a bytea value, so let's send that.
412 */
413Datum
414pg_ndistinct_send(PG_FUNCTION_ARGS)
415{
416 return byteasend(fcinfo);
417}
418
419/*
420 * ndistinct_for_combination
421 * Estimates number of distinct values in a combination of columns.
422 *
423 * This uses the same ndistinct estimator as compute_scalar_stats() in
424 * ANALYZE, i.e.,
425 * n*d / (n - f1 + f1*n/N)
426 *
427 * except that instead of values in a single column we are dealing with
428 * combination of multiple columns.
429 */
430static double
431ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
432 VacAttrStats **stats, int k, int *combination)
433{
434 int i,
435 j;
436 int f1,
437 cnt,
438 d;
439 bool *isnull;
440 Datum *values;
441 SortItem *items;
442 MultiSortSupport mss;
443
444 mss = multi_sort_init(k);
445
446 /*
447 * In order to determine the number of distinct elements, create separate
448 * values[]/isnull[] arrays with all the data we have, then sort them
449 * using the specified column combination as dimensions. We could try to
450 * sort in place, but it'd probably be more complex and bug-prone.
451 */
452 items = (SortItem *) palloc(numrows * sizeof(SortItem));
453 values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
454 isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
455
456 for (i = 0; i < numrows; i++)
457 {
458 items[i].values = &values[i * k];
459 items[i].isnull = &isnull[i * k];
460 }
461
462 /*
463 * For each dimension, set up sort-support and fill in the values from the
464 * sample data.
465 *
466 * We use the column data types' default sort operators and collations;
467 * perhaps at some point it'd be worth using column-specific collations?
468 */
469 for (i = 0; i < k; i++)
470 {
471 VacAttrStats *colstat = stats[combination[i]];
472 TypeCacheEntry *type;
473
474 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
475 if (type->lt_opr == InvalidOid) /* shouldn't happen */
476 elog(ERROR, "cache lookup failed for ordering operator for type %u",
477 colstat->attrtypid);
478
479 /* prepare the sort function for this dimension */
480 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
481
482 /* accumulate all the data for this dimension into the arrays */
483 for (j = 0; j < numrows; j++)
484 {
485 items[j].values[i] =
486 heap_getattr(rows[j],
487 colstat->attr->attnum,
488 colstat->tupDesc,
489 &items[j].isnull[i]);
490 }
491 }
492
493 /* We can sort the array now ... */
494 qsort_arg((void *) items, numrows, sizeof(SortItem),
495 multi_sort_compare, mss);
496
497 /* ... and count the number of distinct combinations */
498
499 f1 = 0;
500 cnt = 1;
501 d = 1;
502 for (i = 1; i < numrows; i++)
503 {
504 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
505 {
506 if (cnt == 1)
507 f1 += 1;
508
509 d++;
510 cnt = 0;
511 }
512
513 cnt += 1;
514 }
515
516 if (cnt == 1)
517 f1 += 1;
518
519 return estimate_ndistinct(totalrows, numrows, d, f1);
520}
521
522/* The Duj1 estimator (already used in analyze.c). */
523static double
524estimate_ndistinct(double totalrows, int numrows, int d, int f1)
525{
526 double numer,
527 denom,
528 ndistinct;
529
530 numer = (double) numrows * (double) d;
531
532 denom = (double) (numrows - f1) +
533 (double) f1 * (double) numrows / totalrows;
534
535 ndistinct = numer / denom;
536
537 /* Clamp to sane range in case of roundoff error */
538 if (ndistinct < (double) d)
539 ndistinct = (double) d;
540
541 if (ndistinct > totalrows)
542 ndistinct = totalrows;
543
544 return floor(ndistinct + 0.5);
545}
546
547/*
548 * n_choose_k
549 * computes binomial coefficients using an algorithm that is both
550 * efficient and prevents overflows
551 */
552static int
553n_choose_k(int n, int k)
554{
555 int d,
556 r;
557
558 Assert((k > 0) && (n >= k));
559
560 /* use symmetry of the binomial coefficients */
561 k = Min(k, n - k);
562
563 r = 1;
564 for (d = 1; d <= k; ++d)
565 {
566 r *= n--;
567 r /= d;
568 }
569
570 return r;
571}
572
573/*
574 * num_combinations
575 * number of combinations, excluding single-value combinations
576 */
577static int
578num_combinations(int n)
579{
580 int k;
581 int ncombs = 1;
582
583 for (k = 1; k <= n; k++)
584 ncombs *= 2;
585
586 ncombs -= (n + 1);
587
588 return ncombs;
589}
590
591/*
592 * generator_init
593 * initialize the generator of combinations
594 *
595 * The generator produces combinations of K elements in the interval (0..N).
596 * We prebuild all the combinations in this method, which is simpler than
597 * generating them on the fly.
598 */
599static CombinationGenerator *
600generator_init(int n, int k)
601{
602 CombinationGenerator *state;
603
604 Assert((n >= k) && (k > 0));
605
606 /* allocate the generator state as a single chunk of memory */
607 state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator));
608
609 state->ncombinations = n_choose_k(n, k);
610
611 /* pre-allocate space for all combinations */
612 state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
613
614 state->current = 0;
615 state->k = k;
616 state->n = n;
617
618 /* now actually pre-generate all the combinations of K elements */
619 generate_combinations(state);
620
621 /* make sure we got the expected number of combinations */
622 Assert(state->current == state->ncombinations);
623
624 /* reset the number, so we start with the first one */
625 state->current = 0;
626
627 return state;
628}
629
630/*
631 * generator_next
632 * returns the next combination from the prebuilt list
633 *
634 * Returns a combination of K array indexes (0 .. N), as specified to
635 * generator_init), or NULL when there are no more combination.
636 */
637static int *
638generator_next(CombinationGenerator *state)
639{
640 if (state->current == state->ncombinations)
641 return NULL;
642
643 return &state->combinations[state->k * state->current++];
644}
645
646/*
647 * generator_free
648 * free the internal state of the generator
649 *
650 * Releases the generator internal state (pre-built combinations).
651 */
652static void
653generator_free(CombinationGenerator *state)
654{
655 pfree(state->combinations);
656 pfree(state);
657}
658
659/*
660 * generate_combinations_recurse
661 * given a prefix, generate all possible combinations
662 *
663 * Given a prefix (first few elements of the combination), generate following
664 * elements recursively. We generate the combinations in lexicographic order,
665 * which eliminates permutations of the same combination.
666 */
667static void
668generate_combinations_recurse(CombinationGenerator *state,
669 int index, int start, int *current)
670{
671 /* If we haven't filled all the elements, simply recurse. */
672 if (index < state->k)
673 {
674 int i;
675
676 /*
677 * The values have to be in ascending order, so make sure we start
678 * with the value passed by parameter.
679 */
680
681 for (i = start; i < state->n; i++)
682 {
683 current[index] = i;
684 generate_combinations_recurse(state, (index + 1), (i + 1), current);
685 }
686
687 return;
688 }
689 else
690 {
691 /* we got a valid combination, add it to the array */
692 memcpy(&state->combinations[(state->k * state->current)],
693 current, state->k * sizeof(int));
694 state->current++;
695 }
696}
697
698/*
699 * generate_combinations
700 * generate all k-combinations of N elements
701 */
702static void
703generate_combinations(CombinationGenerator *state)
704{
705 int *current = (int *) palloc0(sizeof(int) * state->k);
706
707 generate_combinations_recurse(state, 0, 0, current);
708
709 pfree(current);
710}
711