mcv.c source code [PostgreSQL/src/backend/statistics/mcv.c]

1	/-------------------------------------------------------------------------*
2	*
3	* mcv.c
4	* POSTGRES multivariate MCV lists
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/statistics/mcv.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include <math.h>
18
19	#include "access/htup_details.h"
20	#include "catalog/pg_collation.h"
21	#include "catalog/pg_statistic_ext.h"
22	#include "catalog/pg_statistic_ext_data.h"
23	#include "fmgr.h"
24	#include "funcapi.h"
25	#include "nodes/nodeFuncs.h"
26	#include "optimizer/clauses.h"
27	#include "statistics/extended_stats_internal.h"
28	#include "statistics/statistics.h"
29	#include "utils/builtins.h"
30	#include "utils/bytea.h"
31	#include "utils/fmgroids.h"
32	#include "utils/fmgrprotos.h"
33	#include "utils/lsyscache.h"
34	#include "utils/syscache.h"
35	#include "utils/typcache.h"
36
37	/*
38	* Computes size of a serialized MCV item, depending on the number of
39	* dimensions (columns) the statistic is defined on. The datum values are
40	* stored in a separate array (deduplicated, to minimize the size), and
41	* so the serialized items only store uint16 indexes into that array.
42	*
43	* Each serialized item stores (in this order):
44	*
45	* - indexes to values (ndim * sizeof(uint16))
46	* - null flags (ndim * sizeof(bool))
47	* - frequency (sizeof(double))
48	* - base_frequency (sizeof(double))
49	*
50	* There is no alignment padding within an MCV item.
51	* So in total each MCV item requires this many bytes:
52	*
53	* ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
54	*/
55	#define ITEM_SIZE(ndims) \
56	((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
57
58	/*
59	* Used to compute size of serialized MCV list representation.
60	*/
61	#define MinSizeOfMCVList \
62	(VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
63
64	/*
65	* Size of the serialized MCV list, excluding the space needed for
66	* deduplicated per-dimension values. The macro is meant to be used
67	* when it's not yet safe to access the serialized info about amount
68	* of data for each column.
69	*/
70	#define SizeOfMCVList(ndims,nitems) \
71	((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
72	((ndims) * sizeof(DimensionInfo)) + \
73	((nitems) * ITEM_SIZE(ndims)))
74
75	static MultiSortSupport build_mss(VacAttrStats *stats, int* numattrs);
76
77	static SortItem build_distinct_groups(int* numrows, SortItem *items,
78	MultiSortSupport mss, int *ndistinct);
79
80	static SortItem *build_column_frequencies(SortItem groups, int ngroups,
81	MultiSortSupport mss, int *ncounts);
82
83	static int count_distinct_groups(int numrows, SortItem *items,
84	MultiSortSupport mss);
85
86	/*
87	* Compute new value for bitmap item, considering whether it's used for
88	* clauses connected by AND/OR.
89	*/
90	#define RESULT_MERGE(value, is_or, match) \
91	((is_or) ? ((value) \|\| (match)) : ((value) && (match)))
92
93	/*
94	* When processing a list of clauses, the bitmap item may get set to a value
95	* such that additional clauses can't change it. For example, when processing
96	* a list of clauses connected to AND, as soon as the item gets set to 'false'
97	* then it'll remain like that. Similarly clauses connected by OR and 'true'.
98	*
99	* Returns true when the value in the bitmap can't change no matter how the
100	* remaining clauses are evaluated.
101	*/
102	#define RESULT_IS_FINAL(value, is_or) ((is_or) ? (value) : (!(value)))
103
104	/*
105	* get_mincount_for_mcv_list
106	* Determine the minimum number of times a value needs to appear in
107	* the sample for it to be included in the MCV list.
108	*
109	* We want to keep only values that appear sufficiently often in the
110	* sample that it is reasonable to extrapolate their sample frequencies to
111	* the entire table. We do this by placing an upper bound on the relative
112	* standard error of the sample frequency, so that any estimates the
113	* planner generates from the MCV statistics can be expected to be
114	* reasonably accurate.
115	*
116	* Since we are sampling without replacement, the sample frequency of a
117	* particular value is described by a hypergeometric distribution. A
118	* common rule of thumb when estimating errors in this situation is to
119	* require at least 10 instances of the value in the sample, in which case
120	* the distribution can be approximated by a normal distribution, and
121	* standard error analysis techniques can be applied. Given a sample size
122	* of n, a population size of N, and a sample frequency of p=cnt/n, the
123	* standard error of the proportion p is given by
124	* SE = sqrt(p(1-p)/n) sqrt((N-n)/(N-1))
125	* where the second term is the finite population correction. To get
126	* reasonably accurate planner estimates, we impose an upper bound on the
127	* relative standard error of 20% -- i.e., SE/p < 0.2. This 20% relative
128	* error bound is fairly arbitrary, but has been found empirically to work
129	* well. Rearranging this formula gives a lower bound on the number of
130	* instances of the value seen:
131	* cnt > n(N-n) / (N-n+0.04n*(N-1))
132	* This bound is at most 25, and approaches 0 as n approaches 0 or N. The
133	* case where n approaches 0 cannot happen in practice, since the sample
134	* size is at least 300. The case where n approaches N corresponds to
135	* sampling the whole the table, in which case it is reasonable to keep
136	* the whole MCV list (have no lower bound), so it makes sense to apply
137	* this formula for all inputs, even though the above derivation is
138	* technically only valid when the right hand side is at least around 10.
139	*
140	* An alternative way to look at this formula is as follows -- assume that
141	* the number of instances of the value seen scales up to the entire
142	* table, so that the population count is K=N*cnt/n. Then the distribution
143	* in the sample is a hypergeometric distribution parameterised by N, n
144	* and K, and the bound above is mathematically equivalent to demanding
145	* that the standard deviation of that distribution is less than 20% of
146	* its mean. Thus the relative errors in any planner estimates produced
147	* from the MCV statistics are likely to be not too large.
148	*/
149	static double
150	get_mincount_for_mcv_list(int samplerows, double totalrows)
151	{
152	double n = samplerows;
153	double N = totalrows;
154	double numer,
155	denom;
156
157	numer = n * (N - n);
158	denom = N - n + `0.04` * n * (N - `1`);
159
160	/ Guard against division by zero (possible if n = N = 1) /
161	if (denom == `0.0`)
162	return `0.0`;
163
164	return numer / denom;
165	}
166
167	/*
168	* Builds MCV list from the set of sampled rows.
169	*
170	* The algorithm is quite simple:
171	*
172	* (1) sort the data (default collation, '<' for the data type)
173	*
174	* (2) count distinct groups, decide how many to keep
175	*
176	* (3) build the MCV list using the threshold determined in (2)
177	*
178	* (4) remove rows represented by the MCV from the sample
179	*
180	*/
181	MCVList *
182	statext_mcv_build(int numrows, HeapTuple rows, Bitmapset attrs,
183	VacAttrStats *stats, double* totalrows)
184	{
185	int i,
186	numattrs,
187	ngroups,
188	nitems;
189	AttrNumber *attnums;
190	double mincount;
191	SortItem *items;
192	SortItem *groups;
193	MCVList *mcvlist = NULL;
194	MultiSortSupport mss;
195
196	attnums = build_attnums_array(attrs, &numattrs);
197
198	/ comparator for all the columns /
199	mss = build_mss(stats, numattrs);
200
201	/ sort the rows /
202	items = build_sorted_items(numrows, &nitems, rows, stats[`0`]->tupDesc,
203	mss, numattrs, attnums);
204
205	if (!items)
206	return NULL;
207
208	/ transform the sorted rows into groups (sorted by frequency) /
209	groups = build_distinct_groups(nitems, items, mss, &ngroups);
210
211	/*
212	* Maximum number of MCV items to store, based on the attribute with the
213	* largest stats target (and the number of groups we have available).
214	*/
215	nitems = stats[`0`]->attr->attstattarget;
216	for (i = `1`; i < numattrs; i++)
217	{
218	if (stats[i]->attr->attstattarget > nitems)
219	nitems = stats[i]->attr->attstattarget;
220	}
221	if (nitems > ngroups)
222	nitems = ngroups;
223
224	/*
225	* Decide how many items to keep in the MCV list. We can't use the same
226	* algorithm as per-column MCV lists, because that only considers the
227	* actual group frequency - but we're primarily interested in how the
228	* actual frequency differs from the base frequency (product of simple
229	* per-column frequencies, as if the columns were independent).
230	*
231	* Using the same algorithm might exclude items that are close to the
232	* "average" frequency of the sample. But that does not say whether the
233	* observed frequency is close to the base frequency or not. We also need
234	* to consider unexpectedly uncommon items (again, compared to the base
235	* frequency), and the single-column algorithm does not have to.
236	*
237	* We simply decide how many items to keep by computing minimum count
238	* using get_mincount_for_mcv_list() and then keep all items that seem to
239	* be more common than that.
240	*/
241	mincount = get_mincount_for_mcv_list(numrows, totalrows);
242
243	/*
244	* Walk the groups until we find the first group with a count below the
245	* mincount threshold (the index of that group is the number of groups we
246	* want to keep).
247	*/
248	for (i = `0`; i < nitems; i++)
249	{
250	if (groups[i].count < mincount)
251	{
252	nitems = i;
253	break;
254	}
255	}
256
257	/*
258	* At this point we know the number of items for the MCV list. There might
259	* be none (for uniform distribution with many groups), and in that case
260	* there will be no MCV list. Otherwise construct the MCV list.
261	*/
262	if (nitems > `0`)
263	{
264	int j;
265	SortItem key;
266	MultiSortSupport tmp;
267
268	/ frequencies for values in each attribute /
269	SortItem **freqs;
270	int *nfreqs;
271
272	/ used to search values /
273	tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
274	+ sizeof(SortSupportData));
275
276	/ compute frequencies for values in each column /
277	nfreqs = (int ) palloc0(sizeof(int) numattrs);
278	freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
279
280	/*
281	* Allocate the MCV list structure, set the global parameters.
282	*/
283	mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
284	sizeof(MCVItem) * nitems);
285
286	mcvlist->magic = STATS_MCV_MAGIC;
287	mcvlist->type = STATS_MCV_TYPE_BASIC;
288	mcvlist->ndimensions = numattrs;
289	mcvlist->nitems = nitems;
290
291	/ store info about data type OIDs /
292	for (i = `0`; i < numattrs; i++)
293	mcvlist->types[i] = stats[i]->attrtypid;
294
295	/ Copy the first chunk of groups into the result. /
296	for (i = `0`; i < nitems; i++)
297	{
298	/ just pointer to the proper place in the list /
299	MCVItem *item = &mcvlist->items[i];
300
301	item->values = (Datum ) palloc(sizeof(Datum) numattrs);
302	item->isnull = (bool ) palloc(sizeof(bool) numattrs);
303
304	/ copy values for the group /
305	memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
306	memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
307
308	/ groups should be sorted by frequency in descending order /
309	Assert((i == `0`) \|\| (groups[i - `1`].count >= groups[i].count));
310
311	/ group frequency /
312	item->frequency = (double) groups[i].count / numrows;
313
314	/ base frequency, if the attributes were independent /
315	item->base_frequency = `1.0`;
316	for (j = `0`; j < numattrs; j++)
317	{
318	SortItem *freq;
319
320	/ single dimension /
321	tmp->ndims = `1`;
322	tmp->ssup[`0`] = mss->ssup[j];
323
324	/ fill search key /
325	key.values = &groups[i].values[j];
326	key.isnull = &groups[i].isnull[j];
327
328	freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
329	sizeof(SortItem),
330	multi_sort_compare, tmp);
331
332	item->base_frequency = ((double*) freq->count) / numrows;
333	}
334	}
335
336	pfree(nfreqs);
337	pfree(freqs);
338	}
339
340	pfree(items);
341	pfree(groups);
342
343	return mcvlist;
344	}
345
346	/*
347	* build_mss
348	* build MultiSortSupport for the attributes passed in attrs
349	*/
350	static MultiSortSupport
351	build_mss(VacAttrStats *stats, int* numattrs)
352	{
353	int i;
354
355	/ Sort by multiple columns (using array of SortSupport) /
356	MultiSortSupport mss = multi_sort_init(numattrs);
357
358	/ prepare the sort functions for all the attributes /
359	for (i = `0`; i < numattrs; i++)
360	{
361	VacAttrStats *colstat = stats[i];
362	TypeCacheEntry *type;
363
364	type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
365	if (type->lt_opr == InvalidOid) / shouldn't happen /
366	elog(ERROR, "cache lookup failed for ordering operator for type %u",
367	colstat->attrtypid);
368
369	multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
370	}
371
372	return mss;
373	}
374
375	/*
376	* count_distinct_groups
377	* count distinct combinations of SortItems in the array
378	*
379	* The array is assumed to be sorted according to the MultiSortSupport.
380	*/
381	static int
382	count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
383	{
384	int i;
385	int ndistinct;
386
387	ndistinct = `1`;
388	for (i = `1`; i < numrows; i++)
389	{
390	/ make sure the array really is sorted /
391	Assert(multi_sort_compare(&items[i], &items[i - `1`], mss) >= `0`);
392
393	if (multi_sort_compare(&items[i], &items[i - `1`], mss) != `0`)
394	ndistinct += `1`;
395	}
396
397	return ndistinct;
398	}
399
400	/*
401	* compare_sort_item_count
402	* comparator for sorting items by count (frequencies) in descending order
403	*/
404	static int
405	compare_sort_item_count(const void a, const* void *b)
406	{
407	SortItem ia = (SortItem ) a;
408	SortItem ib = (SortItem ) b;
409
410	if (ia->count == ib->count)
411	return `0`;
412	else if (ia->count > ib->count)
413	return -`1`;
414
415	return `1`;
416	}
417
418	/*
419	* build_distinct_groups
420	* build an array of SortItems for distinct groups and counts matching items
421	*
422	* The input array is assumed to be sorted
423	*/
424	static SortItem *
425	build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
426	int *ndistinct)
427	{
428	int i,
429	j;
430	int ngroups = count_distinct_groups(numrows, items, mss);
431
432	SortItem groups = (SortItem ) palloc(ngroups * sizeof(SortItem));
433
434	j = `0`;
435	groups[`0`] = items[`0`];
436	groups[`0`].count = `1`;
437
438	for (i = `1`; i < numrows; i++)
439	{
440	/ Assume sorted in ascending order. /
441	Assert(multi_sort_compare(&items[i], &items[i - `1`], mss) >= `0`);
442
443	/ New distinct group detected. /
444	if (multi_sort_compare(&items[i], &items[i - `1`], mss) != `0`)
445	{
446	groups[++j] = items[i];
447	groups[j].count = `0`;
448	}
449
450	groups[j].count++;
451	}
452
453	/ ensure we filled the expected number of distinct groups /
454	Assert(j + `1` == ngroups);
455
456	/ Sort the distinct groups by frequency (in descending order). /
457	pg_qsort((void ) groups, ngroups, sizeof*(SortItem),
458	compare_sort_item_count);
459
460	*ndistinct = ngroups;
461	return groups;
462	}
463
464	/ compare sort items (single dimension) /
465	static int
466	sort_item_compare(const void a, const* void b, void* *arg)
467	{
468	SortSupport ssup = (SortSupport) arg;
469	SortItem ia = (SortItem ) a;
470	SortItem ib = (SortItem ) b;
471
472	return ApplySortComparator(ia->values[`0`], ia->isnull[`0`],
473	ib->values[`0`], ib->isnull[`0`],
474	ssup);
475	}
476
477	/*
478	* build_column_frequencies
479	* compute frequencies of values in each column
480	*
481	* This returns an array of SortItems for each attibute the MCV is built
482	* on, with a frequency (number of occurrences) for each value. This is
483	* then used to compute "base" frequency of MCV items.
484	*
485	* All the memory is allocated in a single chunk, so that a single pfree
486	* is enough to release it. We do not allocate space for values/isnull
487	* arrays in the SortItems, because we can simply point into the input
488	* groups directly.
489	*/
490	static SortItem **
491	build_column_frequencies(SortItem groups, int* ngroups,
492	MultiSortSupport mss, int *ncounts)
493	{
494	int i,
495	dim;
496	SortItem **result;
497	char *ptr;
498
499	Assert(groups);
500	Assert(ncounts);
501
502	/ allocate arrays for all columns as a single chunk /
503	ptr = palloc(MAXALIGN(sizeof(SortItem ) mss->ndims) +
504	mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
505
506	/ initial array of pointers /
507	result = (SortItem **) ptr;
508	ptr += MAXALIGN(sizeof(SortItem ) mss->ndims);
509
510	for (dim = `0`; dim < mss->ndims; dim++)
511	{
512	SortSupport ssup = &mss->ssup[dim];
513
514	/ array of values for a single column /
515	result[dim] = (SortItem *) ptr;
516	ptr += MAXALIGN(sizeof(SortItem) * ngroups);
517
518	/ extract data for the dimension /
519	for (i = `0`; i < ngroups; i++)
520	{
521	/ point into the input groups /
522	result[dim][i].values = &groups[i].values[dim];
523	result[dim][i].isnull = &groups[i].isnull[dim];
524	result[dim][i].count = groups[i].count;
525	}
526
527	/ sort the values, deduplicate /
528	qsort_arg((void ) result[dim], ngroups, sizeof*(SortItem),
529	sort_item_compare, ssup);
530
531	/*
532	* Identify distinct values, compute frequency (there might be
533	* multiple MCV items containing this value, so we need to sum
534	* counts from all of them.
535	*/
536	ncounts[dim] = `1`;
537	for (i = `1`; i < ngroups; i++)
538	{
539	if (sort_item_compare(&result[dim][i-`1`], &result[dim][i], ssup) == `0`)
540	{
541	result[dim][ncounts[dim]-`1`].count += result[dim][i].count;
542	continue;
543	}
544
545	result[dim][ncounts[dim]] = result[dim][i];
546
547	ncounts[dim]++;
548	}
549	}
550
551	return result;
552	}
553
554	/*
555	* statext_mcv_load
556	* Load the MCV list for the indicated pg_statistic_ext tuple
557	*/
558	MCVList *
559	statext_mcv_load(Oid mvoid)
560	{
561	MCVList *result;
562	bool isnull;
563	Datum mcvlist;
564	HeapTuple htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
565
566	if (!HeapTupleIsValid(htup))
567	elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
568
569	mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
570	Anum_pg_statistic_ext_data_stxdmcv, &isnull);
571
572	if (isnull)
573	elog(ERROR,
574	"requested statistic kind \"%c\" is not yet built for statistics object %u",
575	STATS_EXT_DEPENDENCIES, mvoid);
576
577	result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
578
579	ReleaseSysCache(htup);
580
581	return result;
582	}
583
584
585	/*
586	* statext_mcv_serialize
587	* Serialize MCV list into a pg_mcv_list value.
588	*
589	* The MCV items may include values of various data types, and it's reasonable
590	* to expect redundancy (values for a given attribute, repeated for multiple
591	* MCV list items). So we deduplicate the values into arrays, and then replace
592	* the values by indexes into those arrays.
593	*
594	* The overall structure of the serialized representation looks like this:
595	*
596	* +---------------+----------------+---------------------+-------+
597	* \| header fields \| dimension info \| deduplicated values \| items \|
598	* +---------------+----------------+---------------------+-------+
599	*
600	* Where dimension info stores information about type of K-th attribute (e.g.
601	* typlen, typbyval and length of deduplicated values). Deduplicated values
602	* store deduplicated values for each attribute. And items store the actual
603	* MCV list items, with values replaced by indexes into the arrays.
604	*
605	* When serializing the items, we use uint16 indexes. The number of MCV items
606	* is limited by the statistics target (which is capped to 10k at the moment).
607	* We might increase this to 65k and still fit into uint16, so there's a bit of
608	* slack. Furthermore, this limit is on the number of distinct values per column,
609	* and we usually have few of those (and various combinations of them for the
610	* those MCV list). So uint16 seems fine for now.
611	*
612	* We don't really expect the serialization to save as much space as for
613	* histograms, as we are not doing any bucket splits (which is the source
614	* of high redundancy in histograms).
615	*
616	* TODO: Consider packing boolean flags (NULL) for each item into a single char
617	* (or a longer type) instead of using an array of bool items.
618	*/
619	bytea *
620	statext_mcv_serialize(MCVList mcvlist, VacAttrStats *stats)
621	{
622	int i;
623	int dim;
624	int ndims = mcvlist->ndimensions;
625
626	SortSupport ssup;
627	DimensionInfo *info;
628
629	Size total_length;
630
631	/ serialized items (indexes into arrays, etc.) /
632	bytea *raw;
633	char *ptr;
634	char *endptr PG_USED_FOR_ASSERTS_ONLY;
635
636	/ values per dimension (and number of non-NULL values) /
637	Datum values = (Datum ) palloc0(sizeof(Datum ) ndims);
638	int counts = (int* ) palloc0(sizeof(int) ndims);
639
640	/*
641	* We'll include some rudimentary information about the attribute types
642	* (length, by-val flag), so that we don't have to look them up while
643	* deserializating the MCV list (we already have the type OID in the
644	* header). This is safe, because when changing type of the attribute the
645	* statistics gets dropped automatically. We need to store the info about
646	* the arrays of deduplicated values anyway.
647	*/
648	info = (DimensionInfo ) palloc0(sizeof(DimensionInfo) ndims);
649
650	/ sort support data for all attributes included in the MCV list /
651	ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
652
653	/ collect and deduplicate values for each dimension (attribute) /
654	for (dim = `0`; dim < ndims; dim++)
655	{
656	int ndistinct;
657	TypeCacheEntry *typentry;
658
659	/*
660	* Lookup the LT operator (can't get it from stats extra_data, as we
661	* don't know how to interpret that - scalar vs. array etc.).
662	*/
663	typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
664
665	/ copy important info about the data type (length, by-value) /
666	info[dim].typlen = stats[dim]->attrtype->typlen;
667	info[dim].typbyval = stats[dim]->attrtype->typbyval;
668
669	/ allocate space for values in the attribute and collect them /
670	values[dim] = (Datum ) palloc0(sizeof(Datum) mcvlist->nitems);
671
672	for (i = `0`; i < mcvlist->nitems; i++)
673	{
674	/ skip NULL values - we don't need to deduplicate those /
675	if (mcvlist->items[i].isnull[dim])
676	continue;
677
678	/ append the value at the end /
679	values[dim][counts[dim]] = mcvlist->items[i].values[dim];
680	counts[dim] += `1`;
681	}
682
683	/ if there are just NULL values in this dimension, we're done /
684	if (counts[dim] == `0`)
685	continue;
686
687	/ sort and deduplicate the data /
688	ssup[dim].ssup_cxt = CurrentMemoryContext;
689	ssup[dim].ssup_collation = stats[dim]->attrcollid;
690	ssup[dim].ssup_nulls_first = false;
691
692	PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
693
694	qsort_arg(values[dim], counts[dim], sizeof(Datum),
695	compare_scalars_simple, &ssup[dim]);
696
697	/*
698	* Walk through the array and eliminate duplicate values, but keep the
699	* ordering (so that we can do bsearch later). We know there's at
700	* least one item as (counts[dim] != 0), so we can skip the first
701	* element.
702	*/
703	ndistinct = `1`; / number of distinct values /
704	for (i = `1`; i < counts[dim]; i++)
705	{
706	/ expect sorted array /
707	Assert(compare_datums_simple(values[dim][i - `1`], values[dim][i], &ssup[dim]) <= `0`);
708
709	/ if the value is the same as the previous one, we can skip it /
710	if (!compare_datums_simple(values[dim][i - `1`], values[dim][i], &ssup[dim]))
711	continue;
712
713	values[dim][ndistinct] = values[dim][i];
714	ndistinct += `1`;
715	}
716
717	/ we must not exceed PG_UINT16_MAX, as we use uint16 indexes /
718	Assert(ndistinct <= PG_UINT16_MAX);
719
720	/*
721	* Store additional info about the attribute - number of deduplicated
722	* values, and also size of the serialized data. For fixed-length data
723	* types this is trivial to compute, for varwidth types we need to
724	* actually walk the array and sum the sizes.
725	*/
726	info[dim].nvalues = ndistinct;
727
728	if (info[dim].typbyval) / by-value data types /
729	{
730	info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
731
732	/*
733	* We copy the data into the MCV item during deserialization, so
734	* we don't need to allocate any extra space.
735	*/
736	info[dim].nbytes_aligned = `0`;
737	}
738	else if (info[dim].typlen > `0`) / fixed-length by-ref /
739	{
740	/*
741	* We don't care about alignment in the serialized data, so we
742	* pack the data as much as possible. But we also track how much
743	* data will be needed after deserialization, and in that case
744	* we need to account for alignment of each item.
745	*
746	* Note: As the items are fixed-length, we could easily compute
747	* this during deserialization, but we do it here anyway.
748	*/
749	info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
750	info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
751	}
752	else if (info[dim].typlen == -`1`) / varlena /
753	{
754	info[dim].nbytes = `0`;
755	info[dim].nbytes_aligned = `0`;
756	for (i = `0`; i < info[dim].nvalues; i++)
757	{
758	Size len;
759
760	/*
761	* For varlena values, we detoast the values and store the
762	* length and data separately. We don't bother with alignment
763	* here, which means that during deserialization we need to
764	* copy the fields and only access the copies.
765	*/
766	values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
767
768	/ serialized length (uint32 length + data) /
769	len = VARSIZE_ANY_EXHDR(values[dim][i]);
770	info[dim].nbytes += sizeof(uint32); / length /
771	info[dim].nbytes += len; / value (no header) /
772
773	/*
774	* During deserialization we'll build regular varlena values
775	* with full headers, and we need to align them properly.
776	*/
777	info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
778	}
779	}
780	else if (info[dim].typlen == -`2`) / cstring /
781	{
782	info[dim].nbytes = `0`;
783	info[dim].nbytes_aligned = `0`;
784	for (i = `0`; i < info[dim].nvalues; i++)
785	{
786	Size len;
787
788	/*
789	* For cstring, we do similar thing as for varlena - first we
790	* store the length as uint32 and then the data. We don't care
791	* about alignment, which means that during deserialization we
792	* need to copy the fields and only access the copies.
793	*/
794
795	/ c-strings include terminator, so +1 byte /
796	len = strlen(DatumGetCString(values[dim][i])) + `1`;
797	info[dim].nbytes += sizeof(uint32); / length /
798	info[dim].nbytes += len; / value /
799
800	/ space needed for properly aligned deserialized copies /
801	info[dim].nbytes_aligned += MAXALIGN(len);
802	}
803	}
804
805	/ we know (count>0) so there must be some data /
806	Assert(info[dim].nbytes > `0`);
807	}
808
809	/*
810	* Now we can finally compute how much space we'll actually need for the
811	* whole serialized MCV list (varlena header, MCV header, dimension info
812	* for each attribute, deduplicated values and items).
813	*/
814	total_length = (`3` * sizeof(uint32)) / magic + type + nitems /
815	+ sizeof(AttrNumber) / ndimensions /
816	+ (ndims * sizeof(Oid)); / attribute types /
817
818	/ dimension info /
819	total_length += ndims * sizeof(DimensionInfo);
820
821	/ add space for the arrays of deduplicated values /
822	for (i = `0`; i < ndims; i++)
823	total_length += info[i].nbytes;
824
825	/*
826	* And finally account for the items (those are fixed-length, thanks to
827	* replacing values with uint16 indexes into the deduplicated arrays).
828	*/
829	total_length += mcvlist->nitems * ITEM_SIZE(dim);
830
831	/*
832	* Allocate space for the whole serialized MCV list (we'll skip bytes, so
833	* we set them to zero to make the result more compressible).
834	*/
835	raw = (bytea *) palloc0(VARHDRSZ + total_length);
836	SET_VARSIZE(raw, VARHDRSZ + total_length);
837
838	ptr = VARDATA(raw);
839	endptr = ptr + total_length;
840
841	/ copy the MCV list header fields, one by one /
842	memcpy(ptr, &mcvlist->magic, sizeof(uint32));
843	ptr += sizeof(uint32);
844
845	memcpy(ptr, &mcvlist->type, sizeof(uint32));
846	ptr += sizeof(uint32);
847
848	memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
849	ptr += sizeof(uint32);
850
851	memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
852	ptr += sizeof(AttrNumber);
853
854	memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
855	ptr += (sizeof(Oid) * ndims);
856
857	/ store information about the attributes (data amounts, ...) /
858	memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
859	ptr += sizeof(DimensionInfo) * ndims;
860
861	/ Copy the deduplicated values for all attributes to the output. /
862	for (dim = `0`; dim < ndims; dim++)
863	{
864	/ remember the starting point for Asserts later /
865	char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
866
867	for (i = `0`; i < info[dim].nvalues; i++)
868	{
869	Datum value = values[dim][i];
870
871	if (info[dim].typbyval) / passed by value /
872	{
873	Datum tmp;
874
875	/*
876	* For values passed by value, we need to copy just the
877	* significant bytes - we can't use memcpy directly, as that
878	* assumes little endian behavior. store_att_byval does
879	* almost what we need, but it requires properly aligned
880	* buffer - the output buffer does not guarantee that. So we
881	* simply use a local Datum variable (which guarantees proper
882	* alignment), and then copy the value from it.
883	*/
884	store_att_byval(&tmp, value, info[dim].typlen);
885
886	memcpy(ptr, &tmp, info[dim].typlen);
887	ptr += info[dim].typlen;
888	}
889	else if (info[dim].typlen > `0`) / passed by reference /
890	{
891	/ no special alignment needed, treated as char array /
892	memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
893	ptr += info[dim].typlen;
894	}
895	else if (info[dim].typlen == -`1`) / varlena /
896	{
897	uint32 len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
898
899	/ copy the length /
900	memcpy(ptr, &len, sizeof(uint32));
901	ptr += sizeof(uint32);
902
903	/ data from the varlena value (without the header) /
904	memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
905	ptr += len;
906	}
907	else if (info[dim].typlen == -`2`) / cstring /
908	{
909	uint32 len = (uint32) strlen(DatumGetCString(value)) + `1`;
910
911	/ copy the length /
912	memcpy(ptr, &len, sizeof(uint32));
913	ptr += sizeof(uint32);
914
915	/ value /
916	memcpy(ptr, DatumGetCString(value), len);
917	ptr += len;
918	}
919
920	/ no underflows or overflows /
921	Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
922	}
923
924	/ we should get exactly nbytes of data for this dimension /
925	Assert((ptr - start) == info[dim].nbytes);
926	}
927
928	/ Serialize the items, with uint16 indexes instead of the values. /
929	for (i = `0`; i < mcvlist->nitems; i++)
930	{
931	MCVItem *mcvitem = &mcvlist->items[i];
932
933	/ don't write beyond the allocated space /
934	Assert(ptr <= (endptr - ITEM_SIZE(dim)));
935
936	/ copy NULL and frequency flags into the serialized MCV /
937	memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
938	ptr += sizeof(bool) * ndims;
939
940	memcpy(ptr, &mcvitem->frequency, sizeof(double));
941	ptr += sizeof(double);
942
943	memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
944	ptr += sizeof(double);
945
946	/ store the indexes last /
947	for (dim = `0`; dim < ndims; dim++)
948	{
949	uint16 index = `0`;
950	Datum *value;
951
952	/ do the lookup only for non-NULL values /
953	if (!mcvitem->isnull[dim])
954	{
955	value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
956	info[dim].nvalues, sizeof(Datum),
957	compare_scalars_simple, &ssup[dim]);
958
959	Assert(value != NULL); / serialization or deduplication error /
960
961	/ compute index within the deduplicated array /
962	index = (uint16) (value - values[dim]);
963
964	/ check the index is within expected bounds /
965	Assert(index < info[dim].nvalues);
966	}
967
968	/ copy the index into the serialized MCV /
969	memcpy(ptr, &index, sizeof(uint16));
970	ptr += sizeof(uint16);
971	}
972
973	/ make sure we don't overflow the allocated value /
974	Assert(ptr <= endptr);
975	}
976
977	/ at this point we expect to match the total_length exactly /
978	Assert(ptr == endptr);
979
980	pfree(values);
981	pfree(counts);
982
983	return raw;
984	}
985
986	/*
987	* statext_mcv_deserialize
988	* Reads serialized MCV list into MCVList structure.
989	*
990	* All the memory needed by the MCV list is allocated as a single chunk, so
991	* it's possible to simply pfree() it at once.
992	*/
993	MCVList *
994	statext_mcv_deserialize(bytea *data)
995	{
996	int dim,
997	i;
998	Size expected_size;
999	MCVList *mcvlist;
1000	char *raw;
1001	char *ptr;
1002	char *endptr PG_USED_FOR_ASSERTS_ONLY;
1003
1004	int ndims,
1005	nitems;
1006	DimensionInfo *info = NULL;
1007
1008	/ local allocation buffer (used only for deserialization) /
1009	Datum **map = NULL;
1010
1011	/ MCV list /
1012	Size mcvlen;
1013
1014	/ buffer used for the result /
1015	Size datalen;
1016	char *dataptr;
1017	char *valuesptr;
1018	char *isnullptr;
1019
1020	if (data == NULL)
1021	return NULL;
1022
1023	/*
1024	* We can't possibly deserialize a MCV list if there's not even a complete
1025	* header. We need an explicit formula here, because we serialize the
1026	* header fields one by one, so we need to ignore struct alignment.
1027	*/
1028	if (VARSIZE_ANY(data) < MinSizeOfMCVList)
1029	elog(ERROR, "invalid MCV size %zd (expected at least %zu)",
1030	VARSIZE_ANY(data), MinSizeOfMCVList);
1031
1032	/ read the MCV list header /
1033	mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
1034
1035	/ pointer to the data part (skip the varlena header) /
1036	raw = (char *) data;
1037	ptr = VARDATA_ANY(raw);
1038	endptr = (char *) raw + VARSIZE_ANY(data);
1039
1040	/ get the header and perform further sanity checks /
1041	memcpy(&mcvlist->magic, ptr, sizeof(uint32));
1042	ptr += sizeof(uint32);
1043
1044	memcpy(&mcvlist->type, ptr, sizeof(uint32));
1045	ptr += sizeof(uint32);
1046
1047	memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
1048	ptr += sizeof(uint32);
1049
1050	memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
1051	ptr += sizeof(AttrNumber);
1052
1053	if (mcvlist->magic != STATS_MCV_MAGIC)
1054	elog(ERROR, "invalid MCV magic %u (expected %u)",
1055	mcvlist->magic, STATS_MCV_MAGIC);
1056
1057	if (mcvlist->type != STATS_MCV_TYPE_BASIC)
1058	elog(ERROR, "invalid MCV type %u (expected %u)",
1059	mcvlist->type, STATS_MCV_TYPE_BASIC);
1060
1061	if (mcvlist->ndimensions == `0`)
1062	elog(ERROR, "invalid zero-length dimension array in MCVList");
1063	else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) \|\|
1064	(mcvlist->ndimensions < `0`))
1065	elog(ERROR, "invalid length (%d) dimension array in MCVList",
1066	mcvlist->ndimensions);
1067
1068	if (mcvlist->nitems == `0`)
1069	elog(ERROR, "invalid zero-length item array in MCVList");
1070	else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
1071	elog(ERROR, "invalid length (%u) item array in MCVList",
1072	mcvlist->nitems);
1073
1074	nitems = mcvlist->nitems;
1075	ndims = mcvlist->ndimensions;
1076
1077	/*
1078	* Check amount of data including DimensionInfo for all dimensions and
1079	* also the serialized items (including uint16 indexes). Also, walk
1080	* through the dimension information and add it to the sum.
1081	*/
1082	expected_size = SizeOfMCVList(ndims, nitems);
1083
1084	/*
1085	* Check that we have at least the dimension and info records, along with
1086	* the items. We don't know the size of the serialized values yet. We need
1087	* to do this check first, before accessing the dimension info.
1088	*/
1089	if (VARSIZE_ANY(data) < expected_size)
1090	elog(ERROR, "invalid MCV size %zd (expected %zu)",
1091	VARSIZE_ANY(data), expected_size);
1092
1093	/ Now copy the array of type Oids. /
1094	memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
1095	ptr += (sizeof(Oid) * ndims);
1096
1097	/ Now it's safe to access the dimension info. /
1098	info = palloc(ndims * sizeof(DimensionInfo));
1099
1100	memcpy(info, ptr, ndims * sizeof(DimensionInfo));
1101	ptr += (ndims * sizeof(DimensionInfo));
1102
1103	/ account for the value arrays /
1104	for (dim = `0`; dim < ndims; dim++)
1105	{
1106	/*
1107	* XXX I wonder if we can/should rely on asserts here. Maybe those
1108	* checks should be done every time?
1109	*/
1110	Assert(info[dim].nvalues >= `0`);
1111	Assert(info[dim].nbytes >= `0`);
1112
1113	expected_size += info[dim].nbytes;
1114	}
1115
1116	/*
1117	* Now we know the total expected MCV size, including all the pieces
1118	* (header, dimension info. items and deduplicated data). So do the final
1119	* check on size.
1120	*/
1121	if (VARSIZE_ANY(data) != expected_size)
1122	elog(ERROR, "invalid MCV size %zd (expected %zu)",
1123	VARSIZE_ANY(data), expected_size);
1124
1125	/*
1126	* We need an array of Datum values for each dimension, so that we can
1127	* easily translate the uint16 indexes later. We also need a top-level
1128	* array of pointers to those per-dimension arrays.
1129	*
1130	* While allocating the arrays for dimensions, compute how much space we
1131	* need for a copy of the by-ref data, as we can't simply point to the
1132	* original values (it might go away).
1133	*/
1134	datalen = `0`; / space for by-ref data /
1135	map = (Datum *) palloc(ndims sizeof(Datum *));
1136
1137	for (dim = `0`; dim < ndims; dim++)
1138	{
1139	map[dim] = (Datum ) palloc(sizeof(Datum) info[dim].nvalues);
1140
1141	/ space needed for a copy of data for by-ref types /
1142	datalen += info[dim].nbytes_aligned;
1143	}
1144
1145	/*
1146	* Now resize the MCV list so that the allocation includes all the data.
1147	*
1148	* Allocate space for a copy of the data, as we can't simply reference the
1149	* serialized data - it's not aligned properly, and it may disappear while
1150	* we're still using the MCV list, e.g. due to catcache release.
1151	*
1152	* We do care about alignment here, because we will allocate all the pieces
1153	* at once, but then use pointers to different parts.
1154	*/
1155	mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1156
1157	/ arrays of values and isnull flags for all MCV items /
1158	mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
1159	mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
1160
1161	/ we don't quite need to align this, but it makes some asserts easier /
1162	mcvlen += MAXALIGN(datalen);
1163
1164	/ now resize the deserialized MCV list, and compute pointers to parts /
1165	mcvlist = repalloc(mcvlist, mcvlen);
1166
1167	/ pointer to the beginning of values/isnull arrays /
1168	valuesptr = (char *) mcvlist
1169	+ MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1170
1171	isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
1172
1173	dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
1174
1175	/*
1176	* Build mapping (index => value) for translating the serialized data into
1177	* the in-memory representation.
1178	*/
1179	for (dim = `0`; dim < ndims; dim++)
1180	{
1181	/ remember start position in the input array /
1182	char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
1183
1184	if (info[dim].typbyval)
1185	{
1186	/ for by-val types we simply copy data into the mapping /
1187	for (i = `0`; i < info[dim].nvalues; i++)
1188	{
1189	Datum v = `0`;
1190
1191	memcpy(&v, ptr, info[dim].typlen);
1192	ptr += info[dim].typlen;
1193
1194	map[dim][i] = fetch_att(&v, true, info[dim].typlen);
1195
1196	/ no under/overflow of input array /
1197	Assert(ptr <= (start + info[dim].nbytes));
1198	}
1199	}
1200	else
1201	{
1202	/ for by-ref types we need to also make a copy of the data /
1203
1204	/ passed by reference, but fixed length (name, tid, ...) /
1205	if (info[dim].typlen > `0`)
1206	{
1207	for (i = `0`; i < info[dim].nvalues; i++)
1208	{
1209	memcpy(dataptr, ptr, info[dim].typlen);
1210	ptr += info[dim].typlen;
1211
1212	/ just point into the array /
1213	map[dim][i] = PointerGetDatum(dataptr);
1214	dataptr += MAXALIGN(info[dim].typlen);
1215	}
1216	}
1217	else if (info[dim].typlen == -`1`)
1218	{
1219	/ varlena /
1220	for (i = `0`; i < info[dim].nvalues; i++)
1221	{
1222	uint32 len;
1223
1224	/ read the uint32 length /
1225	memcpy(&len, ptr, sizeof(uint32));
1226	ptr += sizeof(uint32);
1227
1228	/ the length is data-only /
1229	SET_VARSIZE(dataptr, len + VARHDRSZ);
1230	memcpy(VARDATA(dataptr), ptr, len);
1231	ptr += len;
1232
1233	/ just point into the array /
1234	map[dim][i] = PointerGetDatum(dataptr);
1235
1236	/ skip to place of the next deserialized value /
1237	dataptr += MAXALIGN(len + VARHDRSZ);
1238	}
1239	}
1240	else if (info[dim].typlen == -`2`)
1241	{
1242	/ cstring /
1243	for (i = `0`; i < info[dim].nvalues; i++)
1244	{
1245	uint32 len;
1246
1247	memcpy(&len, ptr, sizeof(uint32));
1248	ptr += sizeof(uint32);
1249
1250	memcpy(dataptr, ptr, len);
1251	ptr += len;
1252
1253	/ just point into the array /
1254	map[dim][i] = PointerGetDatum(dataptr);
1255	dataptr += MAXALIGN(len);
1256	}
1257	}
1258
1259	/ no under/overflow of input array /
1260	Assert(ptr <= (start + info[dim].nbytes));
1261
1262	/ no overflow of the output mcv value /
1263	Assert(dataptr <= ((char *) mcvlist + mcvlen));
1264	}
1265
1266	/ check we consumed input data for this dimension exactly /
1267	Assert(ptr == (start + info[dim].nbytes));
1268	}
1269
1270	/ we should have also filled the MCV list exactly /
1271	Assert(dataptr == ((char *) mcvlist + mcvlen));
1272
1273	/ deserialize the MCV items and translate the indexes to Datums /
1274	for (i = `0`; i < nitems; i++)
1275	{
1276	MCVItem *item = &mcvlist->items[i];
1277
1278	item->values = (Datum *) valuesptr;
1279	valuesptr += MAXALIGN(sizeof(Datum) * ndims);
1280
1281	item->isnull = (bool *) isnullptr;
1282	isnullptr += MAXALIGN(sizeof(bool) * ndims);
1283
1284	memcpy(item->isnull, ptr, sizeof(bool) * ndims);
1285	ptr += sizeof(bool) * ndims;
1286
1287	memcpy(&item->frequency, ptr, sizeof(double));
1288	ptr += sizeof(double);
1289
1290	memcpy(&item->base_frequency, ptr, sizeof(double));
1291	ptr += sizeof(double);
1292
1293	/ finally translate the indexes (for non-NULL only) /
1294	for (dim = `0`; dim < ndims; dim++)
1295	{
1296	uint16 index;
1297
1298	memcpy(&index, ptr, sizeof(uint16));
1299	ptr += sizeof(uint16);
1300
1301	if (item->isnull[dim])
1302	continue;
1303
1304	item->values[dim] = map[dim][index];
1305	}
1306
1307	/ check we're not overflowing the input /
1308	Assert(ptr <= endptr);
1309	}
1310
1311	/ check that we processed all the data /
1312	Assert(ptr == endptr);
1313
1314	/ release the buffers used for mapping /
1315	for (dim = `0`; dim < ndims; dim++)
1316	pfree(map[dim]);
1317
1318	pfree(map);
1319
1320	return mcvlist;
1321	}
1322
1323	/*
1324	* SRF with details about buckets of a histogram:
1325	*
1326	* - item ID (0...nitems)
1327	* - values (string array)
1328	* - nulls only (boolean array)
1329	* - frequency (double precision)
1330	* - base_frequency (double precision)
1331	*
1332	* The input is the OID of the statistics, and there are no rows returned if
1333	* the statistics contains no histogram.
1334	*/
1335	Datum
1336	pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
1337	{
1338	FuncCallContext *funcctx;
1339
1340	/ stuff done only on the first call of the function /
1341	if (SRF_IS_FIRSTCALL())
1342	{
1343	MemoryContext oldcontext;
1344	MCVList *mcvlist;
1345	TupleDesc tupdesc;
1346
1347	/ create a function context for cross-call persistence /
1348	funcctx = SRF_FIRSTCALL_INIT();
1349
1350	/ switch to memory context appropriate for multiple function calls /
1351	oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1352
1353	mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(`0`));
1354
1355	funcctx->user_fctx = mcvlist;
1356
1357	/ total number of tuples to be returned /
1358	funcctx->max_calls = `0`;
1359	if (funcctx->user_fctx != NULL)
1360	funcctx->max_calls = mcvlist->nitems;
1361
1362	/ Build a tuple descriptor for our result type /
1363	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1364	ereport(ERROR,
1365	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1366	errmsg("function returning record called in context "
1367	"that cannot accept type record")));
1368	tupdesc = BlessTupleDesc(tupdesc);
1369
1370	/*
1371	* generate attribute metadata needed later to produce tuples from raw
1372	* C strings
1373	*/
1374	funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
1375
1376	MemoryContextSwitchTo(oldcontext);
1377	}
1378
1379	/ stuff done on every call of the function /
1380	funcctx = SRF_PERCALL_SETUP();
1381
1382	if (funcctx->call_cntr < funcctx->max_calls) / do when there is more left to send /
1383	{
1384	Datum values[`5`];
1385	bool nulls[`5`];
1386	HeapTuple tuple;
1387	Datum result;
1388	ArrayBuildState *astate_values = NULL;
1389	ArrayBuildState *astate_nulls = NULL;
1390
1391	int i;
1392	MCVList *mcvlist;
1393	MCVItem *item;
1394
1395	mcvlist = (MCVList *) funcctx->user_fctx;
1396
1397	Assert(funcctx->call_cntr < mcvlist->nitems);
1398
1399	item = &mcvlist->items[funcctx->call_cntr];
1400
1401	for (i = `0`; i < mcvlist->ndimensions; i++)
1402	{
1403
1404	astate_nulls = accumArrayResult(astate_nulls,
1405	BoolGetDatum(item->isnull[i]),
1406	false,
1407	BOOLOID,
1408	CurrentMemoryContext);
1409
1410	if (!item->isnull[i])
1411	{
1412	bool isvarlena;
1413	Oid outfunc;
1414	FmgrInfo fmgrinfo;
1415	Datum val;
1416	text *txt;
1417
1418	/ lookup output func for the type /
1419	getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
1420	fmgr_info(outfunc, &fmgrinfo);
1421
1422	val = FunctionCall1(&fmgrinfo, item->values[i]);
1423	txt = cstring_to_text(DatumGetPointer(val));
1424
1425	astate_values = accumArrayResult(astate_values,
1426	PointerGetDatum(txt),
1427	false,
1428	TEXTOID,
1429	CurrentMemoryContext);
1430	}
1431	else
1432	astate_values = accumArrayResult(astate_values,
1433	(Datum) `0`,
1434	true,
1435	TEXTOID,
1436	CurrentMemoryContext);
1437	}
1438
1439	values[`0`] = Int32GetDatum(funcctx->call_cntr);
1440	values[`1`] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
1441	values[`2`] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext));
1442	values[`3`] = Float8GetDatum(item->frequency);
1443	values[`4`] = Float8GetDatum(item->base_frequency);
1444
1445	/ no NULLs in the tuple /
1446	memset(nulls, `0`, sizeof(nulls));
1447
1448	/ build a tuple /
1449	tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
1450
1451	/ make the tuple into a datum /
1452	result = HeapTupleGetDatum(tuple);
1453
1454	SRF_RETURN_NEXT(funcctx, result);
1455	}
1456	else / do when there is no more left /
1457	{
1458	SRF_RETURN_DONE(funcctx);
1459	}
1460	}
1461
1462	/*
1463	* pg_mcv_list_in - input routine for type pg_mcv_list.
1464	*
1465	* pg_mcv_list is real enough to be a table column, but it has no operations
1466	* of its own, and disallows input too
1467	*/
1468	Datum
1469	pg_mcv_list_in(PG_FUNCTION_ARGS)
1470	{
1471	/*
1472	* pg_mcv_list stores the data in binary form and parsing text input is
1473	* not needed, so disallow this.
1474	*/
1475	ereport(ERROR,
1476	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1477	errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1478
1479	PG_RETURN_VOID(); / keep compiler quiet /
1480	}
1481
1482
1483	/*
1484	* pg_mcv_list_out - output routine for type pg_mcv_list.
1485	*
1486	* MCV lists are serialized into a bytea value, so we simply call byteaout()
1487	* to serialize the value into text. But it'd be nice to serialize that into
1488	* a meaningful representation (e.g. for inspection by people).
1489	*
1490	* XXX This should probably return something meaningful, similar to what
1491	* pg_dependencies_out does. Not sure how to deal with the deduplicated
1492	* values, though - do we want to expand that or not?
1493	*/
1494	Datum
1495	pg_mcv_list_out(PG_FUNCTION_ARGS)
1496	{
1497	return byteaout(fcinfo);
1498	}
1499
1500	/*
1501	* pg_mcv_list_recv - binary input routine for type pg_mcv_list.
1502	*/
1503	Datum
1504	pg_mcv_list_recv(PG_FUNCTION_ARGS)
1505	{
1506	ereport(ERROR,
1507	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1508	errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1509
1510	PG_RETURN_VOID(); / keep compiler quiet /
1511	}
1512
1513	/*
1514	* pg_mcv_list_send - binary output routine for type pg_mcv_list.
1515	*
1516	* MCV lists are serialized in a bytea value (although the type is named
1517	* differently), so let's just send that.
1518	*/
1519	Datum
1520	pg_mcv_list_send(PG_FUNCTION_ARGS)
1521	{
1522	return byteasend(fcinfo);
1523	}
1524
1525	/*
1526	* mcv_get_match_bitmap
1527	* Evaluate clauses using the MCV list, and update the match bitmap.
1528	*
1529	* A match bitmap keeps match/mismatch status for each MCV item, and we
1530	* update it based on additional clauses. We also use it to skip items
1531	* that can't possibly match (e.g. item marked as "mismatch" can't change
1532	* to "match" when evaluating AND clause list).
1533	*
1534	* The function also returns a flag indicating whether there was an
1535	* equality condition for all attributes, the minimum frequency in the MCV
1536	* list, and a total MCV frequency (sum of frequencies for all items).
1537	*
1538	* XXX Currently the match bitmap uses a bool for each MCV item, which is
1539	* somewhat wasteful as we could do with just a single bit, thus reducing
1540	* the size to ~1/8. It would also allow us to combine bitmaps simply using
1541	* & and \|, which should be faster than min/max. The bitmaps are fairly
1542	* small, though (thanks to the cap on the MCV list size).
1543	*/
1544	static bool *
1545	mcv_get_match_bitmap(PlannerInfo root, List clauses,
1546	Bitmapset keys, MCVList mcvlist, bool is_or)
1547	{
1548	int i;
1549	ListCell *l;
1550	bool *matches;
1551
1552	/ The bitmap may be partially built. /
1553	Assert(clauses != NIL);
1554	Assert(list_length(clauses) >= `1`);
1555	Assert(mcvlist != NULL);
1556	Assert(mcvlist->nitems > `0`);
1557	Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
1558
1559	matches = palloc(sizeof(bool) * mcvlist->nitems);
1560	memset(matches, (is_or) ? false : true,
1561	sizeof(bool) * mcvlist->nitems);
1562
1563	/*
1564	* Loop through the list of clauses, and for each of them evaluate all the
1565	* MCV items not yet eliminated by the preceding clauses.
1566	*/
1567	foreach(l, clauses)
1568	{
1569	Node clause = (Node ) lfirst(l);
1570
1571	/ if it's a RestrictInfo, then extract the clause /
1572	if (IsA(clause, RestrictInfo))
1573	clause = (Node ) ((RestrictInfo ) clause)->clause;
1574
1575	/*
1576	* Handle the various types of clauses - OpClause, NullTest and
1577	* AND/OR/NOT
1578	*/
1579	if (is_opclause(clause))
1580	{
1581	OpExpr expr = (OpExpr ) clause;
1582	FmgrInfo opproc;
1583
1584	/ valid only after examine_opclause_expression returns true /
1585	Var *var;
1586	Const *cst;
1587	bool varonleft;
1588
1589	fmgr_info(get_opcode(expr->opno), &opproc);
1590
1591	/ extract the var and const from the expression /
1592	if (examine_opclause_expression(expr, &var, &cst, &varonleft))
1593	{
1594	int idx;
1595
1596	/ match the attribute to a dimension of the statistic /
1597	idx = bms_member_index(keys, var->varattno);
1598
1599	/*
1600	* Walk through the MCV items and evaluate the current clause.
1601	* We can skip items that were already ruled out, and
1602	* terminate if there are no remaining MCV items that might
1603	* possibly match.
1604	*/
1605	for (i = `0`; i < mcvlist->nitems; i++)
1606	{
1607	bool match = true;
1608	MCVItem *item = &mcvlist->items[i];
1609
1610	/*
1611	* When the MCV item or the Const value is NULL we can treat
1612	* this as a mismatch. We must not call the operator because
1613	* of strictness.
1614	*/
1615	if (item->isnull[idx] \|\| cst->constisnull)
1616	{
1617	matches[i] = RESULT_MERGE(matches[i], is_or, false);
1618	continue;
1619	}
1620
1621	/*
1622	* Skip MCV items that can't change result in the bitmap.
1623	* Once the value gets false for AND-lists, or true for
1624	* OR-lists, we don't need to look at more clauses.
1625	*/
1626	if (RESULT_IS_FINAL(matches[i], is_or))
1627	continue;
1628
1629	/*
1630	* First check whether the constant is below the lower
1631	* boundary (in that case we can skip the bucket, because
1632	* there's no overlap).
1633	*
1634	* We don't store collations used to build the statistics,
1635	* but we can use the collation for the attribute itself,
1636	* as stored in varcollid. We do reset the statistics after
1637	* a type change (including collation change), so this is
1638	* OK. We may need to relax this after allowing extended
1639	* statistics on expressions.
1640	*/
1641	if (varonleft)
1642	match = DatumGetBool(FunctionCall2Coll(&opproc,
1643	var->varcollid,
1644	item->values[idx],
1645	cst->constvalue));
1646	else
1647	match = DatumGetBool(FunctionCall2Coll(&opproc,
1648	var->varcollid,
1649	cst->constvalue,
1650	item->values[idx]));
1651
1652	/ update the match bitmap with the result /
1653	matches[i] = RESULT_MERGE(matches[i], is_or, match);
1654	}
1655	}
1656	}
1657	else if (IsA(clause, NullTest))
1658	{
1659	NullTest expr = (NullTest ) clause;
1660	Var var = (Var ) (expr->arg);
1661
1662	/ match the attribute to a dimension of the statistic /
1663	int idx = bms_member_index(keys, var->varattno);
1664
1665	/*
1666	* Walk through the MCV items and evaluate the current clause. We
1667	* can skip items that were already ruled out, and terminate if
1668	* there are no remaining MCV items that might possibly match.
1669	*/
1670	for (i = `0`; i < mcvlist->nitems; i++)
1671	{
1672	bool match = false; / assume mismatch /
1673	MCVItem *item = &mcvlist->items[i];
1674
1675	/ if the clause mismatches the MCV item, update the bitmap /
1676	switch (expr->nulltesttype)
1677	{
1678	case IS_NULL:
1679	match = (item->isnull[idx]) ? true : match;
1680	break;
1681
1682	case IS_NOT_NULL:
1683	match = (!item->isnull[idx]) ? true : match;
1684	break;
1685	}
1686
1687	/ now, update the match bitmap, depending on OR/AND type /
1688	matches[i] = RESULT_MERGE(matches[i], is_or, match);
1689	}
1690	}
1691	else if (is_orclause(clause) \|\| is_andclause(clause))
1692	{
1693	/ AND/OR clause, with all subclauses being compatible /
1694
1695	int i;
1696	BoolExpr bool_clause = ((BoolExpr ) clause);
1697	List *bool_clauses = bool_clause->args;
1698
1699	/ match/mismatch bitmap for each MCV item /
1700	bool *bool_matches = NULL;
1701
1702	Assert(bool_clauses != NIL);
1703	Assert(list_length(bool_clauses) >= `2`);
1704
1705	/ build the match bitmap for the OR-clauses /
1706	bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
1707	mcvlist, is_orclause(clause));
1708
1709	/*
1710	* Merge the bitmap produced by mcv_get_match_bitmap into the
1711	* current one. We need to consider if we're evaluating AND or OR
1712	* condition when merging the results.
1713	*/
1714	for (i = `0`; i < mcvlist->nitems; i++)
1715	matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
1716
1717	pfree(bool_matches);
1718	}
1719	else if (is_notclause(clause))
1720	{
1721	/ NOT clause, with all subclauses compatible /
1722
1723	int i;
1724	BoolExpr not_clause = ((BoolExpr ) clause);
1725	List *not_args = not_clause->args;
1726
1727	/ match/mismatch bitmap for each MCV item /
1728	bool *not_matches = NULL;
1729
1730	Assert(not_args != NIL);
1731	Assert(list_length(not_args) == `1`);
1732
1733	/ build the match bitmap for the NOT-clause /
1734	not_matches = mcv_get_match_bitmap(root, not_args, keys,
1735	mcvlist, false);
1736
1737	/*
1738	* Merge the bitmap produced by mcv_get_match_bitmap into the
1739	* current one. We're handling a NOT clause, so invert the result
1740	* before merging it into the global bitmap.
1741	*/
1742	for (i = `0`; i < mcvlist->nitems; i++)
1743	matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
1744
1745	pfree(not_matches);
1746	}
1747	else if (IsA(clause, Var))
1748	{
1749	/ Var (has to be a boolean Var, possibly from below NOT) /
1750
1751	Var var = (Var ) (clause);
1752
1753	/ match the attribute to a dimension of the statistic /
1754	int idx = bms_member_index(keys, var->varattno);
1755
1756	Assert(var->vartype == BOOLOID);
1757
1758	/*
1759	* Walk through the MCV items and evaluate the current clause. We
1760	* can skip items that were already ruled out, and terminate if
1761	* there are no remaining MCV items that might possibly match.
1762	*/
1763	for (i = `0`; i < mcvlist->nitems; i++)
1764	{
1765	MCVItem *item = &mcvlist->items[i];
1766	bool match = false;
1767
1768	/ if the item is NULL, it's a mismatch /
1769	if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
1770	match = true;
1771
1772	/ update the result bitmap /
1773	matches[i] = RESULT_MERGE(matches[i], is_or, match);
1774	}
1775	}
1776	else
1777	elog(ERROR, "unknown clause type: %d", clause->type);
1778	}
1779
1780	return matches;
1781	}
1782
1783
1784	/*
1785	* mcv_clauselist_selectivity
1786	* Return the selectivity estimate computed using an MCV list.
1787	*
1788	* First builds a bitmap of MCV items matching the clauses, and then sums
1789	* the frequencies of matching items.
1790	*
1791	* It also produces two additional interesting selectivities - total
1792	* selectivity of all the MCV items (not just the matching ones), and the
1793	* base frequency computed on the assumption of independence.
1794	*/
1795	Selectivity
1796	mcv_clauselist_selectivity(PlannerInfo root, StatisticExtInfo stat,
1797	List clauses, int* varRelid,
1798	JoinType jointype, SpecialJoinInfo *sjinfo,
1799	RelOptInfo *rel,
1800	Selectivity basesel, Selectivity totalsel)
1801	{
1802	int i;
1803	MCVList *mcv;
1804	Selectivity s = `0.0`;
1805
1806	/ match/mismatch bitmap for each MCV item /
1807	bool *matches = NULL;
1808
1809	/ load the MCV list stored in the statistics object /
1810	mcv = statext_mcv_load(stat->statOid);
1811
1812	/ build a match bitmap for the clauses /
1813	matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
1814
1815	/ sum frequencies for all the matching MCV items /
1816	*basesel = `0.0`;
1817	*totalsel = `0.0`;
1818	for (i = `0`; i < mcv->nitems; i++)
1819	{
1820	*totalsel += mcv->items[i].frequency;
1821
1822	if (matches[i] != false)
1823	{
1824	/ XXX Shouldn't the basesel be outside the if condition? /
1825	*basesel += mcv->items[i].base_frequency;
1826	s += mcv->items[i].frequency;
1827	}
1828	}
1829
1830	return s;
1831	}
1832

Browse the source code of PostgreSQL/src/backend/statistics/mcv.c