array_typanalyze.c source code [PostgreSQL/src/backend/utils/adt/array_typanalyze.c]

1	/-------------------------------------------------------------------------*
2	*
3	* array_typanalyze.c
4	* Functions for gathering statistics from array columns
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/utils/adt/array_typanalyze.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#include "postgres.h"
16
17	#include "access/tuptoaster.h"
18	#include "commands/vacuum.h"
19	#include "utils/array.h"
20	#include "utils/builtins.h"
21	#include "utils/datum.h"
22	#include "utils/lsyscache.h"
23	#include "utils/typcache.h"
24
25
26	/*
27	* To avoid consuming too much memory, IO and CPU load during analysis, and/or
28	* too much space in the resulting pg_statistic rows, we ignore arrays that
29	* are wider than ARRAY_WIDTH_THRESHOLD (after detoasting!). Note that this
30	* number is considerably more than the similar WIDTH_THRESHOLD limit used
31	* in analyze.c's standard typanalyze code.
32	*/
33	#define ARRAY_WIDTH_THRESHOLD 0x10000
34
35	/ Extra data for compute_array_stats function /
36	typedef struct
37	{
38	/ Information about array element type /
39	Oid type_id; / element type's OID /
40	Oid eq_opr; / default equality operator's OID /
41	Oid coll_id; / collation to use /
42	bool typbyval; / physical properties of element type /
43	int16 typlen;
44	char typalign;
45
46	/*
47	* Lookup data for element type's comparison and hash functions (these are
48	* in the type's typcache entry, which we expect to remain valid over the
49	* lifespan of the ANALYZE run)
50	*/
51	FmgrInfo *cmp;
52	FmgrInfo *hash;
53
54	/ Saved state from std_typanalyze() /
55	AnalyzeAttrComputeStatsFunc std_compute_stats;
56	void *std_extra_data;
57	} ArrayAnalyzeExtraData;
58
59	/*
60	* While compute_array_stats is running, we keep a pointer to the extra data
61	* here for use by assorted subroutines. compute_array_stats doesn't
62	* currently need to be re-entrant, so avoiding this is not worth the extra
63	* notational cruft that would be needed.
64	*/
65	static ArrayAnalyzeExtraData *array_extra_data;
66
67	/ A hash table entry for the Lossy Counting algorithm /
68	typedef struct
69	{
70	Datum key; / This is 'e' from the LC algorithm. /
71	int frequency; / This is 'f'. /
72	int delta; / And this is 'delta'. /
73	int last_container; / For de-duplication of array elements. /
74	} TrackItem;
75
76	/ A hash table entry for distinct-elements counts /
77	typedef struct
78	{
79	int count; / Count of distinct elements in an array /
80	int frequency; / Number of arrays seen with this count /
81	} DECountItem;
82
83	static void compute_array_stats(VacAttrStats *stats,
84	AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
85	static void prune_element_hashtable(HTAB elements_tab, int* b_current);
86	static uint32 element_hash(const void *key, Size keysize);
87	static int element_match(const void key1, const* void *key2, Size keysize);
88	static int element_compare(const void key1, const* void *key2);
89	static int trackitem_compare_frequencies_desc(const void e1, const* void *e2);
90	static int trackitem_compare_element(const void e1, const* void *e2);
91	static int countitem_compare_count(const void e1, const* void *e2);
92
93
94	/*
95	* array_typanalyze -- typanalyze function for array columns
96	*/
97	Datum
98	array_typanalyze(PG_FUNCTION_ARGS)
99	{
100	VacAttrStats stats = (VacAttrStats ) PG_GETARG_POINTER(`0`);
101	Oid element_typeid;
102	TypeCacheEntry *typentry;
103	ArrayAnalyzeExtraData *extra_data;
104
105	/*
106	* Call the standard typanalyze function. It may fail to find needed
107	* operators, in which case we also can't do anything, so just fail.
108	*/
109	if (!std_typanalyze(stats))
110	PG_RETURN_BOOL(false);
111
112	/*
113	* Check attribute data type is a varlena array (or a domain over one).
114	*/
115	element_typeid = get_base_element_type(stats->attrtypid);
116	if (!OidIsValid(element_typeid))
117	elog(ERROR, "array_typanalyze was invoked for non-array type %u",
118	stats->attrtypid);
119
120	/*
121	* Gather information about the element type. If we fail to find
122	* something, return leaving the state from std_typanalyze() in place.
123	*/
124	typentry = lookup_type_cache(element_typeid,
125	TYPECACHE_EQ_OPR \|
126	TYPECACHE_CMP_PROC_FINFO \|
127	TYPECACHE_HASH_PROC_FINFO);
128
129	if (!OidIsValid(typentry->eq_opr) \|\|
130	!OidIsValid(typentry->cmp_proc_finfo.fn_oid) \|\|
131	!OidIsValid(typentry->hash_proc_finfo.fn_oid))
132	PG_RETURN_BOOL(true);
133
134	/ Store our findings for use by compute_array_stats() /
135	extra_data = (ArrayAnalyzeExtraData ) palloc(sizeof*(ArrayAnalyzeExtraData));
136	extra_data->type_id = typentry->type_id;
137	extra_data->eq_opr = typentry->eq_opr;
138	extra_data->coll_id = stats->attrcollid; / collation we should use /
139	extra_data->typbyval = typentry->typbyval;
140	extra_data->typlen = typentry->typlen;
141	extra_data->typalign = typentry->typalign;
142	extra_data->cmp = &typentry->cmp_proc_finfo;
143	extra_data->hash = &typentry->hash_proc_finfo;
144
145	/ Save old compute_stats and extra_data for scalar statistics ... /
146	extra_data->std_compute_stats = stats->compute_stats;
147	extra_data->std_extra_data = stats->extra_data;
148
149	/ ... and replace with our info /
150	stats->compute_stats = compute_array_stats;
151	stats->extra_data = extra_data;
152
153	/*
154	* Note we leave stats->minrows set as std_typanalyze set it. Should it
155	* be increased for array analysis purposes?
156	*/
157
158	PG_RETURN_BOOL(true);
159	}
160
161	/*
162	* compute_array_stats() -- compute statistics for an array column
163	*
164	* This function computes statistics useful for determining selectivity of
165	* the array operators <@, &&, and @>. It is invoked by ANALYZE via the
166	* compute_stats hook after sample rows have been collected.
167	*
168	* We also invoke the standard compute_stats function, which will compute
169	* "scalar" statistics relevant to the btree-style array comparison operators.
170	* However, exact duplicates of an entire array may be rare despite many
171	* arrays sharing individual elements. This especially afflicts long arrays,
172	* which are also liable to lack all scalar statistics due to the low
173	* WIDTH_THRESHOLD used in analyze.c. So, in addition to the standard stats,
174	* we find the most common array elements and compute a histogram of distinct
175	* element counts.
176	*
177	* The algorithm used is Lossy Counting, as proposed in the paper "Approximate
178	* frequency counts over data streams" by G. S. Manku and R. Motwani, in
179	* Proceedings of the 28th International Conference on Very Large Data Bases,
180	* Hong Kong, China, August 2002, section 4.2. The paper is available at
181	* http://www.vldb.org/conf/2002/S10P03.pdf
182	*
183	* The Lossy Counting (aka LC) algorithm goes like this:
184	* Let s be the threshold frequency for an item (the minimum frequency we
185	* are interested in) and epsilon the error margin for the frequency. Let D
186	* be a set of triples (e, f, delta), where e is an element value, f is that
187	* element's frequency (actually, its current occurrence count) and delta is
188	* the maximum error in f. We start with D empty and process the elements in
189	* batches of size w. (The batch size is also known as "bucket size" and is
190	* equal to 1/epsilon.) Let the current batch number be b_current, starting
191	* with 1. For each element e we either increment its f count, if it's
192	* already in D, or insert a new triple into D with values (e, 1, b_current
193	* - 1). After processing each batch we prune D, by removing from it all
194	* elements with f + delta <= b_current. After the algorithm finishes we
195	* suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
196	* where N is the total number of elements in the input. We emit the
197	* remaining elements with estimated frequency f/N. The LC paper proves
198	* that this algorithm finds all elements with true frequency at least s,
199	* and that no frequency is overestimated or is underestimated by more than
200	* epsilon. Furthermore, given reasonable assumptions about the input
201	* distribution, the required table size is no more than about 7 times w.
202	*
203	* In the absence of a principled basis for other particular values, we
204	* follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
205	* But we leave out the correction for stopwords, which do not apply to
206	* arrays. These parameters give bucket width w = K/0.007 and maximum
207	* expected hashtable size of about 1000 * K.
208	*
209	* Elements may repeat within an array. Since duplicates do not change the
210	* behavior of <@, && or @>, we want to count each element only once per
211	* array. Therefore, we store in the finished pg_statistic entry each
212	* element's frequency as the fraction of all non-null rows that contain it.
213	* We divide the raw counts by nonnull_cnt to get those figures.
214	*/
215	static void
216	compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
217	int samplerows, double totalrows)
218	{
219	ArrayAnalyzeExtraData *extra_data;
220	int num_mcelem;
221	int null_cnt = `0`;
222	int null_elem_cnt = `0`;
223	int analyzed_rows = `0`;
224
225	/ This is D from the LC algorithm. /
226	HTAB *elements_tab;
227	HASHCTL elem_hash_ctl;
228	HASH_SEQ_STATUS scan_status;
229
230	/ This is the current bucket number from the LC algorithm /
231	int b_current;
232
233	/ This is 'w' from the LC algorithm /
234	int bucket_width;
235	int array_no;
236	int64 element_no;
237	TrackItem *item;
238	int slot_idx;
239	HTAB *count_tab;
240	HASHCTL count_hash_ctl;
241	DECountItem *count_item;
242
243	extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;
244
245	/*
246	* Invoke analyze.c's standard analysis function to create scalar-style
247	* stats for the column. It will expect its own extra_data pointer, so
248	* temporarily install that.
249	*/
250	stats->extra_data = extra_data->std_extra_data;
251	extra_data->std_compute_stats(stats, fetchfunc, samplerows, totalrows);
252	stats->extra_data = extra_data;
253
254	/*
255	* Set up static pointer for use by subroutines. We wait till here in
256	* case std_compute_stats somehow recursively invokes us (probably not
257	* possible, but ...)
258	*/
259	array_extra_data = extra_data;
260
261	/*
262	* We want statistics_target * 10 elements in the MCELEM array. This
263	* multiplier is pretty arbitrary, but is meant to reflect the fact that
264	* the number of individual elements tracked in pg_statistic ought to be
265	* more than the number of values for a simple scalar column.
266	*/
267	num_mcelem = stats->attr->attstattarget * `10`;
268
269	/*
270	* We set bucket width equal to num_mcelem / 0.007 as per the comment
271	* above.
272	*/
273	bucket_width = num_mcelem * `1000` / `7`;
274
275	/*
276	* Create the hashtable. It will be in local memory, so we don't need to
277	* worry about overflowing the initial size. Also we don't need to pay any
278	* attention to locking and memory management.
279	*/
280	MemSet(&elem_hash_ctl, `0`, sizeof(elem_hash_ctl));
281	elem_hash_ctl.keysize = sizeof(Datum);
282	elem_hash_ctl.entrysize = sizeof(TrackItem);
283	elem_hash_ctl.hash = element_hash;
284	elem_hash_ctl.match = element_match;
285	elem_hash_ctl.hcxt = CurrentMemoryContext;
286	elements_tab = hash_create("Analyzed elements table",
287	num_mcelem,
288	&elem_hash_ctl,
289	HASH_ELEM \| HASH_FUNCTION \| HASH_COMPARE \| HASH_CONTEXT);
290
291	/ hashtable for array distinct elements counts /
292	MemSet(&count_hash_ctl, `0`, sizeof(count_hash_ctl));
293	count_hash_ctl.keysize = sizeof(int);
294	count_hash_ctl.entrysize = sizeof(DECountItem);
295	count_hash_ctl.hcxt = CurrentMemoryContext;
296	count_tab = hash_create("Array distinct element count table",
297	`64`,
298	&count_hash_ctl,
299	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
300
301	/ Initialize counters. /
302	b_current = `1`;
303	element_no = `0`;
304
305	/ Loop over the arrays. /
306	for (array_no = `0`; array_no < samplerows; array_no++)
307	{
308	Datum value;
309	bool isnull;
310	ArrayType *array;
311	int num_elems;
312	Datum *elem_values;
313	bool *elem_nulls;
314	bool null_present;
315	int j;
316	int64 prev_element_no = element_no;
317	int distinct_count;
318	bool count_item_found;
319
320	vacuum_delay_point();
321
322	value = fetchfunc(stats, array_no, &isnull);
323	if (isnull)
324	{
325	/ array is null, just count that /
326	null_cnt++;
327	continue;
328	}
329
330	/ Skip too-large values. /
331	if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
332	continue;
333	else
334	analyzed_rows++;
335
336	/*
337	* Now detoast the array if needed, and deconstruct into datums.
338	*/
339	array = DatumGetArrayTypeP(value);
340
341	Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
342	deconstruct_array(array,
343	extra_data->type_id,
344	extra_data->typlen,
345	extra_data->typbyval,
346	extra_data->typalign,
347	&elem_values, &elem_nulls, &num_elems);
348
349	/*
350	* We loop through the elements in the array and add them to our
351	* tracking hashtable.
352	*/
353	null_present = false;
354	for (j = `0`; j < num_elems; j++)
355	{
356	Datum elem_value;
357	bool found;
358
359	/ No null element processing other than flag setting here /
360	if (elem_nulls[j])
361	{
362	null_present = true;
363	continue;
364	}
365
366	/ Lookup current element in hashtable, adding it if new /
367	elem_value = elem_values[j];
368	item = (TrackItem *) hash_search(elements_tab,
369	(const void *) &elem_value,
370	HASH_ENTER, &found);
371
372	if (found)
373	{
374	/ The element value is already on the tracking list /
375
376	/*
377	* The operators we assist ignore duplicate array elements, so
378	* count a given distinct element only once per array.
379	*/
380	if (item->last_container == array_no)
381	continue;
382
383	item->frequency++;
384	item->last_container = array_no;
385	}
386	else
387	{
388	/ Initialize new tracking list element /
389
390	/*
391	* If element type is pass-by-reference, we must copy it into
392	* palloc'd space, so that we can release the array below. (We
393	* do this so that the space needed for element values is
394	* limited by the size of the hashtable; if we kept all the
395	* array values around, it could be much more.)
396	*/
397	item->key = datumCopy(elem_value,
398	extra_data->typbyval,
399	extra_data->typlen);
400
401	item->frequency = `1`;
402	item->delta = b_current - `1`;
403	item->last_container = array_no;
404	}
405
406	/ element_no is the number of elements processed (ie N) /
407	element_no++;
408
409	/ We prune the D structure after processing each bucket /
410	if (element_no % bucket_width == `0`)
411	{
412	prune_element_hashtable(elements_tab, b_current);
413	b_current++;
414	}
415	}
416
417	/ Count null element presence once per array. /
418	if (null_present)
419	null_elem_cnt++;
420
421	/ Update frequency of the particular array distinct element count. /
422	distinct_count = (int) (element_no - prev_element_no);
423	count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
424	HASH_ENTER,
425	&count_item_found);
426
427	if (count_item_found)
428	count_item->frequency++;
429	else
430	count_item->frequency = `1`;
431
432	/ Free memory allocated while detoasting. /
433	if (PointerGetDatum(array) != value)
434	pfree(array);
435	pfree(elem_values);
436	pfree(elem_nulls);
437	}
438
439	/ Skip pg_statistic slots occupied by standard statistics /
440	slot_idx = `0`;
441	while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != `0`)
442	slot_idx++;
443	if (slot_idx > STATISTIC_NUM_SLOTS - `2`)
444	elog(ERROR, "insufficient pg_statistic slots for array stats");
445
446	/ We can only compute real stats if we found some non-null values. /
447	if (analyzed_rows > `0`)
448	{
449	int nonnull_cnt = analyzed_rows;
450	int count_items_count;
451	int i;
452	TrackItem **sort_table;
453	int track_len;
454	int64 cutoff_freq;
455	int64 minfreq,
456	maxfreq;
457
458	/*
459	* We assume the standard stats code already took care of setting
460	* stats_valid, stanullfrac, stawidth, stadistinct. We'd have to
461	* re-compute those values if we wanted to not store the standard
462	* stats.
463	*/
464
465	/*
466	* Construct an array of the interesting hashtable items, that is,
467	* those meeting the cutoff frequency (s - epsilon)*N. Also identify
468	* the minimum and maximum frequencies among these items.
469	*
470	* Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
471	* frequency is 9*N / bucket_width.
472	*/
473	cutoff_freq = `9` * element_no / bucket_width;
474
475	i = hash_get_num_entries(elements_tab); / surely enough space /
476	sort_table = (TrackItem ) palloc(sizeof*(TrackItem ) * i);
477
478	hash_seq_init(&scan_status, elements_tab);
479	track_len = `0`;
480	minfreq = element_no;
481	maxfreq = `0`;
482	while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
483	{
484	if (item->frequency > cutoff_freq)
485	{
486	sort_table[track_len++] = item;
487	minfreq = Min(minfreq, item->frequency);
488	maxfreq = Max(maxfreq, item->frequency);
489	}
490	}
491	Assert(track_len <= i);
492
493	/ emit some statistics for debug purposes /
494	elog(DEBUG3, "compute_array_stats: target # mces = %d, "
495	"bucket width = %d, "
496	"# elements = " INT64_FORMAT ", hashtable size = %d, "
497	"usable entries = %d",
498	num_mcelem, bucket_width, element_no, i, track_len);
499
500	/*
501	* If we obtained more elements than we really want, get rid of those
502	* with least frequencies. The easiest way is to qsort the array into
503	* descending frequency order and truncate the array.
504	*/
505	if (num_mcelem < track_len)
506	{
507	qsort(sort_table, track_len, sizeof(TrackItem *),
508	trackitem_compare_frequencies_desc);
509	/ reset minfreq to the smallest frequency we're keeping /
510	minfreq = sort_table[num_mcelem - `1`]->frequency;
511	}
512	else
513	num_mcelem = track_len;
514
515	/ Generate MCELEM slot entry /
516	if (num_mcelem > `0`)
517	{
518	MemoryContext old_context;
519	Datum *mcelem_values;
520	float4 *mcelem_freqs;
521
522	/*
523	* We want to store statistics sorted on the element value using
524	* the element type's default comparison function. This permits
525	* fast binary searches in selectivity estimation functions.
526	*/
527	qsort(sort_table, num_mcelem, sizeof(TrackItem *),
528	trackitem_compare_element);
529
530	/ Must copy the target values into anl_context /
531	old_context = MemoryContextSwitchTo(stats->anl_context);
532
533	/*
534	* We sorted statistics on the element value, but we want to be
535	* able to find the minimal and maximal frequencies without going
536	* through all the values. We also want the frequency of null
537	* elements. Store these three values at the end of mcelem_freqs.
538	*/
539	mcelem_values = (Datum ) palloc(num_mcelem sizeof(Datum));
540	mcelem_freqs = (float4 ) palloc((num_mcelem + `3`) sizeof(float4));
541
542	/*
543	* See comments above about use of nonnull_cnt as the divisor for
544	* the final frequency estimates.
545	*/
546	for (i = `0`; i < num_mcelem; i++)
547	{
548	TrackItem *item = sort_table[i];
549
550	mcelem_values[i] = datumCopy(item->key,
551	extra_data->typbyval,
552	extra_data->typlen);
553	mcelem_freqs[i] = (double) item->frequency /
554	(double) nonnull_cnt;
555	}
556	mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
557	mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
558	mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;
559
560	MemoryContextSwitchTo(old_context);
561
562	stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
563	stats->staop[slot_idx] = extra_data->eq_opr;
564	stats->stacoll[slot_idx] = extra_data->coll_id;
565	stats->stanumbers[slot_idx] = mcelem_freqs;
566	/ See above comment about extra stanumber entries /
567	stats->numnumbers[slot_idx] = num_mcelem + `3`;
568	stats->stavalues[slot_idx] = mcelem_values;
569	stats->numvalues[slot_idx] = num_mcelem;
570	/ We are storing values of element type /
571	stats->statypid[slot_idx] = extra_data->type_id;
572	stats->statyplen[slot_idx] = extra_data->typlen;
573	stats->statypbyval[slot_idx] = extra_data->typbyval;
574	stats->statypalign[slot_idx] = extra_data->typalign;
575	slot_idx++;
576	}
577
578	/ Generate DECHIST slot entry /
579	count_items_count = hash_get_num_entries(count_tab);
580	if (count_items_count > `0`)
581	{
582	int num_hist = stats->attr->attstattarget;
583	DECountItem **sorted_count_items;
584	int j;
585	int delta;
586	int64 frac;
587	float4 *hist;
588
589	/ num_hist must be at least 2 for the loop below to work /
590	num_hist = Max(num_hist, `2`);
591
592	/*
593	* Create an array of DECountItem pointers, and sort them into
594	* increasing count order.
595	*/
596	sorted_count_items = (DECountItem **)
597	palloc(sizeof(DECountItem ) count_items_count);
598	hash_seq_init(&scan_status, count_tab);
599	j = `0`;
600	while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
601	{
602	sorted_count_items[j++] = count_item;
603	}
604	qsort(sorted_count_items, count_items_count,
605	sizeof(DECountItem *), countitem_compare_count);
606
607	/*
608	* Prepare to fill stanumbers with the histogram, followed by the
609	* average count. This array must be stored in anl_context.
610	*/
611	hist = (float4 *)
612	MemoryContextAlloc(stats->anl_context,
613	sizeof(float4) * (num_hist + `1`));
614	hist[num_hist] = (double) element_no / (double) nonnull_cnt;
615
616	/----------*
617	* Construct the histogram of distinct-element counts (DECs).
618	*
619	* The object of this loop is to copy the min and max DECs to
620	* hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
621	* in between (where "evenly-spaced" is with reference to the
622	* whole input population of arrays). If we had a complete sorted
623	* array of DECs, one per analyzed row, the i'th hist value would
624	* come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
625	* (compare the histogram-making loop in compute_scalar_stats()).
626	* But instead of that we have the sorted_count_items[] array,
627	* which holds unique DEC values with their frequencies (that is,
628	* a run-length-compressed version of the full array). So we
629	* control advancing through sorted_count_items[] with the
630	* variable "frac", which is defined as (x - y) * (num_hist - 1),
631	* where x is the index in the notional DECs array corresponding
632	* to the start of the next sorted_count_items[] element's run,
633	* and y is the index in DECs from which we should take the next
634	* histogram value. We have to advance whenever x <= y, that is
635	* frac <= 0. The x component is the sum of the frequencies seen
636	* so far (up through the current sorted_count_items[] element),
637	* and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
638	* per the subscript calculation above. (The subscript calculation
639	* implies dropping any fractional part of y; in this formulation
640	* that's handled by not advancing until frac reaches 1.)
641	*
642	* Even though frac has a bounded range, it could overflow int32
643	* when working with very large statistics targets, so we do that
644	* math in int64.
645	*----------
646	*/
647	delta = analyzed_rows - `1`;
648	j = `0`; / current index in sorted_count_items /
649	/ Initialize frac for sorted_count_items[0]; y is initially 0 /
650	frac = (int64) sorted_count_items[`0`]->frequency * (num_hist - `1`);
651	for (i = `0`; i < num_hist; i++)
652	{
653	while (frac <= `0`)
654	{
655	/ Advance, and update x component of frac /
656	j++;
657	frac += (int64) sorted_count_items[j]->frequency * (num_hist - `1`);
658	}
659	hist[i] = sorted_count_items[j]->count;
660	frac -= delta; / update y for upcoming i increment /
661	}
662	Assert(j == count_items_count - `1`);
663
664	stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
665	stats->staop[slot_idx] = extra_data->eq_opr;
666	stats->stacoll[slot_idx] = extra_data->coll_id;
667	stats->stanumbers[slot_idx] = hist;
668	stats->numnumbers[slot_idx] = num_hist + `1`;
669	slot_idx++;
670	}
671	}
672
673	/*
674	* We don't need to bother cleaning up any of our temporary palloc's. The
675	* hashtable should also go away, as it used a child memory context.
676	*/
677	}
678
679	/*
680	* A function to prune the D structure from the Lossy Counting algorithm.
681	* Consult compute_tsvector_stats() for wider explanation.
682	*/
683	static void
684	prune_element_hashtable(HTAB elements_tab, int* b_current)
685	{
686	HASH_SEQ_STATUS scan_status;
687	TrackItem *item;
688
689	hash_seq_init(&scan_status, elements_tab);
690	while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
691	{
692	if (item->frequency + item->delta <= b_current)
693	{
694	Datum value = item->key;
695
696	if (hash_search(elements_tab, (const void *) &item->key,
697	HASH_REMOVE, NULL) == NULL)
698	elog(ERROR, "hash table corrupted");
699	/ We should free memory if element is not passed by value /
700	if (!array_extra_data->typbyval)
701	pfree(DatumGetPointer(value));
702	}
703	}
704	}
705
706	/*
707	* Hash function for elements.
708	*
709	* We use the element type's default hash opclass, and the column collation
710	* if the type is collation-sensitive.
711	*/
712	static uint32
713	element_hash(const void *key, Size keysize)
714	{
715	Datum d = ((const* Datum *) key);
716	Datum h;
717
718	h = FunctionCall1Coll(array_extra_data->hash,
719	array_extra_data->coll_id,
720	d);
721	return DatumGetUInt32(h);
722	}
723
724	/*
725	* Matching function for elements, to be used in hashtable lookups.
726	*/
727	static int
728	element_match(const void key1, const* void *key2, Size keysize)
729	{
730	/ The keysize parameter is superfluous here /
731	return element_compare(key1, key2);
732	}
733
734	/*
735	* Comparison function for elements.
736	*
737	* We use the element type's default btree opclass, and the column collation
738	* if the type is collation-sensitive.
739	*
740	* XXX consider using SortSupport infrastructure
741	*/
742	static int
743	element_compare(const void key1, const* void *key2)
744	{
745	Datum d1 = ((const* Datum *) key1);
746	Datum d2 = ((const* Datum *) key2);
747	Datum c;
748
749	c = FunctionCall2Coll(array_extra_data->cmp,
750	array_extra_data->coll_id,
751	d1, d2);
752	return DatumGetInt32(c);
753	}
754
755	/*
756	* qsort() comparator for sorting TrackItems by frequencies (descending sort)
757	*/
758	static int
759	trackitem_compare_frequencies_desc(const void e1, const* void *e2)
760	{
761	const TrackItem *const t1 = (const* TrackItem *const *) e1;
762	const TrackItem *const t2 = (const* TrackItem *const *) e2;
763
764	return (t2)->frequency - (t1)->frequency;
765	}
766
767	/*
768	* qsort() comparator for sorting TrackItems by element values
769	*/
770	static int
771	trackitem_compare_element(const void e1, const* void *e2)
772	{
773	const TrackItem *const t1 = (const* TrackItem *const *) e1;
774	const TrackItem *const t2 = (const* TrackItem *const *) e2;
775
776	return element_compare(&(t1)->key, &(t2)->key);
777	}
778
779	/*
780	* qsort() comparator for sorting DECountItems by count
781	*/
782	static int
783	countitem_compare_count(const void e1, const* void *e2)
784	{
785	const DECountItem *const t1 = (const* DECountItem *const *) e1;
786	const DECountItem *const t2 = (const* DECountItem *const *) e2;
787
788	if ((t1)->count < (t2)->count)
789	return -`1`;
790	else if ((t1)->count == (t2)->count)
791	return `0`;
792	else
793	return `1`;
794	}
795

Browse the source code of PostgreSQL/src/backend/utils/adt/array_typanalyze.c