rangetypes_typanalyze.c source code [PostgreSQL/src/backend/utils/adt/rangetypes_typanalyze.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ragetypes_typanalyze.c
4	* Functions for gathering statistics from range columns
5	*
6	* For a range type column, histograms of lower and upper bounds, and
7	* the fraction of NULL and empty ranges are collected.
8	*
9	* Both histograms have the same length, and they are combined into a
10	* single array of ranges. This has the same shape as the histogram that
11	* std_typanalyze would collect, but the values are different. Each range
12	* in the array is a valid range, even though the lower and upper bounds
13	* come from different tuples. In theory, the standard scalar selectivity
14	* functions could be used with the combined histogram.
15	*
16	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
17	* Portions Copyright (c) 1994, Regents of the University of California
18	*
19	*
20	* IDENTIFICATION
21	* src/backend/utils/adt/rangetypes_typanalyze.c
22	*
23	*-------------------------------------------------------------------------
24	*/
25	#include "postgres.h"
26
27	#include "catalog/pg_operator.h"
28	#include "commands/vacuum.h"
29	#include "utils/float.h"
30	#include "utils/fmgrprotos.h"
31	#include "utils/lsyscache.h"
32	#include "utils/rangetypes.h"
33
34	static int float8_qsort_cmp(const void a1, const* void *a2);
35	static int range_bound_qsort_cmp(const void a1, const* void a2, void* *arg);
36	static void compute_range_stats(VacAttrStats *stats,
37	AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
38
39	/*
40	* range_typanalyze -- typanalyze function for range columns
41	*/
42	Datum
43	range_typanalyze(PG_FUNCTION_ARGS)
44	{
45	VacAttrStats stats = (VacAttrStats ) PG_GETARG_POINTER(`0`);
46	TypeCacheEntry *typcache;
47	Form_pg_attribute attr = stats->attr;
48
49	/ Get information about range type; note column might be a domain /
50	typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
51
52	if (attr->attstattarget < `0`)
53	attr->attstattarget = default_statistics_target;
54
55	stats->compute_stats = compute_range_stats;
56	stats->extra_data = typcache;
57	/ same as in std_typanalyze /
58	stats->minrows = `300` * attr->attstattarget;
59
60	PG_RETURN_BOOL(true);
61	}
62
63	/*
64	* Comparison function for sorting float8s, used for range lengths.
65	*/
66	static int
67	float8_qsort_cmp(const void a1, const* void *a2)
68	{
69	const float8 f1 = (const* float8 *) a1;
70	const float8 f2 = (const* float8 *) a2;
71
72	if (f1 < f2)
73	return -`1`;
74	else if (f1 == f2)
75	return `0`;
76	else
77	return `1`;
78	}
79
80	/*
81	* Comparison function for sorting RangeBounds.
82	*/
83	static int
84	range_bound_qsort_cmp(const void a1, const* void a2, void* *arg)
85	{
86	RangeBound b1 = (RangeBound ) a1;
87	RangeBound b2 = (RangeBound ) a2;
88	TypeCacheEntry typcache = (TypeCacheEntry ) arg;
89
90	return range_cmp_bounds(typcache, b1, b2);
91	}
92
93	/*
94	* compute_range_stats() -- compute statistics for a range column
95	*/
96	static void
97	compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
98	int samplerows, double totalrows)
99	{
100	TypeCacheEntry typcache = (TypeCacheEntry ) stats->extra_data;
101	bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
102	int null_cnt = `0`;
103	int non_null_cnt = `0`;
104	int non_empty_cnt = `0`;
105	int empty_cnt = `0`;
106	int range_no;
107	int slot_idx;
108	int num_bins = stats->attr->attstattarget;
109	int num_hist;
110	float8 *lengths;
111	RangeBound *lowers,
112	*uppers;
113	double total_width = `0`;
114
115	/ Allocate memory to hold range bounds and lengths of the sample ranges. /
116	lowers = (RangeBound ) palloc(sizeof(RangeBound) samplerows);
117	uppers = (RangeBound ) palloc(sizeof(RangeBound) samplerows);
118	lengths = (float8 ) palloc(sizeof(float8) samplerows);
119
120	/ Loop over the sample ranges. /
121	for (range_no = `0`; range_no < samplerows; range_no++)
122	{
123	Datum value;
124	bool isnull,
125	empty;
126	RangeType *range;
127	RangeBound lower,
128	upper;
129	float8 length;
130
131	vacuum_delay_point();
132
133	value = fetchfunc(stats, range_no, &isnull);
134	if (isnull)
135	{
136	/ range is null, just count that /
137	null_cnt++;
138	continue;
139	}
140
141	/*
142	* XXX: should we ignore wide values, like std_typanalyze does, to
143	* avoid bloating the statistics table?
144	*/
145	total_width += VARSIZE_ANY(DatumGetPointer(value));
146
147	/ Get range and deserialize it for further analysis. /
148	range = DatumGetRangeTypeP(value);
149	range_deserialize(typcache, range, &lower, &upper, &empty);
150
151	if (!empty)
152	{
153	/ Remember bounds and length for further usage in histograms /
154	lowers[non_empty_cnt] = lower;
155	uppers[non_empty_cnt] = upper;
156
157	if (lower.infinite \|\| upper.infinite)
158	{
159	/ Length of any kind of an infinite range is infinite /
160	length = get_float8_infinity();
161	}
162	else if (has_subdiff)
163	{
164	/*
165	* For an ordinary range, use subdiff function between upper
166	* and lower bound values.
167	*/
168	length = DatumGetFloat8(FunctionCall2Coll(
169	&typcache->rng_subdiff_finfo,
170	typcache->rng_collation,
171	upper.val, lower.val));
172	}
173	else
174	{
175	/ Use default value of 1.0 if no subdiff is available. /
176	length = `1.0`;
177	}
178	lengths[non_empty_cnt] = length;
179
180	non_empty_cnt++;
181	}
182	else
183	empty_cnt++;
184
185	non_null_cnt++;
186	}
187
188	slot_idx = `0`;
189
190	/ We can only compute real stats if we found some non-null values. /
191	if (non_null_cnt > `0`)
192	{
193	Datum *bound_hist_values;
194	Datum *length_hist_values;
195	int pos,
196	posfrac,
197	delta,
198	deltafrac,
199	i;
200	MemoryContext old_cxt;
201	float4 *emptyfrac;
202
203	stats->stats_valid = true;
204	/ Do the simple null-frac and width stats /
205	stats->stanullfrac = (double) null_cnt / (double) samplerows;
206	stats->stawidth = total_width / (double) non_null_cnt;
207
208	/ Estimate that non-null values are unique /
209	stats->stadistinct = -`1.0` * (`1.0` - stats->stanullfrac);
210
211	/ Must copy the target values into anl_context /
212	old_cxt = MemoryContextSwitchTo(stats->anl_context);
213
214	/*
215	* Generate a bounds histogram slot entry if there are at least two
216	* values.
217	*/
218	if (non_empty_cnt >= `2`)
219	{
220	/ Sort bound values /
221	qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
222	range_bound_qsort_cmp, typcache);
223	qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
224	range_bound_qsort_cmp, typcache);
225
226	num_hist = non_empty_cnt;
227	if (num_hist > num_bins)
228	num_hist = num_bins + `1`;
229
230	bound_hist_values = (Datum ) palloc(num_hist sizeof(Datum));
231
232	/*
233	* The object of this loop is to construct ranges from first and
234	* last entries in lowers[] and uppers[] along with evenly-spaced
235	* values in between. So the i'th value is a range of lowers[(i *
236	* (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
237	* (num_hist - 1)]. But computing that subscript directly risks
238	* integer overflow when the stats target is more than a couple
239	* thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
240	* at each step, tracking the integral and fractional parts of the
241	* sum separately.
242	*/
243	delta = (non_empty_cnt - `1`) / (num_hist - `1`);
244	deltafrac = (non_empty_cnt - `1`) % (num_hist - `1`);
245	pos = posfrac = `0`;
246
247	for (i = `0`; i < num_hist; i++)
248	{
249	bound_hist_values[i] = PointerGetDatum(range_serialize(
250	typcache, &lowers[pos], &uppers[pos], false));
251	pos += delta;
252	posfrac += deltafrac;
253	if (posfrac >= (num_hist - `1`))
254	{
255	/ fractional part exceeds 1, carry to integer part /
256	pos++;
257	posfrac -= (num_hist - `1`);
258	}
259	}
260
261	stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
262	stats->stavalues[slot_idx] = bound_hist_values;
263	stats->numvalues[slot_idx] = num_hist;
264	slot_idx++;
265	}
266
267	/*
268	* Generate a length histogram slot entry if there are at least two
269	* values.
270	*/
271	if (non_empty_cnt >= `2`)
272	{
273	/*
274	* Ascending sort of range lengths for further filling of
275	* histogram
276	*/
277	qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
278
279	num_hist = non_empty_cnt;
280	if (num_hist > num_bins)
281	num_hist = num_bins + `1`;
282
283	length_hist_values = (Datum ) palloc(num_hist sizeof(Datum));
284
285	/*
286	* The object of this loop is to copy the first and last lengths[]
287	* entries along with evenly-spaced values in between. So the i'th
288	* value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
289	* computing that subscript directly risks integer overflow when
290	* the stats target is more than a couple thousand. Instead we
291	* add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
292	* the integral and fractional parts of the sum separately.
293	*/
294	delta = (non_empty_cnt - `1`) / (num_hist - `1`);
295	deltafrac = (non_empty_cnt - `1`) % (num_hist - `1`);
296	pos = posfrac = `0`;
297
298	for (i = `0`; i < num_hist; i++)
299	{
300	length_hist_values[i] = Float8GetDatum(lengths[pos]);
301	pos += delta;
302	posfrac += deltafrac;
303	if (posfrac >= (num_hist - `1`))
304	{
305	/ fractional part exceeds 1, carry to integer part /
306	pos++;
307	posfrac -= (num_hist - `1`);
308	}
309	}
310	}
311	else
312	{
313	/*
314	* Even when we don't create the histogram, store an empty array
315	* to mean "no histogram". We can't just leave stavalues NULL,
316	* because get_attstatsslot() errors if you ask for stavalues, and
317	* it's NULL. We'll still store the empty fraction in stanumbers.
318	*/
319	length_hist_values = palloc(`0`);
320	num_hist = `0`;
321	}
322	stats->staop[slot_idx] = Float8LessOperator;
323	stats->stacoll[slot_idx] = InvalidOid;
324	stats->stavalues[slot_idx] = length_hist_values;
325	stats->numvalues[slot_idx] = num_hist;
326	stats->statypid[slot_idx] = FLOAT8OID;
327	stats->statyplen[slot_idx] = sizeof(float8);
328	#ifdef USE_FLOAT8_BYVAL
329	stats->statypbyval[slot_idx] = true;
330	#else
331	stats->statypbyval[slot_idx] = false;
332	#endif
333	stats->statypalign[slot_idx] = `'d'`;
334
335	/ Store the fraction of empty ranges /
336	emptyfrac = (float4 ) palloc(sizeof*(float4));
337	emptyfrac = ((double) empty_cnt) / ((double*) non_null_cnt);
338	stats->stanumbers[slot_idx] = emptyfrac;
339	stats->numnumbers[slot_idx] = `1`;
340
341	stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
342	slot_idx++;
343
344	MemoryContextSwitchTo(old_cxt);
345	}
346	else if (null_cnt > `0`)
347	{
348	/ We found only nulls; assume the column is entirely null /
349	stats->stats_valid = true;
350	stats->stanullfrac = `1.0`;
351	stats->stawidth = `0`; / "unknown" /
352	stats->stadistinct = `0.0`; / "unknown" /
353	}
354
355	/*
356	* We don't need to bother cleaning up any of our temporary palloc's. The
357	* hashtable should also go away, as it used a child memory context.
358	*/
359	}
360

Browse the source code of PostgreSQL/src/backend/utils/adt/rangetypes_typanalyze.c