1/*-------------------------------------------------------------------------
2 *
3 * ragetypes_typanalyze.c
4 * Functions for gathering statistics from range columns
5 *
6 * For a range type column, histograms of lower and upper bounds, and
7 * the fraction of NULL and empty ranges are collected.
8 *
9 * Both histograms have the same length, and they are combined into a
10 * single array of ranges. This has the same shape as the histogram that
11 * std_typanalyze would collect, but the values are different. Each range
12 * in the array is a valid range, even though the lower and upper bounds
13 * come from different tuples. In theory, the standard scalar selectivity
14 * functions could be used with the combined histogram.
15 *
16 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 *
20 * IDENTIFICATION
21 * src/backend/utils/adt/rangetypes_typanalyze.c
22 *
23 *-------------------------------------------------------------------------
24 */
25#include "postgres.h"
26
27#include "catalog/pg_operator.h"
28#include "commands/vacuum.h"
29#include "utils/float.h"
30#include "utils/fmgrprotos.h"
31#include "utils/lsyscache.h"
32#include "utils/rangetypes.h"
33
34static int float8_qsort_cmp(const void *a1, const void *a2);
35static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
36static void compute_range_stats(VacAttrStats *stats,
37 AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
38
39/*
40 * range_typanalyze -- typanalyze function for range columns
41 */
42Datum
43range_typanalyze(PG_FUNCTION_ARGS)
44{
45 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
46 TypeCacheEntry *typcache;
47 Form_pg_attribute attr = stats->attr;
48
49 /* Get information about range type; note column might be a domain */
50 typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
51
52 if (attr->attstattarget < 0)
53 attr->attstattarget = default_statistics_target;
54
55 stats->compute_stats = compute_range_stats;
56 stats->extra_data = typcache;
57 /* same as in std_typanalyze */
58 stats->minrows = 300 * attr->attstattarget;
59
60 PG_RETURN_BOOL(true);
61}
62
63/*
64 * Comparison function for sorting float8s, used for range lengths.
65 */
66static int
67float8_qsort_cmp(const void *a1, const void *a2)
68{
69 const float8 *f1 = (const float8 *) a1;
70 const float8 *f2 = (const float8 *) a2;
71
72 if (*f1 < *f2)
73 return -1;
74 else if (*f1 == *f2)
75 return 0;
76 else
77 return 1;
78}
79
80/*
81 * Comparison function for sorting RangeBounds.
82 */
83static int
84range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
85{
86 RangeBound *b1 = (RangeBound *) a1;
87 RangeBound *b2 = (RangeBound *) a2;
88 TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
89
90 return range_cmp_bounds(typcache, b1, b2);
91}
92
93/*
94 * compute_range_stats() -- compute statistics for a range column
95 */
96static void
97compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
98 int samplerows, double totalrows)
99{
100 TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
101 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
102 int null_cnt = 0;
103 int non_null_cnt = 0;
104 int non_empty_cnt = 0;
105 int empty_cnt = 0;
106 int range_no;
107 int slot_idx;
108 int num_bins = stats->attr->attstattarget;
109 int num_hist;
110 float8 *lengths;
111 RangeBound *lowers,
112 *uppers;
113 double total_width = 0;
114
115 /* Allocate memory to hold range bounds and lengths of the sample ranges. */
116 lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117 uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
118 lengths = (float8 *) palloc(sizeof(float8) * samplerows);
119
120 /* Loop over the sample ranges. */
121 for (range_no = 0; range_no < samplerows; range_no++)
122 {
123 Datum value;
124 bool isnull,
125 empty;
126 RangeType *range;
127 RangeBound lower,
128 upper;
129 float8 length;
130
131 vacuum_delay_point();
132
133 value = fetchfunc(stats, range_no, &isnull);
134 if (isnull)
135 {
136 /* range is null, just count that */
137 null_cnt++;
138 continue;
139 }
140
141 /*
142 * XXX: should we ignore wide values, like std_typanalyze does, to
143 * avoid bloating the statistics table?
144 */
145 total_width += VARSIZE_ANY(DatumGetPointer(value));
146
147 /* Get range and deserialize it for further analysis. */
148 range = DatumGetRangeTypeP(value);
149 range_deserialize(typcache, range, &lower, &upper, &empty);
150
151 if (!empty)
152 {
153 /* Remember bounds and length for further usage in histograms */
154 lowers[non_empty_cnt] = lower;
155 uppers[non_empty_cnt] = upper;
156
157 if (lower.infinite || upper.infinite)
158 {
159 /* Length of any kind of an infinite range is infinite */
160 length = get_float8_infinity();
161 }
162 else if (has_subdiff)
163 {
164 /*
165 * For an ordinary range, use subdiff function between upper
166 * and lower bound values.
167 */
168 length = DatumGetFloat8(FunctionCall2Coll(
169 &typcache->rng_subdiff_finfo,
170 typcache->rng_collation,
171 upper.val, lower.val));
172 }
173 else
174 {
175 /* Use default value of 1.0 if no subdiff is available. */
176 length = 1.0;
177 }
178 lengths[non_empty_cnt] = length;
179
180 non_empty_cnt++;
181 }
182 else
183 empty_cnt++;
184
185 non_null_cnt++;
186 }
187
188 slot_idx = 0;
189
190 /* We can only compute real stats if we found some non-null values. */
191 if (non_null_cnt > 0)
192 {
193 Datum *bound_hist_values;
194 Datum *length_hist_values;
195 int pos,
196 posfrac,
197 delta,
198 deltafrac,
199 i;
200 MemoryContext old_cxt;
201 float4 *emptyfrac;
202
203 stats->stats_valid = true;
204 /* Do the simple null-frac and width stats */
205 stats->stanullfrac = (double) null_cnt / (double) samplerows;
206 stats->stawidth = total_width / (double) non_null_cnt;
207
208 /* Estimate that non-null values are unique */
209 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
210
211 /* Must copy the target values into anl_context */
212 old_cxt = MemoryContextSwitchTo(stats->anl_context);
213
214 /*
215 * Generate a bounds histogram slot entry if there are at least two
216 * values.
217 */
218 if (non_empty_cnt >= 2)
219 {
220 /* Sort bound values */
221 qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
222 range_bound_qsort_cmp, typcache);
223 qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
224 range_bound_qsort_cmp, typcache);
225
226 num_hist = non_empty_cnt;
227 if (num_hist > num_bins)
228 num_hist = num_bins + 1;
229
230 bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
231
232 /*
233 * The object of this loop is to construct ranges from first and
234 * last entries in lowers[] and uppers[] along with evenly-spaced
235 * values in between. So the i'th value is a range of lowers[(i *
236 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
237 * (num_hist - 1)]. But computing that subscript directly risks
238 * integer overflow when the stats target is more than a couple
239 * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
240 * at each step, tracking the integral and fractional parts of the
241 * sum separately.
242 */
243 delta = (non_empty_cnt - 1) / (num_hist - 1);
244 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
245 pos = posfrac = 0;
246
247 for (i = 0; i < num_hist; i++)
248 {
249 bound_hist_values[i] = PointerGetDatum(range_serialize(
250 typcache, &lowers[pos], &uppers[pos], false));
251 pos += delta;
252 posfrac += deltafrac;
253 if (posfrac >= (num_hist - 1))
254 {
255 /* fractional part exceeds 1, carry to integer part */
256 pos++;
257 posfrac -= (num_hist - 1);
258 }
259 }
260
261 stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
262 stats->stavalues[slot_idx] = bound_hist_values;
263 stats->numvalues[slot_idx] = num_hist;
264 slot_idx++;
265 }
266
267 /*
268 * Generate a length histogram slot entry if there are at least two
269 * values.
270 */
271 if (non_empty_cnt >= 2)
272 {
273 /*
274 * Ascending sort of range lengths for further filling of
275 * histogram
276 */
277 qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
278
279 num_hist = non_empty_cnt;
280 if (num_hist > num_bins)
281 num_hist = num_bins + 1;
282
283 length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
284
285 /*
286 * The object of this loop is to copy the first and last lengths[]
287 * entries along with evenly-spaced values in between. So the i'th
288 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
289 * computing that subscript directly risks integer overflow when
290 * the stats target is more than a couple thousand. Instead we
291 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
292 * the integral and fractional parts of the sum separately.
293 */
294 delta = (non_empty_cnt - 1) / (num_hist - 1);
295 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
296 pos = posfrac = 0;
297
298 for (i = 0; i < num_hist; i++)
299 {
300 length_hist_values[i] = Float8GetDatum(lengths[pos]);
301 pos += delta;
302 posfrac += deltafrac;
303 if (posfrac >= (num_hist - 1))
304 {
305 /* fractional part exceeds 1, carry to integer part */
306 pos++;
307 posfrac -= (num_hist - 1);
308 }
309 }
310 }
311 else
312 {
313 /*
314 * Even when we don't create the histogram, store an empty array
315 * to mean "no histogram". We can't just leave stavalues NULL,
316 * because get_attstatsslot() errors if you ask for stavalues, and
317 * it's NULL. We'll still store the empty fraction in stanumbers.
318 */
319 length_hist_values = palloc(0);
320 num_hist = 0;
321 }
322 stats->staop[slot_idx] = Float8LessOperator;
323 stats->stacoll[slot_idx] = InvalidOid;
324 stats->stavalues[slot_idx] = length_hist_values;
325 stats->numvalues[slot_idx] = num_hist;
326 stats->statypid[slot_idx] = FLOAT8OID;
327 stats->statyplen[slot_idx] = sizeof(float8);
328#ifdef USE_FLOAT8_BYVAL
329 stats->statypbyval[slot_idx] = true;
330#else
331 stats->statypbyval[slot_idx] = false;
332#endif
333 stats->statypalign[slot_idx] = 'd';
334
335 /* Store the fraction of empty ranges */
336 emptyfrac = (float4 *) palloc(sizeof(float4));
337 *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
338 stats->stanumbers[slot_idx] = emptyfrac;
339 stats->numnumbers[slot_idx] = 1;
340
341 stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
342 slot_idx++;
343
344 MemoryContextSwitchTo(old_cxt);
345 }
346 else if (null_cnt > 0)
347 {
348 /* We found only nulls; assume the column is entirely null */
349 stats->stats_valid = true;
350 stats->stanullfrac = 1.0;
351 stats->stawidth = 0; /* "unknown" */
352 stats->stadistinct = 0.0; /* "unknown" */
353 }
354
355 /*
356 * We don't need to bother cleaning up any of our temporary palloc's. The
357 * hashtable should also go away, as it used a child memory context.
358 */
359}
360