| 1 | /*------------------------------------------------------------------------- | 
|---|
| 2 | * | 
|---|
| 3 | * ragetypes_typanalyze.c | 
|---|
| 4 | *	  Functions for gathering statistics from range columns | 
|---|
| 5 | * | 
|---|
| 6 | * For a range type column, histograms of lower and upper bounds, and | 
|---|
| 7 | * the fraction of NULL and empty ranges are collected. | 
|---|
| 8 | * | 
|---|
| 9 | * Both histograms have the same length, and they are combined into a | 
|---|
| 10 | * single array of ranges. This has the same shape as the histogram that | 
|---|
| 11 | * std_typanalyze would collect, but the values are different. Each range | 
|---|
| 12 | * in the array is a valid range, even though the lower and upper bounds | 
|---|
| 13 | * come from different tuples. In theory, the standard scalar selectivity | 
|---|
| 14 | * functions could be used with the combined histogram. | 
|---|
| 15 | * | 
|---|
| 16 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
|---|
| 17 | * Portions Copyright (c) 1994, Regents of the University of California | 
|---|
| 18 | * | 
|---|
| 19 | * | 
|---|
| 20 | * IDENTIFICATION | 
|---|
| 21 | *	  src/backend/utils/adt/rangetypes_typanalyze.c | 
|---|
| 22 | * | 
|---|
| 23 | *------------------------------------------------------------------------- | 
|---|
| 24 | */ | 
|---|
| 25 | #include "postgres.h" | 
|---|
| 26 |  | 
|---|
| 27 | #include "catalog/pg_operator.h" | 
|---|
| 28 | #include "commands/vacuum.h" | 
|---|
| 29 | #include "utils/float.h" | 
|---|
| 30 | #include "utils/fmgrprotos.h" | 
|---|
| 31 | #include "utils/lsyscache.h" | 
|---|
| 32 | #include "utils/rangetypes.h" | 
|---|
| 33 |  | 
|---|
| 34 | static int	float8_qsort_cmp(const void *a1, const void *a2); | 
|---|
| 35 | static int	range_bound_qsort_cmp(const void *a1, const void *a2, void *arg); | 
|---|
| 36 | static void compute_range_stats(VacAttrStats *stats, | 
|---|
| 37 | AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows); | 
|---|
| 38 |  | 
|---|
| 39 | /* | 
|---|
| 40 | * range_typanalyze -- typanalyze function for range columns | 
|---|
| 41 | */ | 
|---|
| 42 | Datum | 
|---|
| 43 | range_typanalyze(PG_FUNCTION_ARGS) | 
|---|
| 44 | { | 
|---|
| 45 | VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0); | 
|---|
| 46 | TypeCacheEntry *typcache; | 
|---|
| 47 | Form_pg_attribute attr = stats->attr; | 
|---|
| 48 |  | 
|---|
| 49 | /* Get information about range type; note column might be a domain */ | 
|---|
| 50 | typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid)); | 
|---|
| 51 |  | 
|---|
| 52 | if (attr->attstattarget < 0) | 
|---|
| 53 | attr->attstattarget = default_statistics_target; | 
|---|
| 54 |  | 
|---|
| 55 | stats->compute_stats = compute_range_stats; | 
|---|
| 56 | stats->extra_data = typcache; | 
|---|
| 57 | /* same as in std_typanalyze */ | 
|---|
| 58 | stats->minrows = 300 * attr->attstattarget; | 
|---|
| 59 |  | 
|---|
| 60 | PG_RETURN_BOOL(true); | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | /* | 
|---|
| 64 | * Comparison function for sorting float8s, used for range lengths. | 
|---|
| 65 | */ | 
|---|
| 66 | static int | 
|---|
| 67 | float8_qsort_cmp(const void *a1, const void *a2) | 
|---|
| 68 | { | 
|---|
| 69 | const float8 *f1 = (const float8 *) a1; | 
|---|
| 70 | const float8 *f2 = (const float8 *) a2; | 
|---|
| 71 |  | 
|---|
| 72 | if (*f1 < *f2) | 
|---|
| 73 | return -1; | 
|---|
| 74 | else if (*f1 == *f2) | 
|---|
| 75 | return 0; | 
|---|
| 76 | else | 
|---|
| 77 | return 1; | 
|---|
| 78 | } | 
|---|
| 79 |  | 
|---|
| 80 | /* | 
|---|
| 81 | * Comparison function for sorting RangeBounds. | 
|---|
| 82 | */ | 
|---|
| 83 | static int | 
|---|
| 84 | range_bound_qsort_cmp(const void *a1, const void *a2, void *arg) | 
|---|
| 85 | { | 
|---|
| 86 | RangeBound *b1 = (RangeBound *) a1; | 
|---|
| 87 | RangeBound *b2 = (RangeBound *) a2; | 
|---|
| 88 | TypeCacheEntry *typcache = (TypeCacheEntry *) arg; | 
|---|
| 89 |  | 
|---|
| 90 | return range_cmp_bounds(typcache, b1, b2); | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | /* | 
|---|
| 94 | * compute_range_stats() -- compute statistics for a range column | 
|---|
| 95 | */ | 
|---|
| 96 | static void | 
|---|
| 97 | compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, | 
|---|
| 98 | int samplerows, double totalrows) | 
|---|
| 99 | { | 
|---|
| 100 | TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data; | 
|---|
| 101 | bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); | 
|---|
| 102 | int			null_cnt = 0; | 
|---|
| 103 | int			non_null_cnt = 0; | 
|---|
| 104 | int			non_empty_cnt = 0; | 
|---|
| 105 | int			empty_cnt = 0; | 
|---|
| 106 | int			range_no; | 
|---|
| 107 | int			slot_idx; | 
|---|
| 108 | int			num_bins = stats->attr->attstattarget; | 
|---|
| 109 | int			num_hist; | 
|---|
| 110 | float8	   *lengths; | 
|---|
| 111 | RangeBound *lowers, | 
|---|
| 112 | *uppers; | 
|---|
| 113 | double		total_width = 0; | 
|---|
| 114 |  | 
|---|
| 115 | /* Allocate memory to hold range bounds and lengths of the sample ranges. */ | 
|---|
| 116 | lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); | 
|---|
| 117 | uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); | 
|---|
| 118 | lengths = (float8 *) palloc(sizeof(float8) * samplerows); | 
|---|
| 119 |  | 
|---|
| 120 | /* Loop over the sample ranges. */ | 
|---|
| 121 | for (range_no = 0; range_no < samplerows; range_no++) | 
|---|
| 122 | { | 
|---|
| 123 | Datum		value; | 
|---|
| 124 | bool		isnull, | 
|---|
| 125 | empty; | 
|---|
| 126 | RangeType  *range; | 
|---|
| 127 | RangeBound	lower, | 
|---|
| 128 | upper; | 
|---|
| 129 | float8		length; | 
|---|
| 130 |  | 
|---|
| 131 | vacuum_delay_point(); | 
|---|
| 132 |  | 
|---|
| 133 | value = fetchfunc(stats, range_no, &isnull); | 
|---|
| 134 | if (isnull) | 
|---|
| 135 | { | 
|---|
| 136 | /* range is null, just count that */ | 
|---|
| 137 | null_cnt++; | 
|---|
| 138 | continue; | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | /* | 
|---|
| 142 | * XXX: should we ignore wide values, like std_typanalyze does, to | 
|---|
| 143 | * avoid bloating the statistics table? | 
|---|
| 144 | */ | 
|---|
| 145 | total_width += VARSIZE_ANY(DatumGetPointer(value)); | 
|---|
| 146 |  | 
|---|
| 147 | /* Get range and deserialize it for further analysis. */ | 
|---|
| 148 | range = DatumGetRangeTypeP(value); | 
|---|
| 149 | range_deserialize(typcache, range, &lower, &upper, &empty); | 
|---|
| 150 |  | 
|---|
| 151 | if (!empty) | 
|---|
| 152 | { | 
|---|
| 153 | /* Remember bounds and length for further usage in histograms */ | 
|---|
| 154 | lowers[non_empty_cnt] = lower; | 
|---|
| 155 | uppers[non_empty_cnt] = upper; | 
|---|
| 156 |  | 
|---|
| 157 | if (lower.infinite || upper.infinite) | 
|---|
| 158 | { | 
|---|
| 159 | /* Length of any kind of an infinite range is infinite */ | 
|---|
| 160 | length = get_float8_infinity(); | 
|---|
| 161 | } | 
|---|
| 162 | else if (has_subdiff) | 
|---|
| 163 | { | 
|---|
| 164 | /* | 
|---|
| 165 | * For an ordinary range, use subdiff function between upper | 
|---|
| 166 | * and lower bound values. | 
|---|
| 167 | */ | 
|---|
| 168 | length = DatumGetFloat8(FunctionCall2Coll( | 
|---|
| 169 | &typcache->rng_subdiff_finfo, | 
|---|
| 170 | typcache->rng_collation, | 
|---|
| 171 | upper.val, lower.val)); | 
|---|
| 172 | } | 
|---|
| 173 | else | 
|---|
| 174 | { | 
|---|
| 175 | /* Use default value of 1.0 if no subdiff is available. */ | 
|---|
| 176 | length = 1.0; | 
|---|
| 177 | } | 
|---|
| 178 | lengths[non_empty_cnt] = length; | 
|---|
| 179 |  | 
|---|
| 180 | non_empty_cnt++; | 
|---|
| 181 | } | 
|---|
| 182 | else | 
|---|
| 183 | empty_cnt++; | 
|---|
| 184 |  | 
|---|
| 185 | non_null_cnt++; | 
|---|
| 186 | } | 
|---|
| 187 |  | 
|---|
| 188 | slot_idx = 0; | 
|---|
| 189 |  | 
|---|
| 190 | /* We can only compute real stats if we found some non-null values. */ | 
|---|
| 191 | if (non_null_cnt > 0) | 
|---|
| 192 | { | 
|---|
| 193 | Datum	   *bound_hist_values; | 
|---|
| 194 | Datum	   *length_hist_values; | 
|---|
| 195 | int			pos, | 
|---|
| 196 | posfrac, | 
|---|
| 197 | delta, | 
|---|
| 198 | deltafrac, | 
|---|
| 199 | i; | 
|---|
| 200 | MemoryContext old_cxt; | 
|---|
| 201 | float4	   *emptyfrac; | 
|---|
| 202 |  | 
|---|
| 203 | stats->stats_valid = true; | 
|---|
| 204 | /* Do the simple null-frac and width stats */ | 
|---|
| 205 | stats->stanullfrac = (double) null_cnt / (double) samplerows; | 
|---|
| 206 | stats->stawidth = total_width / (double) non_null_cnt; | 
|---|
| 207 |  | 
|---|
| 208 | /* Estimate that non-null values are unique */ | 
|---|
| 209 | stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); | 
|---|
| 210 |  | 
|---|
| 211 | /* Must copy the target values into anl_context */ | 
|---|
| 212 | old_cxt = MemoryContextSwitchTo(stats->anl_context); | 
|---|
| 213 |  | 
|---|
| 214 | /* | 
|---|
| 215 | * Generate a bounds histogram slot entry if there are at least two | 
|---|
| 216 | * values. | 
|---|
| 217 | */ | 
|---|
| 218 | if (non_empty_cnt >= 2) | 
|---|
| 219 | { | 
|---|
| 220 | /* Sort bound values */ | 
|---|
| 221 | qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound), | 
|---|
| 222 | range_bound_qsort_cmp, typcache); | 
|---|
| 223 | qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound), | 
|---|
| 224 | range_bound_qsort_cmp, typcache); | 
|---|
| 225 |  | 
|---|
| 226 | num_hist = non_empty_cnt; | 
|---|
| 227 | if (num_hist > num_bins) | 
|---|
| 228 | num_hist = num_bins + 1; | 
|---|
| 229 |  | 
|---|
| 230 | bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); | 
|---|
| 231 |  | 
|---|
| 232 | /* | 
|---|
| 233 | * The object of this loop is to construct ranges from first and | 
|---|
| 234 | * last entries in lowers[] and uppers[] along with evenly-spaced | 
|---|
| 235 | * values in between. So the i'th value is a range of lowers[(i * | 
|---|
| 236 | * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) / | 
|---|
| 237 | * (num_hist - 1)]. But computing that subscript directly risks | 
|---|
| 238 | * integer overflow when the stats target is more than a couple | 
|---|
| 239 | * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos | 
|---|
| 240 | * at each step, tracking the integral and fractional parts of the | 
|---|
| 241 | * sum separately. | 
|---|
| 242 | */ | 
|---|
| 243 | delta = (non_empty_cnt - 1) / (num_hist - 1); | 
|---|
| 244 | deltafrac = (non_empty_cnt - 1) % (num_hist - 1); | 
|---|
| 245 | pos = posfrac = 0; | 
|---|
| 246 |  | 
|---|
| 247 | for (i = 0; i < num_hist; i++) | 
|---|
| 248 | { | 
|---|
| 249 | bound_hist_values[i] = PointerGetDatum(range_serialize( | 
|---|
| 250 | typcache, &lowers[pos], &uppers[pos], false)); | 
|---|
| 251 | pos += delta; | 
|---|
| 252 | posfrac += deltafrac; | 
|---|
| 253 | if (posfrac >= (num_hist - 1)) | 
|---|
| 254 | { | 
|---|
| 255 | /* fractional part exceeds 1, carry to integer part */ | 
|---|
| 256 | pos++; | 
|---|
| 257 | posfrac -= (num_hist - 1); | 
|---|
| 258 | } | 
|---|
| 259 | } | 
|---|
| 260 |  | 
|---|
| 261 | stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM; | 
|---|
| 262 | stats->stavalues[slot_idx] = bound_hist_values; | 
|---|
| 263 | stats->numvalues[slot_idx] = num_hist; | 
|---|
| 264 | slot_idx++; | 
|---|
| 265 | } | 
|---|
| 266 |  | 
|---|
| 267 | /* | 
|---|
| 268 | * Generate a length histogram slot entry if there are at least two | 
|---|
| 269 | * values. | 
|---|
| 270 | */ | 
|---|
| 271 | if (non_empty_cnt >= 2) | 
|---|
| 272 | { | 
|---|
| 273 | /* | 
|---|
| 274 | * Ascending sort of range lengths for further filling of | 
|---|
| 275 | * histogram | 
|---|
| 276 | */ | 
|---|
| 277 | qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp); | 
|---|
| 278 |  | 
|---|
| 279 | num_hist = non_empty_cnt; | 
|---|
| 280 | if (num_hist > num_bins) | 
|---|
| 281 | num_hist = num_bins + 1; | 
|---|
| 282 |  | 
|---|
| 283 | length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); | 
|---|
| 284 |  | 
|---|
| 285 | /* | 
|---|
| 286 | * The object of this loop is to copy the first and last lengths[] | 
|---|
| 287 | * entries along with evenly-spaced values in between. So the i'th | 
|---|
| 288 | * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But | 
|---|
| 289 | * computing that subscript directly risks integer overflow when | 
|---|
| 290 | * the stats target is more than a couple thousand.  Instead we | 
|---|
| 291 | * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking | 
|---|
| 292 | * the integral and fractional parts of the sum separately. | 
|---|
| 293 | */ | 
|---|
| 294 | delta = (non_empty_cnt - 1) / (num_hist - 1); | 
|---|
| 295 | deltafrac = (non_empty_cnt - 1) % (num_hist - 1); | 
|---|
| 296 | pos = posfrac = 0; | 
|---|
| 297 |  | 
|---|
| 298 | for (i = 0; i < num_hist; i++) | 
|---|
| 299 | { | 
|---|
| 300 | length_hist_values[i] = Float8GetDatum(lengths[pos]); | 
|---|
| 301 | pos += delta; | 
|---|
| 302 | posfrac += deltafrac; | 
|---|
| 303 | if (posfrac >= (num_hist - 1)) | 
|---|
| 304 | { | 
|---|
| 305 | /* fractional part exceeds 1, carry to integer part */ | 
|---|
| 306 | pos++; | 
|---|
| 307 | posfrac -= (num_hist - 1); | 
|---|
| 308 | } | 
|---|
| 309 | } | 
|---|
| 310 | } | 
|---|
| 311 | else | 
|---|
| 312 | { | 
|---|
| 313 | /* | 
|---|
| 314 | * Even when we don't create the histogram, store an empty array | 
|---|
| 315 | * to mean "no histogram". We can't just leave stavalues NULL, | 
|---|
| 316 | * because get_attstatsslot() errors if you ask for stavalues, and | 
|---|
| 317 | * it's NULL. We'll still store the empty fraction in stanumbers. | 
|---|
| 318 | */ | 
|---|
| 319 | length_hist_values = palloc(0); | 
|---|
| 320 | num_hist = 0; | 
|---|
| 321 | } | 
|---|
| 322 | stats->staop[slot_idx] = Float8LessOperator; | 
|---|
| 323 | stats->stacoll[slot_idx] = InvalidOid; | 
|---|
| 324 | stats->stavalues[slot_idx] = length_hist_values; | 
|---|
| 325 | stats->numvalues[slot_idx] = num_hist; | 
|---|
| 326 | stats->statypid[slot_idx] = FLOAT8OID; | 
|---|
| 327 | stats->statyplen[slot_idx] = sizeof(float8); | 
|---|
| 328 | #ifdef USE_FLOAT8_BYVAL | 
|---|
| 329 | stats->statypbyval[slot_idx] = true; | 
|---|
| 330 | #else | 
|---|
| 331 | stats->statypbyval[slot_idx] = false; | 
|---|
| 332 | #endif | 
|---|
| 333 | stats->statypalign[slot_idx] = 'd'; | 
|---|
| 334 |  | 
|---|
| 335 | /* Store the fraction of empty ranges */ | 
|---|
| 336 | emptyfrac = (float4 *) palloc(sizeof(float4)); | 
|---|
| 337 | *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt); | 
|---|
| 338 | stats->stanumbers[slot_idx] = emptyfrac; | 
|---|
| 339 | stats->numnumbers[slot_idx] = 1; | 
|---|
| 340 |  | 
|---|
| 341 | stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM; | 
|---|
| 342 | slot_idx++; | 
|---|
| 343 |  | 
|---|
| 344 | MemoryContextSwitchTo(old_cxt); | 
|---|
| 345 | } | 
|---|
| 346 | else if (null_cnt > 0) | 
|---|
| 347 | { | 
|---|
| 348 | /* We found only nulls; assume the column is entirely null */ | 
|---|
| 349 | stats->stats_valid = true; | 
|---|
| 350 | stats->stanullfrac = 1.0; | 
|---|
| 351 | stats->stawidth = 0;	/* "unknown" */ | 
|---|
| 352 | stats->stadistinct = 0.0;	/* "unknown" */ | 
|---|
| 353 | } | 
|---|
| 354 |  | 
|---|
| 355 | /* | 
|---|
| 356 | * We don't need to bother cleaning up any of our temporary palloc's. The | 
|---|
| 357 | * hashtable should also go away, as it used a child memory context. | 
|---|
| 358 | */ | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|