1/*-------------------------------------------------------------------------
2 *
3 * pg_statistic.h
4 * definition of the "statistics" system catalog (pg_statistic)
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/include/catalog/pg_statistic.h
11 *
12 * NOTES
13 * The Catalog.pm module reads this file and derives schema
14 * information.
15 *
16 *-------------------------------------------------------------------------
17 */
18#ifndef PG_STATISTIC_H
19#define PG_STATISTIC_H
20
21#include "catalog/genbki.h"
22#include "catalog/pg_statistic_d.h"
23
24/* ----------------
25 * pg_statistic definition. cpp turns this into
26 * typedef struct FormData_pg_statistic
27 * ----------------
28 */
29CATALOG(pg_statistic,2619,StatisticRelationId)
30{
31 /* These fields form the unique key for the entry: */
32 Oid starelid; /* relation containing attribute */
33 int16 staattnum; /* attribute (column) stats are for */
34 bool stainherit; /* true if inheritance children are included */
35
36 /* the fraction of the column's entries that are NULL: */
37 float4 stanullfrac;
38
39 /*
40 * stawidth is the average width in bytes of non-null entries. For
41 * fixed-width datatypes this is of course the same as the typlen, but for
42 * var-width types it is more useful. Note that this is the average width
43 * of the data as actually stored, post-TOASTing (eg, for a
44 * moved-out-of-line value, only the size of the pointer object is
45 * counted). This is the appropriate definition for the primary use of
46 * the statistic, which is to estimate sizes of in-memory hash tables of
47 * tuples.
48 */
49 int32 stawidth;
50
51 /* ----------------
52 * stadistinct indicates the (approximate) number of distinct non-null
53 * data values in the column. The interpretation is:
54 * 0 unknown or not computed
55 * > 0 actual number of distinct values
56 * < 0 negative of multiplier for number of rows
57 * The special negative case allows us to cope with columns that are
58 * unique (stadistinct = -1) or nearly so (for example, a column in which
59 * non-null values appear about twice on the average could be represented
60 * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the
61 * column is nulls). Because the number-of-rows statistic in pg_class may
62 * be updated more frequently than pg_statistic is, it's important to be
63 * able to describe such situations as a multiple of the number of rows,
64 * rather than a fixed number of distinct values. But in other cases a
65 * fixed number is correct (eg, a boolean column).
66 * ----------------
67 */
68 float4 stadistinct;
69
70 /* ----------------
71 * To allow keeping statistics on different kinds of datatypes,
72 * we do not hard-wire any particular meaning for the remaining
73 * statistical fields. Instead, we provide several "slots" in which
74 * statistical data can be placed. Each slot includes:
75 * kind integer code identifying kind of data (see below)
76 * op OID of associated operator, if needed
77 * coll OID of relevant collation, or 0 if none
78 * numbers float4 array (for statistical values)
79 * values anyarray (for representations of data values)
80 * The ID, operator, and collation fields are never NULL; they are zeroes
81 * in an unused slot. The numbers and values fields are NULL in an
82 * unused slot, and might also be NULL in a used slot if the slot kind
83 * has no need for one or the other.
84 * ----------------
85 */
86
87 int16 stakind1;
88 int16 stakind2;
89 int16 stakind3;
90 int16 stakind4;
91 int16 stakind5;
92
93 Oid staop1;
94 Oid staop2;
95 Oid staop3;
96 Oid staop4;
97 Oid staop5;
98
99 Oid stacoll1;
100 Oid stacoll2;
101 Oid stacoll3;
102 Oid stacoll4;
103 Oid stacoll5;
104
105#ifdef CATALOG_VARLEN /* variable-length fields start here */
106 float4 stanumbers1[1];
107 float4 stanumbers2[1];
108 float4 stanumbers3[1];
109 float4 stanumbers4[1];
110 float4 stanumbers5[1];
111
112 /*
113 * Values in these arrays are values of the column's data type, or of some
114 * related type such as an array element type. We presently have to cheat
115 * quite a bit to allow polymorphic arrays of this kind, but perhaps
116 * someday it'll be a less bogus facility.
117 */
118 anyarray stavalues1;
119 anyarray stavalues2;
120 anyarray stavalues3;
121 anyarray stavalues4;
122 anyarray stavalues5;
123#endif
124} FormData_pg_statistic;
125
126#define STATISTIC_NUM_SLOTS 5
127
128
129/* ----------------
130 * Form_pg_statistic corresponds to a pointer to a tuple with
131 * the format of pg_statistic relation.
132 * ----------------
133 */
134typedef FormData_pg_statistic *Form_pg_statistic;
135
136#ifdef EXPOSE_TO_CLIENT_CODE
137
138/*
139 * Several statistical slot "kinds" are defined by core PostgreSQL, as
140 * documented below. Also, custom data types can define their own "kind"
141 * codes by mutual agreement between a custom typanalyze routine and the
142 * selectivity estimation functions of the type's operators.
143 *
144 * Code reading the pg_statistic relation should not assume that a particular
145 * data "kind" will appear in any particular slot. Instead, search the
146 * stakind fields to see if the desired data is available. (The standard
147 * function get_attstatsslot() may be used for this.)
148 */
149
150/*
151 * The present allocation of "kind" codes is:
152 *
153 * 1-99: reserved for assignment by the core PostgreSQL project
154 * (values in this range will be documented in this file)
155 * 100-199: reserved for assignment by the PostGIS project
156 * (values to be documented in PostGIS documentation)
157 * 200-299: reserved for assignment by the ESRI ST_Geometry project
158 * (values to be documented in ESRI ST_Geometry documentation)
159 * 300-9999: reserved for future public assignments
160 *
161 * For private use you may choose a "kind" code at random in the range
162 * 10000-30000. However, for code that is to be widely disseminated it is
163 * better to obtain a publicly defined "kind" code by request from the
164 * PostgreSQL Global Development Group.
165 */
166
167/*
168 * In a "most common values" slot, staop is the OID of the "=" operator
169 * used to decide whether values are the same or not, and stacoll is the
170 * collation used (same as column's collation). stavalues contains
171 * the K most common non-null values appearing in the column, and stanumbers
172 * contains their frequencies (fractions of total row count). The values
173 * shall be ordered in decreasing frequency. Note that since the arrays are
174 * variable-size, K may be chosen by the statistics collector. Values should
175 * not appear in MCV unless they have been observed to occur more than once;
176 * a unique column will have no MCV slot.
177 */
178#define STATISTIC_KIND_MCV 1
179
180/*
181 * A "histogram" slot describes the distribution of scalar data. staop is
182 * the OID of the "<" operator that describes the sort ordering, and stacoll
183 * is the relevant collation. (In theory more than one histogram could appear,
184 * if a datatype has more than one useful sort operator or we care about more
185 * than one collation. Currently the collation will always be that of the
186 * underlying column.) stavalues contains M (>=2) non-null values that
187 * divide the non-null column data values into M-1 bins of approximately equal
188 * population. The first stavalues item is the MIN and the last is the MAX.
189 * stanumbers is not used and should be NULL. IMPORTANT POINT: if an MCV
190 * slot is also provided, then the histogram describes the data distribution
191 * *after removing the values listed in MCV* (thus, it's a "compressed
192 * histogram" in the technical parlance). This allows a more accurate
193 * representation of the distribution of a column with some very-common
194 * values. In a column with only a few distinct values, it's possible that
195 * the MCV list describes the entire data population; in this case the
196 * histogram reduces to empty and should be omitted.
197 */
198#define STATISTIC_KIND_HISTOGRAM 2
199
200/*
201 * A "correlation" slot describes the correlation between the physical order
202 * of table tuples and the ordering of data values of this column, as seen
203 * by the "<" operator identified by staop with the collation identified by
204 * stacoll. (As with the histogram, more than one entry could theoretically
205 * appear.) stavalues is not used and should be NULL. stanumbers contains
206 * a single entry, the correlation coefficient between the sequence of data
207 * values and the sequence of their actual tuple positions. The coefficient
208 * ranges from +1 to -1.
209 */
210#define STATISTIC_KIND_CORRELATION 3
211
212/*
213 * A "most common elements" slot is similar to a "most common values" slot,
214 * except that it stores the most common non-null *elements* of the column
215 * values. This is useful when the column datatype is an array or some other
216 * type with identifiable elements (for instance, tsvector). staop contains
217 * the equality operator appropriate to the element type, and stacoll
218 * contains the collation to use with it. stavalues contains
219 * the most common element values, and stanumbers their frequencies. Unlike
220 * MCV slots, frequencies are measured as the fraction of non-null rows the
221 * element value appears in, not the frequency of all rows. Also unlike
222 * MCV slots, the values are sorted into the element type's default order
223 * (to support binary search for a particular value). Since this puts the
224 * minimum and maximum frequencies at unpredictable spots in stanumbers,
225 * there are two extra members of stanumbers, holding copies of the minimum
226 * and maximum frequencies. Optionally, there can be a third extra member,
227 * which holds the frequency of null elements (expressed in the same terms:
228 * the fraction of non-null rows that contain at least one null element). If
229 * this member is omitted, the column is presumed to contain no null elements.
230 *
231 * Note: in current usage for tsvector columns, the stavalues elements are of
232 * type text, even though their representation within tsvector is not
233 * exactly text.
234 */
235#define STATISTIC_KIND_MCELEM 4
236
237/*
238 * A "distinct elements count histogram" slot describes the distribution of
239 * the number of distinct element values present in each row of an array-type
240 * column. Only non-null rows are considered, and only non-null elements.
241 * staop contains the equality operator appropriate to the element type,
242 * and stacoll contains the collation to use with it.
243 * stavalues is not used and should be NULL. The last member of stanumbers is
244 * the average count of distinct element values over all non-null rows. The
245 * preceding M (>=2) members form a histogram that divides the population of
246 * distinct-elements counts into M-1 bins of approximately equal population.
247 * The first of these is the minimum observed count, and the last the maximum.
248 */
249#define STATISTIC_KIND_DECHIST 5
250
251/*
252 * A "length histogram" slot describes the distribution of range lengths in
253 * rows of a range-type column. stanumbers contains a single entry, the
254 * fraction of empty ranges. stavalues is a histogram of non-empty lengths, in
255 * a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range
256 * values that divide the column data values into M-1 bins of approximately
257 * equal population. The lengths are stored as float8s, as measured by the
258 * range type's subdiff function. Only non-null rows are considered.
259 */
260#define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM 6
261
262/*
263 * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
264 * a range-type column. stavalues contains M (>=2) range values that divide
265 * the column data values into M-1 bins of approximately equal population.
266 * Unlike a regular scalar histogram, this is actually two histograms combined
267 * into a single array, with the lower bounds of each value forming a
268 * histogram of lower bounds, and the upper bounds a histogram of upper
269 * bounds. Only non-NULL, non-empty ranges are included.
270 */
271#define STATISTIC_KIND_BOUNDS_HISTOGRAM 7
272
273#endif /* EXPOSE_TO_CLIENT_CODE */
274
275#endif /* PG_STATISTIC_H */
276