sampling.c source code [PostgreSQL/src/backend/utils/misc/sampling.c]

1	/-------------------------------------------------------------------------*
2	*
3	* sampling.c
4	* Relation block sampling routines.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/utils/misc/sampling.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15
16	#include "postgres.h"
17
18	#include <math.h>
19
20	#include "utils/sampling.h"
21
22
23	/*
24	* BlockSampler_Init -- prepare for random sampling of blocknumbers
25	*
26	* BlockSampler provides algorithm for block level sampling of a relation
27	* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28	* It selects a random sample of samplesize blocks out of
29	* the nblocks blocks in the table. If the table has less than
30	* samplesize blocks, all blocks are selected.
31	*
32	* Since we know the total number of blocks in advance, we can use the
33	* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34	* algorithm.
35	*/
36	void
37	BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38	long randseed)
39	{
40	bs->N = nblocks; / measured table size /
41
42	/*
43	* If we decide to reduce samplesize for tables that have less or not much
44	* more than samplesize blocks, here is the place to do it.
45	*/
46	bs->n = samplesize;
47	bs->t = `0`; / blocks scanned so far /
48	bs->m = `0`; / blocks selected so far /
49
50	sampler_random_init_state(randseed, bs->randstate);
51	}
52
53	bool
54	BlockSampler_HasMore(BlockSampler bs)
55	{
56	return (bs->t < bs->N) && (bs->m < bs->n);
57	}
58
59	BlockNumber
60	BlockSampler_Next(BlockSampler bs)
61	{
62	BlockNumber K = bs->N - bs->t; / remaining blocks /
63	int k = bs->n - bs->m; / blocks still to sample /
64	double p; / probability to skip block /
65	double V; / random /
66
67	Assert(BlockSampler_HasMore(bs)); / hence K > 0 and k > 0 /
68
69	if ((BlockNumber) k >= K)
70	{
71	/ need all the rest /
72	bs->m++;
73	return bs->t++;
74	}
75
76	/----------*
77	* It is not obvious that this code matches Knuth's Algorithm S.
78	* Knuth says to skip the current block with probability 1 - k/K.
79	* If we are to skip, we should advance t (hence decrease K), and
80	* repeat the same probabilistic test for the next block. The naive
81	* implementation thus requires a sampler_random_fract() call for each
82	* block number. But we can reduce this to one sampler_random_fract()
83	* call per selected block, by noting that each time the while-test
84	* succeeds, we can reinterpret V as a uniform random number in the range
85	* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86	* the appropriate fraction of its former value, and our next loop
87	* makes the appropriate probabilistic test.
88	*
89	* We have initially K > k > 0. If the loop reduces K to equal k,
90	* the next while-test must fail since p will become exactly zero
91	* (we assume there will not be roundoff error in the division).
92	* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93	* to be doubly sure about roundoff error.) Therefore K cannot become
94	* less than k, which means that we cannot fail to select enough blocks.
95	*----------
96	*/
97	V = sampler_random_fract(bs->randstate);
98	p = `1.0` - (double) k / (double) K;
99	while (V < p)
100	{
101	/ skip /
102	bs->t++;
103	K--; / keep K == N - t /
104
105	/ adjust p to be new cutoff point in reduced range /
106	p = `1.0` - (double) k / (double*) K;
107	}
108
109	/ select /
110	bs->m++;
111	return bs->t++;
112	}
113
114	/*
115	* These two routines embody Algorithm Z from "Random sampling with a
116	* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117	* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
118	* of the count S of records to skip before processing another record.
119	* It is computed primarily based on t, the number of records already read.
120	* The only extra state needed between calls is W, a random state variable.
121	*
122	* reservoir_init_selection_state computes the initial W value.
123	*
124	* Given that we've already read t records (t >= n), reservoir_get_next_S
125	* determines the number of records to skip before the next record is
126	* processed.
127	*/
128	void
129	reservoir_init_selection_state(ReservoirState rs, int n)
130	{
131	/*
132	* Reservoir sampling is not used anywhere where it would need to return
133	* repeatable results so we can initialize it randomly.
134	*/
135	sampler_random_init_state(random(), rs->randstate);
136
137	/ Initial value of W (for use when Algorithm Z is first applied) /
138	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139	}
140
141	double
142	reservoir_get_next_S(ReservoirState rs, double t, int n)
143	{
144	double S;
145
146	/ The magic constant here is T from Vitter's paper /
147	if (t <= (`22.0` * n))
148	{
149	/ Process records using Algorithm X until t is large enough /
150	double V,
151	quot;
152
153	V = sampler_random_fract(rs->randstate); / Generate V /
154	S = `0`;
155	t += `1`;
156	/ Note: "num" in Vitter's code is always equal to t - n /
157	quot = (t - (double) n) / t;
158	/ Find min S satisfying (4.1) /
159	while (quot > V)
160	{
161	S += `1`;
162	t += `1`;
163	quot = (t - (double*) n) / t;
164	}
165	}
166	else
167	{
168	/ Now apply Algorithm Z /
169	double W = rs->W;
170	double term = t - (double) n + `1`;
171
172	for (;;)
173	{
174	double numer,
175	numer_lim,
176	denom;
177	double U,
178	X,
179	lhs,
180	rhs,
181	y,
182	tmp;
183
184	/ Generate U and X /
185	U = sampler_random_fract(rs->randstate);
186	X = t * (W - `1.0`);
187	S = floor(X); / S is tentatively set to floor(X) /
188	/ Test if U <= h(S)/cg(X) in the manner of (6.3) /
189	tmp = (t + `1`) / term;
190	lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191	rhs = (((t + X) / (term + S)) * term) / t;
192	if (lhs <= rhs)
193	{
194	W = rhs / lhs;
195	break;
196	}
197	/ Test if U <= f(S)/cg(X) /
198	y = (((U * (t + `1`)) / term) * (t + S + `1`)) / (t + X);
199	if ((double) n < S)
200	{
201	denom = t;
202	numer_lim = term + S;
203	}
204	else
205	{
206	denom = t - (double) n + S;
207	numer_lim = t + `1`;
208	}
209	for (numer = t + S; numer >= numer_lim; numer -= `1`)
210	{
211	y *= numer / denom;
212	denom -= `1`;
213	}
214	W = exp(-log(sampler_random_fract(rs->randstate)) / n); / Generate W in advance /
215	if (exp(log(y) / n) <= (t + X) / t)
216	break;
217	}
218	rs->W = W;
219	}
220	return S;
221	}
222
223
224	/----------*
225	* Random number generator used by sampling
226	*----------
227	*/
228	void
229	sampler_random_init_state(long seed, SamplerRandomState randstate)
230	{
231	randstate[`0`] = `0x330e`; / same as pg_erand48, but could be anything /
232	randstate[`1`] = (unsigned short) seed;
233	randstate[`2`] = (unsigned short) (seed >> `16`);
234	}
235
236	/ Select a random value R uniformly distributed in (0 - 1) /
237	double
238	sampler_random_fract(SamplerRandomState randstate)
239	{
240	double res;
241
242	/ pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 /
243	do
244	{
245	res = pg_erand48(randstate);
246	} while (res == `0.0`);
247	return res;
248	}
249
250
251	/*
252	* Backwards-compatible API for block sampling
253	*
254	* This code is now deprecated, but since it's still in use by many FDWs,
255	* we should keep it for awhile at least. The functionality is the same as
256	* sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257	* except that a common random state is used across all callers.
258	*/
259	static ReservoirStateData oldrs;
260
261	double
262	anl_random_fract(void)
263	{
264	/ initialize if first time through /
265	if (oldrs.randstate[`0`] == `0`)
266	sampler_random_init_state(random(), oldrs.randstate);
267
268	/ and compute a random fraction /
269	return sampler_random_fract(oldrs.randstate);
270	}
271
272	double
273	anl_init_selection_state(int n)
274	{
275	/ initialize if first time through /
276	if (oldrs.randstate[`0`] == `0`)
277	sampler_random_init_state(random(), oldrs.randstate);
278
279	/ Initial value of W (for use when Algorithm Z is first applied) /
280	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281	}
282
283	double
284	anl_get_next_S(double t, int n, double *stateptr)
285	{
286	double result;
287
288	oldrs.W = *stateptr;
289	result = reservoir_get_next_S(&oldrs, t, n);
290	*stateptr = oldrs.W;
291	return result;
292	}
293

Browse the source code of PostgreSQL/src/backend/utils/misc/sampling.c