1/*-------------------------------------------------------------------------
2 *
3 * sampling.c
4 * Relation block sampling routines.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/misc/sampling.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include <math.h>
19
20#include "utils/sampling.h"
21
22
23/*
24 * BlockSampler_Init -- prepare for random sampling of blocknumbers
25 *
26 * BlockSampler provides algorithm for block level sampling of a relation
27 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 * It selects a random sample of samplesize blocks out of
29 * the nblocks blocks in the table. If the table has less than
30 * samplesize blocks, all blocks are selected.
31 *
32 * Since we know the total number of blocks in advance, we can use the
33 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34 * algorithm.
35 */
36void
37BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38 long randseed)
39{
40 bs->N = nblocks; /* measured table size */
41
42 /*
43 * If we decide to reduce samplesize for tables that have less or not much
44 * more than samplesize blocks, here is the place to do it.
45 */
46 bs->n = samplesize;
47 bs->t = 0; /* blocks scanned so far */
48 bs->m = 0; /* blocks selected so far */
49
50 sampler_random_init_state(randseed, bs->randstate);
51}
52
53bool
54BlockSampler_HasMore(BlockSampler bs)
55{
56 return (bs->t < bs->N) && (bs->m < bs->n);
57}
58
59BlockNumber
60BlockSampler_Next(BlockSampler bs)
61{
62 BlockNumber K = bs->N - bs->t; /* remaining blocks */
63 int k = bs->n - bs->m; /* blocks still to sample */
64 double p; /* probability to skip block */
65 double V; /* random */
66
67 Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68
69 if ((BlockNumber) k >= K)
70 {
71 /* need all the rest */
72 bs->m++;
73 return bs->t++;
74 }
75
76 /*----------
77 * It is not obvious that this code matches Knuth's Algorithm S.
78 * Knuth says to skip the current block with probability 1 - k/K.
79 * If we are to skip, we should advance t (hence decrease K), and
80 * repeat the same probabilistic test for the next block. The naive
81 * implementation thus requires a sampler_random_fract() call for each
82 * block number. But we can reduce this to one sampler_random_fract()
83 * call per selected block, by noting that each time the while-test
84 * succeeds, we can reinterpret V as a uniform random number in the range
85 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86 * the appropriate fraction of its former value, and our next loop
87 * makes the appropriate probabilistic test.
88 *
89 * We have initially K > k > 0. If the loop reduces K to equal k,
90 * the next while-test must fail since p will become exactly zero
91 * (we assume there will not be roundoff error in the division).
92 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93 * to be doubly sure about roundoff error.) Therefore K cannot become
94 * less than k, which means that we cannot fail to select enough blocks.
95 *----------
96 */
97 V = sampler_random_fract(bs->randstate);
98 p = 1.0 - (double) k / (double) K;
99 while (V < p)
100 {
101 /* skip */
102 bs->t++;
103 K--; /* keep K == N - t */
104
105 /* adjust p to be new cutoff point in reduced range */
106 p *= 1.0 - (double) k / (double) K;
107 }
108
109 /* select */
110 bs->m++;
111 return bs->t++;
112}
113
114/*
115 * These two routines embody Algorithm Z from "Random sampling with a
116 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117 * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
118 * of the count S of records to skip before processing another record.
119 * It is computed primarily based on t, the number of records already read.
120 * The only extra state needed between calls is W, a random state variable.
121 *
122 * reservoir_init_selection_state computes the initial W value.
123 *
124 * Given that we've already read t records (t >= n), reservoir_get_next_S
125 * determines the number of records to skip before the next record is
126 * processed.
127 */
128void
129reservoir_init_selection_state(ReservoirState rs, int n)
130{
131 /*
132 * Reservoir sampling is not used anywhere where it would need to return
133 * repeatable results so we can initialize it randomly.
134 */
135 sampler_random_init_state(random(), rs->randstate);
136
137 /* Initial value of W (for use when Algorithm Z is first applied) */
138 rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139}
140
141double
142reservoir_get_next_S(ReservoirState rs, double t, int n)
143{
144 double S;
145
146 /* The magic constant here is T from Vitter's paper */
147 if (t <= (22.0 * n))
148 {
149 /* Process records using Algorithm X until t is large enough */
150 double V,
151 quot;
152
153 V = sampler_random_fract(rs->randstate); /* Generate V */
154 S = 0;
155 t += 1;
156 /* Note: "num" in Vitter's code is always equal to t - n */
157 quot = (t - (double) n) / t;
158 /* Find min S satisfying (4.1) */
159 while (quot > V)
160 {
161 S += 1;
162 t += 1;
163 quot *= (t - (double) n) / t;
164 }
165 }
166 else
167 {
168 /* Now apply Algorithm Z */
169 double W = rs->W;
170 double term = t - (double) n + 1;
171
172 for (;;)
173 {
174 double numer,
175 numer_lim,
176 denom;
177 double U,
178 X,
179 lhs,
180 rhs,
181 y,
182 tmp;
183
184 /* Generate U and X */
185 U = sampler_random_fract(rs->randstate);
186 X = t * (W - 1.0);
187 S = floor(X); /* S is tentatively set to floor(X) */
188 /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189 tmp = (t + 1) / term;
190 lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191 rhs = (((t + X) / (term + S)) * term) / t;
192 if (lhs <= rhs)
193 {
194 W = rhs / lhs;
195 break;
196 }
197 /* Test if U <= f(S)/cg(X) */
198 y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199 if ((double) n < S)
200 {
201 denom = t;
202 numer_lim = term + S;
203 }
204 else
205 {
206 denom = t - (double) n + S;
207 numer_lim = t + 1;
208 }
209 for (numer = t + S; numer >= numer_lim; numer -= 1)
210 {
211 y *= numer / denom;
212 denom -= 1;
213 }
214 W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215 if (exp(log(y) / n) <= (t + X) / t)
216 break;
217 }
218 rs->W = W;
219 }
220 return S;
221}
222
223
224/*----------
225 * Random number generator used by sampling
226 *----------
227 */
228void
229sampler_random_init_state(long seed, SamplerRandomState randstate)
230{
231 randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
232 randstate[1] = (unsigned short) seed;
233 randstate[2] = (unsigned short) (seed >> 16);
234}
235
236/* Select a random value R uniformly distributed in (0 - 1) */
237double
238sampler_random_fract(SamplerRandomState randstate)
239{
240 double res;
241
242 /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243 do
244 {
245 res = pg_erand48(randstate);
246 } while (res == 0.0);
247 return res;
248}
249
250
251/*
252 * Backwards-compatible API for block sampling
253 *
254 * This code is now deprecated, but since it's still in use by many FDWs,
255 * we should keep it for awhile at least. The functionality is the same as
256 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257 * except that a common random state is used across all callers.
258 */
259static ReservoirStateData oldrs;
260
261double
262anl_random_fract(void)
263{
264 /* initialize if first time through */
265 if (oldrs.randstate[0] == 0)
266 sampler_random_init_state(random(), oldrs.randstate);
267
268 /* and compute a random fraction */
269 return sampler_random_fract(oldrs.randstate);
270}
271
272double
273anl_init_selection_state(int n)
274{
275 /* initialize if first time through */
276 if (oldrs.randstate[0] == 0)
277 sampler_random_init_state(random(), oldrs.randstate);
278
279 /* Initial value of W (for use when Algorithm Z is first applied) */
280 return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281}
282
283double
284anl_get_next_S(double t, int n, double *stateptr)
285{
286 double result;
287
288 oldrs.W = *stateptr;
289 result = reservoir_get_next_S(&oldrs, t, n);
290 *stateptr = oldrs.W;
291 return result;
292}
293