1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * sampling.c |
4 | * Relation block sampling routines. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/utils/misc/sampling.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | |
16 | #include "postgres.h" |
17 | |
18 | #include <math.h> |
19 | |
20 | #include "utils/sampling.h" |
21 | |
22 | |
23 | /* |
24 | * BlockSampler_Init -- prepare for random sampling of blocknumbers |
25 | * |
26 | * BlockSampler provides algorithm for block level sampling of a relation |
27 | * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB") |
28 | * It selects a random sample of samplesize blocks out of |
29 | * the nblocks blocks in the table. If the table has less than |
30 | * samplesize blocks, all blocks are selected. |
31 | * |
32 | * Since we know the total number of blocks in advance, we can use the |
33 | * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's |
34 | * algorithm. |
35 | */ |
36 | void |
37 | BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, |
38 | long randseed) |
39 | { |
40 | bs->N = nblocks; /* measured table size */ |
41 | |
42 | /* |
43 | * If we decide to reduce samplesize for tables that have less or not much |
44 | * more than samplesize blocks, here is the place to do it. |
45 | */ |
46 | bs->n = samplesize; |
47 | bs->t = 0; /* blocks scanned so far */ |
48 | bs->m = 0; /* blocks selected so far */ |
49 | |
50 | sampler_random_init_state(randseed, bs->randstate); |
51 | } |
52 | |
53 | bool |
54 | BlockSampler_HasMore(BlockSampler bs) |
55 | { |
56 | return (bs->t < bs->N) && (bs->m < bs->n); |
57 | } |
58 | |
59 | BlockNumber |
60 | BlockSampler_Next(BlockSampler bs) |
61 | { |
62 | BlockNumber K = bs->N - bs->t; /* remaining blocks */ |
63 | int k = bs->n - bs->m; /* blocks still to sample */ |
64 | double p; /* probability to skip block */ |
65 | double V; /* random */ |
66 | |
67 | Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */ |
68 | |
69 | if ((BlockNumber) k >= K) |
70 | { |
71 | /* need all the rest */ |
72 | bs->m++; |
73 | return bs->t++; |
74 | } |
75 | |
76 | /*---------- |
77 | * It is not obvious that this code matches Knuth's Algorithm S. |
78 | * Knuth says to skip the current block with probability 1 - k/K. |
79 | * If we are to skip, we should advance t (hence decrease K), and |
80 | * repeat the same probabilistic test for the next block. The naive |
81 | * implementation thus requires a sampler_random_fract() call for each |
82 | * block number. But we can reduce this to one sampler_random_fract() |
83 | * call per selected block, by noting that each time the while-test |
84 | * succeeds, we can reinterpret V as a uniform random number in the range |
85 | * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be |
86 | * the appropriate fraction of its former value, and our next loop |
87 | * makes the appropriate probabilistic test. |
88 | * |
89 | * We have initially K > k > 0. If the loop reduces K to equal k, |
90 | * the next while-test must fail since p will become exactly zero |
91 | * (we assume there will not be roundoff error in the division). |
92 | * (Note: Knuth suggests a "<=" loop condition, but we use "<" just |
93 | * to be doubly sure about roundoff error.) Therefore K cannot become |
94 | * less than k, which means that we cannot fail to select enough blocks. |
95 | *---------- |
96 | */ |
97 | V = sampler_random_fract(bs->randstate); |
98 | p = 1.0 - (double) k / (double) K; |
99 | while (V < p) |
100 | { |
101 | /* skip */ |
102 | bs->t++; |
103 | K--; /* keep K == N - t */ |
104 | |
105 | /* adjust p to be new cutoff point in reduced range */ |
106 | p *= 1.0 - (double) k / (double) K; |
107 | } |
108 | |
109 | /* select */ |
110 | bs->m++; |
111 | return bs->t++; |
112 | } |
113 | |
114 | /* |
115 | * These two routines embody Algorithm Z from "Random sampling with a |
116 | * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 |
117 | * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms |
118 | * of the count S of records to skip before processing another record. |
119 | * It is computed primarily based on t, the number of records already read. |
120 | * The only extra state needed between calls is W, a random state variable. |
121 | * |
122 | * reservoir_init_selection_state computes the initial W value. |
123 | * |
124 | * Given that we've already read t records (t >= n), reservoir_get_next_S |
125 | * determines the number of records to skip before the next record is |
126 | * processed. |
127 | */ |
128 | void |
129 | reservoir_init_selection_state(ReservoirState rs, int n) |
130 | { |
131 | /* |
132 | * Reservoir sampling is not used anywhere where it would need to return |
133 | * repeatable results so we can initialize it randomly. |
134 | */ |
135 | sampler_random_init_state(random(), rs->randstate); |
136 | |
137 | /* Initial value of W (for use when Algorithm Z is first applied) */ |
138 | rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n); |
139 | } |
140 | |
141 | double |
142 | reservoir_get_next_S(ReservoirState rs, double t, int n) |
143 | { |
144 | double S; |
145 | |
146 | /* The magic constant here is T from Vitter's paper */ |
147 | if (t <= (22.0 * n)) |
148 | { |
149 | /* Process records using Algorithm X until t is large enough */ |
150 | double V, |
151 | quot; |
152 | |
153 | V = sampler_random_fract(rs->randstate); /* Generate V */ |
154 | S = 0; |
155 | t += 1; |
156 | /* Note: "num" in Vitter's code is always equal to t - n */ |
157 | quot = (t - (double) n) / t; |
158 | /* Find min S satisfying (4.1) */ |
159 | while (quot > V) |
160 | { |
161 | S += 1; |
162 | t += 1; |
163 | quot *= (t - (double) n) / t; |
164 | } |
165 | } |
166 | else |
167 | { |
168 | /* Now apply Algorithm Z */ |
169 | double W = rs->W; |
170 | double term = t - (double) n + 1; |
171 | |
172 | for (;;) |
173 | { |
174 | double numer, |
175 | numer_lim, |
176 | denom; |
177 | double U, |
178 | X, |
179 | lhs, |
180 | rhs, |
181 | y, |
182 | tmp; |
183 | |
184 | /* Generate U and X */ |
185 | U = sampler_random_fract(rs->randstate); |
186 | X = t * (W - 1.0); |
187 | S = floor(X); /* S is tentatively set to floor(X) */ |
188 | /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ |
189 | tmp = (t + 1) / term; |
190 | lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); |
191 | rhs = (((t + X) / (term + S)) * term) / t; |
192 | if (lhs <= rhs) |
193 | { |
194 | W = rhs / lhs; |
195 | break; |
196 | } |
197 | /* Test if U <= f(S)/cg(X) */ |
198 | y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); |
199 | if ((double) n < S) |
200 | { |
201 | denom = t; |
202 | numer_lim = term + S; |
203 | } |
204 | else |
205 | { |
206 | denom = t - (double) n + S; |
207 | numer_lim = t + 1; |
208 | } |
209 | for (numer = t + S; numer >= numer_lim; numer -= 1) |
210 | { |
211 | y *= numer / denom; |
212 | denom -= 1; |
213 | } |
214 | W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */ |
215 | if (exp(log(y) / n) <= (t + X) / t) |
216 | break; |
217 | } |
218 | rs->W = W; |
219 | } |
220 | return S; |
221 | } |
222 | |
223 | |
224 | /*---------- |
225 | * Random number generator used by sampling |
226 | *---------- |
227 | */ |
228 | void |
229 | sampler_random_init_state(long seed, SamplerRandomState randstate) |
230 | { |
231 | randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */ |
232 | randstate[1] = (unsigned short) seed; |
233 | randstate[2] = (unsigned short) (seed >> 16); |
234 | } |
235 | |
236 | /* Select a random value R uniformly distributed in (0 - 1) */ |
237 | double |
238 | sampler_random_fract(SamplerRandomState randstate) |
239 | { |
240 | double res; |
241 | |
242 | /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */ |
243 | do |
244 | { |
245 | res = pg_erand48(randstate); |
246 | } while (res == 0.0); |
247 | return res; |
248 | } |
249 | |
250 | |
251 | /* |
252 | * Backwards-compatible API for block sampling |
253 | * |
254 | * This code is now deprecated, but since it's still in use by many FDWs, |
255 | * we should keep it for awhile at least. The functionality is the same as |
256 | * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S, |
257 | * except that a common random state is used across all callers. |
258 | */ |
259 | static ReservoirStateData oldrs; |
260 | |
261 | double |
262 | anl_random_fract(void) |
263 | { |
264 | /* initialize if first time through */ |
265 | if (oldrs.randstate[0] == 0) |
266 | sampler_random_init_state(random(), oldrs.randstate); |
267 | |
268 | /* and compute a random fraction */ |
269 | return sampler_random_fract(oldrs.randstate); |
270 | } |
271 | |
272 | double |
273 | anl_init_selection_state(int n) |
274 | { |
275 | /* initialize if first time through */ |
276 | if (oldrs.randstate[0] == 0) |
277 | sampler_random_init_state(random(), oldrs.randstate); |
278 | |
279 | /* Initial value of W (for use when Algorithm Z is first applied) */ |
280 | return exp(-log(sampler_random_fract(oldrs.randstate)) / n); |
281 | } |
282 | |
283 | double |
284 | anl_get_next_S(double t, int n, double *stateptr) |
285 | { |
286 | double result; |
287 | |
288 | oldrs.W = *stateptr; |
289 | result = reservoir_get_next_S(&oldrs, t, n); |
290 | *stateptr = oldrs.W; |
291 | return result; |
292 | } |
293 | |