| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * sampling.c |
| 4 | * Relation block sampling routines. |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/utils/misc/sampling.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | |
| 16 | #include "postgres.h" |
| 17 | |
| 18 | #include <math.h> |
| 19 | |
| 20 | #include "utils/sampling.h" |
| 21 | |
| 22 | |
| 23 | /* |
| 24 | * BlockSampler_Init -- prepare for random sampling of blocknumbers |
| 25 | * |
| 26 | * BlockSampler provides algorithm for block level sampling of a relation |
| 27 | * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB") |
| 28 | * It selects a random sample of samplesize blocks out of |
| 29 | * the nblocks blocks in the table. If the table has less than |
| 30 | * samplesize blocks, all blocks are selected. |
| 31 | * |
| 32 | * Since we know the total number of blocks in advance, we can use the |
| 33 | * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's |
| 34 | * algorithm. |
| 35 | */ |
| 36 | void |
| 37 | BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, |
| 38 | long randseed) |
| 39 | { |
| 40 | bs->N = nblocks; /* measured table size */ |
| 41 | |
| 42 | /* |
| 43 | * If we decide to reduce samplesize for tables that have less or not much |
| 44 | * more than samplesize blocks, here is the place to do it. |
| 45 | */ |
| 46 | bs->n = samplesize; |
| 47 | bs->t = 0; /* blocks scanned so far */ |
| 48 | bs->m = 0; /* blocks selected so far */ |
| 49 | |
| 50 | sampler_random_init_state(randseed, bs->randstate); |
| 51 | } |
| 52 | |
| 53 | bool |
| 54 | BlockSampler_HasMore(BlockSampler bs) |
| 55 | { |
| 56 | return (bs->t < bs->N) && (bs->m < bs->n); |
| 57 | } |
| 58 | |
| 59 | BlockNumber |
| 60 | BlockSampler_Next(BlockSampler bs) |
| 61 | { |
| 62 | BlockNumber K = bs->N - bs->t; /* remaining blocks */ |
| 63 | int k = bs->n - bs->m; /* blocks still to sample */ |
| 64 | double p; /* probability to skip block */ |
| 65 | double V; /* random */ |
| 66 | |
| 67 | Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */ |
| 68 | |
| 69 | if ((BlockNumber) k >= K) |
| 70 | { |
| 71 | /* need all the rest */ |
| 72 | bs->m++; |
| 73 | return bs->t++; |
| 74 | } |
| 75 | |
| 76 | /*---------- |
| 77 | * It is not obvious that this code matches Knuth's Algorithm S. |
| 78 | * Knuth says to skip the current block with probability 1 - k/K. |
| 79 | * If we are to skip, we should advance t (hence decrease K), and |
| 80 | * repeat the same probabilistic test for the next block. The naive |
| 81 | * implementation thus requires a sampler_random_fract() call for each |
| 82 | * block number. But we can reduce this to one sampler_random_fract() |
| 83 | * call per selected block, by noting that each time the while-test |
| 84 | * succeeds, we can reinterpret V as a uniform random number in the range |
| 85 | * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be |
| 86 | * the appropriate fraction of its former value, and our next loop |
| 87 | * makes the appropriate probabilistic test. |
| 88 | * |
| 89 | * We have initially K > k > 0. If the loop reduces K to equal k, |
| 90 | * the next while-test must fail since p will become exactly zero |
| 91 | * (we assume there will not be roundoff error in the division). |
| 92 | * (Note: Knuth suggests a "<=" loop condition, but we use "<" just |
| 93 | * to be doubly sure about roundoff error.) Therefore K cannot become |
| 94 | * less than k, which means that we cannot fail to select enough blocks. |
| 95 | *---------- |
| 96 | */ |
| 97 | V = sampler_random_fract(bs->randstate); |
| 98 | p = 1.0 - (double) k / (double) K; |
| 99 | while (V < p) |
| 100 | { |
| 101 | /* skip */ |
| 102 | bs->t++; |
| 103 | K--; /* keep K == N - t */ |
| 104 | |
| 105 | /* adjust p to be new cutoff point in reduced range */ |
| 106 | p *= 1.0 - (double) k / (double) K; |
| 107 | } |
| 108 | |
| 109 | /* select */ |
| 110 | bs->m++; |
| 111 | return bs->t++; |
| 112 | } |
| 113 | |
| 114 | /* |
| 115 | * These two routines embody Algorithm Z from "Random sampling with a |
| 116 | * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 |
| 117 | * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms |
| 118 | * of the count S of records to skip before processing another record. |
| 119 | * It is computed primarily based on t, the number of records already read. |
| 120 | * The only extra state needed between calls is W, a random state variable. |
| 121 | * |
| 122 | * reservoir_init_selection_state computes the initial W value. |
| 123 | * |
| 124 | * Given that we've already read t records (t >= n), reservoir_get_next_S |
| 125 | * determines the number of records to skip before the next record is |
| 126 | * processed. |
| 127 | */ |
| 128 | void |
| 129 | reservoir_init_selection_state(ReservoirState rs, int n) |
| 130 | { |
| 131 | /* |
| 132 | * Reservoir sampling is not used anywhere where it would need to return |
| 133 | * repeatable results so we can initialize it randomly. |
| 134 | */ |
| 135 | sampler_random_init_state(random(), rs->randstate); |
| 136 | |
| 137 | /* Initial value of W (for use when Algorithm Z is first applied) */ |
| 138 | rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n); |
| 139 | } |
| 140 | |
| 141 | double |
| 142 | reservoir_get_next_S(ReservoirState rs, double t, int n) |
| 143 | { |
| 144 | double S; |
| 145 | |
| 146 | /* The magic constant here is T from Vitter's paper */ |
| 147 | if (t <= (22.0 * n)) |
| 148 | { |
| 149 | /* Process records using Algorithm X until t is large enough */ |
| 150 | double V, |
| 151 | quot; |
| 152 | |
| 153 | V = sampler_random_fract(rs->randstate); /* Generate V */ |
| 154 | S = 0; |
| 155 | t += 1; |
| 156 | /* Note: "num" in Vitter's code is always equal to t - n */ |
| 157 | quot = (t - (double) n) / t; |
| 158 | /* Find min S satisfying (4.1) */ |
| 159 | while (quot > V) |
| 160 | { |
| 161 | S += 1; |
| 162 | t += 1; |
| 163 | quot *= (t - (double) n) / t; |
| 164 | } |
| 165 | } |
| 166 | else |
| 167 | { |
| 168 | /* Now apply Algorithm Z */ |
| 169 | double W = rs->W; |
| 170 | double term = t - (double) n + 1; |
| 171 | |
| 172 | for (;;) |
| 173 | { |
| 174 | double numer, |
| 175 | numer_lim, |
| 176 | denom; |
| 177 | double U, |
| 178 | X, |
| 179 | lhs, |
| 180 | rhs, |
| 181 | y, |
| 182 | tmp; |
| 183 | |
| 184 | /* Generate U and X */ |
| 185 | U = sampler_random_fract(rs->randstate); |
| 186 | X = t * (W - 1.0); |
| 187 | S = floor(X); /* S is tentatively set to floor(X) */ |
| 188 | /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ |
| 189 | tmp = (t + 1) / term; |
| 190 | lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); |
| 191 | rhs = (((t + X) / (term + S)) * term) / t; |
| 192 | if (lhs <= rhs) |
| 193 | { |
| 194 | W = rhs / lhs; |
| 195 | break; |
| 196 | } |
| 197 | /* Test if U <= f(S)/cg(X) */ |
| 198 | y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); |
| 199 | if ((double) n < S) |
| 200 | { |
| 201 | denom = t; |
| 202 | numer_lim = term + S; |
| 203 | } |
| 204 | else |
| 205 | { |
| 206 | denom = t - (double) n + S; |
| 207 | numer_lim = t + 1; |
| 208 | } |
| 209 | for (numer = t + S; numer >= numer_lim; numer -= 1) |
| 210 | { |
| 211 | y *= numer / denom; |
| 212 | denom -= 1; |
| 213 | } |
| 214 | W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */ |
| 215 | if (exp(log(y) / n) <= (t + X) / t) |
| 216 | break; |
| 217 | } |
| 218 | rs->W = W; |
| 219 | } |
| 220 | return S; |
| 221 | } |
| 222 | |
| 223 | |
| 224 | /*---------- |
| 225 | * Random number generator used by sampling |
| 226 | *---------- |
| 227 | */ |
| 228 | void |
| 229 | sampler_random_init_state(long seed, SamplerRandomState randstate) |
| 230 | { |
| 231 | randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */ |
| 232 | randstate[1] = (unsigned short) seed; |
| 233 | randstate[2] = (unsigned short) (seed >> 16); |
| 234 | } |
| 235 | |
| 236 | /* Select a random value R uniformly distributed in (0 - 1) */ |
| 237 | double |
| 238 | sampler_random_fract(SamplerRandomState randstate) |
| 239 | { |
| 240 | double res; |
| 241 | |
| 242 | /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */ |
| 243 | do |
| 244 | { |
| 245 | res = pg_erand48(randstate); |
| 246 | } while (res == 0.0); |
| 247 | return res; |
| 248 | } |
| 249 | |
| 250 | |
| 251 | /* |
| 252 | * Backwards-compatible API for block sampling |
| 253 | * |
| 254 | * This code is now deprecated, but since it's still in use by many FDWs, |
| 255 | * we should keep it for awhile at least. The functionality is the same as |
| 256 | * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S, |
| 257 | * except that a common random state is used across all callers. |
| 258 | */ |
| 259 | static ReservoirStateData oldrs; |
| 260 | |
| 261 | double |
| 262 | anl_random_fract(void) |
| 263 | { |
| 264 | /* initialize if first time through */ |
| 265 | if (oldrs.randstate[0] == 0) |
| 266 | sampler_random_init_state(random(), oldrs.randstate); |
| 267 | |
| 268 | /* and compute a random fraction */ |
| 269 | return sampler_random_fract(oldrs.randstate); |
| 270 | } |
| 271 | |
| 272 | double |
| 273 | anl_init_selection_state(int n) |
| 274 | { |
| 275 | /* initialize if first time through */ |
| 276 | if (oldrs.randstate[0] == 0) |
| 277 | sampler_random_init_state(random(), oldrs.randstate); |
| 278 | |
| 279 | /* Initial value of W (for use when Algorithm Z is first applied) */ |
| 280 | return exp(-log(sampler_random_fract(oldrs.randstate)) / n); |
| 281 | } |
| 282 | |
| 283 | double |
| 284 | anl_get_next_S(double t, int n, double *stateptr) |
| 285 | { |
| 286 | double result; |
| 287 | |
| 288 | oldrs.W = *stateptr; |
| 289 | result = reservoir_get_next_S(&oldrs, t, n); |
| 290 | *stateptr = oldrs.W; |
| 291 | return result; |
| 292 | } |
| 293 | |