1/*-------------------------------------------------------------------------
2 *
3 * system.c
4 * support routines for SYSTEM tablesample method
5 *
6 * To ensure repeatability of samples, it is necessary that selection of a
7 * given tuple be history-independent; otherwise syncscanning would break
8 * repeatability, to say nothing of logically-irrelevant maintenance such
9 * as physical extension or shortening of the relation.
10 *
11 * To achieve that, we proceed by hashing each candidate block number together
12 * with the active seed, and then selecting it if the hash is less than the
13 * cutoff value computed from the selection probability by BeginSampleScan.
14 *
15 *
16 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/access/tablesample/system.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <math.h>
28
29#include "access/relscan.h"
30#include "access/tsmapi.h"
31#include "catalog/pg_type.h"
32#include "optimizer/optimizer.h"
33#include "utils/builtins.h"
34#include "utils/hashutils.h"
35
36
37/* Private state */
38typedef struct
39{
40 uint64 cutoff; /* select blocks with hash less than this */
41 uint32 seed; /* random seed */
42 BlockNumber nextblock; /* next block to consider sampling */
43 OffsetNumber lt; /* last tuple returned from current block */
44} SystemSamplerData;
45
46
47static void system_samplescangetsamplesize(PlannerInfo *root,
48 RelOptInfo *baserel,
49 List *paramexprs,
50 BlockNumber *pages,
51 double *tuples);
52static void system_initsamplescan(SampleScanState *node,
53 int eflags);
54static void system_beginsamplescan(SampleScanState *node,
55 Datum *params,
56 int nparams,
57 uint32 seed);
58static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
59static OffsetNumber system_nextsampletuple(SampleScanState *node,
60 BlockNumber blockno,
61 OffsetNumber maxoffset);
62
63
64/*
65 * Create a TsmRoutine descriptor for the SYSTEM method.
66 */
67Datum
68tsm_system_handler(PG_FUNCTION_ARGS)
69{
70 TsmRoutine *tsm = makeNode(TsmRoutine);
71
72 tsm->parameterTypes = list_make1_oid(FLOAT4OID);
73 tsm->repeatable_across_queries = true;
74 tsm->repeatable_across_scans = true;
75 tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
76 tsm->InitSampleScan = system_initsamplescan;
77 tsm->BeginSampleScan = system_beginsamplescan;
78 tsm->NextSampleBlock = system_nextsampleblock;
79 tsm->NextSampleTuple = system_nextsampletuple;
80 tsm->EndSampleScan = NULL;
81
82 PG_RETURN_POINTER(tsm);
83}
84
85/*
86 * Sample size estimation.
87 */
88static void
89system_samplescangetsamplesize(PlannerInfo *root,
90 RelOptInfo *baserel,
91 List *paramexprs,
92 BlockNumber *pages,
93 double *tuples)
94{
95 Node *pctnode;
96 float4 samplefract;
97
98 /* Try to extract an estimate for the sample percentage */
99 pctnode = (Node *) linitial(paramexprs);
100 pctnode = estimate_expression_value(root, pctnode);
101
102 if (IsA(pctnode, Const) &&
103 !((Const *) pctnode)->constisnull)
104 {
105 samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
106 if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
107 samplefract /= 100.0f;
108 else
109 {
110 /* Default samplefract if the value is bogus */
111 samplefract = 0.1f;
112 }
113 }
114 else
115 {
116 /* Default samplefract if we didn't obtain a non-null Const */
117 samplefract = 0.1f;
118 }
119
120 /* We'll visit a sample of the pages ... */
121 *pages = clamp_row_est(baserel->pages * samplefract);
122
123 /* ... and hopefully get a representative number of tuples from them */
124 *tuples = clamp_row_est(baserel->tuples * samplefract);
125}
126
127/*
128 * Initialize during executor setup.
129 */
130static void
131system_initsamplescan(SampleScanState *node, int eflags)
132{
133 node->tsm_state = palloc0(sizeof(SystemSamplerData));
134}
135
136/*
137 * Examine parameters and prepare for a sample scan.
138 */
139static void
140system_beginsamplescan(SampleScanState *node,
141 Datum *params,
142 int nparams,
143 uint32 seed)
144{
145 SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
146 double percent = DatumGetFloat4(params[0]);
147 double dcutoff;
148
149 if (percent < 0 || percent > 100 || isnan(percent))
150 ereport(ERROR,
151 (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
152 errmsg("sample percentage must be between 0 and 100")));
153
154 /*
155 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
156 * store that as a uint64, of course. Note that this gives strictly
157 * correct behavior at the limits of zero or one probability.
158 */
159 dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
160 sampler->cutoff = (uint64) dcutoff;
161 sampler->seed = seed;
162 sampler->nextblock = 0;
163 sampler->lt = InvalidOffsetNumber;
164
165 /*
166 * Bulkread buffer access strategy probably makes sense unless we're
167 * scanning a very small fraction of the table. The 1% cutoff here is a
168 * guess. We should use pagemode visibility checking, since we scan all
169 * tuples on each selected page.
170 */
171 node->use_bulkread = (percent >= 1);
172 node->use_pagemode = true;
173}
174
175/*
176 * Select next block to sample.
177 */
178static BlockNumber
179system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
180{
181 SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
182 BlockNumber nextblock = sampler->nextblock;
183 uint32 hashinput[2];
184
185 /*
186 * We compute the hash by applying hash_any to an array of 2 uint32's
187 * containing the block number and seed. This is efficient to set up, and
188 * with the current implementation of hash_any, it gives
189 * machine-independent results, which is a nice property for regression
190 * testing.
191 *
192 * These words in the hash input are the same throughout the block:
193 */
194 hashinput[1] = sampler->seed;
195
196 /*
197 * Loop over block numbers until finding suitable block or reaching end of
198 * relation.
199 */
200 for (; nextblock < nblocks; nextblock++)
201 {
202 uint32 hash;
203
204 hashinput[0] = nextblock;
205
206 hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
207 (int) sizeof(hashinput)));
208 if (hash < sampler->cutoff)
209 break;
210 }
211
212 if (nextblock < nblocks)
213 {
214 /* Found a suitable block; remember where we should start next time */
215 sampler->nextblock = nextblock + 1;
216 return nextblock;
217 }
218
219 /* Done, but let's reset nextblock to 0 for safety. */
220 sampler->nextblock = 0;
221 return InvalidBlockNumber;
222}
223
224/*
225 * Select next sampled tuple in current block.
226 *
227 * In block sampling, we just want to sample all the tuples in each selected
228 * block.
229 *
230 * It is OK here to return an offset without knowing if the tuple is visible
231 * (or even exists); nodeSamplescan.c will deal with that.
232 *
233 * When we reach end of the block, return InvalidOffsetNumber which tells
234 * SampleScan to go to next block.
235 */
236static OffsetNumber
237system_nextsampletuple(SampleScanState *node,
238 BlockNumber blockno,
239 OffsetNumber maxoffset)
240{
241 SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
242 OffsetNumber tupoffset = sampler->lt;
243
244 /* Advance to next possible offset on page */
245 if (tupoffset == InvalidOffsetNumber)
246 tupoffset = FirstOffsetNumber;
247 else
248 tupoffset++;
249
250 /* Done? */
251 if (tupoffset > maxoffset)
252 tupoffset = InvalidOffsetNumber;
253
254 sampler->lt = tupoffset;
255
256 return tupoffset;
257}
258