1/*-------------------------------------------------------------------------
2 *
3 * syncscan.c
4 * heap scan synchronization support
5 *
6 * When multiple backends run a sequential scan on the same table, we try
7 * to keep them synchronized to reduce the overall I/O needed. The goal is
8 * to read each page into shared buffer cache only once, and let all backends
9 * that take part in the shared scan process the page before it falls out of
10 * the cache.
11 *
12 * Since the "leader" in a pack of backends doing a seqscan will have to wait
13 * for I/O, while the "followers" don't, there is a strong self-synchronizing
14 * effect once we can get the backends examining approximately the same part
15 * of the table at the same time. Hence all that is really needed is to get
16 * a new backend beginning a seqscan to begin it close to where other backends
17 * are reading. We can scan the table circularly, from block X up to the
18 * end and then from block 0 to X-1, to ensure we visit all rows while still
19 * participating in the common scan.
20 *
21 * To accomplish that, we keep track of the scan position of each table, and
22 * start new scans close to where the previous scan(s) are. We don't try to
23 * do any extra synchronization to keep the scans together afterwards; some
24 * scans might progress much more slowly than others, for example if the
25 * results need to be transferred to the client over a slow network, and we
26 * don't want such queries to slow down others.
27 *
28 * There can realistically only be a few large sequential scans on different
29 * tables in progress at any time. Therefore we just keep the scan positions
30 * in a small LRU list which we scan every time we need to look up or update a
31 * scan position. The whole mechanism is only applied for tables exceeding
32 * a threshold size (but that is not the concern of this module).
33 *
34 * INTERFACE ROUTINES
35 * ss_get_location - return current scan location of a relation
36 * ss_report_location - update current scan location
37 *
38 *
39 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 * IDENTIFICATION
43 * src/backend/access/heap/syncscan.c
44 *
45 *-------------------------------------------------------------------------
46 */
47#include "postgres.h"
48
49#include "access/heapam.h"
50#include "miscadmin.h"
51#include "storage/lwlock.h"
52#include "storage/shmem.h"
53#include "utils/rel.h"
54
55
56/* GUC variables */
57#ifdef TRACE_SYNCSCAN
58bool trace_syncscan = false;
59#endif
60
61
62/*
63 * Size of the LRU list.
64 *
65 * Note: the code assumes that SYNC_SCAN_NELEM > 1.
66 *
67 * XXX: What's a good value? It should be large enough to hold the
68 * maximum number of large tables scanned simultaneously. But a larger value
69 * means more traversing of the LRU list when starting a new scan.
70 */
71#define SYNC_SCAN_NELEM 20
72
73/*
74 * Interval between reports of the location of the current scan, in pages.
75 *
76 * Note: This should be smaller than the ring size (see buffer/freelist.c)
77 * we use for bulk reads. Otherwise a scan joining other scans might start
78 * from a page that's no longer in the buffer cache. This is a bit fuzzy;
79 * there's no guarantee that the new scan will read the page before it leaves
80 * the buffer cache anyway, and on the other hand the page is most likely
81 * still in the OS cache.
82 */
83#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84
85
86/*
87 * The scan locations structure is essentially a doubly-linked LRU with head
88 * and tail pointer, but designed to hold a fixed maximum number of elements in
89 * fixed-size shared memory.
90 */
91typedef struct ss_scan_location_t
92{
93 RelFileNode relfilenode; /* identity of a relation */
94 BlockNumber location; /* last-reported location in the relation */
95} ss_scan_location_t;
96
97typedef struct ss_lru_item_t
98{
99 struct ss_lru_item_t *prev;
100 struct ss_lru_item_t *next;
101 ss_scan_location_t location;
102} ss_lru_item_t;
103
104typedef struct ss_scan_locations_t
105{
106 ss_lru_item_t *head;
107 ss_lru_item_t *tail;
108 ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
109} ss_scan_locations_t;
110
111#define SizeOfScanLocations(N) \
112 (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113
114/* Pointer to struct in shared memory */
115static ss_scan_locations_t *scan_locations;
116
117/* prototypes for internal functions */
118static BlockNumber ss_search(RelFileNode relfilenode,
119 BlockNumber location, bool set);
120
121
122/*
123 * SyncScanShmemSize --- report amount of shared memory space needed
124 */
125Size
126SyncScanShmemSize(void)
127{
128 return SizeOfScanLocations(SYNC_SCAN_NELEM);
129}
130
131/*
132 * SyncScanShmemInit --- initialize this module's shared memory
133 */
134void
135SyncScanShmemInit(void)
136{
137 int i;
138 bool found;
139
140 scan_locations = (ss_scan_locations_t *)
141 ShmemInitStruct("Sync Scan Locations List",
142 SizeOfScanLocations(SYNC_SCAN_NELEM),
143 &found);
144
145 if (!IsUnderPostmaster)
146 {
147 /* Initialize shared memory area */
148 Assert(!found);
149
150 scan_locations->head = &scan_locations->items[0];
151 scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
152
153 for (i = 0; i < SYNC_SCAN_NELEM; i++)
154 {
155 ss_lru_item_t *item = &scan_locations->items[i];
156
157 /*
158 * Initialize all slots with invalid values. As scans are started,
159 * these invalid entries will fall off the LRU list and get
160 * replaced with real entries.
161 */
162 item->location.relfilenode.spcNode = InvalidOid;
163 item->location.relfilenode.dbNode = InvalidOid;
164 item->location.relfilenode.relNode = InvalidOid;
165 item->location.location = InvalidBlockNumber;
166
167 item->prev = (i > 0) ?
168 (&scan_locations->items[i - 1]) : NULL;
169 item->next = (i < SYNC_SCAN_NELEM - 1) ?
170 (&scan_locations->items[i + 1]) : NULL;
171 }
172 }
173 else
174 Assert(found);
175}
176
177/*
178 * ss_search --- search the scan_locations structure for an entry with the
179 * given relfilenode.
180 *
181 * If "set" is true, the location is updated to the given location. If no
182 * entry for the given relfilenode is found, it will be created at the head
183 * of the list with the given location, even if "set" is false.
184 *
185 * In any case, the location after possible update is returned.
186 *
187 * Caller is responsible for having acquired suitable lock on the shared
188 * data structure.
189 */
190static BlockNumber
191ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
192{
193 ss_lru_item_t *item;
194
195 item = scan_locations->head;
196 for (;;)
197 {
198 bool match;
199
200 match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
201
202 if (match || item->next == NULL)
203 {
204 /*
205 * If we reached the end of list and no match was found, take over
206 * the last entry
207 */
208 if (!match)
209 {
210 item->location.relfilenode = relfilenode;
211 item->location.location = location;
212 }
213 else if (set)
214 item->location.location = location;
215
216 /* Move the entry to the front of the LRU list */
217 if (item != scan_locations->head)
218 {
219 /* unlink */
220 if (item == scan_locations->tail)
221 scan_locations->tail = item->prev;
222 item->prev->next = item->next;
223 if (item->next)
224 item->next->prev = item->prev;
225
226 /* link */
227 item->prev = NULL;
228 item->next = scan_locations->head;
229 scan_locations->head->prev = item;
230 scan_locations->head = item;
231 }
232
233 return item->location.location;
234 }
235
236 item = item->next;
237 }
238
239 /* not reached */
240}
241
242/*
243 * ss_get_location --- get the optimal starting location for scan
244 *
245 * Returns the last-reported location of a sequential scan on the
246 * relation, or 0 if no valid location is found.
247 *
248 * We expect the caller has just done RelationGetNumberOfBlocks(), and
249 * so that number is passed in rather than computing it again. The result
250 * is guaranteed less than relnblocks (assuming that's > 0).
251 */
252BlockNumber
253ss_get_location(Relation rel, BlockNumber relnblocks)
254{
255 BlockNumber startloc;
256
257 LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
258 startloc = ss_search(rel->rd_node, 0, false);
259 LWLockRelease(SyncScanLock);
260
261 /*
262 * If the location is not a valid block number for this scan, start at 0.
263 *
264 * This can happen if for instance a VACUUM truncated the table since the
265 * location was saved.
266 */
267 if (startloc >= relnblocks)
268 startloc = 0;
269
270#ifdef TRACE_SYNCSCAN
271 if (trace_syncscan)
272 elog(LOG,
273 "SYNC_SCAN: start \"%s\" (size %u) at %u",
274 RelationGetRelationName(rel), relnblocks, startloc);
275#endif
276
277 return startloc;
278}
279
280/*
281 * ss_report_location --- update the current scan location
282 *
283 * Writes an entry into the shared Sync Scan state of the form
284 * (relfilenode, blocknumber), overwriting any existing entry for the
285 * same relfilenode.
286 */
287void
288ss_report_location(Relation rel, BlockNumber location)
289{
290#ifdef TRACE_SYNCSCAN
291 if (trace_syncscan)
292 {
293 if ((location % 1024) == 0)
294 elog(LOG,
295 "SYNC_SCAN: scanning \"%s\" at %u",
296 RelationGetRelationName(rel), location);
297 }
298#endif
299
300 /*
301 * To reduce lock contention, only report scan progress every N pages. For
302 * the same reason, don't block if the lock isn't immediately available.
303 * Missing a few updates isn't critical, it just means that a new scan
304 * that wants to join the pack will start a little bit behind the head of
305 * the scan. Hopefully the pages are still in OS cache and the scan
306 * catches up quickly.
307 */
308 if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
309 {
310 if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
311 {
312 (void) ss_search(rel->rd_node, location, true);
313 LWLockRelease(SyncScanLock);
314 }
315#ifdef TRACE_SYNCSCAN
316 else if (trace_syncscan)
317 elog(LOG,
318 "SYNC_SCAN: missed update for \"%s\" at %u",
319 RelationGetRelationName(rel), location);
320#endif
321 }
322}
323