1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * syncscan.c |
4 | * heap scan synchronization support |
5 | * |
6 | * When multiple backends run a sequential scan on the same table, we try |
7 | * to keep them synchronized to reduce the overall I/O needed. The goal is |
8 | * to read each page into shared buffer cache only once, and let all backends |
9 | * that take part in the shared scan process the page before it falls out of |
10 | * the cache. |
11 | * |
12 | * Since the "leader" in a pack of backends doing a seqscan will have to wait |
13 | * for I/O, while the "followers" don't, there is a strong self-synchronizing |
14 | * effect once we can get the backends examining approximately the same part |
15 | * of the table at the same time. Hence all that is really needed is to get |
16 | * a new backend beginning a seqscan to begin it close to where other backends |
17 | * are reading. We can scan the table circularly, from block X up to the |
18 | * end and then from block 0 to X-1, to ensure we visit all rows while still |
19 | * participating in the common scan. |
20 | * |
21 | * To accomplish that, we keep track of the scan position of each table, and |
22 | * start new scans close to where the previous scan(s) are. We don't try to |
23 | * do any extra synchronization to keep the scans together afterwards; some |
24 | * scans might progress much more slowly than others, for example if the |
25 | * results need to be transferred to the client over a slow network, and we |
26 | * don't want such queries to slow down others. |
27 | * |
28 | * There can realistically only be a few large sequential scans on different |
29 | * tables in progress at any time. Therefore we just keep the scan positions |
30 | * in a small LRU list which we scan every time we need to look up or update a |
31 | * scan position. The whole mechanism is only applied for tables exceeding |
32 | * a threshold size (but that is not the concern of this module). |
33 | * |
34 | * INTERFACE ROUTINES |
35 | * ss_get_location - return current scan location of a relation |
36 | * ss_report_location - update current scan location |
37 | * |
38 | * |
39 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
40 | * Portions Copyright (c) 1994, Regents of the University of California |
41 | * |
42 | * IDENTIFICATION |
43 | * src/backend/access/heap/syncscan.c |
44 | * |
45 | *------------------------------------------------------------------------- |
46 | */ |
47 | #include "postgres.h" |
48 | |
49 | #include "access/heapam.h" |
50 | #include "miscadmin.h" |
51 | #include "storage/lwlock.h" |
52 | #include "storage/shmem.h" |
53 | #include "utils/rel.h" |
54 | |
55 | |
56 | /* GUC variables */ |
57 | #ifdef TRACE_SYNCSCAN |
58 | bool trace_syncscan = false; |
59 | #endif |
60 | |
61 | |
62 | /* |
63 | * Size of the LRU list. |
64 | * |
65 | * Note: the code assumes that SYNC_SCAN_NELEM > 1. |
66 | * |
67 | * XXX: What's a good value? It should be large enough to hold the |
68 | * maximum number of large tables scanned simultaneously. But a larger value |
69 | * means more traversing of the LRU list when starting a new scan. |
70 | */ |
71 | #define SYNC_SCAN_NELEM 20 |
72 | |
73 | /* |
74 | * Interval between reports of the location of the current scan, in pages. |
75 | * |
76 | * Note: This should be smaller than the ring size (see buffer/freelist.c) |
77 | * we use for bulk reads. Otherwise a scan joining other scans might start |
78 | * from a page that's no longer in the buffer cache. This is a bit fuzzy; |
79 | * there's no guarantee that the new scan will read the page before it leaves |
80 | * the buffer cache anyway, and on the other hand the page is most likely |
81 | * still in the OS cache. |
82 | */ |
83 | #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) |
84 | |
85 | |
86 | /* |
87 | * The scan locations structure is essentially a doubly-linked LRU with head |
88 | * and tail pointer, but designed to hold a fixed maximum number of elements in |
89 | * fixed-size shared memory. |
90 | */ |
91 | typedef struct ss_scan_location_t |
92 | { |
93 | RelFileNode relfilenode; /* identity of a relation */ |
94 | BlockNumber location; /* last-reported location in the relation */ |
95 | } ss_scan_location_t; |
96 | |
97 | typedef struct ss_lru_item_t |
98 | { |
99 | struct ss_lru_item_t *prev; |
100 | struct ss_lru_item_t *next; |
101 | ss_scan_location_t location; |
102 | } ss_lru_item_t; |
103 | |
104 | typedef struct ss_scan_locations_t |
105 | { |
106 | ss_lru_item_t *head; |
107 | ss_lru_item_t *tail; |
108 | ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ |
109 | } ss_scan_locations_t; |
110 | |
111 | #define SizeOfScanLocations(N) \ |
112 | (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) |
113 | |
114 | /* Pointer to struct in shared memory */ |
115 | static ss_scan_locations_t *scan_locations; |
116 | |
117 | /* prototypes for internal functions */ |
118 | static BlockNumber ss_search(RelFileNode relfilenode, |
119 | BlockNumber location, bool set); |
120 | |
121 | |
122 | /* |
123 | * SyncScanShmemSize --- report amount of shared memory space needed |
124 | */ |
125 | Size |
126 | SyncScanShmemSize(void) |
127 | { |
128 | return SizeOfScanLocations(SYNC_SCAN_NELEM); |
129 | } |
130 | |
131 | /* |
132 | * SyncScanShmemInit --- initialize this module's shared memory |
133 | */ |
134 | void |
135 | SyncScanShmemInit(void) |
136 | { |
137 | int i; |
138 | bool found; |
139 | |
140 | scan_locations = (ss_scan_locations_t *) |
141 | ShmemInitStruct("Sync Scan Locations List" , |
142 | SizeOfScanLocations(SYNC_SCAN_NELEM), |
143 | &found); |
144 | |
145 | if (!IsUnderPostmaster) |
146 | { |
147 | /* Initialize shared memory area */ |
148 | Assert(!found); |
149 | |
150 | scan_locations->head = &scan_locations->items[0]; |
151 | scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; |
152 | |
153 | for (i = 0; i < SYNC_SCAN_NELEM; i++) |
154 | { |
155 | ss_lru_item_t *item = &scan_locations->items[i]; |
156 | |
157 | /* |
158 | * Initialize all slots with invalid values. As scans are started, |
159 | * these invalid entries will fall off the LRU list and get |
160 | * replaced with real entries. |
161 | */ |
162 | item->location.relfilenode.spcNode = InvalidOid; |
163 | item->location.relfilenode.dbNode = InvalidOid; |
164 | item->location.relfilenode.relNode = InvalidOid; |
165 | item->location.location = InvalidBlockNumber; |
166 | |
167 | item->prev = (i > 0) ? |
168 | (&scan_locations->items[i - 1]) : NULL; |
169 | item->next = (i < SYNC_SCAN_NELEM - 1) ? |
170 | (&scan_locations->items[i + 1]) : NULL; |
171 | } |
172 | } |
173 | else |
174 | Assert(found); |
175 | } |
176 | |
177 | /* |
178 | * ss_search --- search the scan_locations structure for an entry with the |
179 | * given relfilenode. |
180 | * |
181 | * If "set" is true, the location is updated to the given location. If no |
182 | * entry for the given relfilenode is found, it will be created at the head |
183 | * of the list with the given location, even if "set" is false. |
184 | * |
185 | * In any case, the location after possible update is returned. |
186 | * |
187 | * Caller is responsible for having acquired suitable lock on the shared |
188 | * data structure. |
189 | */ |
190 | static BlockNumber |
191 | ss_search(RelFileNode relfilenode, BlockNumber location, bool set) |
192 | { |
193 | ss_lru_item_t *item; |
194 | |
195 | item = scan_locations->head; |
196 | for (;;) |
197 | { |
198 | bool match; |
199 | |
200 | match = RelFileNodeEquals(item->location.relfilenode, relfilenode); |
201 | |
202 | if (match || item->next == NULL) |
203 | { |
204 | /* |
205 | * If we reached the end of list and no match was found, take over |
206 | * the last entry |
207 | */ |
208 | if (!match) |
209 | { |
210 | item->location.relfilenode = relfilenode; |
211 | item->location.location = location; |
212 | } |
213 | else if (set) |
214 | item->location.location = location; |
215 | |
216 | /* Move the entry to the front of the LRU list */ |
217 | if (item != scan_locations->head) |
218 | { |
219 | /* unlink */ |
220 | if (item == scan_locations->tail) |
221 | scan_locations->tail = item->prev; |
222 | item->prev->next = item->next; |
223 | if (item->next) |
224 | item->next->prev = item->prev; |
225 | |
226 | /* link */ |
227 | item->prev = NULL; |
228 | item->next = scan_locations->head; |
229 | scan_locations->head->prev = item; |
230 | scan_locations->head = item; |
231 | } |
232 | |
233 | return item->location.location; |
234 | } |
235 | |
236 | item = item->next; |
237 | } |
238 | |
239 | /* not reached */ |
240 | } |
241 | |
242 | /* |
243 | * ss_get_location --- get the optimal starting location for scan |
244 | * |
245 | * Returns the last-reported location of a sequential scan on the |
246 | * relation, or 0 if no valid location is found. |
247 | * |
248 | * We expect the caller has just done RelationGetNumberOfBlocks(), and |
249 | * so that number is passed in rather than computing it again. The result |
250 | * is guaranteed less than relnblocks (assuming that's > 0). |
251 | */ |
252 | BlockNumber |
253 | ss_get_location(Relation rel, BlockNumber relnblocks) |
254 | { |
255 | BlockNumber startloc; |
256 | |
257 | LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); |
258 | startloc = ss_search(rel->rd_node, 0, false); |
259 | LWLockRelease(SyncScanLock); |
260 | |
261 | /* |
262 | * If the location is not a valid block number for this scan, start at 0. |
263 | * |
264 | * This can happen if for instance a VACUUM truncated the table since the |
265 | * location was saved. |
266 | */ |
267 | if (startloc >= relnblocks) |
268 | startloc = 0; |
269 | |
270 | #ifdef TRACE_SYNCSCAN |
271 | if (trace_syncscan) |
272 | elog(LOG, |
273 | "SYNC_SCAN: start \"%s\" (size %u) at %u" , |
274 | RelationGetRelationName(rel), relnblocks, startloc); |
275 | #endif |
276 | |
277 | return startloc; |
278 | } |
279 | |
280 | /* |
281 | * ss_report_location --- update the current scan location |
282 | * |
283 | * Writes an entry into the shared Sync Scan state of the form |
284 | * (relfilenode, blocknumber), overwriting any existing entry for the |
285 | * same relfilenode. |
286 | */ |
287 | void |
288 | ss_report_location(Relation rel, BlockNumber location) |
289 | { |
290 | #ifdef TRACE_SYNCSCAN |
291 | if (trace_syncscan) |
292 | { |
293 | if ((location % 1024) == 0) |
294 | elog(LOG, |
295 | "SYNC_SCAN: scanning \"%s\" at %u" , |
296 | RelationGetRelationName(rel), location); |
297 | } |
298 | #endif |
299 | |
300 | /* |
301 | * To reduce lock contention, only report scan progress every N pages. For |
302 | * the same reason, don't block if the lock isn't immediately available. |
303 | * Missing a few updates isn't critical, it just means that a new scan |
304 | * that wants to join the pack will start a little bit behind the head of |
305 | * the scan. Hopefully the pages are still in OS cache and the scan |
306 | * catches up quickly. |
307 | */ |
308 | if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) |
309 | { |
310 | if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) |
311 | { |
312 | (void) ss_search(rel->rd_node, location, true); |
313 | LWLockRelease(SyncScanLock); |
314 | } |
315 | #ifdef TRACE_SYNCSCAN |
316 | else if (trace_syncscan) |
317 | elog(LOG, |
318 | "SYNC_SCAN: missed update for \"%s\" at %u" , |
319 | RelationGetRelationName(rel), location); |
320 | #endif |
321 | } |
322 | } |
323 | |