syncscan.c source code [PostgreSQL/src/backend/access/heap/syncscan.c]

1	/-------------------------------------------------------------------------*
2	*
3	* syncscan.c
4	* heap scan synchronization support
5	*
6	* When multiple backends run a sequential scan on the same table, we try
7	* to keep them synchronized to reduce the overall I/O needed. The goal is
8	* to read each page into shared buffer cache only once, and let all backends
9	* that take part in the shared scan process the page before it falls out of
10	* the cache.
11	*
12	* Since the "leader" in a pack of backends doing a seqscan will have to wait
13	* for I/O, while the "followers" don't, there is a strong self-synchronizing
14	* effect once we can get the backends examining approximately the same part
15	* of the table at the same time. Hence all that is really needed is to get
16	* a new backend beginning a seqscan to begin it close to where other backends
17	* are reading. We can scan the table circularly, from block X up to the
18	* end and then from block 0 to X-1, to ensure we visit all rows while still
19	* participating in the common scan.
20	*
21	* To accomplish that, we keep track of the scan position of each table, and
22	* start new scans close to where the previous scan(s) are. We don't try to
23	* do any extra synchronization to keep the scans together afterwards; some
24	* scans might progress much more slowly than others, for example if the
25	* results need to be transferred to the client over a slow network, and we
26	* don't want such queries to slow down others.
27	*
28	* There can realistically only be a few large sequential scans on different
29	* tables in progress at any time. Therefore we just keep the scan positions
30	* in a small LRU list which we scan every time we need to look up or update a
31	* scan position. The whole mechanism is only applied for tables exceeding
32	* a threshold size (but that is not the concern of this module).
33	*
34	* INTERFACE ROUTINES
35	* ss_get_location - return current scan location of a relation
36	* ss_report_location - update current scan location
37	*
38	*
39	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
40	* Portions Copyright (c) 1994, Regents of the University of California
41	*
42	* IDENTIFICATION
43	* src/backend/access/heap/syncscan.c
44	*
45	*-------------------------------------------------------------------------
46	*/
47	#include "postgres.h"
48
49	#include "access/heapam.h"
50	#include "miscadmin.h"
51	#include "storage/lwlock.h"
52	#include "storage/shmem.h"
53	#include "utils/rel.h"
54
55
56	/ GUC variables /
57	#ifdef TRACE_SYNCSCAN
58	bool trace_syncscan = false;
59	#endif
60
61
62	/*
63	* Size of the LRU list.
64	*
65	* Note: the code assumes that SYNC_SCAN_NELEM > 1.
66	*
67	* XXX: What's a good value? It should be large enough to hold the
68	* maximum number of large tables scanned simultaneously. But a larger value
69	* means more traversing of the LRU list when starting a new scan.
70	*/
71	#define SYNC_SCAN_NELEM 20
72
73	/*
74	* Interval between reports of the location of the current scan, in pages.
75	*
76	* Note: This should be smaller than the ring size (see buffer/freelist.c)
77	* we use for bulk reads. Otherwise a scan joining other scans might start
78	* from a page that's no longer in the buffer cache. This is a bit fuzzy;
79	* there's no guarantee that the new scan will read the page before it leaves
80	* the buffer cache anyway, and on the other hand the page is most likely
81	* still in the OS cache.
82	*/
83	#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84
85
86	/*
87	* The scan locations structure is essentially a doubly-linked LRU with head
88	* and tail pointer, but designed to hold a fixed maximum number of elements in
89	* fixed-size shared memory.
90	*/
91	typedef struct ss_scan_location_t
92	{
93	RelFileNode relfilenode; / identity of a relation /
94	BlockNumber location; / last-reported location in the relation /
95	} ss_scan_location_t;
96
97	typedef struct ss_lru_item_t
98	{
99	struct ss_lru_item_t *prev;
100	struct ss_lru_item_t *next;
101	ss_scan_location_t location;
102	} ss_lru_item_t;
103
104	typedef struct ss_scan_locations_t
105	{
106	ss_lru_item_t *head;
107	ss_lru_item_t *tail;
108	ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; / SYNC_SCAN_NELEM items /
109	} ss_scan_locations_t;
110
111	#define SizeOfScanLocations(N) \
112	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113
114	/ Pointer to struct in shared memory /
115	static ss_scan_locations_t *scan_locations;
116
117	/ prototypes for internal functions /
118	static BlockNumber ss_search(RelFileNode relfilenode,
119	BlockNumber location, bool set);
120
121
122	/*
123	* SyncScanShmemSize --- report amount of shared memory space needed
124	*/
125	Size
126	SyncScanShmemSize(void)
127	{
128	return SizeOfScanLocations(SYNC_SCAN_NELEM);
129	}
130
131	/*
132	* SyncScanShmemInit --- initialize this module's shared memory
133	*/
134	void
135	SyncScanShmemInit(void)
136	{
137	int i;
138	bool found;
139
140	scan_locations = (ss_scan_locations_t *)
141	ShmemInitStruct("Sync Scan Locations List",
142	SizeOfScanLocations(SYNC_SCAN_NELEM),
143	&found);
144
145	if (!IsUnderPostmaster)
146	{
147	/ Initialize shared memory area /
148	Assert(!found);
149
150	scan_locations->head = &scan_locations->items[`0`];
151	scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - `1`];
152
153	for (i = `0`; i < SYNC_SCAN_NELEM; i++)
154	{
155	ss_lru_item_t *item = &scan_locations->items[i];
156
157	/*
158	* Initialize all slots with invalid values. As scans are started,
159	* these invalid entries will fall off the LRU list and get
160	* replaced with real entries.
161	*/
162	item->location.relfilenode.spcNode = InvalidOid;
163	item->location.relfilenode.dbNode = InvalidOid;
164	item->location.relfilenode.relNode = InvalidOid;
165	item->location.location = InvalidBlockNumber;
166
167	item->prev = (i > `0`) ?
168	(&scan_locations->items[i - `1`]) : NULL;
169	item->next = (i < SYNC_SCAN_NELEM - `1`) ?
170	(&scan_locations->items[i + `1`]) : NULL;
171	}
172	}
173	else
174	Assert(found);
175	}
176
177	/*
178	* ss_search --- search the scan_locations structure for an entry with the
179	* given relfilenode.
180	*
181	* If "set" is true, the location is updated to the given location. If no
182	* entry for the given relfilenode is found, it will be created at the head
183	* of the list with the given location, even if "set" is false.
184	*
185	* In any case, the location after possible update is returned.
186	*
187	* Caller is responsible for having acquired suitable lock on the shared
188	* data structure.
189	*/
190	static BlockNumber
191	ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
192	{
193	ss_lru_item_t *item;
194
195	item = scan_locations->head;
196	for (;;)
197	{
198	bool match;
199
200	match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
201
202	if (match \|\| item->next == NULL)
203	{
204	/*
205	* If we reached the end of list and no match was found, take over
206	* the last entry
207	*/
208	if (!match)
209	{
210	item->location.relfilenode = relfilenode;
211	item->location.location = location;
212	}
213	else if (set)
214	item->location.location = location;
215
216	/ Move the entry to the front of the LRU list /
217	if (item != scan_locations->head)
218	{
219	/ unlink /
220	if (item == scan_locations->tail)
221	scan_locations->tail = item->prev;
222	item->prev->next = item->next;
223	if (item->next)
224	item->next->prev = item->prev;
225
226	/ link /
227	item->prev = NULL;
228	item->next = scan_locations->head;
229	scan_locations->head->prev = item;
230	scan_locations->head = item;
231	}
232
233	return item->location.location;
234	}
235
236	item = item->next;
237	}
238
239	/ not reached /
240	}
241
242	/*
243	* ss_get_location --- get the optimal starting location for scan
244	*
245	* Returns the last-reported location of a sequential scan on the
246	* relation, or 0 if no valid location is found.
247	*
248	* We expect the caller has just done RelationGetNumberOfBlocks(), and
249	* so that number is passed in rather than computing it again. The result
250	* is guaranteed less than relnblocks (assuming that's > 0).
251	*/
252	BlockNumber
253	ss_get_location(Relation rel, BlockNumber relnblocks)
254	{
255	BlockNumber startloc;
256
257	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
258	startloc = ss_search(rel->rd_node, `0`, false);
259	LWLockRelease(SyncScanLock);
260
261	/*
262	* If the location is not a valid block number for this scan, start at 0.
263	*
264	* This can happen if for instance a VACUUM truncated the table since the
265	* location was saved.
266	*/
267	if (startloc >= relnblocks)
268	startloc = `0`;
269
270	#ifdef TRACE_SYNCSCAN
271	if (trace_syncscan)
272	elog(LOG,
273	"SYNC_SCAN: start \"%s\" (size %u) at %u",
274	RelationGetRelationName(rel), relnblocks, startloc);
275	#endif
276
277	return startloc;
278	}
279
280	/*
281	* ss_report_location --- update the current scan location
282	*
283	* Writes an entry into the shared Sync Scan state of the form
284	* (relfilenode, blocknumber), overwriting any existing entry for the
285	* same relfilenode.
286	*/
287	void
288	ss_report_location(Relation rel, BlockNumber location)
289	{
290	#ifdef TRACE_SYNCSCAN
291	if (trace_syncscan)
292	{
293	if ((location % `1024`) == `0`)
294	elog(LOG,
295	"SYNC_SCAN: scanning \"%s\" at %u",
296	RelationGetRelationName(rel), location);
297	}
298	#endif
299
300	/*
301	* To reduce lock contention, only report scan progress every N pages. For
302	* the same reason, don't block if the lock isn't immediately available.
303	* Missing a few updates isn't critical, it just means that a new scan
304	* that wants to join the pack will start a little bit behind the head of
305	* the scan. Hopefully the pages are still in OS cache and the scan
306	* catches up quickly.
307	*/
308	if ((location % SYNC_SCAN_REPORT_INTERVAL) == `0`)
309	{
310	if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
311	{
312	(void) ss_search(rel->rd_node, location, true);
313	LWLockRelease(SyncScanLock);
314	}
315	#ifdef TRACE_SYNCSCAN
316	else if (trace_syncscan)
317	elog(LOG,
318	"SYNC_SCAN: missed update for \"%s\" at %u",
319	RelationGetRelationName(rel), location);
320	#endif
321	}
322	}
323

Browse the source code of PostgreSQL/src/backend/access/heap/syncscan.c