freelist.c source code [PostgreSQL/src/backend/storage/buffer/freelist.c]

1	/-------------------------------------------------------------------------*
2	*
3	* freelist.c
4	* routines for managing the buffer pool's replacement strategy.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	*
11	* IDENTIFICATION
12	* src/backend/storage/buffer/freelist.c
13	*
14	*-------------------------------------------------------------------------
15	*/
16	#include "postgres.h"
17
18	#include "port/atomics.h"
19	#include "storage/buf_internals.h"
20	#include "storage/bufmgr.h"
21	#include "storage/proc.h"
22
23	#define INT_ACCESS_ONCE(var) ((int)(((volatile int )&(var))))
24
25
26	/*
27	* The shared freelist control information.
28	*/
29	typedef struct
30	{
31	/ Spinlock: protects the values below /
32	slock_t buffer_strategy_lock;
33
34	/*
35	* Clock sweep hand: index of next buffer to consider grabbing. Note that
36	* this isn't a concrete buffer - we only ever increase the value. So, to
37	* get an actual buffer, it needs to be used modulo NBuffers.
38	*/
39	pg_atomic_uint32 nextVictimBuffer;
40
41	int firstFreeBuffer; / Head of list of unused buffers /
42	int lastFreeBuffer; / Tail of list of unused buffers /
43
44	/*
45	* NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
46	* when the list is empty)
47	*/
48
49	/*
50	* Statistics. These counters should be wide enough that they can't
51	* overflow during a single bgwriter cycle.
52	*/
53	uint32 completePasses; / Complete cycles of the clock sweep /
54	pg_atomic_uint32 numBufferAllocs; / Buffers allocated since last reset /
55
56	/*
57	* Bgworker process to be notified upon activity or -1 if none. See
58	* StrategyNotifyBgWriter.
59	*/
60	int bgwprocno;
61	} BufferStrategyControl;
62
63	/ Pointers to shared state /
64	static BufferStrategyControl *StrategyControl = NULL;
65
66	/*
67	* Private (non-shared) state for managing a ring of shared buffers to re-use.
68	* This is currently the only kind of BufferAccessStrategy object, but someday
69	* we might have more kinds.
70	*/
71	typedef struct BufferAccessStrategyData
72	{
73	/ Overall strategy type /
74	BufferAccessStrategyType btype;
75	/ Number of elements in buffers[] array /
76	int ring_size;
77
78	/*
79	* Index of the "current" slot in the ring, ie, the one most recently
80	* returned by GetBufferFromRing.
81	*/
82	int current;
83
84	/*
85	* True if the buffer just returned by StrategyGetBuffer had been in the
86	* ring already.
87	*/
88	bool current_was_in_ring;
89
90	/*
91	* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
92	* have not yet selected a buffer for this ring slot. For allocation
93	* simplicity this is palloc'd together with the fixed fields of the
94	* struct.
95	*/
96	Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
97	} BufferAccessStrategyData;
98
99
100	/ Prototypes for internal functions /
101	static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
102	uint32 *buf_state);
103	static void AddBufferToRing(BufferAccessStrategy strategy,
104	BufferDesc *buf);
105
106	/*
107	* ClockSweepTick - Helper routine for StrategyGetBuffer()
108	*
109	* Move the clock hand one buffer ahead of its current position and return the
110	* id of the buffer now under the hand.
111	*/
112	static inline uint32
113	ClockSweepTick(void)
114	{
115	uint32 victim;
116
117	/*
118	* Atomically move hand ahead one buffer - if there's several processes
119	* doing this, this can lead to buffers being returned slightly out of
120	* apparent order.
121	*/
122	victim =
123	pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, `1`);
124
125	if (victim >= NBuffers)
126	{
127	uint32 originalVictim = victim;
128
129	/ always wrap what we look up in BufferDescriptors /
130	victim = victim % NBuffers;
131
132	/*
133	* If we're the one that just caused a wraparound, force
134	* completePasses to be incremented while holding the spinlock. We
135	* need the spinlock so StrategySyncStart() can return a consistent
136	* value consisting of nextVictimBuffer and completePasses.
137	*/
138	if (victim == `0`)
139	{
140	uint32 expected;
141	uint32 wrapped;
142	bool success = false;
143
144	expected = originalVictim + `1`;
145
146	while (!success)
147	{
148	/*
149	* Acquire the spinlock while increasing completePasses. That
150	* allows other readers to read nextVictimBuffer and
151	* completePasses in a consistent manner which is required for
152	* StrategySyncStart(). In theory delaying the increment
153	* could lead to an overflow of nextVictimBuffers, but that's
154	* highly unlikely and wouldn't be particularly harmful.
155	*/
156	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
157
158	wrapped = expected % NBuffers;
159
160	success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
161	&expected, wrapped);
162	if (success)
163	StrategyControl->completePasses++;
164	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
165	}
166	}
167	}
168	return victim;
169	}
170
171	/*
172	* have_free_buffer -- a lockless check to see if there is a free buffer in
173	* buffer pool.
174	*
175	* If the result is true that will become stale once free buffers are moved out
176	* by other operations, so the caller who strictly want to use a free buffer
177	* should not call this.
178	*/
179	bool
180	have_free_buffer()
181	{
182	if (StrategyControl->firstFreeBuffer >= `0`)
183	return true;
184	else
185	return false;
186	}
187
188	/*
189	* StrategyGetBuffer
190	*
191	* Called by the bufmgr to get the next candidate buffer to use in
192	* BufferAlloc(). The only hard requirement BufferAlloc() has is that
193	* the selected buffer must not currently be pinned by anyone.
194	*
195	* strategy is a BufferAccessStrategy object, or NULL for default strategy.
196	*
197	* To ensure that no one else can pin the buffer before we do, we must
198	* return the buffer with the buffer header spinlock still held.
199	*/
200	BufferDesc *
201	StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
202	{
203	BufferDesc *buf;
204	int bgwprocno;
205	int trycounter;
206	uint32 local_buf_state; / to avoid repeated (de-)referencing /
207
208	/*
209	* If given a strategy object, see whether it can select a buffer. We
210	* assume strategy objects don't need buffer_strategy_lock.
211	*/
212	if (strategy != NULL)
213	{
214	buf = GetBufferFromRing(strategy, buf_state);
215	if (buf != NULL)
216	return buf;
217	}
218
219	/*
220	* If asked, we need to waken the bgwriter. Since we don't want to rely on
221	* a spinlock for this we force a read from shared memory once, and then
222	* set the latch based on that value. We need to go through that length
223	* because otherwise bgwprocno might be reset while/after we check because
224	* the compiler might just reread from memory.
225	*
226	* This can possibly set the latch of the wrong process if the bgwriter
227	* dies in the wrong moment. But since PGPROC->procLatch is never
228	* deallocated the worst consequence of that is that we set the latch of
229	* some arbitrary process.
230	*/
231	bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
232	if (bgwprocno != -`1`)
233	{
234	/ reset bgwprocno first, before setting the latch /
235	StrategyControl->bgwprocno = -`1`;
236
237	/*
238	* Not acquiring ProcArrayLock here which is slightly icky. It's
239	* actually fine because procLatch isn't ever freed, so we just can
240	* potentially set the wrong process' (or no process') latch.
241	*/
242	SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
243	}
244
245	/*
246	* We count buffer allocation requests so that the bgwriter can estimate
247	* the rate of buffer consumption. Note that buffers recycled by a
248	* strategy object are intentionally not counted here.
249	*/
250	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, `1`);
251
252	/*
253	* First check, without acquiring the lock, whether there's buffers in the
254	* freelist. Since we otherwise don't require the spinlock in every
255	* StrategyGetBuffer() invocation, it'd be sad to acquire it here -
256	* uselessly in most cases. That obviously leaves a race where a buffer is
257	* put on the freelist but we don't see the store yet - but that's pretty
258	* harmless, it'll just get used during the next buffer acquisition.
259	*
260	* If there's buffers on the freelist, acquire the spinlock to pop one
261	* buffer of the freelist. Then check whether that buffer is usable and
262	* repeat if not.
263	*
264	* Note that the freeNext fields are considered to be protected by the
265	* buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
266	* manipulate them without holding the spinlock.
267	*/
268	if (StrategyControl->firstFreeBuffer >= `0`)
269	{
270	while (true)
271	{
272	/ Acquire the spinlock to remove element from the freelist /
273	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
274
275	if (StrategyControl->firstFreeBuffer < `0`)
276	{
277	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
278	break;
279	}
280
281	buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
282	Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
283
284	/ Unconditionally remove buffer from freelist /
285	StrategyControl->firstFreeBuffer = buf->freeNext;
286	buf->freeNext = FREENEXT_NOT_IN_LIST;
287
288	/*
289	* Release the lock so someone else can access the freelist while
290	* we check out this buffer.
291	*/
292	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
293
294	/*
295	* If the buffer is pinned or has a nonzero usage_count, we cannot
296	* use it; discard it and retry. (This can only happen if VACUUM
297	* put a valid buffer in the freelist and then someone else used
298	* it before we got to it. It's probably impossible altogether as
299	* of 8.3, but we'd better check anyway.)
300	*/
301	local_buf_state = LockBufHdr(buf);
302	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == `0`
303	&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == `0`)
304	{
305	if (strategy != NULL)
306	AddBufferToRing(strategy, buf);
307	*buf_state = local_buf_state;
308	return buf;
309	}
310	UnlockBufHdr(buf, local_buf_state);
311
312	}
313	}
314
315	/ Nothing on the freelist, so run the "clock sweep" algorithm /
316	trycounter = NBuffers;
317	for (;;)
318	{
319	buf = GetBufferDescriptor(ClockSweepTick());
320
321	/*
322	* If the buffer is pinned or has a nonzero usage_count, we cannot use
323	* it; decrement the usage_count (unless pinned) and keep scanning.
324	*/
325	local_buf_state = LockBufHdr(buf);
326
327	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == `0`)
328	{
329	if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != `0`)
330	{
331	local_buf_state -= BUF_USAGECOUNT_ONE;
332
333	trycounter = NBuffers;
334	}
335	else
336	{
337	/ Found a usable buffer /
338	if (strategy != NULL)
339	AddBufferToRing(strategy, buf);
340	*buf_state = local_buf_state;
341	return buf;
342	}
343	}
344	else if (--trycounter == `0`)
345	{
346	/*
347	* We've scanned all the buffers without making any state changes,
348	* so all the buffers are pinned (or were when we looked at them).
349	* We could hope that someone will free one eventually, but it's
350	* probably better to fail than to risk getting stuck in an
351	* infinite loop.
352	*/
353	UnlockBufHdr(buf, local_buf_state);
354	elog(ERROR, "no unpinned buffers available");
355	}
356	UnlockBufHdr(buf, local_buf_state);
357	}
358	}
359
360	/*
361	* StrategyFreeBuffer: put a buffer on the freelist
362	*/
363	void
364	StrategyFreeBuffer(BufferDesc *buf)
365	{
366	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
367
368	/*
369	* It is possible that we are told to put something in the freelist that
370	* is already in it; don't screw up the list if so.
371	*/
372	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
373	{
374	buf->freeNext = StrategyControl->firstFreeBuffer;
375	if (buf->freeNext < `0`)
376	StrategyControl->lastFreeBuffer = buf->buf_id;
377	StrategyControl->firstFreeBuffer = buf->buf_id;
378	}
379
380	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
381	}
382
383	/*
384	* StrategySyncStart -- tell BufferSync where to start syncing
385	*
386	* The result is the buffer index of the best buffer to sync first.
387	* BufferSync() will proceed circularly around the buffer array from there.
388	*
389	* In addition, we return the completed-pass count (which is effectively
390	* the higher-order bits of nextVictimBuffer) and the count of recent buffer
391	* allocs if non-NULL pointers are passed. The alloc count is reset after
392	* being read.
393	*/
394	int
395	StrategySyncStart(uint32 complete_passes, uint32 num_buf_alloc)
396	{
397	uint32 nextVictimBuffer;
398	int result;
399
400	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
401	nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
402	result = nextVictimBuffer % NBuffers;
403
404	if (complete_passes)
405	{
406	*complete_passes = StrategyControl->completePasses;
407
408	/*
409	* Additionally add the number of wraparounds that happened before
410	* completePasses could be incremented. C.f. ClockSweepTick().
411	*/
412	*complete_passes += nextVictimBuffer / NBuffers;
413	}
414
415	if (num_buf_alloc)
416	{
417	*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, `0`);
418	}
419	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
420	return result;
421	}
422
423	/*
424	* StrategyNotifyBgWriter -- set or clear allocation notification latch
425	*
426	* If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
427	* set that latch. Pass -1 to clear the pending notification before it
428	* happens. This feature is used by the bgwriter process to wake itself up
429	* from hibernation, and is not meant for anybody else to use.
430	*/
431	void
432	StrategyNotifyBgWriter(int bgwprocno)
433	{
434	/*
435	* We acquire buffer_strategy_lock just to ensure that the store appears
436	* atomic to StrategyGetBuffer. The bgwriter should call this rather
437	* infrequently, so there's no performance penalty from being safe.
438	*/
439	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
440	StrategyControl->bgwprocno = bgwprocno;
441	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
442	}
443
444
445	/*
446	* StrategyShmemSize
447	*
448	* estimate the size of shared memory used by the freelist-related structures.
449	*
450	* Note: for somewhat historical reasons, the buffer lookup hashtable size
451	* is also determined here.
452	*/
453	Size
454	StrategyShmemSize(void)
455	{
456	Size size = `0`;
457
458	/ size of lookup hash table ... see comment in StrategyInitialize /
459	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
460
461	/ size of the shared replacement strategy control block /
462	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
463
464	return size;
465	}
466
467	/*
468	* StrategyInitialize -- initialize the buffer cache replacement
469	* strategy.
470	*
471	* Assumes: All of the buffers are already built into a linked list.
472	* Only called by postmaster and only during initialization.
473	*/
474	void
475	StrategyInitialize(bool init)
476	{
477	bool found;
478
479	/*
480	* Initialize the shared buffer lookup hashtable.
481	*
482	* Since we can't tolerate running out of lookup table entries, we must be
483	* sure to specify an adequate table size here. The maximum steady-state
484	* usage is of course NBuffers entries, but BufferAlloc() tries to insert
485	* a new entry before deleting the old. In principle this could be
486	* happening in each partition concurrently, so we could need as many as
487	* NBuffers + NUM_BUFFER_PARTITIONS entries.
488	*/
489	InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
490
491	/*
492	* Get or create the shared strategy control block
493	*/
494	StrategyControl = (BufferStrategyControl *)
495	ShmemInitStruct("Buffer Strategy Status",
496	sizeof(BufferStrategyControl),
497	&found);
498
499	if (!found)
500	{
501	/*
502	* Only done once, usually in postmaster
503	*/
504	Assert(init);
505
506	SpinLockInit(&StrategyControl->buffer_strategy_lock);
507
508	/*
509	* Grab the whole linked list of free buffers for our strategy. We
510	* assume it was previously set up by InitBufferPool().
511	*/
512	StrategyControl->firstFreeBuffer = `0`;
513	StrategyControl->lastFreeBuffer = NBuffers - `1`;
514
515	/ Initialize the clock sweep pointer /
516	pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, `0`);
517
518	/ Clear statistics /
519	StrategyControl->completePasses = `0`;
520	pg_atomic_init_u32(&StrategyControl->numBufferAllocs, `0`);
521
522	/ No pending notification /
523	StrategyControl->bgwprocno = -`1`;
524	}
525	else
526	Assert(!init);
527	}
528
529
530	/ ----------------------------------------------------------------*
531	* Backend-private buffer ring management
532	* ----------------------------------------------------------------
533	*/
534
535
536	/*
537	* GetAccessStrategy -- create a BufferAccessStrategy object
538	*
539	* The object is allocated in the current memory context.
540	*/
541	BufferAccessStrategy
542	GetAccessStrategy(BufferAccessStrategyType btype)
543	{
544	BufferAccessStrategy strategy;
545	int ring_size;
546
547	/*
548	* Select ring size to use. See buffer/README for rationales.
549	*
550	* Note: if you change the ring size for BAS_BULKREAD, see also
551	* SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
552	*/
553	switch (btype)
554	{
555	case BAS_NORMAL:
556	/ if someone asks for NORMAL, just give 'em a "default" object /
557	return NULL;
558
559	case BAS_BULKREAD:
560	ring_size = `256` * `1024` / BLCKSZ;
561	break;
562	case BAS_BULKWRITE:
563	ring_size = `16` * `1024` * `1024` / BLCKSZ;
564	break;
565	case BAS_VACUUM:
566	ring_size = `256` * `1024` / BLCKSZ;
567	break;
568
569	default:
570	elog(ERROR, "unrecognized buffer access strategy: %d",
571	(int) btype);
572	return NULL; / keep compiler quiet /
573	}
574
575	/ Make sure ring isn't an undue fraction of shared buffers /
576	ring_size = Min(NBuffers / `8`, ring_size);
577
578	/ Allocate the object and initialize all elements to zeroes /
579	strategy = (BufferAccessStrategy)
580	palloc0(offsetof(BufferAccessStrategyData, buffers) +
581	ring_size * sizeof(Buffer));
582
583	/ Set fields that don't start out zero /
584	strategy->btype = btype;
585	strategy->ring_size = ring_size;
586
587	return strategy;
588	}
589
590	/*
591	* FreeAccessStrategy -- release a BufferAccessStrategy object
592	*
593	* A simple pfree would do at the moment, but we would prefer that callers
594	* don't assume that much about the representation of BufferAccessStrategy.
595	*/
596	void
597	FreeAccessStrategy(BufferAccessStrategy strategy)
598	{
599	/ don't crash if called on a "default" strategy /
600	if (strategy != NULL)
601	pfree(strategy);
602	}
603
604	/*
605	* GetBufferFromRing -- returns a buffer from the ring, or NULL if the
606	* ring is empty.
607	*
608	* The bufhdr spin lock is held on the returned buffer.
609	*/
610	static BufferDesc *
611	GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
612	{
613	BufferDesc *buf;
614	Buffer bufnum;
615	uint32 local_buf_state; / to avoid repeated (de-)referencing /
616
617
618	/ Advance to next ring slot /
619	if (++strategy->current >= strategy->ring_size)
620	strategy->current = `0`;
621
622	/*
623	* If the slot hasn't been filled yet, tell the caller to allocate a new
624	* buffer with the normal allocation strategy. He will then fill this
625	* slot by calling AddBufferToRing with the new buffer.
626	*/
627	bufnum = strategy->buffers[strategy->current];
628	if (bufnum == InvalidBuffer)
629	{
630	strategy->current_was_in_ring = false;
631	return NULL;
632	}
633
634	/*
635	* If the buffer is pinned we cannot use it under any circumstances.
636	*
637	* If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
638	* since our own previous usage of the ring element would have left it
639	* there, but it might've been decremented by clock sweep since then). A
640	* higher usage_count indicates someone else has touched the buffer, so we
641	* shouldn't re-use it.
642	*/
643	buf = GetBufferDescriptor(bufnum - `1`);
644	local_buf_state = LockBufHdr(buf);
645	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == `0`
646	&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= `1`)
647	{
648	strategy->current_was_in_ring = true;
649	*buf_state = local_buf_state;
650	return buf;
651	}
652	UnlockBufHdr(buf, local_buf_state);
653
654	/*
655	* Tell caller to allocate a new buffer with the normal allocation
656	* strategy. He'll then replace this ring element via AddBufferToRing.
657	*/
658	strategy->current_was_in_ring = false;
659	return NULL;
660	}
661
662	/*
663	* AddBufferToRing -- add a buffer to the buffer ring
664	*
665	* Caller must hold the buffer header spinlock on the buffer. Since this
666	* is called with the spinlock held, it had better be quite cheap.
667	*/
668	static void
669	AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
670	{
671	strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
672	}
673
674	/*
675	* StrategyRejectBuffer -- consider rejecting a dirty buffer
676	*
677	* When a nondefault strategy is used, the buffer manager calls this function
678	* when it turns out that the buffer selected by StrategyGetBuffer needs to
679	* be written out and doing so would require flushing WAL too. This gives us
680	* a chance to choose a different victim.
681	*
682	* Returns true if buffer manager should ask for a new victim, and false
683	* if this buffer should be written and re-used.
684	*/
685	bool
686	StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
687	{
688	/ We only do this in bulkread mode /
689	if (strategy->btype != BAS_BULKREAD)
690	return false;
691
692	/ Don't muck with behavior of normal buffer-replacement strategy /
693	if (!strategy->current_was_in_ring \|\|
694	strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
695	return false;
696
697	/*
698	* Remove the dirty buffer from the ring; necessary to prevent infinite
699	* loop if all ring members are dirty.
700	*/
701	strategy->buffers[strategy->current] = InvalidBuffer;
702
703	return true;
704	}
705

Browse the source code of PostgreSQL/src/backend/storage/buffer/freelist.c