1/*-------------------------------------------------------------------------
2 *
3 * freelist.c
4 * routines for managing the buffer pool's replacement strategy.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 *
11 * IDENTIFICATION
12 * src/backend/storage/buffer/freelist.c
13 *
14 *-------------------------------------------------------------------------
15 */
16#include "postgres.h"
17
18#include "port/atomics.h"
19#include "storage/buf_internals.h"
20#include "storage/bufmgr.h"
21#include "storage/proc.h"
22
23#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
24
25
26/*
27 * The shared freelist control information.
28 */
29typedef struct
30{
31 /* Spinlock: protects the values below */
32 slock_t buffer_strategy_lock;
33
34 /*
35 * Clock sweep hand: index of next buffer to consider grabbing. Note that
36 * this isn't a concrete buffer - we only ever increase the value. So, to
37 * get an actual buffer, it needs to be used modulo NBuffers.
38 */
39 pg_atomic_uint32 nextVictimBuffer;
40
41 int firstFreeBuffer; /* Head of list of unused buffers */
42 int lastFreeBuffer; /* Tail of list of unused buffers */
43
44 /*
45 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
46 * when the list is empty)
47 */
48
49 /*
50 * Statistics. These counters should be wide enough that they can't
51 * overflow during a single bgwriter cycle.
52 */
53 uint32 completePasses; /* Complete cycles of the clock sweep */
54 pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
55
56 /*
57 * Bgworker process to be notified upon activity or -1 if none. See
58 * StrategyNotifyBgWriter.
59 */
60 int bgwprocno;
61} BufferStrategyControl;
62
63/* Pointers to shared state */
64static BufferStrategyControl *StrategyControl = NULL;
65
66/*
67 * Private (non-shared) state for managing a ring of shared buffers to re-use.
68 * This is currently the only kind of BufferAccessStrategy object, but someday
69 * we might have more kinds.
70 */
71typedef struct BufferAccessStrategyData
72{
73 /* Overall strategy type */
74 BufferAccessStrategyType btype;
75 /* Number of elements in buffers[] array */
76 int ring_size;
77
78 /*
79 * Index of the "current" slot in the ring, ie, the one most recently
80 * returned by GetBufferFromRing.
81 */
82 int current;
83
84 /*
85 * True if the buffer just returned by StrategyGetBuffer had been in the
86 * ring already.
87 */
88 bool current_was_in_ring;
89
90 /*
91 * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
92 * have not yet selected a buffer for this ring slot. For allocation
93 * simplicity this is palloc'd together with the fixed fields of the
94 * struct.
95 */
96 Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
97} BufferAccessStrategyData;
98
99
100/* Prototypes for internal functions */
101static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
102 uint32 *buf_state);
103static void AddBufferToRing(BufferAccessStrategy strategy,
104 BufferDesc *buf);
105
106/*
107 * ClockSweepTick - Helper routine for StrategyGetBuffer()
108 *
109 * Move the clock hand one buffer ahead of its current position and return the
110 * id of the buffer now under the hand.
111 */
112static inline uint32
113ClockSweepTick(void)
114{
115 uint32 victim;
116
117 /*
118 * Atomically move hand ahead one buffer - if there's several processes
119 * doing this, this can lead to buffers being returned slightly out of
120 * apparent order.
121 */
122 victim =
123 pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
124
125 if (victim >= NBuffers)
126 {
127 uint32 originalVictim = victim;
128
129 /* always wrap what we look up in BufferDescriptors */
130 victim = victim % NBuffers;
131
132 /*
133 * If we're the one that just caused a wraparound, force
134 * completePasses to be incremented while holding the spinlock. We
135 * need the spinlock so StrategySyncStart() can return a consistent
136 * value consisting of nextVictimBuffer and completePasses.
137 */
138 if (victim == 0)
139 {
140 uint32 expected;
141 uint32 wrapped;
142 bool success = false;
143
144 expected = originalVictim + 1;
145
146 while (!success)
147 {
148 /*
149 * Acquire the spinlock while increasing completePasses. That
150 * allows other readers to read nextVictimBuffer and
151 * completePasses in a consistent manner which is required for
152 * StrategySyncStart(). In theory delaying the increment
153 * could lead to an overflow of nextVictimBuffers, but that's
154 * highly unlikely and wouldn't be particularly harmful.
155 */
156 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
157
158 wrapped = expected % NBuffers;
159
160 success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
161 &expected, wrapped);
162 if (success)
163 StrategyControl->completePasses++;
164 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
165 }
166 }
167 }
168 return victim;
169}
170
171/*
172 * have_free_buffer -- a lockless check to see if there is a free buffer in
173 * buffer pool.
174 *
175 * If the result is true that will become stale once free buffers are moved out
176 * by other operations, so the caller who strictly want to use a free buffer
177 * should not call this.
178 */
179bool
180have_free_buffer()
181{
182 if (StrategyControl->firstFreeBuffer >= 0)
183 return true;
184 else
185 return false;
186}
187
188/*
189 * StrategyGetBuffer
190 *
191 * Called by the bufmgr to get the next candidate buffer to use in
192 * BufferAlloc(). The only hard requirement BufferAlloc() has is that
193 * the selected buffer must not currently be pinned by anyone.
194 *
195 * strategy is a BufferAccessStrategy object, or NULL for default strategy.
196 *
197 * To ensure that no one else can pin the buffer before we do, we must
198 * return the buffer with the buffer header spinlock still held.
199 */
200BufferDesc *
201StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
202{
203 BufferDesc *buf;
204 int bgwprocno;
205 int trycounter;
206 uint32 local_buf_state; /* to avoid repeated (de-)referencing */
207
208 /*
209 * If given a strategy object, see whether it can select a buffer. We
210 * assume strategy objects don't need buffer_strategy_lock.
211 */
212 if (strategy != NULL)
213 {
214 buf = GetBufferFromRing(strategy, buf_state);
215 if (buf != NULL)
216 return buf;
217 }
218
219 /*
220 * If asked, we need to waken the bgwriter. Since we don't want to rely on
221 * a spinlock for this we force a read from shared memory once, and then
222 * set the latch based on that value. We need to go through that length
223 * because otherwise bgwprocno might be reset while/after we check because
224 * the compiler might just reread from memory.
225 *
226 * This can possibly set the latch of the wrong process if the bgwriter
227 * dies in the wrong moment. But since PGPROC->procLatch is never
228 * deallocated the worst consequence of that is that we set the latch of
229 * some arbitrary process.
230 */
231 bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
232 if (bgwprocno != -1)
233 {
234 /* reset bgwprocno first, before setting the latch */
235 StrategyControl->bgwprocno = -1;
236
237 /*
238 * Not acquiring ProcArrayLock here which is slightly icky. It's
239 * actually fine because procLatch isn't ever freed, so we just can
240 * potentially set the wrong process' (or no process') latch.
241 */
242 SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
243 }
244
245 /*
246 * We count buffer allocation requests so that the bgwriter can estimate
247 * the rate of buffer consumption. Note that buffers recycled by a
248 * strategy object are intentionally not counted here.
249 */
250 pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
251
252 /*
253 * First check, without acquiring the lock, whether there's buffers in the
254 * freelist. Since we otherwise don't require the spinlock in every
255 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
256 * uselessly in most cases. That obviously leaves a race where a buffer is
257 * put on the freelist but we don't see the store yet - but that's pretty
258 * harmless, it'll just get used during the next buffer acquisition.
259 *
260 * If there's buffers on the freelist, acquire the spinlock to pop one
261 * buffer of the freelist. Then check whether that buffer is usable and
262 * repeat if not.
263 *
264 * Note that the freeNext fields are considered to be protected by the
265 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
266 * manipulate them without holding the spinlock.
267 */
268 if (StrategyControl->firstFreeBuffer >= 0)
269 {
270 while (true)
271 {
272 /* Acquire the spinlock to remove element from the freelist */
273 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
274
275 if (StrategyControl->firstFreeBuffer < 0)
276 {
277 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
278 break;
279 }
280
281 buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
282 Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
283
284 /* Unconditionally remove buffer from freelist */
285 StrategyControl->firstFreeBuffer = buf->freeNext;
286 buf->freeNext = FREENEXT_NOT_IN_LIST;
287
288 /*
289 * Release the lock so someone else can access the freelist while
290 * we check out this buffer.
291 */
292 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
293
294 /*
295 * If the buffer is pinned or has a nonzero usage_count, we cannot
296 * use it; discard it and retry. (This can only happen if VACUUM
297 * put a valid buffer in the freelist and then someone else used
298 * it before we got to it. It's probably impossible altogether as
299 * of 8.3, but we'd better check anyway.)
300 */
301 local_buf_state = LockBufHdr(buf);
302 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
303 && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
304 {
305 if (strategy != NULL)
306 AddBufferToRing(strategy, buf);
307 *buf_state = local_buf_state;
308 return buf;
309 }
310 UnlockBufHdr(buf, local_buf_state);
311
312 }
313 }
314
315 /* Nothing on the freelist, so run the "clock sweep" algorithm */
316 trycounter = NBuffers;
317 for (;;)
318 {
319 buf = GetBufferDescriptor(ClockSweepTick());
320
321 /*
322 * If the buffer is pinned or has a nonzero usage_count, we cannot use
323 * it; decrement the usage_count (unless pinned) and keep scanning.
324 */
325 local_buf_state = LockBufHdr(buf);
326
327 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
328 {
329 if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
330 {
331 local_buf_state -= BUF_USAGECOUNT_ONE;
332
333 trycounter = NBuffers;
334 }
335 else
336 {
337 /* Found a usable buffer */
338 if (strategy != NULL)
339 AddBufferToRing(strategy, buf);
340 *buf_state = local_buf_state;
341 return buf;
342 }
343 }
344 else if (--trycounter == 0)
345 {
346 /*
347 * We've scanned all the buffers without making any state changes,
348 * so all the buffers are pinned (or were when we looked at them).
349 * We could hope that someone will free one eventually, but it's
350 * probably better to fail than to risk getting stuck in an
351 * infinite loop.
352 */
353 UnlockBufHdr(buf, local_buf_state);
354 elog(ERROR, "no unpinned buffers available");
355 }
356 UnlockBufHdr(buf, local_buf_state);
357 }
358}
359
360/*
361 * StrategyFreeBuffer: put a buffer on the freelist
362 */
363void
364StrategyFreeBuffer(BufferDesc *buf)
365{
366 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
367
368 /*
369 * It is possible that we are told to put something in the freelist that
370 * is already in it; don't screw up the list if so.
371 */
372 if (buf->freeNext == FREENEXT_NOT_IN_LIST)
373 {
374 buf->freeNext = StrategyControl->firstFreeBuffer;
375 if (buf->freeNext < 0)
376 StrategyControl->lastFreeBuffer = buf->buf_id;
377 StrategyControl->firstFreeBuffer = buf->buf_id;
378 }
379
380 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
381}
382
383/*
384 * StrategySyncStart -- tell BufferSync where to start syncing
385 *
386 * The result is the buffer index of the best buffer to sync first.
387 * BufferSync() will proceed circularly around the buffer array from there.
388 *
389 * In addition, we return the completed-pass count (which is effectively
390 * the higher-order bits of nextVictimBuffer) and the count of recent buffer
391 * allocs if non-NULL pointers are passed. The alloc count is reset after
392 * being read.
393 */
394int
395StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
396{
397 uint32 nextVictimBuffer;
398 int result;
399
400 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
401 nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
402 result = nextVictimBuffer % NBuffers;
403
404 if (complete_passes)
405 {
406 *complete_passes = StrategyControl->completePasses;
407
408 /*
409 * Additionally add the number of wraparounds that happened before
410 * completePasses could be incremented. C.f. ClockSweepTick().
411 */
412 *complete_passes += nextVictimBuffer / NBuffers;
413 }
414
415 if (num_buf_alloc)
416 {
417 *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
418 }
419 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
420 return result;
421}
422
423/*
424 * StrategyNotifyBgWriter -- set or clear allocation notification latch
425 *
426 * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
427 * set that latch. Pass -1 to clear the pending notification before it
428 * happens. This feature is used by the bgwriter process to wake itself up
429 * from hibernation, and is not meant for anybody else to use.
430 */
431void
432StrategyNotifyBgWriter(int bgwprocno)
433{
434 /*
435 * We acquire buffer_strategy_lock just to ensure that the store appears
436 * atomic to StrategyGetBuffer. The bgwriter should call this rather
437 * infrequently, so there's no performance penalty from being safe.
438 */
439 SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
440 StrategyControl->bgwprocno = bgwprocno;
441 SpinLockRelease(&StrategyControl->buffer_strategy_lock);
442}
443
444
445/*
446 * StrategyShmemSize
447 *
448 * estimate the size of shared memory used by the freelist-related structures.
449 *
450 * Note: for somewhat historical reasons, the buffer lookup hashtable size
451 * is also determined here.
452 */
453Size
454StrategyShmemSize(void)
455{
456 Size size = 0;
457
458 /* size of lookup hash table ... see comment in StrategyInitialize */
459 size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
460
461 /* size of the shared replacement strategy control block */
462 size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
463
464 return size;
465}
466
467/*
468 * StrategyInitialize -- initialize the buffer cache replacement
469 * strategy.
470 *
471 * Assumes: All of the buffers are already built into a linked list.
472 * Only called by postmaster and only during initialization.
473 */
474void
475StrategyInitialize(bool init)
476{
477 bool found;
478
479 /*
480 * Initialize the shared buffer lookup hashtable.
481 *
482 * Since we can't tolerate running out of lookup table entries, we must be
483 * sure to specify an adequate table size here. The maximum steady-state
484 * usage is of course NBuffers entries, but BufferAlloc() tries to insert
485 * a new entry before deleting the old. In principle this could be
486 * happening in each partition concurrently, so we could need as many as
487 * NBuffers + NUM_BUFFER_PARTITIONS entries.
488 */
489 InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
490
491 /*
492 * Get or create the shared strategy control block
493 */
494 StrategyControl = (BufferStrategyControl *)
495 ShmemInitStruct("Buffer Strategy Status",
496 sizeof(BufferStrategyControl),
497 &found);
498
499 if (!found)
500 {
501 /*
502 * Only done once, usually in postmaster
503 */
504 Assert(init);
505
506 SpinLockInit(&StrategyControl->buffer_strategy_lock);
507
508 /*
509 * Grab the whole linked list of free buffers for our strategy. We
510 * assume it was previously set up by InitBufferPool().
511 */
512 StrategyControl->firstFreeBuffer = 0;
513 StrategyControl->lastFreeBuffer = NBuffers - 1;
514
515 /* Initialize the clock sweep pointer */
516 pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
517
518 /* Clear statistics */
519 StrategyControl->completePasses = 0;
520 pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
521
522 /* No pending notification */
523 StrategyControl->bgwprocno = -1;
524 }
525 else
526 Assert(!init);
527}
528
529
530/* ----------------------------------------------------------------
531 * Backend-private buffer ring management
532 * ----------------------------------------------------------------
533 */
534
535
536/*
537 * GetAccessStrategy -- create a BufferAccessStrategy object
538 *
539 * The object is allocated in the current memory context.
540 */
541BufferAccessStrategy
542GetAccessStrategy(BufferAccessStrategyType btype)
543{
544 BufferAccessStrategy strategy;
545 int ring_size;
546
547 /*
548 * Select ring size to use. See buffer/README for rationales.
549 *
550 * Note: if you change the ring size for BAS_BULKREAD, see also
551 * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
552 */
553 switch (btype)
554 {
555 case BAS_NORMAL:
556 /* if someone asks for NORMAL, just give 'em a "default" object */
557 return NULL;
558
559 case BAS_BULKREAD:
560 ring_size = 256 * 1024 / BLCKSZ;
561 break;
562 case BAS_BULKWRITE:
563 ring_size = 16 * 1024 * 1024 / BLCKSZ;
564 break;
565 case BAS_VACUUM:
566 ring_size = 256 * 1024 / BLCKSZ;
567 break;
568
569 default:
570 elog(ERROR, "unrecognized buffer access strategy: %d",
571 (int) btype);
572 return NULL; /* keep compiler quiet */
573 }
574
575 /* Make sure ring isn't an undue fraction of shared buffers */
576 ring_size = Min(NBuffers / 8, ring_size);
577
578 /* Allocate the object and initialize all elements to zeroes */
579 strategy = (BufferAccessStrategy)
580 palloc0(offsetof(BufferAccessStrategyData, buffers) +
581 ring_size * sizeof(Buffer));
582
583 /* Set fields that don't start out zero */
584 strategy->btype = btype;
585 strategy->ring_size = ring_size;
586
587 return strategy;
588}
589
590/*
591 * FreeAccessStrategy -- release a BufferAccessStrategy object
592 *
593 * A simple pfree would do at the moment, but we would prefer that callers
594 * don't assume that much about the representation of BufferAccessStrategy.
595 */
596void
597FreeAccessStrategy(BufferAccessStrategy strategy)
598{
599 /* don't crash if called on a "default" strategy */
600 if (strategy != NULL)
601 pfree(strategy);
602}
603
604/*
605 * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
606 * ring is empty.
607 *
608 * The bufhdr spin lock is held on the returned buffer.
609 */
610static BufferDesc *
611GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
612{
613 BufferDesc *buf;
614 Buffer bufnum;
615 uint32 local_buf_state; /* to avoid repeated (de-)referencing */
616
617
618 /* Advance to next ring slot */
619 if (++strategy->current >= strategy->ring_size)
620 strategy->current = 0;
621
622 /*
623 * If the slot hasn't been filled yet, tell the caller to allocate a new
624 * buffer with the normal allocation strategy. He will then fill this
625 * slot by calling AddBufferToRing with the new buffer.
626 */
627 bufnum = strategy->buffers[strategy->current];
628 if (bufnum == InvalidBuffer)
629 {
630 strategy->current_was_in_ring = false;
631 return NULL;
632 }
633
634 /*
635 * If the buffer is pinned we cannot use it under any circumstances.
636 *
637 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
638 * since our own previous usage of the ring element would have left it
639 * there, but it might've been decremented by clock sweep since then). A
640 * higher usage_count indicates someone else has touched the buffer, so we
641 * shouldn't re-use it.
642 */
643 buf = GetBufferDescriptor(bufnum - 1);
644 local_buf_state = LockBufHdr(buf);
645 if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
646 && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
647 {
648 strategy->current_was_in_ring = true;
649 *buf_state = local_buf_state;
650 return buf;
651 }
652 UnlockBufHdr(buf, local_buf_state);
653
654 /*
655 * Tell caller to allocate a new buffer with the normal allocation
656 * strategy. He'll then replace this ring element via AddBufferToRing.
657 */
658 strategy->current_was_in_ring = false;
659 return NULL;
660}
661
662/*
663 * AddBufferToRing -- add a buffer to the buffer ring
664 *
665 * Caller must hold the buffer header spinlock on the buffer. Since this
666 * is called with the spinlock held, it had better be quite cheap.
667 */
668static void
669AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
670{
671 strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
672}
673
674/*
675 * StrategyRejectBuffer -- consider rejecting a dirty buffer
676 *
677 * When a nondefault strategy is used, the buffer manager calls this function
678 * when it turns out that the buffer selected by StrategyGetBuffer needs to
679 * be written out and doing so would require flushing WAL too. This gives us
680 * a chance to choose a different victim.
681 *
682 * Returns true if buffer manager should ask for a new victim, and false
683 * if this buffer should be written and re-used.
684 */
685bool
686StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
687{
688 /* We only do this in bulkread mode */
689 if (strategy->btype != BAS_BULKREAD)
690 return false;
691
692 /* Don't muck with behavior of normal buffer-replacement strategy */
693 if (!strategy->current_was_in_ring ||
694 strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
695 return false;
696
697 /*
698 * Remove the dirty buffer from the ring; necessary to prevent infinite
699 * loop if all ring members are dirty.
700 */
701 strategy->buffers[strategy->current] = InvalidBuffer;
702
703 return true;
704}
705