1/*-------------------------------------------------------------------------
2 *
3 * buf_internals.h
4 * Internal definitions for buffer manager and the buffer replacement
5 * strategy.
6 *
7 *
8 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 * src/include/storage/buf_internals.h
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef BUFMGR_INTERNALS_H
16#define BUFMGR_INTERNALS_H
17
18#include "storage/buf.h"
19#include "storage/bufmgr.h"
20#include "storage/latch.h"
21#include "storage/lwlock.h"
22#include "storage/shmem.h"
23#include "storage/smgr.h"
24#include "port/atomics.h"
25#include "storage/spin.h"
26#include "utils/relcache.h"
27
28
29/*
30 * Buffer state is a single 32-bit variable where following data is combined.
31 *
32 * - 18 bits refcount
33 * - 4 bits usage count
34 * - 10 bits of flags
35 *
36 * Combining these values allows to perform some operations without locking
37 * the buffer header, by modifying them together with a CAS loop.
38 *
39 * The definition of buffer state components is below.
40 */
41#define BUF_REFCOUNT_ONE 1
42#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
43#define BUF_USAGECOUNT_MASK 0x003C0000U
44#define BUF_USAGECOUNT_ONE (1U << 18)
45#define BUF_USAGECOUNT_SHIFT 18
46#define BUF_FLAG_MASK 0xFFC00000U
47
48/* Get refcount and usagecount from buffer state */
49#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
50#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
51
52/*
53 * Flags for buffer descriptors
54 *
55 * Note: TAG_VALID essentially means that there is a buffer hashtable
56 * entry associated with the buffer's tag.
57 */
58#define BM_LOCKED (1U << 22) /* buffer header is locked */
59#define BM_DIRTY (1U << 23) /* data needs writing */
60#define BM_VALID (1U << 24) /* data is valid */
61#define BM_TAG_VALID (1U << 25) /* tag is assigned */
62#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
63#define BM_IO_ERROR (1U << 27) /* previous I/O failed */
64#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
65#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
66#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
67#define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
68 * or init fork) */
69/*
70 * The maximum allowed value of usage_count represents a tradeoff between
71 * accuracy and speed of the clock-sweep buffer management algorithm. A
72 * large value (comparable to NBuffers) would approximate LRU semantics.
73 * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
74 * clock sweeps to find a free buffer, so in practice we don't want the
75 * value to be very large.
76 */
77#define BM_MAX_USAGE_COUNT 5
78
79/*
80 * Buffer tag identifies which disk block the buffer contains.
81 *
82 * Note: the BufferTag data must be sufficient to determine where to write the
83 * block, without reference to pg_class or pg_tablespace entries. It's
84 * possible that the backend flushing the buffer doesn't even believe the
85 * relation is visible yet (its xact may have started before the xact that
86 * created the rel). The storage manager must be able to cope anyway.
87 *
88 * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
89 * to be fixed to zero them, since this struct is used as a hash key.
90 */
91typedef struct buftag
92{
93 RelFileNode rnode; /* physical relation identifier */
94 ForkNumber forkNum;
95 BlockNumber blockNum; /* blknum relative to begin of reln */
96} BufferTag;
97
98#define CLEAR_BUFFERTAG(a) \
99( \
100 (a).rnode.spcNode = InvalidOid, \
101 (a).rnode.dbNode = InvalidOid, \
102 (a).rnode.relNode = InvalidOid, \
103 (a).forkNum = InvalidForkNumber, \
104 (a).blockNum = InvalidBlockNumber \
105)
106
107#define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
108( \
109 (a).rnode = (xx_rnode), \
110 (a).forkNum = (xx_forkNum), \
111 (a).blockNum = (xx_blockNum) \
112)
113
114#define BUFFERTAGS_EQUAL(a,b) \
115( \
116 RelFileNodeEquals((a).rnode, (b).rnode) && \
117 (a).blockNum == (b).blockNum && \
118 (a).forkNum == (b).forkNum \
119)
120
121/*
122 * The shared buffer mapping table is partitioned to reduce contention.
123 * To determine which partition lock a given tag requires, compute the tag's
124 * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
125 * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
126 */
127#define BufTableHashPartition(hashcode) \
128 ((hashcode) % NUM_BUFFER_PARTITIONS)
129#define BufMappingPartitionLock(hashcode) \
130 (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
131 BufTableHashPartition(hashcode)].lock)
132#define BufMappingPartitionLockByIndex(i) \
133 (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
134
135/*
136 * BufferDesc -- shared descriptor/state data for a single shared buffer.
137 *
138 * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
139 * the tag, state or wait_backend_pid fields. In general, buffer header lock
140 * is a spinlock which is combined with flags, refcount and usagecount into
141 * single atomic variable. This layout allow us to do some operations in a
142 * single atomic operation, without actually acquiring and releasing spinlock;
143 * for instance, increase or decrease refcount. buf_id field never changes
144 * after initialization, so does not need locking. freeNext is protected by
145 * the buffer_strategy_lock not buffer header lock. The LWLock can take care
146 * of itself. The buffer header lock is *not* used to control access to the
147 * data in the buffer!
148 *
149 * It's assumed that nobody changes the state field while buffer header lock
150 * is held. Thus buffer header lock holder can do complex updates of the
151 * state variable in single write, simultaneously with lock release (cleaning
152 * BM_LOCKED flag). On the other hand, updating of state without holding
153 * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
154 * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
155 *
156 * An exception is that if we have the buffer pinned, its tag can't change
157 * underneath us, so we can examine the tag without locking the buffer header.
158 * Also, in places we do one-time reads of the flags without bothering to
159 * lock the buffer header; this is generally for situations where we don't
160 * expect the flag bit being tested to be changing.
161 *
162 * We can't physically remove items from a disk page if another backend has
163 * the buffer pinned. Hence, a backend may need to wait for all other pins
164 * to go away. This is signaled by storing its own PID into
165 * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present,
166 * there can be only one such waiter per buffer.
167 *
168 * We use this same struct for local buffer headers, but the locks are not
169 * used and not all of the flag bits are useful either. To avoid unnecessary
170 * overhead, manipulations of the state field should be done without actual
171 * atomic operations (i.e. only pg_atomic_read_u32() and
172 * pg_atomic_unlocked_write_u32()).
173 *
174 * Be careful to avoid increasing the size of the struct when adding or
175 * reordering members. Keeping it below 64 bytes (the most common CPU
176 * cache line size) is fairly important for performance.
177 */
178typedef struct BufferDesc
179{
180 BufferTag tag; /* ID of page contained in buffer */
181 int buf_id; /* buffer's index number (from 0) */
182
183 /* state of the tag, containing flags, refcount and usagecount */
184 pg_atomic_uint32 state;
185
186 int wait_backend_pid; /* backend PID of pin-count waiter */
187 int freeNext; /* link in freelist chain */
188
189 LWLock content_lock; /* to lock access to buffer contents */
190} BufferDesc;
191
192/*
193 * Concurrent access to buffer headers has proven to be more efficient if
194 * they're cache line aligned. So we force the start of the BufferDescriptors
195 * array to be on a cache line boundary and force the elements to be cache
196 * line sized.
197 *
198 * XXX: As this is primarily matters in highly concurrent workloads which
199 * probably all are 64bit these days, and the space wastage would be a bit
200 * more noticeable on 32bit systems, we don't force the stride to be cache
201 * line sized on those. If somebody does actual performance testing, we can
202 * reevaluate.
203 *
204 * Note that local buffer descriptors aren't forced to be aligned - as there's
205 * no concurrent access to those it's unlikely to be beneficial.
206 *
207 * We use 64bit as the cache line size here, because that's the most common
208 * size. Making it bigger would be a waste of memory. Even if running on a
209 * platform with either 32 or 128 byte line sizes, it's good to align to
210 * boundaries and avoid false sharing.
211 */
212#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
213
214typedef union BufferDescPadded
215{
216 BufferDesc bufferdesc;
217 char pad[BUFFERDESC_PAD_TO_SIZE];
218} BufferDescPadded;
219
220#define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
221#define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
222
223#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
224
225#define BufferDescriptorGetIOLock(bdesc) \
226 (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
227#define BufferDescriptorGetContentLock(bdesc) \
228 ((LWLock*) (&(bdesc)->content_lock))
229
230extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
231
232/*
233 * The freeNext field is either the index of the next freelist entry,
234 * or one of these special values:
235 */
236#define FREENEXT_END_OF_LIST (-1)
237#define FREENEXT_NOT_IN_LIST (-2)
238
239/*
240 * Functions for acquiring/releasing a shared buffer header's spinlock. Do
241 * not apply these to local buffers!
242 */
243extern uint32 LockBufHdr(BufferDesc *desc);
244#define UnlockBufHdr(desc, s) \
245 do { \
246 pg_write_barrier(); \
247 pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
248 } while (0)
249
250
251/*
252 * The PendingWriteback & WritebackContext structure are used to keep
253 * information about pending flush requests to be issued to the OS.
254 */
255typedef struct PendingWriteback
256{
257 /* could store different types of pending flushes here */
258 BufferTag tag;
259} PendingWriteback;
260
261/* struct forward declared in bufmgr.h */
262typedef struct WritebackContext
263{
264 /* pointer to the max number of writeback requests to coalesce */
265 int *max_pending;
266
267 /* current number of pending writeback requests */
268 int nr_pending;
269
270 /* pending requests */
271 PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
272} WritebackContext;
273
274/* in buf_init.c */
275extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
276extern PGDLLIMPORT WritebackContext BackendWritebackContext;
277
278/* in localbuf.c */
279extern BufferDesc *LocalBufferDescriptors;
280
281/* in bufmgr.c */
282
283/*
284 * Structure to sort buffers per file on checkpoints.
285 *
286 * This structure is allocated per buffer in shared memory, so it should be
287 * kept as small as possible.
288 */
289typedef struct CkptSortItem
290{
291 Oid tsId;
292 Oid relNode;
293 ForkNumber forkNum;
294 BlockNumber blockNum;
295 int buf_id;
296} CkptSortItem;
297
298extern CkptSortItem *CkptBufferIds;
299
300/*
301 * Internal buffer management routines
302 */
303/* bufmgr.c */
304extern void WritebackContextInit(WritebackContext *context, int *max_pending);
305extern void IssuePendingWritebacks(WritebackContext *context);
306extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
307
308/* freelist.c */
309extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
310 uint32 *buf_state);
311extern void StrategyFreeBuffer(BufferDesc *buf);
312extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
313 BufferDesc *buf);
314
315extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
316extern void StrategyNotifyBgWriter(int bgwprocno);
317
318extern Size StrategyShmemSize(void);
319extern void StrategyInitialize(bool init);
320extern bool have_free_buffer(void);
321
322/* buf_table.c */
323extern Size BufTableShmemSize(int size);
324extern void InitBufTable(int size);
325extern uint32 BufTableHashCode(BufferTag *tagPtr);
326extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
327extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
328extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
329
330/* localbuf.c */
331extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
332 BlockNumber blockNum);
333extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
334 BlockNumber blockNum, bool *foundPtr);
335extern void MarkLocalBufferDirty(Buffer buffer);
336extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
337 BlockNumber firstDelBlock);
338extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
339extern void AtEOXact_LocalBuffers(bool isCommit);
340
341#endif /* BUFMGR_INTERNALS_H */
342