1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * buf_internals.h |
4 | * Internal definitions for buffer manager and the buffer replacement |
5 | * strategy. |
6 | * |
7 | * |
8 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
9 | * Portions Copyright (c) 1994, Regents of the University of California |
10 | * |
11 | * src/include/storage/buf_internals.h |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | #ifndef BUFMGR_INTERNALS_H |
16 | #define BUFMGR_INTERNALS_H |
17 | |
18 | #include "storage/buf.h" |
19 | #include "storage/bufmgr.h" |
20 | #include "storage/latch.h" |
21 | #include "storage/lwlock.h" |
22 | #include "storage/shmem.h" |
23 | #include "storage/smgr.h" |
24 | #include "port/atomics.h" |
25 | #include "storage/spin.h" |
26 | #include "utils/relcache.h" |
27 | |
28 | |
29 | /* |
30 | * Buffer state is a single 32-bit variable where following data is combined. |
31 | * |
32 | * - 18 bits refcount |
33 | * - 4 bits usage count |
34 | * - 10 bits of flags |
35 | * |
36 | * Combining these values allows to perform some operations without locking |
37 | * the buffer header, by modifying them together with a CAS loop. |
38 | * |
39 | * The definition of buffer state components is below. |
40 | */ |
41 | #define BUF_REFCOUNT_ONE 1 |
42 | #define BUF_REFCOUNT_MASK ((1U << 18) - 1) |
43 | #define BUF_USAGECOUNT_MASK 0x003C0000U |
44 | #define BUF_USAGECOUNT_ONE (1U << 18) |
45 | #define BUF_USAGECOUNT_SHIFT 18 |
46 | #define BUF_FLAG_MASK 0xFFC00000U |
47 | |
48 | /* Get refcount and usagecount from buffer state */ |
49 | #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) |
50 | #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) |
51 | |
52 | /* |
53 | * Flags for buffer descriptors |
54 | * |
55 | * Note: TAG_VALID essentially means that there is a buffer hashtable |
56 | * entry associated with the buffer's tag. |
57 | */ |
58 | #define BM_LOCKED (1U << 22) /* buffer header is locked */ |
59 | #define BM_DIRTY (1U << 23) /* data needs writing */ |
60 | #define BM_VALID (1U << 24) /* data is valid */ |
61 | #define BM_TAG_VALID (1U << 25) /* tag is assigned */ |
62 | #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ |
63 | #define BM_IO_ERROR (1U << 27) /* previous I/O failed */ |
64 | #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */ |
65 | #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */ |
66 | #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */ |
67 | #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged, |
68 | * or init fork) */ |
69 | /* |
70 | * The maximum allowed value of usage_count represents a tradeoff between |
71 | * accuracy and speed of the clock-sweep buffer management algorithm. A |
72 | * large value (comparable to NBuffers) would approximate LRU semantics. |
73 | * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of |
74 | * clock sweeps to find a free buffer, so in practice we don't want the |
75 | * value to be very large. |
76 | */ |
77 | #define BM_MAX_USAGE_COUNT 5 |
78 | |
79 | /* |
80 | * Buffer tag identifies which disk block the buffer contains. |
81 | * |
82 | * Note: the BufferTag data must be sufficient to determine where to write the |
83 | * block, without reference to pg_class or pg_tablespace entries. It's |
84 | * possible that the backend flushing the buffer doesn't even believe the |
85 | * relation is visible yet (its xact may have started before the xact that |
86 | * created the rel). The storage manager must be able to cope anyway. |
87 | * |
88 | * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have |
89 | * to be fixed to zero them, since this struct is used as a hash key. |
90 | */ |
91 | typedef struct buftag |
92 | { |
93 | RelFileNode rnode; /* physical relation identifier */ |
94 | ForkNumber forkNum; |
95 | BlockNumber blockNum; /* blknum relative to begin of reln */ |
96 | } BufferTag; |
97 | |
98 | #define CLEAR_BUFFERTAG(a) \ |
99 | ( \ |
100 | (a).rnode.spcNode = InvalidOid, \ |
101 | (a).rnode.dbNode = InvalidOid, \ |
102 | (a).rnode.relNode = InvalidOid, \ |
103 | (a).forkNum = InvalidForkNumber, \ |
104 | (a).blockNum = InvalidBlockNumber \ |
105 | ) |
106 | |
107 | #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ |
108 | ( \ |
109 | (a).rnode = (xx_rnode), \ |
110 | (a).forkNum = (xx_forkNum), \ |
111 | (a).blockNum = (xx_blockNum) \ |
112 | ) |
113 | |
114 | #define BUFFERTAGS_EQUAL(a,b) \ |
115 | ( \ |
116 | RelFileNodeEquals((a).rnode, (b).rnode) && \ |
117 | (a).blockNum == (b).blockNum && \ |
118 | (a).forkNum == (b).forkNum \ |
119 | ) |
120 | |
121 | /* |
122 | * The shared buffer mapping table is partitioned to reduce contention. |
123 | * To determine which partition lock a given tag requires, compute the tag's |
124 | * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). |
125 | * NB: NUM_BUFFER_PARTITIONS must be a power of 2! |
126 | */ |
127 | #define BufTableHashPartition(hashcode) \ |
128 | ((hashcode) % NUM_BUFFER_PARTITIONS) |
129 | #define BufMappingPartitionLock(hashcode) \ |
130 | (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ |
131 | BufTableHashPartition(hashcode)].lock) |
132 | #define BufMappingPartitionLockByIndex(i) \ |
133 | (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) |
134 | |
135 | /* |
136 | * BufferDesc -- shared descriptor/state data for a single shared buffer. |
137 | * |
138 | * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change |
139 | * the tag, state or wait_backend_pid fields. In general, buffer header lock |
140 | * is a spinlock which is combined with flags, refcount and usagecount into |
141 | * single atomic variable. This layout allow us to do some operations in a |
142 | * single atomic operation, without actually acquiring and releasing spinlock; |
143 | * for instance, increase or decrease refcount. buf_id field never changes |
144 | * after initialization, so does not need locking. freeNext is protected by |
145 | * the buffer_strategy_lock not buffer header lock. The LWLock can take care |
146 | * of itself. The buffer header lock is *not* used to control access to the |
147 | * data in the buffer! |
148 | * |
149 | * It's assumed that nobody changes the state field while buffer header lock |
150 | * is held. Thus buffer header lock holder can do complex updates of the |
151 | * state variable in single write, simultaneously with lock release (cleaning |
152 | * BM_LOCKED flag). On the other hand, updating of state without holding |
153 | * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag |
154 | * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. |
155 | * |
156 | * An exception is that if we have the buffer pinned, its tag can't change |
157 | * underneath us, so we can examine the tag without locking the buffer header. |
158 | * Also, in places we do one-time reads of the flags without bothering to |
159 | * lock the buffer header; this is generally for situations where we don't |
160 | * expect the flag bit being tested to be changing. |
161 | * |
162 | * We can't physically remove items from a disk page if another backend has |
163 | * the buffer pinned. Hence, a backend may need to wait for all other pins |
164 | * to go away. This is signaled by storing its own PID into |
165 | * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present, |
166 | * there can be only one such waiter per buffer. |
167 | * |
168 | * We use this same struct for local buffer headers, but the locks are not |
169 | * used and not all of the flag bits are useful either. To avoid unnecessary |
170 | * overhead, manipulations of the state field should be done without actual |
171 | * atomic operations (i.e. only pg_atomic_read_u32() and |
172 | * pg_atomic_unlocked_write_u32()). |
173 | * |
174 | * Be careful to avoid increasing the size of the struct when adding or |
175 | * reordering members. Keeping it below 64 bytes (the most common CPU |
176 | * cache line size) is fairly important for performance. |
177 | */ |
178 | typedef struct BufferDesc |
179 | { |
180 | BufferTag tag; /* ID of page contained in buffer */ |
181 | int buf_id; /* buffer's index number (from 0) */ |
182 | |
183 | /* state of the tag, containing flags, refcount and usagecount */ |
184 | pg_atomic_uint32 state; |
185 | |
186 | int wait_backend_pid; /* backend PID of pin-count waiter */ |
187 | int freeNext; /* link in freelist chain */ |
188 | |
189 | LWLock content_lock; /* to lock access to buffer contents */ |
190 | } BufferDesc; |
191 | |
192 | /* |
193 | * Concurrent access to buffer headers has proven to be more efficient if |
194 | * they're cache line aligned. So we force the start of the BufferDescriptors |
195 | * array to be on a cache line boundary and force the elements to be cache |
196 | * line sized. |
197 | * |
198 | * XXX: As this is primarily matters in highly concurrent workloads which |
199 | * probably all are 64bit these days, and the space wastage would be a bit |
200 | * more noticeable on 32bit systems, we don't force the stride to be cache |
201 | * line sized on those. If somebody does actual performance testing, we can |
202 | * reevaluate. |
203 | * |
204 | * Note that local buffer descriptors aren't forced to be aligned - as there's |
205 | * no concurrent access to those it's unlikely to be beneficial. |
206 | * |
207 | * We use 64bit as the cache line size here, because that's the most common |
208 | * size. Making it bigger would be a waste of memory. Even if running on a |
209 | * platform with either 32 or 128 byte line sizes, it's good to align to |
210 | * boundaries and avoid false sharing. |
211 | */ |
212 | #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1) |
213 | |
214 | typedef union BufferDescPadded |
215 | { |
216 | BufferDesc bufferdesc; |
217 | char pad[BUFFERDESC_PAD_TO_SIZE]; |
218 | } BufferDescPadded; |
219 | |
220 | #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc) |
221 | #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)]) |
222 | |
223 | #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) |
224 | |
225 | #define BufferDescriptorGetIOLock(bdesc) \ |
226 | (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock) |
227 | #define BufferDescriptorGetContentLock(bdesc) \ |
228 | ((LWLock*) (&(bdesc)->content_lock)) |
229 | |
230 | extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; |
231 | |
232 | /* |
233 | * The freeNext field is either the index of the next freelist entry, |
234 | * or one of these special values: |
235 | */ |
236 | #define FREENEXT_END_OF_LIST (-1) |
237 | #define FREENEXT_NOT_IN_LIST (-2) |
238 | |
239 | /* |
240 | * Functions for acquiring/releasing a shared buffer header's spinlock. Do |
241 | * not apply these to local buffers! |
242 | */ |
243 | extern uint32 LockBufHdr(BufferDesc *desc); |
244 | #define UnlockBufHdr(desc, s) \ |
245 | do { \ |
246 | pg_write_barrier(); \ |
247 | pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \ |
248 | } while (0) |
249 | |
250 | |
251 | /* |
252 | * The PendingWriteback & WritebackContext structure are used to keep |
253 | * information about pending flush requests to be issued to the OS. |
254 | */ |
255 | typedef struct PendingWriteback |
256 | { |
257 | /* could store different types of pending flushes here */ |
258 | BufferTag tag; |
259 | } PendingWriteback; |
260 | |
261 | /* struct forward declared in bufmgr.h */ |
262 | typedef struct WritebackContext |
263 | { |
264 | /* pointer to the max number of writeback requests to coalesce */ |
265 | int *max_pending; |
266 | |
267 | /* current number of pending writeback requests */ |
268 | int nr_pending; |
269 | |
270 | /* pending requests */ |
271 | PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; |
272 | } WritebackContext; |
273 | |
274 | /* in buf_init.c */ |
275 | extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; |
276 | extern PGDLLIMPORT WritebackContext BackendWritebackContext; |
277 | |
278 | /* in localbuf.c */ |
279 | extern BufferDesc *LocalBufferDescriptors; |
280 | |
281 | /* in bufmgr.c */ |
282 | |
283 | /* |
284 | * Structure to sort buffers per file on checkpoints. |
285 | * |
286 | * This structure is allocated per buffer in shared memory, so it should be |
287 | * kept as small as possible. |
288 | */ |
289 | typedef struct CkptSortItem |
290 | { |
291 | Oid tsId; |
292 | Oid relNode; |
293 | ForkNumber forkNum; |
294 | BlockNumber blockNum; |
295 | int buf_id; |
296 | } CkptSortItem; |
297 | |
298 | extern CkptSortItem *CkptBufferIds; |
299 | |
300 | /* |
301 | * Internal buffer management routines |
302 | */ |
303 | /* bufmgr.c */ |
304 | extern void WritebackContextInit(WritebackContext *context, int *max_pending); |
305 | extern void IssuePendingWritebacks(WritebackContext *context); |
306 | extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); |
307 | |
308 | /* freelist.c */ |
309 | extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, |
310 | uint32 *buf_state); |
311 | extern void StrategyFreeBuffer(BufferDesc *buf); |
312 | extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, |
313 | BufferDesc *buf); |
314 | |
315 | extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); |
316 | extern void StrategyNotifyBgWriter(int bgwprocno); |
317 | |
318 | extern Size StrategyShmemSize(void); |
319 | extern void StrategyInitialize(bool init); |
320 | extern bool have_free_buffer(void); |
321 | |
322 | /* buf_table.c */ |
323 | extern Size BufTableShmemSize(int size); |
324 | extern void InitBufTable(int size); |
325 | extern uint32 BufTableHashCode(BufferTag *tagPtr); |
326 | extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); |
327 | extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); |
328 | extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); |
329 | |
330 | /* localbuf.c */ |
331 | extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, |
332 | BlockNumber blockNum); |
333 | extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, |
334 | BlockNumber blockNum, bool *foundPtr); |
335 | extern void MarkLocalBufferDirty(Buffer buffer); |
336 | extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, |
337 | BlockNumber firstDelBlock); |
338 | extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); |
339 | extern void AtEOXact_LocalBuffers(bool isCommit); |
340 | |
341 | #endif /* BUFMGR_INTERNALS_H */ |
342 | |