| 1 | /*------------------------------------------------------------------------- | 
| 2 |  * | 
| 3 |  * buf_internals.h | 
| 4 |  *	  Internal definitions for buffer manager and the buffer replacement | 
| 5 |  *	  strategy. | 
| 6 |  * | 
| 7 |  * | 
| 8 |  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
| 9 |  * Portions Copyright (c) 1994, Regents of the University of California | 
| 10 |  * | 
| 11 |  * src/include/storage/buf_internals.h | 
| 12 |  * | 
| 13 |  *------------------------------------------------------------------------- | 
| 14 |  */ | 
| 15 | #ifndef BUFMGR_INTERNALS_H | 
| 16 | #define BUFMGR_INTERNALS_H | 
| 17 |  | 
| 18 | #include "storage/buf.h" | 
| 19 | #include "storage/bufmgr.h" | 
| 20 | #include "storage/latch.h" | 
| 21 | #include "storage/lwlock.h" | 
| 22 | #include "storage/shmem.h" | 
| 23 | #include "storage/smgr.h" | 
| 24 | #include "port/atomics.h" | 
| 25 | #include "storage/spin.h" | 
| 26 | #include "utils/relcache.h" | 
| 27 |  | 
| 28 |  | 
| 29 | /* | 
| 30 |  * Buffer state is a single 32-bit variable where following data is combined. | 
| 31 |  * | 
| 32 |  * - 18 bits refcount | 
| 33 |  * - 4 bits usage count | 
| 34 |  * - 10 bits of flags | 
| 35 |  * | 
| 36 |  * Combining these values allows to perform some operations without locking | 
| 37 |  * the buffer header, by modifying them together with a CAS loop. | 
| 38 |  * | 
| 39 |  * The definition of buffer state components is below. | 
| 40 |  */ | 
| 41 | #define BUF_REFCOUNT_ONE 1 | 
| 42 | #define BUF_REFCOUNT_MASK ((1U << 18) - 1) | 
| 43 | #define BUF_USAGECOUNT_MASK 0x003C0000U | 
| 44 | #define BUF_USAGECOUNT_ONE (1U << 18) | 
| 45 | #define BUF_USAGECOUNT_SHIFT 18 | 
| 46 | #define BUF_FLAG_MASK 0xFFC00000U | 
| 47 |  | 
| 48 | /* Get refcount and usagecount from buffer state */ | 
| 49 | #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) | 
| 50 | #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) | 
| 51 |  | 
| 52 | /* | 
| 53 |  * Flags for buffer descriptors | 
| 54 |  * | 
| 55 |  * Note: TAG_VALID essentially means that there is a buffer hashtable | 
| 56 |  * entry associated with the buffer's tag. | 
| 57 |  */ | 
| 58 | #define BM_LOCKED				(1U << 22)	/* buffer header is locked */ | 
| 59 | #define BM_DIRTY				(1U << 23)	/* data needs writing */ | 
| 60 | #define BM_VALID				(1U << 24)	/* data is valid */ | 
| 61 | #define BM_TAG_VALID			(1U << 25)	/* tag is assigned */ | 
| 62 | #define BM_IO_IN_PROGRESS		(1U << 26)	/* read or write in progress */ | 
| 63 | #define BM_IO_ERROR				(1U << 27)	/* previous I/O failed */ | 
| 64 | #define BM_JUST_DIRTIED			(1U << 28)	/* dirtied since write started */ | 
| 65 | #define BM_PIN_COUNT_WAITER		(1U << 29)	/* have waiter for sole pin */ | 
| 66 | #define BM_CHECKPOINT_NEEDED	(1U << 30)	/* must write for checkpoint */ | 
| 67 | #define BM_PERMANENT			(1U << 31)	/* permanent buffer (not unlogged, | 
| 68 | 											 * or init fork) */ | 
| 69 | /* | 
| 70 |  * The maximum allowed value of usage_count represents a tradeoff between | 
| 71 |  * accuracy and speed of the clock-sweep buffer management algorithm.  A | 
| 72 |  * large value (comparable to NBuffers) would approximate LRU semantics. | 
| 73 |  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of | 
| 74 |  * clock sweeps to find a free buffer, so in practice we don't want the | 
| 75 |  * value to be very large. | 
| 76 |  */ | 
| 77 | #define BM_MAX_USAGE_COUNT	5 | 
| 78 |  | 
| 79 | /* | 
| 80 |  * Buffer tag identifies which disk block the buffer contains. | 
| 81 |  * | 
| 82 |  * Note: the BufferTag data must be sufficient to determine where to write the | 
| 83 |  * block, without reference to pg_class or pg_tablespace entries.  It's | 
| 84 |  * possible that the backend flushing the buffer doesn't even believe the | 
| 85 |  * relation is visible yet (its xact may have started before the xact that | 
| 86 |  * created the rel).  The storage manager must be able to cope anyway. | 
| 87 |  * | 
| 88 |  * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have | 
| 89 |  * to be fixed to zero them, since this struct is used as a hash key. | 
| 90 |  */ | 
| 91 | typedef struct buftag | 
| 92 | { | 
| 93 | 	RelFileNode rnode;			/* physical relation identifier */ | 
| 94 | 	ForkNumber	forkNum; | 
| 95 | 	BlockNumber blockNum;		/* blknum relative to begin of reln */ | 
| 96 | } BufferTag; | 
| 97 |  | 
| 98 | #define CLEAR_BUFFERTAG(a) \ | 
| 99 | ( \ | 
| 100 | 	(a).rnode.spcNode = InvalidOid, \ | 
| 101 | 	(a).rnode.dbNode = InvalidOid, \ | 
| 102 | 	(a).rnode.relNode = InvalidOid, \ | 
| 103 | 	(a).forkNum = InvalidForkNumber, \ | 
| 104 | 	(a).blockNum = InvalidBlockNumber \ | 
| 105 | ) | 
| 106 |  | 
| 107 | #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ | 
| 108 | ( \ | 
| 109 | 	(a).rnode = (xx_rnode), \ | 
| 110 | 	(a).forkNum = (xx_forkNum), \ | 
| 111 | 	(a).blockNum = (xx_blockNum) \ | 
| 112 | ) | 
| 113 |  | 
| 114 | #define BUFFERTAGS_EQUAL(a,b) \ | 
| 115 | ( \ | 
| 116 | 	RelFileNodeEquals((a).rnode, (b).rnode) && \ | 
| 117 | 	(a).blockNum == (b).blockNum && \ | 
| 118 | 	(a).forkNum == (b).forkNum \ | 
| 119 | ) | 
| 120 |  | 
| 121 | /* | 
| 122 |  * The shared buffer mapping table is partitioned to reduce contention. | 
| 123 |  * To determine which partition lock a given tag requires, compute the tag's | 
| 124 |  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). | 
| 125 |  * NB: NUM_BUFFER_PARTITIONS must be a power of 2! | 
| 126 |  */ | 
| 127 | #define BufTableHashPartition(hashcode) \ | 
| 128 | 	((hashcode) % NUM_BUFFER_PARTITIONS) | 
| 129 | #define BufMappingPartitionLock(hashcode) \ | 
| 130 | 	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ | 
| 131 | 		BufTableHashPartition(hashcode)].lock) | 
| 132 | #define BufMappingPartitionLockByIndex(i) \ | 
| 133 | 	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) | 
| 134 |  | 
| 135 | /* | 
| 136 |  *	BufferDesc -- shared descriptor/state data for a single shared buffer. | 
| 137 |  * | 
| 138 |  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change | 
| 139 |  * the tag, state or wait_backend_pid fields.  In general, buffer header lock | 
| 140 |  * is a spinlock which is combined with flags, refcount and usagecount into | 
| 141 |  * single atomic variable.  This layout allow us to do some operations in a | 
| 142 |  * single atomic operation, without actually acquiring and releasing spinlock; | 
| 143 |  * for instance, increase or decrease refcount.  buf_id field never changes | 
| 144 |  * after initialization, so does not need locking.  freeNext is protected by | 
| 145 |  * the buffer_strategy_lock not buffer header lock.  The LWLock can take care | 
| 146 |  * of itself.  The buffer header lock is *not* used to control access to the | 
| 147 |  * data in the buffer! | 
| 148 |  * | 
| 149 |  * It's assumed that nobody changes the state field while buffer header lock | 
| 150 |  * is held.  Thus buffer header lock holder can do complex updates of the | 
| 151 |  * state variable in single write, simultaneously with lock release (cleaning | 
| 152 |  * BM_LOCKED flag).  On the other hand, updating of state without holding | 
| 153 |  * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag | 
| 154 |  * is not set.  Atomic increment/decrement, OR/AND etc. are not allowed. | 
| 155 |  * | 
| 156 |  * An exception is that if we have the buffer pinned, its tag can't change | 
| 157 |  * underneath us, so we can examine the tag without locking the buffer header. | 
| 158 |  * Also, in places we do one-time reads of the flags without bothering to | 
| 159 |  * lock the buffer header; this is generally for situations where we don't | 
| 160 |  * expect the flag bit being tested to be changing. | 
| 161 |  * | 
| 162 |  * We can't physically remove items from a disk page if another backend has | 
| 163 |  * the buffer pinned.  Hence, a backend may need to wait for all other pins | 
| 164 |  * to go away.  This is signaled by storing its own PID into | 
| 165 |  * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER.  At present, | 
| 166 |  * there can be only one such waiter per buffer. | 
| 167 |  * | 
| 168 |  * We use this same struct for local buffer headers, but the locks are not | 
| 169 |  * used and not all of the flag bits are useful either. To avoid unnecessary | 
| 170 |  * overhead, manipulations of the state field should be done without actual | 
| 171 |  * atomic operations (i.e. only pg_atomic_read_u32() and | 
| 172 |  * pg_atomic_unlocked_write_u32()). | 
| 173 |  * | 
| 174 |  * Be careful to avoid increasing the size of the struct when adding or | 
| 175 |  * reordering members.  Keeping it below 64 bytes (the most common CPU | 
| 176 |  * cache line size) is fairly important for performance. | 
| 177 |  */ | 
| 178 | typedef struct BufferDesc | 
| 179 | { | 
| 180 | 	BufferTag	tag;			/* ID of page contained in buffer */ | 
| 181 | 	int			buf_id;			/* buffer's index number (from 0) */ | 
| 182 |  | 
| 183 | 	/* state of the tag, containing flags, refcount and usagecount */ | 
| 184 | 	pg_atomic_uint32 state; | 
| 185 |  | 
| 186 | 	int			wait_backend_pid;	/* backend PID of pin-count waiter */ | 
| 187 | 	int			freeNext;		/* link in freelist chain */ | 
| 188 |  | 
| 189 | 	LWLock		content_lock;	/* to lock access to buffer contents */ | 
| 190 | } BufferDesc; | 
| 191 |  | 
| 192 | /* | 
| 193 |  * Concurrent access to buffer headers has proven to be more efficient if | 
| 194 |  * they're cache line aligned. So we force the start of the BufferDescriptors | 
| 195 |  * array to be on a cache line boundary and force the elements to be cache | 
| 196 |  * line sized. | 
| 197 |  * | 
| 198 |  * XXX: As this is primarily matters in highly concurrent workloads which | 
| 199 |  * probably all are 64bit these days, and the space wastage would be a bit | 
| 200 |  * more noticeable on 32bit systems, we don't force the stride to be cache | 
| 201 |  * line sized on those. If somebody does actual performance testing, we can | 
| 202 |  * reevaluate. | 
| 203 |  * | 
| 204 |  * Note that local buffer descriptors aren't forced to be aligned - as there's | 
| 205 |  * no concurrent access to those it's unlikely to be beneficial. | 
| 206 |  * | 
| 207 |  * We use 64bit as the cache line size here, because that's the most common | 
| 208 |  * size. Making it bigger would be a waste of memory. Even if running on a | 
| 209 |  * platform with either 32 or 128 byte line sizes, it's good to align to | 
| 210 |  * boundaries and avoid false sharing. | 
| 211 |  */ | 
| 212 | #define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 64 : 1) | 
| 213 |  | 
| 214 | typedef union BufferDescPadded | 
| 215 | { | 
| 216 | 	BufferDesc	bufferdesc; | 
| 217 | 	char		pad[BUFFERDESC_PAD_TO_SIZE]; | 
| 218 | } BufferDescPadded; | 
| 219 |  | 
| 220 | #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc) | 
| 221 | #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)]) | 
| 222 |  | 
| 223 | #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) | 
| 224 |  | 
| 225 | #define BufferDescriptorGetIOLock(bdesc) \ | 
| 226 | 	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock) | 
| 227 | #define BufferDescriptorGetContentLock(bdesc) \ | 
| 228 | 	((LWLock*) (&(bdesc)->content_lock)) | 
| 229 |  | 
| 230 | extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; | 
| 231 |  | 
| 232 | /* | 
| 233 |  * The freeNext field is either the index of the next freelist entry, | 
| 234 |  * or one of these special values: | 
| 235 |  */ | 
| 236 | #define FREENEXT_END_OF_LIST	(-1) | 
| 237 | #define FREENEXT_NOT_IN_LIST	(-2) | 
| 238 |  | 
| 239 | /* | 
| 240 |  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do | 
| 241 |  * not apply these to local buffers! | 
| 242 |  */ | 
| 243 | extern uint32 LockBufHdr(BufferDesc *desc); | 
| 244 | #define UnlockBufHdr(desc, s)	\ | 
| 245 | 	do {	\ | 
| 246 | 		pg_write_barrier(); \ | 
| 247 | 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \ | 
| 248 | 	} while (0) | 
| 249 |  | 
| 250 |  | 
| 251 | /* | 
| 252 |  * The PendingWriteback & WritebackContext structure are used to keep | 
| 253 |  * information about pending flush requests to be issued to the OS. | 
| 254 |  */ | 
| 255 | typedef struct PendingWriteback | 
| 256 | { | 
| 257 | 	/* could store different types of pending flushes here */ | 
| 258 | 	BufferTag	tag; | 
| 259 | } PendingWriteback; | 
| 260 |  | 
| 261 | /* struct forward declared in bufmgr.h */ | 
| 262 | typedef struct WritebackContext | 
| 263 | { | 
| 264 | 	/* pointer to the max number of writeback requests to coalesce */ | 
| 265 | 	int		   *max_pending; | 
| 266 |  | 
| 267 | 	/* current number of pending writeback requests */ | 
| 268 | 	int			nr_pending; | 
| 269 |  | 
| 270 | 	/* pending requests */ | 
| 271 | 	PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; | 
| 272 | } WritebackContext; | 
| 273 |  | 
| 274 | /* in buf_init.c */ | 
| 275 | extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; | 
| 276 | extern PGDLLIMPORT WritebackContext BackendWritebackContext; | 
| 277 |  | 
| 278 | /* in localbuf.c */ | 
| 279 | extern BufferDesc *LocalBufferDescriptors; | 
| 280 |  | 
| 281 | /* in bufmgr.c */ | 
| 282 |  | 
| 283 | /* | 
| 284 |  * Structure to sort buffers per file on checkpoints. | 
| 285 |  * | 
| 286 |  * This structure is allocated per buffer in shared memory, so it should be | 
| 287 |  * kept as small as possible. | 
| 288 |  */ | 
| 289 | typedef struct CkptSortItem | 
| 290 | { | 
| 291 | 	Oid			tsId; | 
| 292 | 	Oid			relNode; | 
| 293 | 	ForkNumber	forkNum; | 
| 294 | 	BlockNumber blockNum; | 
| 295 | 	int			buf_id; | 
| 296 | } CkptSortItem; | 
| 297 |  | 
| 298 | extern CkptSortItem *CkptBufferIds; | 
| 299 |  | 
| 300 | /* | 
| 301 |  * Internal buffer management routines | 
| 302 |  */ | 
| 303 | /* bufmgr.c */ | 
| 304 | extern void WritebackContextInit(WritebackContext *context, int *max_pending); | 
| 305 | extern void IssuePendingWritebacks(WritebackContext *context); | 
| 306 | extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); | 
| 307 |  | 
| 308 | /* freelist.c */ | 
| 309 | extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, | 
| 310 | 									 uint32 *buf_state); | 
| 311 | extern void StrategyFreeBuffer(BufferDesc *buf); | 
| 312 | extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, | 
| 313 | 								 BufferDesc *buf); | 
| 314 |  | 
| 315 | extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); | 
| 316 | extern void StrategyNotifyBgWriter(int bgwprocno); | 
| 317 |  | 
| 318 | extern Size StrategyShmemSize(void); | 
| 319 | extern void StrategyInitialize(bool init); | 
| 320 | extern bool have_free_buffer(void); | 
| 321 |  | 
| 322 | /* buf_table.c */ | 
| 323 | extern Size BufTableShmemSize(int size); | 
| 324 | extern void InitBufTable(int size); | 
| 325 | extern uint32 BufTableHashCode(BufferTag *tagPtr); | 
| 326 | extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode); | 
| 327 | extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); | 
| 328 | extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); | 
| 329 |  | 
| 330 | /* localbuf.c */ | 
| 331 | extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, | 
| 332 | 								BlockNumber blockNum); | 
| 333 | extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, | 
| 334 | 									BlockNumber blockNum, bool *foundPtr); | 
| 335 | extern void MarkLocalBufferDirty(Buffer buffer); | 
| 336 | extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, | 
| 337 | 										BlockNumber firstDelBlock); | 
| 338 | extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); | 
| 339 | extern void AtEOXact_LocalBuffers(bool isCommit); | 
| 340 |  | 
| 341 | #endif							/* BUFMGR_INTERNALS_H */ | 
| 342 |  |