| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * slru.c |
| 4 | * Simple LRU buffering for transaction status logfiles |
| 5 | * |
| 6 | * We use a simple least-recently-used scheme to manage a pool of page |
| 7 | * buffers. Under ordinary circumstances we expect that write |
| 8 | * traffic will occur mostly to the latest page (and to the just-prior |
| 9 | * page, soon after a page transition). Read traffic will probably touch |
| 10 | * a larger span of pages, but in any case a fairly small number of page |
| 11 | * buffers should be sufficient. So, we just search the buffers using plain |
| 12 | * linear search; there's no need for a hashtable or anything fancy. |
| 13 | * The management algorithm is straight LRU except that we will never swap |
| 14 | * out the latest page (since we know it's going to be hit again eventually). |
| 15 | * |
| 16 | * We use a control LWLock to protect the shared data structures, plus |
| 17 | * per-buffer LWLocks that synchronize I/O for each buffer. The control lock |
| 18 | * must be held to examine or modify any shared state. A process that is |
| 19 | * reading in or writing out a page buffer does not hold the control lock, |
| 20 | * only the per-buffer lock for the buffer it is working on. |
| 21 | * |
| 22 | * "Holding the control lock" means exclusive lock in all cases except for |
| 23 | * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for |
| 24 | * the implications of that. |
| 25 | * |
| 26 | * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively |
| 27 | * before releasing the control lock. The per-buffer lock is released after |
| 28 | * completing the I/O, re-acquiring the control lock, and updating the shared |
| 29 | * state. (Deadlock is not possible here, because we never try to initiate |
| 30 | * I/O when someone else is already doing I/O on the same buffer.) |
| 31 | * To wait for I/O to complete, release the control lock, acquire the |
| 32 | * per-buffer lock in shared mode, immediately release the per-buffer lock, |
| 33 | * reacquire the control lock, and then recheck state (since arbitrary things |
| 34 | * could have happened while we didn't have the lock). |
| 35 | * |
| 36 | * As with the regular buffer manager, it is possible for another process |
| 37 | * to re-dirty a page that is currently being written out. This is handled |
| 38 | * by re-setting the page's page_dirty flag. |
| 39 | * |
| 40 | * |
| 41 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 42 | * Portions Copyright (c) 1994, Regents of the University of California |
| 43 | * |
| 44 | * src/backend/access/transam/slru.c |
| 45 | * |
| 46 | *------------------------------------------------------------------------- |
| 47 | */ |
| 48 | #include "postgres.h" |
| 49 | |
| 50 | #include <fcntl.h> |
| 51 | #include <sys/stat.h> |
| 52 | #include <unistd.h> |
| 53 | |
| 54 | #include "access/slru.h" |
| 55 | #include "access/transam.h" |
| 56 | #include "access/xlog.h" |
| 57 | #include "pgstat.h" |
| 58 | #include "storage/fd.h" |
| 59 | #include "storage/shmem.h" |
| 60 | #include "miscadmin.h" |
| 61 | |
| 62 | |
| 63 | #define SlruFileName(ctl, path, seg) \ |
| 64 | snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) |
| 65 | |
| 66 | /* |
| 67 | * During SimpleLruFlush(), we will usually not need to write/fsync more |
| 68 | * than one or two physical files, but we may need to write several pages |
| 69 | * per file. We can consolidate the I/O requests by leaving files open |
| 70 | * until control returns to SimpleLruFlush(). This data structure remembers |
| 71 | * which files are open. |
| 72 | */ |
| 73 | #define MAX_FLUSH_BUFFERS 16 |
| 74 | |
| 75 | typedef struct SlruFlushData |
| 76 | { |
| 77 | int num_files; /* # files actually open */ |
| 78 | int fd[MAX_FLUSH_BUFFERS]; /* their FD's */ |
| 79 | int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */ |
| 80 | } SlruFlushData; |
| 81 | |
| 82 | typedef struct SlruFlushData *SlruFlush; |
| 83 | |
| 84 | /* |
| 85 | * Macro to mark a buffer slot "most recently used". Note multiple evaluation |
| 86 | * of arguments! |
| 87 | * |
| 88 | * The reason for the if-test is that there are often many consecutive |
| 89 | * accesses to the same page (particularly the latest page). By suppressing |
| 90 | * useless increments of cur_lru_count, we reduce the probability that old |
| 91 | * pages' counts will "wrap around" and make them appear recently used. |
| 92 | * |
| 93 | * We allow this code to be executed concurrently by multiple processes within |
| 94 | * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, |
| 95 | * this should not cause any completely-bogus values to enter the computation. |
| 96 | * However, it is possible for either cur_lru_count or individual |
| 97 | * page_lru_count entries to be "reset" to lower values than they should have, |
| 98 | * in case a process is delayed while it executes this macro. With care in |
| 99 | * SlruSelectLRUPage(), this does little harm, and in any case the absolute |
| 100 | * worst possible consequence is a nonoptimal choice of page to evict. The |
| 101 | * gain from allowing concurrent reads of SLRU pages seems worth it. |
| 102 | */ |
| 103 | #define SlruRecentlyUsed(shared, slotno) \ |
| 104 | do { \ |
| 105 | int new_lru_count = (shared)->cur_lru_count; \ |
| 106 | if (new_lru_count != (shared)->page_lru_count[slotno]) { \ |
| 107 | (shared)->cur_lru_count = ++new_lru_count; \ |
| 108 | (shared)->page_lru_count[slotno] = new_lru_count; \ |
| 109 | } \ |
| 110 | } while (0) |
| 111 | |
| 112 | /* Saved info for SlruReportIOError */ |
| 113 | typedef enum |
| 114 | { |
| 115 | SLRU_OPEN_FAILED, |
| 116 | SLRU_SEEK_FAILED, |
| 117 | SLRU_READ_FAILED, |
| 118 | SLRU_WRITE_FAILED, |
| 119 | SLRU_FSYNC_FAILED, |
| 120 | SLRU_CLOSE_FAILED |
| 121 | } SlruErrorCause; |
| 122 | |
| 123 | static SlruErrorCause slru_errcause; |
| 124 | static int slru_errno; |
| 125 | |
| 126 | |
| 127 | static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); |
| 128 | static void SimpleLruWaitIO(SlruCtl ctl, int slotno); |
| 129 | static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); |
| 130 | static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); |
| 131 | static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, |
| 132 | SlruFlush fdata); |
| 133 | static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); |
| 134 | static int SlruSelectLRUPage(SlruCtl ctl, int pageno); |
| 135 | |
| 136 | static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, |
| 137 | int segpage, void *data); |
| 138 | static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename); |
| 139 | |
| 140 | /* |
| 141 | * Initialization of shared memory |
| 142 | */ |
| 143 | |
| 144 | Size |
| 145 | SimpleLruShmemSize(int nslots, int nlsns) |
| 146 | { |
| 147 | Size sz; |
| 148 | |
| 149 | /* we assume nslots isn't so large as to risk overflow */ |
| 150 | sz = MAXALIGN(sizeof(SlruSharedData)); |
| 151 | sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */ |
| 152 | sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ |
| 153 | sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */ |
| 154 | sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ |
| 155 | sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ |
| 156 | sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ |
| 157 | |
| 158 | if (nlsns > 0) |
| 159 | sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ |
| 160 | |
| 161 | return BUFFERALIGN(sz) + BLCKSZ * nslots; |
| 162 | } |
| 163 | |
| 164 | void |
| 165 | SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, |
| 166 | LWLock *ctllock, const char *subdir, int tranche_id) |
| 167 | { |
| 168 | SlruShared shared; |
| 169 | bool found; |
| 170 | |
| 171 | shared = (SlruShared) ShmemInitStruct(name, |
| 172 | SimpleLruShmemSize(nslots, nlsns), |
| 173 | &found); |
| 174 | |
| 175 | if (!IsUnderPostmaster) |
| 176 | { |
| 177 | /* Initialize locks and shared memory area */ |
| 178 | char *ptr; |
| 179 | Size offset; |
| 180 | int slotno; |
| 181 | |
| 182 | Assert(!found); |
| 183 | |
| 184 | memset(shared, 0, sizeof(SlruSharedData)); |
| 185 | |
| 186 | shared->ControlLock = ctllock; |
| 187 | |
| 188 | shared->num_slots = nslots; |
| 189 | shared->lsn_groups_per_page = nlsns; |
| 190 | |
| 191 | shared->cur_lru_count = 0; |
| 192 | |
| 193 | /* shared->latest_page_number will be set later */ |
| 194 | |
| 195 | ptr = (char *) shared; |
| 196 | offset = MAXALIGN(sizeof(SlruSharedData)); |
| 197 | shared->page_buffer = (char **) (ptr + offset); |
| 198 | offset += MAXALIGN(nslots * sizeof(char *)); |
| 199 | shared->page_status = (SlruPageStatus *) (ptr + offset); |
| 200 | offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); |
| 201 | shared->page_dirty = (bool *) (ptr + offset); |
| 202 | offset += MAXALIGN(nslots * sizeof(bool)); |
| 203 | shared->page_number = (int *) (ptr + offset); |
| 204 | offset += MAXALIGN(nslots * sizeof(int)); |
| 205 | shared->page_lru_count = (int *) (ptr + offset); |
| 206 | offset += MAXALIGN(nslots * sizeof(int)); |
| 207 | |
| 208 | /* Initialize LWLocks */ |
| 209 | shared->buffer_locks = (LWLockPadded *) (ptr + offset); |
| 210 | offset += MAXALIGN(nslots * sizeof(LWLockPadded)); |
| 211 | |
| 212 | if (nlsns > 0) |
| 213 | { |
| 214 | shared->group_lsn = (XLogRecPtr *) (ptr + offset); |
| 215 | offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); |
| 216 | } |
| 217 | |
| 218 | Assert(strlen(name) + 1 < SLRU_MAX_NAME_LENGTH); |
| 219 | strlcpy(shared->lwlock_tranche_name, name, SLRU_MAX_NAME_LENGTH); |
| 220 | shared->lwlock_tranche_id = tranche_id; |
| 221 | |
| 222 | ptr += BUFFERALIGN(offset); |
| 223 | for (slotno = 0; slotno < nslots; slotno++) |
| 224 | { |
| 225 | LWLockInitialize(&shared->buffer_locks[slotno].lock, |
| 226 | shared->lwlock_tranche_id); |
| 227 | |
| 228 | shared->page_buffer[slotno] = ptr; |
| 229 | shared->page_status[slotno] = SLRU_PAGE_EMPTY; |
| 230 | shared->page_dirty[slotno] = false; |
| 231 | shared->page_lru_count[slotno] = 0; |
| 232 | ptr += BLCKSZ; |
| 233 | } |
| 234 | |
| 235 | /* Should fit to estimated shmem size */ |
| 236 | Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); |
| 237 | } |
| 238 | else |
| 239 | Assert(found); |
| 240 | |
| 241 | /* Register SLRU tranche in the main tranches array */ |
| 242 | LWLockRegisterTranche(shared->lwlock_tranche_id, |
| 243 | shared->lwlock_tranche_name); |
| 244 | |
| 245 | /* |
| 246 | * Initialize the unshared control struct, including directory path. We |
| 247 | * assume caller set PagePrecedes. |
| 248 | */ |
| 249 | ctl->shared = shared; |
| 250 | ctl->do_fsync = true; /* default behavior */ |
| 251 | StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir)); |
| 252 | } |
| 253 | |
| 254 | /* |
| 255 | * Initialize (or reinitialize) a page to zeroes. |
| 256 | * |
| 257 | * The page is not actually written, just set up in shared memory. |
| 258 | * The slot number of the new page is returned. |
| 259 | * |
| 260 | * Control lock must be held at entry, and will be held at exit. |
| 261 | */ |
| 262 | int |
| 263 | SimpleLruZeroPage(SlruCtl ctl, int pageno) |
| 264 | { |
| 265 | SlruShared shared = ctl->shared; |
| 266 | int slotno; |
| 267 | |
| 268 | /* Find a suitable buffer slot for the page */ |
| 269 | slotno = SlruSelectLRUPage(ctl, pageno); |
| 270 | Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || |
| 271 | (shared->page_status[slotno] == SLRU_PAGE_VALID && |
| 272 | !shared->page_dirty[slotno]) || |
| 273 | shared->page_number[slotno] == pageno); |
| 274 | |
| 275 | /* Mark the slot as containing this page */ |
| 276 | shared->page_number[slotno] = pageno; |
| 277 | shared->page_status[slotno] = SLRU_PAGE_VALID; |
| 278 | shared->page_dirty[slotno] = true; |
| 279 | SlruRecentlyUsed(shared, slotno); |
| 280 | |
| 281 | /* Set the buffer to zeroes */ |
| 282 | MemSet(shared->page_buffer[slotno], 0, BLCKSZ); |
| 283 | |
| 284 | /* Set the LSNs for this new page to zero */ |
| 285 | SimpleLruZeroLSNs(ctl, slotno); |
| 286 | |
| 287 | /* Assume this page is now the latest active page */ |
| 288 | shared->latest_page_number = pageno; |
| 289 | |
| 290 | return slotno; |
| 291 | } |
| 292 | |
| 293 | /* |
| 294 | * Zero all the LSNs we store for this slru page. |
| 295 | * |
| 296 | * This should be called each time we create a new page, and each time we read |
| 297 | * in a page from disk into an existing buffer. (Such an old page cannot |
| 298 | * have any interesting LSNs, since we'd have flushed them before writing |
| 299 | * the page in the first place.) |
| 300 | * |
| 301 | * This assumes that InvalidXLogRecPtr is bitwise-all-0. |
| 302 | */ |
| 303 | static void |
| 304 | SimpleLruZeroLSNs(SlruCtl ctl, int slotno) |
| 305 | { |
| 306 | SlruShared shared = ctl->shared; |
| 307 | |
| 308 | if (shared->lsn_groups_per_page > 0) |
| 309 | MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0, |
| 310 | shared->lsn_groups_per_page * sizeof(XLogRecPtr)); |
| 311 | } |
| 312 | |
| 313 | /* |
| 314 | * Wait for any active I/O on a page slot to finish. (This does not |
| 315 | * guarantee that new I/O hasn't been started before we return, though. |
| 316 | * In fact the slot might not even contain the same page anymore.) |
| 317 | * |
| 318 | * Control lock must be held at entry, and will be held at exit. |
| 319 | */ |
| 320 | static void |
| 321 | SimpleLruWaitIO(SlruCtl ctl, int slotno) |
| 322 | { |
| 323 | SlruShared shared = ctl->shared; |
| 324 | |
| 325 | /* See notes at top of file */ |
| 326 | LWLockRelease(shared->ControlLock); |
| 327 | LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED); |
| 328 | LWLockRelease(&shared->buffer_locks[slotno].lock); |
| 329 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 330 | |
| 331 | /* |
| 332 | * If the slot is still in an io-in-progress state, then either someone |
| 333 | * already started a new I/O on the slot, or a previous I/O failed and |
| 334 | * neglected to reset the page state. That shouldn't happen, really, but |
| 335 | * it seems worth a few extra cycles to check and recover from it. We can |
| 336 | * cheaply test for failure by seeing if the buffer lock is still held (we |
| 337 | * assume that transaction abort would release the lock). |
| 338 | */ |
| 339 | if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || |
| 340 | shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) |
| 341 | { |
| 342 | if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED)) |
| 343 | { |
| 344 | /* indeed, the I/O must have failed */ |
| 345 | if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) |
| 346 | shared->page_status[slotno] = SLRU_PAGE_EMPTY; |
| 347 | else /* write_in_progress */ |
| 348 | { |
| 349 | shared->page_status[slotno] = SLRU_PAGE_VALID; |
| 350 | shared->page_dirty[slotno] = true; |
| 351 | } |
| 352 | LWLockRelease(&shared->buffer_locks[slotno].lock); |
| 353 | } |
| 354 | } |
| 355 | } |
| 356 | |
| 357 | /* |
| 358 | * Find a page in a shared buffer, reading it in if necessary. |
| 359 | * The page number must correspond to an already-initialized page. |
| 360 | * |
| 361 | * If write_ok is true then it is OK to return a page that is in |
| 362 | * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure |
| 363 | * that modification of the page is safe. If write_ok is false then we |
| 364 | * will not return the page until it is not undergoing active I/O. |
| 365 | * |
| 366 | * The passed-in xid is used only for error reporting, and may be |
| 367 | * InvalidTransactionId if no specific xid is associated with the action. |
| 368 | * |
| 369 | * Return value is the shared-buffer slot number now holding the page. |
| 370 | * The buffer's LRU access info is updated. |
| 371 | * |
| 372 | * Control lock must be held at entry, and will be held at exit. |
| 373 | */ |
| 374 | int |
| 375 | SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, |
| 376 | TransactionId xid) |
| 377 | { |
| 378 | SlruShared shared = ctl->shared; |
| 379 | |
| 380 | /* Outer loop handles restart if we must wait for someone else's I/O */ |
| 381 | for (;;) |
| 382 | { |
| 383 | int slotno; |
| 384 | bool ok; |
| 385 | |
| 386 | /* See if page already is in memory; if not, pick victim slot */ |
| 387 | slotno = SlruSelectLRUPage(ctl, pageno); |
| 388 | |
| 389 | /* Did we find the page in memory? */ |
| 390 | if (shared->page_number[slotno] == pageno && |
| 391 | shared->page_status[slotno] != SLRU_PAGE_EMPTY) |
| 392 | { |
| 393 | /* |
| 394 | * If page is still being read in, we must wait for I/O. Likewise |
| 395 | * if the page is being written and the caller said that's not OK. |
| 396 | */ |
| 397 | if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || |
| 398 | (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && |
| 399 | !write_ok)) |
| 400 | { |
| 401 | SimpleLruWaitIO(ctl, slotno); |
| 402 | /* Now we must recheck state from the top */ |
| 403 | continue; |
| 404 | } |
| 405 | /* Otherwise, it's ready to use */ |
| 406 | SlruRecentlyUsed(shared, slotno); |
| 407 | return slotno; |
| 408 | } |
| 409 | |
| 410 | /* We found no match; assert we selected a freeable slot */ |
| 411 | Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || |
| 412 | (shared->page_status[slotno] == SLRU_PAGE_VALID && |
| 413 | !shared->page_dirty[slotno])); |
| 414 | |
| 415 | /* Mark the slot read-busy */ |
| 416 | shared->page_number[slotno] = pageno; |
| 417 | shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; |
| 418 | shared->page_dirty[slotno] = false; |
| 419 | |
| 420 | /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ |
| 421 | LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); |
| 422 | |
| 423 | /* Release control lock while doing I/O */ |
| 424 | LWLockRelease(shared->ControlLock); |
| 425 | |
| 426 | /* Do the read */ |
| 427 | ok = SlruPhysicalReadPage(ctl, pageno, slotno); |
| 428 | |
| 429 | /* Set the LSNs for this newly read-in page to zero */ |
| 430 | SimpleLruZeroLSNs(ctl, slotno); |
| 431 | |
| 432 | /* Re-acquire control lock and update page state */ |
| 433 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 434 | |
| 435 | Assert(shared->page_number[slotno] == pageno && |
| 436 | shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && |
| 437 | !shared->page_dirty[slotno]); |
| 438 | |
| 439 | shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; |
| 440 | |
| 441 | LWLockRelease(&shared->buffer_locks[slotno].lock); |
| 442 | |
| 443 | /* Now it's okay to ereport if we failed */ |
| 444 | if (!ok) |
| 445 | SlruReportIOError(ctl, pageno, xid); |
| 446 | |
| 447 | SlruRecentlyUsed(shared, slotno); |
| 448 | return slotno; |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | /* |
| 453 | * Find a page in a shared buffer, reading it in if necessary. |
| 454 | * The page number must correspond to an already-initialized page. |
| 455 | * The caller must intend only read-only access to the page. |
| 456 | * |
| 457 | * The passed-in xid is used only for error reporting, and may be |
| 458 | * InvalidTransactionId if no specific xid is associated with the action. |
| 459 | * |
| 460 | * Return value is the shared-buffer slot number now holding the page. |
| 461 | * The buffer's LRU access info is updated. |
| 462 | * |
| 463 | * Control lock must NOT be held at entry, but will be held at exit. |
| 464 | * It is unspecified whether the lock will be shared or exclusive. |
| 465 | */ |
| 466 | int |
| 467 | SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) |
| 468 | { |
| 469 | SlruShared shared = ctl->shared; |
| 470 | int slotno; |
| 471 | |
| 472 | /* Try to find the page while holding only shared lock */ |
| 473 | LWLockAcquire(shared->ControlLock, LW_SHARED); |
| 474 | |
| 475 | /* See if page is already in a buffer */ |
| 476 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 477 | { |
| 478 | if (shared->page_number[slotno] == pageno && |
| 479 | shared->page_status[slotno] != SLRU_PAGE_EMPTY && |
| 480 | shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) |
| 481 | { |
| 482 | /* See comments for SlruRecentlyUsed macro */ |
| 483 | SlruRecentlyUsed(shared, slotno); |
| 484 | return slotno; |
| 485 | } |
| 486 | } |
| 487 | |
| 488 | /* No luck, so switch to normal exclusive lock and do regular read */ |
| 489 | LWLockRelease(shared->ControlLock); |
| 490 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 491 | |
| 492 | return SimpleLruReadPage(ctl, pageno, true, xid); |
| 493 | } |
| 494 | |
| 495 | /* |
| 496 | * Write a page from a shared buffer, if necessary. |
| 497 | * Does nothing if the specified slot is not dirty. |
| 498 | * |
| 499 | * NOTE: only one write attempt is made here. Hence, it is possible that |
| 500 | * the page is still dirty at exit (if someone else re-dirtied it during |
| 501 | * the write). However, we *do* attempt a fresh write even if the page |
| 502 | * is already being written; this is for checkpoints. |
| 503 | * |
| 504 | * Control lock must be held at entry, and will be held at exit. |
| 505 | */ |
| 506 | static void |
| 507 | SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) |
| 508 | { |
| 509 | SlruShared shared = ctl->shared; |
| 510 | int pageno = shared->page_number[slotno]; |
| 511 | bool ok; |
| 512 | |
| 513 | /* If a write is in progress, wait for it to finish */ |
| 514 | while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && |
| 515 | shared->page_number[slotno] == pageno) |
| 516 | { |
| 517 | SimpleLruWaitIO(ctl, slotno); |
| 518 | } |
| 519 | |
| 520 | /* |
| 521 | * Do nothing if page is not dirty, or if buffer no longer contains the |
| 522 | * same page we were called for. |
| 523 | */ |
| 524 | if (!shared->page_dirty[slotno] || |
| 525 | shared->page_status[slotno] != SLRU_PAGE_VALID || |
| 526 | shared->page_number[slotno] != pageno) |
| 527 | return; |
| 528 | |
| 529 | /* |
| 530 | * Mark the slot write-busy, and clear the dirtybit. After this point, a |
| 531 | * transaction status update on this page will mark it dirty again. |
| 532 | */ |
| 533 | shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; |
| 534 | shared->page_dirty[slotno] = false; |
| 535 | |
| 536 | /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ |
| 537 | LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); |
| 538 | |
| 539 | /* Release control lock while doing I/O */ |
| 540 | LWLockRelease(shared->ControlLock); |
| 541 | |
| 542 | /* Do the write */ |
| 543 | ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); |
| 544 | |
| 545 | /* If we failed, and we're in a flush, better close the files */ |
| 546 | if (!ok && fdata) |
| 547 | { |
| 548 | int i; |
| 549 | |
| 550 | for (i = 0; i < fdata->num_files; i++) |
| 551 | CloseTransientFile(fdata->fd[i]); |
| 552 | } |
| 553 | |
| 554 | /* Re-acquire control lock and update page state */ |
| 555 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 556 | |
| 557 | Assert(shared->page_number[slotno] == pageno && |
| 558 | shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); |
| 559 | |
| 560 | /* If we failed to write, mark the page dirty again */ |
| 561 | if (!ok) |
| 562 | shared->page_dirty[slotno] = true; |
| 563 | |
| 564 | shared->page_status[slotno] = SLRU_PAGE_VALID; |
| 565 | |
| 566 | LWLockRelease(&shared->buffer_locks[slotno].lock); |
| 567 | |
| 568 | /* Now it's okay to ereport if we failed */ |
| 569 | if (!ok) |
| 570 | SlruReportIOError(ctl, pageno, InvalidTransactionId); |
| 571 | } |
| 572 | |
| 573 | /* |
| 574 | * Wrapper of SlruInternalWritePage, for external callers. |
| 575 | * fdata is always passed a NULL here. |
| 576 | */ |
| 577 | void |
| 578 | SimpleLruWritePage(SlruCtl ctl, int slotno) |
| 579 | { |
| 580 | SlruInternalWritePage(ctl, slotno, NULL); |
| 581 | } |
| 582 | |
| 583 | /* |
| 584 | * Return whether the given page exists on disk. |
| 585 | * |
| 586 | * A false return means that either the file does not exist, or that it's not |
| 587 | * large enough to contain the given page. |
| 588 | */ |
| 589 | bool |
| 590 | SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) |
| 591 | { |
| 592 | int segno = pageno / SLRU_PAGES_PER_SEGMENT; |
| 593 | int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; |
| 594 | int offset = rpageno * BLCKSZ; |
| 595 | char path[MAXPGPATH]; |
| 596 | int fd; |
| 597 | bool result; |
| 598 | off_t endpos; |
| 599 | |
| 600 | SlruFileName(ctl, path, segno); |
| 601 | |
| 602 | fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); |
| 603 | if (fd < 0) |
| 604 | { |
| 605 | /* expected: file doesn't exist */ |
| 606 | if (errno == ENOENT) |
| 607 | return false; |
| 608 | |
| 609 | /* report error normally */ |
| 610 | slru_errcause = SLRU_OPEN_FAILED; |
| 611 | slru_errno = errno; |
| 612 | SlruReportIOError(ctl, pageno, 0); |
| 613 | } |
| 614 | |
| 615 | if ((endpos = lseek(fd, 0, SEEK_END)) < 0) |
| 616 | { |
| 617 | slru_errcause = SLRU_SEEK_FAILED; |
| 618 | slru_errno = errno; |
| 619 | SlruReportIOError(ctl, pageno, 0); |
| 620 | } |
| 621 | |
| 622 | result = endpos >= (off_t) (offset + BLCKSZ); |
| 623 | |
| 624 | if (CloseTransientFile(fd)) |
| 625 | { |
| 626 | slru_errcause = SLRU_CLOSE_FAILED; |
| 627 | slru_errno = errno; |
| 628 | return false; |
| 629 | } |
| 630 | |
| 631 | return result; |
| 632 | } |
| 633 | |
| 634 | /* |
| 635 | * Physical read of a (previously existing) page into a buffer slot |
| 636 | * |
| 637 | * On failure, we cannot just ereport(ERROR) since caller has put state in |
| 638 | * shared memory that must be undone. So, we return false and save enough |
| 639 | * info in static variables to let SlruReportIOError make the report. |
| 640 | * |
| 641 | * For now, assume it's not worth keeping a file pointer open across |
| 642 | * read/write operations. We could cache one virtual file pointer ... |
| 643 | */ |
| 644 | static bool |
| 645 | SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) |
| 646 | { |
| 647 | SlruShared shared = ctl->shared; |
| 648 | int segno = pageno / SLRU_PAGES_PER_SEGMENT; |
| 649 | int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; |
| 650 | int offset = rpageno * BLCKSZ; |
| 651 | char path[MAXPGPATH]; |
| 652 | int fd; |
| 653 | |
| 654 | SlruFileName(ctl, path, segno); |
| 655 | |
| 656 | /* |
| 657 | * In a crash-and-restart situation, it's possible for us to receive |
| 658 | * commands to set the commit status of transactions whose bits are in |
| 659 | * already-truncated segments of the commit log (see notes in |
| 660 | * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case |
| 661 | * where the file doesn't exist, and return zeroes instead. |
| 662 | */ |
| 663 | fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); |
| 664 | if (fd < 0) |
| 665 | { |
| 666 | if (errno != ENOENT || !InRecovery) |
| 667 | { |
| 668 | slru_errcause = SLRU_OPEN_FAILED; |
| 669 | slru_errno = errno; |
| 670 | return false; |
| 671 | } |
| 672 | |
| 673 | ereport(LOG, |
| 674 | (errmsg("file \"%s\" doesn't exist, reading as zeroes" , |
| 675 | path))); |
| 676 | MemSet(shared->page_buffer[slotno], 0, BLCKSZ); |
| 677 | return true; |
| 678 | } |
| 679 | |
| 680 | if (lseek(fd, (off_t) offset, SEEK_SET) < 0) |
| 681 | { |
| 682 | slru_errcause = SLRU_SEEK_FAILED; |
| 683 | slru_errno = errno; |
| 684 | CloseTransientFile(fd); |
| 685 | return false; |
| 686 | } |
| 687 | |
| 688 | errno = 0; |
| 689 | pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); |
| 690 | if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) |
| 691 | { |
| 692 | pgstat_report_wait_end(); |
| 693 | slru_errcause = SLRU_READ_FAILED; |
| 694 | slru_errno = errno; |
| 695 | CloseTransientFile(fd); |
| 696 | return false; |
| 697 | } |
| 698 | pgstat_report_wait_end(); |
| 699 | |
| 700 | if (CloseTransientFile(fd)) |
| 701 | { |
| 702 | slru_errcause = SLRU_CLOSE_FAILED; |
| 703 | slru_errno = errno; |
| 704 | return false; |
| 705 | } |
| 706 | |
| 707 | return true; |
| 708 | } |
| 709 | |
| 710 | /* |
| 711 | * Physical write of a page from a buffer slot |
| 712 | * |
| 713 | * On failure, we cannot just ereport(ERROR) since caller has put state in |
| 714 | * shared memory that must be undone. So, we return false and save enough |
| 715 | * info in static variables to let SlruReportIOError make the report. |
| 716 | * |
| 717 | * For now, assume it's not worth keeping a file pointer open across |
| 718 | * independent read/write operations. We do batch operations during |
| 719 | * SimpleLruFlush, though. |
| 720 | * |
| 721 | * fdata is NULL for a standalone write, pointer to open-file info during |
| 722 | * SimpleLruFlush. |
| 723 | */ |
| 724 | static bool |
| 725 | SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) |
| 726 | { |
| 727 | SlruShared shared = ctl->shared; |
| 728 | int segno = pageno / SLRU_PAGES_PER_SEGMENT; |
| 729 | int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; |
| 730 | int offset = rpageno * BLCKSZ; |
| 731 | char path[MAXPGPATH]; |
| 732 | int fd = -1; |
| 733 | |
| 734 | /* |
| 735 | * Honor the write-WAL-before-data rule, if appropriate, so that we do not |
| 736 | * write out data before associated WAL records. This is the same action |
| 737 | * performed during FlushBuffer() in the main buffer manager. |
| 738 | */ |
| 739 | if (shared->group_lsn != NULL) |
| 740 | { |
| 741 | /* |
| 742 | * We must determine the largest async-commit LSN for the page. This |
| 743 | * is a bit tedious, but since this entire function is a slow path |
| 744 | * anyway, it seems better to do this here than to maintain a per-page |
| 745 | * LSN variable (which'd need an extra comparison in the |
| 746 | * transaction-commit path). |
| 747 | */ |
| 748 | XLogRecPtr max_lsn; |
| 749 | int lsnindex, |
| 750 | lsnoff; |
| 751 | |
| 752 | lsnindex = slotno * shared->lsn_groups_per_page; |
| 753 | max_lsn = shared->group_lsn[lsnindex++]; |
| 754 | for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) |
| 755 | { |
| 756 | XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; |
| 757 | |
| 758 | if (max_lsn < this_lsn) |
| 759 | max_lsn = this_lsn; |
| 760 | } |
| 761 | |
| 762 | if (!XLogRecPtrIsInvalid(max_lsn)) |
| 763 | { |
| 764 | /* |
| 765 | * As noted above, elog(ERROR) is not acceptable here, so if |
| 766 | * XLogFlush were to fail, we must PANIC. This isn't much of a |
| 767 | * restriction because XLogFlush is just about all critical |
| 768 | * section anyway, but let's make sure. |
| 769 | */ |
| 770 | START_CRIT_SECTION(); |
| 771 | XLogFlush(max_lsn); |
| 772 | END_CRIT_SECTION(); |
| 773 | } |
| 774 | } |
| 775 | |
| 776 | /* |
| 777 | * During a Flush, we may already have the desired file open. |
| 778 | */ |
| 779 | if (fdata) |
| 780 | { |
| 781 | int i; |
| 782 | |
| 783 | for (i = 0; i < fdata->num_files; i++) |
| 784 | { |
| 785 | if (fdata->segno[i] == segno) |
| 786 | { |
| 787 | fd = fdata->fd[i]; |
| 788 | break; |
| 789 | } |
| 790 | } |
| 791 | } |
| 792 | |
| 793 | if (fd < 0) |
| 794 | { |
| 795 | /* |
| 796 | * If the file doesn't already exist, we should create it. It is |
| 797 | * possible for this to need to happen when writing a page that's not |
| 798 | * first in its segment; we assume the OS can cope with that. (Note: |
| 799 | * it might seem that it'd be okay to create files only when |
| 800 | * SimpleLruZeroPage is called for the first page of a segment. |
| 801 | * However, if after a crash and restart the REDO logic elects to |
| 802 | * replay the log from a checkpoint before the latest one, then it's |
| 803 | * possible that we will get commands to set transaction status of |
| 804 | * transactions that have already been truncated from the commit log. |
| 805 | * Easiest way to deal with that is to accept references to |
| 806 | * nonexistent files here and in SlruPhysicalReadPage.) |
| 807 | * |
| 808 | * Note: it is possible for more than one backend to be executing this |
| 809 | * code simultaneously for different pages of the same file. Hence, |
| 810 | * don't use O_EXCL or O_TRUNC or anything like that. |
| 811 | */ |
| 812 | SlruFileName(ctl, path, segno); |
| 813 | fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); |
| 814 | if (fd < 0) |
| 815 | { |
| 816 | slru_errcause = SLRU_OPEN_FAILED; |
| 817 | slru_errno = errno; |
| 818 | return false; |
| 819 | } |
| 820 | |
| 821 | if (fdata) |
| 822 | { |
| 823 | if (fdata->num_files < MAX_FLUSH_BUFFERS) |
| 824 | { |
| 825 | fdata->fd[fdata->num_files] = fd; |
| 826 | fdata->segno[fdata->num_files] = segno; |
| 827 | fdata->num_files++; |
| 828 | } |
| 829 | else |
| 830 | { |
| 831 | /* |
| 832 | * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, |
| 833 | * fall back to treating it as a standalone write. |
| 834 | */ |
| 835 | fdata = NULL; |
| 836 | } |
| 837 | } |
| 838 | } |
| 839 | |
| 840 | if (lseek(fd, (off_t) offset, SEEK_SET) < 0) |
| 841 | { |
| 842 | slru_errcause = SLRU_SEEK_FAILED; |
| 843 | slru_errno = errno; |
| 844 | if (!fdata) |
| 845 | CloseTransientFile(fd); |
| 846 | return false; |
| 847 | } |
| 848 | |
| 849 | errno = 0; |
| 850 | pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); |
| 851 | if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) |
| 852 | { |
| 853 | pgstat_report_wait_end(); |
| 854 | /* if write didn't set errno, assume problem is no disk space */ |
| 855 | if (errno == 0) |
| 856 | errno = ENOSPC; |
| 857 | slru_errcause = SLRU_WRITE_FAILED; |
| 858 | slru_errno = errno; |
| 859 | if (!fdata) |
| 860 | CloseTransientFile(fd); |
| 861 | return false; |
| 862 | } |
| 863 | pgstat_report_wait_end(); |
| 864 | |
| 865 | /* |
| 866 | * If not part of Flush, need to fsync now. We assume this happens |
| 867 | * infrequently enough that it's not a performance issue. |
| 868 | */ |
| 869 | if (!fdata) |
| 870 | { |
| 871 | pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); |
| 872 | if (ctl->do_fsync && pg_fsync(fd)) |
| 873 | { |
| 874 | pgstat_report_wait_end(); |
| 875 | slru_errcause = SLRU_FSYNC_FAILED; |
| 876 | slru_errno = errno; |
| 877 | CloseTransientFile(fd); |
| 878 | return false; |
| 879 | } |
| 880 | pgstat_report_wait_end(); |
| 881 | |
| 882 | if (CloseTransientFile(fd)) |
| 883 | { |
| 884 | slru_errcause = SLRU_CLOSE_FAILED; |
| 885 | slru_errno = errno; |
| 886 | return false; |
| 887 | } |
| 888 | } |
| 889 | |
| 890 | return true; |
| 891 | } |
| 892 | |
| 893 | /* |
| 894 | * Issue the error message after failure of SlruPhysicalReadPage or |
| 895 | * SlruPhysicalWritePage. Call this after cleaning up shared-memory state. |
| 896 | */ |
| 897 | static void |
| 898 | SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) |
| 899 | { |
| 900 | int segno = pageno / SLRU_PAGES_PER_SEGMENT; |
| 901 | int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; |
| 902 | int offset = rpageno * BLCKSZ; |
| 903 | char path[MAXPGPATH]; |
| 904 | |
| 905 | SlruFileName(ctl, path, segno); |
| 906 | errno = slru_errno; |
| 907 | switch (slru_errcause) |
| 908 | { |
| 909 | case SLRU_OPEN_FAILED: |
| 910 | ereport(ERROR, |
| 911 | (errcode_for_file_access(), |
| 912 | errmsg("could not access status of transaction %u" , xid), |
| 913 | errdetail("Could not open file \"%s\": %m." , path))); |
| 914 | break; |
| 915 | case SLRU_SEEK_FAILED: |
| 916 | ereport(ERROR, |
| 917 | (errcode_for_file_access(), |
| 918 | errmsg("could not access status of transaction %u" , xid), |
| 919 | errdetail("Could not seek in file \"%s\" to offset %u: %m." , |
| 920 | path, offset))); |
| 921 | break; |
| 922 | case SLRU_READ_FAILED: |
| 923 | if (errno) |
| 924 | ereport(ERROR, |
| 925 | (errcode_for_file_access(), |
| 926 | errmsg("could not access status of transaction %u" , xid), |
| 927 | errdetail("Could not read from file \"%s\" at offset %u: %m." , |
| 928 | path, offset))); |
| 929 | else |
| 930 | ereport(ERROR, |
| 931 | (errmsg("could not access status of transaction %u" , xid), |
| 932 | errdetail("Could not read from file \"%s\" at offset %u: read too few bytes." , path, offset))); |
| 933 | break; |
| 934 | case SLRU_WRITE_FAILED: |
| 935 | if (errno) |
| 936 | ereport(ERROR, |
| 937 | (errcode_for_file_access(), |
| 938 | errmsg("could not access status of transaction %u" , xid), |
| 939 | errdetail("Could not write to file \"%s\" at offset %u: %m." , |
| 940 | path, offset))); |
| 941 | else |
| 942 | ereport(ERROR, |
| 943 | (errmsg("could not access status of transaction %u" , xid), |
| 944 | errdetail("Could not write to file \"%s\" at offset %u: wrote too few bytes." , |
| 945 | path, offset))); |
| 946 | break; |
| 947 | case SLRU_FSYNC_FAILED: |
| 948 | ereport(data_sync_elevel(ERROR), |
| 949 | (errcode_for_file_access(), |
| 950 | errmsg("could not access status of transaction %u" , xid), |
| 951 | errdetail("Could not fsync file \"%s\": %m." , |
| 952 | path))); |
| 953 | break; |
| 954 | case SLRU_CLOSE_FAILED: |
| 955 | ereport(ERROR, |
| 956 | (errcode_for_file_access(), |
| 957 | errmsg("could not access status of transaction %u" , xid), |
| 958 | errdetail("Could not close file \"%s\": %m." , |
| 959 | path))); |
| 960 | break; |
| 961 | default: |
| 962 | /* can't get here, we trust */ |
| 963 | elog(ERROR, "unrecognized SimpleLru error cause: %d" , |
| 964 | (int) slru_errcause); |
| 965 | break; |
| 966 | } |
| 967 | } |
| 968 | |
| 969 | /* |
| 970 | * Select the slot to re-use when we need a free slot. |
| 971 | * |
| 972 | * The target page number is passed because we need to consider the |
| 973 | * possibility that some other process reads in the target page while |
| 974 | * we are doing I/O to free a slot. Hence, check or recheck to see if |
| 975 | * any slot already holds the target page, and return that slot if so. |
| 976 | * Thus, the returned slot is *either* a slot already holding the pageno |
| 977 | * (could be any state except EMPTY), *or* a freeable slot (state EMPTY |
| 978 | * or CLEAN). |
| 979 | * |
| 980 | * Control lock must be held at entry, and will be held at exit. |
| 981 | */ |
| 982 | static int |
| 983 | SlruSelectLRUPage(SlruCtl ctl, int pageno) |
| 984 | { |
| 985 | SlruShared shared = ctl->shared; |
| 986 | |
| 987 | /* Outer loop handles restart after I/O */ |
| 988 | for (;;) |
| 989 | { |
| 990 | int slotno; |
| 991 | int cur_count; |
| 992 | int bestvalidslot = 0; /* keep compiler quiet */ |
| 993 | int best_valid_delta = -1; |
| 994 | int best_valid_page_number = 0; /* keep compiler quiet */ |
| 995 | int bestinvalidslot = 0; /* keep compiler quiet */ |
| 996 | int best_invalid_delta = -1; |
| 997 | int best_invalid_page_number = 0; /* keep compiler quiet */ |
| 998 | |
| 999 | /* See if page already has a buffer assigned */ |
| 1000 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 1001 | { |
| 1002 | if (shared->page_number[slotno] == pageno && |
| 1003 | shared->page_status[slotno] != SLRU_PAGE_EMPTY) |
| 1004 | return slotno; |
| 1005 | } |
| 1006 | |
| 1007 | /* |
| 1008 | * If we find any EMPTY slot, just select that one. Else choose a |
| 1009 | * victim page to replace. We normally take the least recently used |
| 1010 | * valid page, but we will never take the slot containing |
| 1011 | * latest_page_number, even if it appears least recently used. We |
| 1012 | * will select a slot that is already I/O busy only if there is no |
| 1013 | * other choice: a read-busy slot will not be least recently used once |
| 1014 | * the read finishes, and waiting for an I/O on a write-busy slot is |
| 1015 | * inferior to just picking some other slot. Testing shows the slot |
| 1016 | * we pick instead will often be clean, allowing us to begin a read at |
| 1017 | * once. |
| 1018 | * |
| 1019 | * Normally the page_lru_count values will all be different and so |
| 1020 | * there will be a well-defined LRU page. But since we allow |
| 1021 | * concurrent execution of SlruRecentlyUsed() within |
| 1022 | * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages |
| 1023 | * acquire the same lru_count values. In that case we break ties by |
| 1024 | * choosing the furthest-back page. |
| 1025 | * |
| 1026 | * Notice that this next line forcibly advances cur_lru_count to a |
| 1027 | * value that is certainly beyond any value that will be in the |
| 1028 | * page_lru_count array after the loop finishes. This ensures that |
| 1029 | * the next execution of SlruRecentlyUsed will mark the page newly |
| 1030 | * used, even if it's for a page that has the current counter value. |
| 1031 | * That gets us back on the path to having good data when there are |
| 1032 | * multiple pages with the same lru_count. |
| 1033 | */ |
| 1034 | cur_count = (shared->cur_lru_count)++; |
| 1035 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 1036 | { |
| 1037 | int this_delta; |
| 1038 | int this_page_number; |
| 1039 | |
| 1040 | if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) |
| 1041 | return slotno; |
| 1042 | this_delta = cur_count - shared->page_lru_count[slotno]; |
| 1043 | if (this_delta < 0) |
| 1044 | { |
| 1045 | /* |
| 1046 | * Clean up in case shared updates have caused cur_count |
| 1047 | * increments to get "lost". We back off the page counts, |
| 1048 | * rather than trying to increase cur_count, to avoid any |
| 1049 | * question of infinite loops or failure in the presence of |
| 1050 | * wrapped-around counts. |
| 1051 | */ |
| 1052 | shared->page_lru_count[slotno] = cur_count; |
| 1053 | this_delta = 0; |
| 1054 | } |
| 1055 | this_page_number = shared->page_number[slotno]; |
| 1056 | if (this_page_number == shared->latest_page_number) |
| 1057 | continue; |
| 1058 | if (shared->page_status[slotno] == SLRU_PAGE_VALID) |
| 1059 | { |
| 1060 | if (this_delta > best_valid_delta || |
| 1061 | (this_delta == best_valid_delta && |
| 1062 | ctl->PagePrecedes(this_page_number, |
| 1063 | best_valid_page_number))) |
| 1064 | { |
| 1065 | bestvalidslot = slotno; |
| 1066 | best_valid_delta = this_delta; |
| 1067 | best_valid_page_number = this_page_number; |
| 1068 | } |
| 1069 | } |
| 1070 | else |
| 1071 | { |
| 1072 | if (this_delta > best_invalid_delta || |
| 1073 | (this_delta == best_invalid_delta && |
| 1074 | ctl->PagePrecedes(this_page_number, |
| 1075 | best_invalid_page_number))) |
| 1076 | { |
| 1077 | bestinvalidslot = slotno; |
| 1078 | best_invalid_delta = this_delta; |
| 1079 | best_invalid_page_number = this_page_number; |
| 1080 | } |
| 1081 | } |
| 1082 | } |
| 1083 | |
| 1084 | /* |
| 1085 | * If all pages (except possibly the latest one) are I/O busy, we'll |
| 1086 | * have to wait for an I/O to complete and then retry. In that |
| 1087 | * unhappy case, we choose to wait for the I/O on the least recently |
| 1088 | * used slot, on the assumption that it was likely initiated first of |
| 1089 | * all the I/Os in progress and may therefore finish first. |
| 1090 | */ |
| 1091 | if (best_valid_delta < 0) |
| 1092 | { |
| 1093 | SimpleLruWaitIO(ctl, bestinvalidslot); |
| 1094 | continue; |
| 1095 | } |
| 1096 | |
| 1097 | /* |
| 1098 | * If the selected page is clean, we're set. |
| 1099 | */ |
| 1100 | if (!shared->page_dirty[bestvalidslot]) |
| 1101 | return bestvalidslot; |
| 1102 | |
| 1103 | /* |
| 1104 | * Write the page. |
| 1105 | */ |
| 1106 | SlruInternalWritePage(ctl, bestvalidslot, NULL); |
| 1107 | |
| 1108 | /* |
| 1109 | * Now loop back and try again. This is the easiest way of dealing |
| 1110 | * with corner cases such as the victim page being re-dirtied while we |
| 1111 | * wrote it. |
| 1112 | */ |
| 1113 | } |
| 1114 | } |
| 1115 | |
| 1116 | /* |
| 1117 | * Flush dirty pages to disk during checkpoint or database shutdown |
| 1118 | */ |
| 1119 | void |
| 1120 | SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) |
| 1121 | { |
| 1122 | SlruShared shared = ctl->shared; |
| 1123 | SlruFlushData fdata; |
| 1124 | int slotno; |
| 1125 | int pageno = 0; |
| 1126 | int i; |
| 1127 | bool ok; |
| 1128 | |
| 1129 | /* |
| 1130 | * Find and write dirty pages |
| 1131 | */ |
| 1132 | fdata.num_files = 0; |
| 1133 | |
| 1134 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 1135 | |
| 1136 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 1137 | { |
| 1138 | SlruInternalWritePage(ctl, slotno, &fdata); |
| 1139 | |
| 1140 | /* |
| 1141 | * In some places (e.g. checkpoints), we cannot assert that the slot |
| 1142 | * is clean now, since another process might have re-dirtied it |
| 1143 | * already. That's okay. |
| 1144 | */ |
| 1145 | Assert(allow_redirtied || |
| 1146 | shared->page_status[slotno] == SLRU_PAGE_EMPTY || |
| 1147 | (shared->page_status[slotno] == SLRU_PAGE_VALID && |
| 1148 | !shared->page_dirty[slotno])); |
| 1149 | } |
| 1150 | |
| 1151 | LWLockRelease(shared->ControlLock); |
| 1152 | |
| 1153 | /* |
| 1154 | * Now fsync and close any files that were open |
| 1155 | */ |
| 1156 | ok = true; |
| 1157 | for (i = 0; i < fdata.num_files; i++) |
| 1158 | { |
| 1159 | pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); |
| 1160 | if (ctl->do_fsync && pg_fsync(fdata.fd[i])) |
| 1161 | { |
| 1162 | slru_errcause = SLRU_FSYNC_FAILED; |
| 1163 | slru_errno = errno; |
| 1164 | pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; |
| 1165 | ok = false; |
| 1166 | } |
| 1167 | pgstat_report_wait_end(); |
| 1168 | |
| 1169 | if (CloseTransientFile(fdata.fd[i])) |
| 1170 | { |
| 1171 | slru_errcause = SLRU_CLOSE_FAILED; |
| 1172 | slru_errno = errno; |
| 1173 | pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; |
| 1174 | ok = false; |
| 1175 | } |
| 1176 | } |
| 1177 | if (!ok) |
| 1178 | SlruReportIOError(ctl, pageno, InvalidTransactionId); |
| 1179 | } |
| 1180 | |
| 1181 | /* |
| 1182 | * Remove all segments before the one holding the passed page number |
| 1183 | */ |
| 1184 | void |
| 1185 | SimpleLruTruncate(SlruCtl ctl, int cutoffPage) |
| 1186 | { |
| 1187 | SlruShared shared = ctl->shared; |
| 1188 | int slotno; |
| 1189 | |
| 1190 | /* |
| 1191 | * The cutoff point is the start of the segment containing cutoffPage. |
| 1192 | */ |
| 1193 | cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT; |
| 1194 | |
| 1195 | /* |
| 1196 | * Scan shared memory and remove any pages preceding the cutoff page, to |
| 1197 | * ensure we won't rewrite them later. (Since this is normally called in |
| 1198 | * or just after a checkpoint, any dirty pages should have been flushed |
| 1199 | * already ... we're just being extra careful here.) |
| 1200 | */ |
| 1201 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 1202 | |
| 1203 | restart:; |
| 1204 | |
| 1205 | /* |
| 1206 | * While we are holding the lock, make an important safety check: the |
| 1207 | * planned cutoff point must be <= the current endpoint page. Otherwise we |
| 1208 | * have already wrapped around, and proceeding with the truncation would |
| 1209 | * risk removing the current segment. |
| 1210 | */ |
| 1211 | if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) |
| 1212 | { |
| 1213 | LWLockRelease(shared->ControlLock); |
| 1214 | ereport(LOG, |
| 1215 | (errmsg("could not truncate directory \"%s\": apparent wraparound" , |
| 1216 | ctl->Dir))); |
| 1217 | return; |
| 1218 | } |
| 1219 | |
| 1220 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 1221 | { |
| 1222 | if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) |
| 1223 | continue; |
| 1224 | if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) |
| 1225 | continue; |
| 1226 | |
| 1227 | /* |
| 1228 | * If page is clean, just change state to EMPTY (expected case). |
| 1229 | */ |
| 1230 | if (shared->page_status[slotno] == SLRU_PAGE_VALID && |
| 1231 | !shared->page_dirty[slotno]) |
| 1232 | { |
| 1233 | shared->page_status[slotno] = SLRU_PAGE_EMPTY; |
| 1234 | continue; |
| 1235 | } |
| 1236 | |
| 1237 | /* |
| 1238 | * Hmm, we have (or may have) I/O operations acting on the page, so |
| 1239 | * we've got to wait for them to finish and then start again. This is |
| 1240 | * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, |
| 1241 | * wouldn't it be OK to just discard it without writing it? For now, |
| 1242 | * keep the logic the same as it was.) |
| 1243 | */ |
| 1244 | if (shared->page_status[slotno] == SLRU_PAGE_VALID) |
| 1245 | SlruInternalWritePage(ctl, slotno, NULL); |
| 1246 | else |
| 1247 | SimpleLruWaitIO(ctl, slotno); |
| 1248 | goto restart; |
| 1249 | } |
| 1250 | |
| 1251 | LWLockRelease(shared->ControlLock); |
| 1252 | |
| 1253 | /* Now we can remove the old segment(s) */ |
| 1254 | (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); |
| 1255 | } |
| 1256 | |
| 1257 | /* |
| 1258 | * Delete an individual SLRU segment, identified by the filename. |
| 1259 | * |
| 1260 | * NB: This does not touch the SLRU buffers themselves, callers have to ensure |
| 1261 | * they either can't yet contain anything, or have already been cleaned out. |
| 1262 | */ |
| 1263 | static void |
| 1264 | SlruInternalDeleteSegment(SlruCtl ctl, char *filename) |
| 1265 | { |
| 1266 | char path[MAXPGPATH]; |
| 1267 | |
| 1268 | snprintf(path, MAXPGPATH, "%s/%s" , ctl->Dir, filename); |
| 1269 | ereport(DEBUG2, |
| 1270 | (errmsg("removing file \"%s\"" , path))); |
| 1271 | unlink(path); |
| 1272 | } |
| 1273 | |
| 1274 | /* |
| 1275 | * Delete an individual SLRU segment, identified by the segment number. |
| 1276 | */ |
| 1277 | void |
| 1278 | SlruDeleteSegment(SlruCtl ctl, int segno) |
| 1279 | { |
| 1280 | SlruShared shared = ctl->shared; |
| 1281 | int slotno; |
| 1282 | char path[MAXPGPATH]; |
| 1283 | bool did_write; |
| 1284 | |
| 1285 | /* Clean out any possibly existing references to the segment. */ |
| 1286 | LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); |
| 1287 | restart: |
| 1288 | did_write = false; |
| 1289 | for (slotno = 0; slotno < shared->num_slots; slotno++) |
| 1290 | { |
| 1291 | int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; |
| 1292 | |
| 1293 | if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) |
| 1294 | continue; |
| 1295 | |
| 1296 | /* not the segment we're looking for */ |
| 1297 | if (pagesegno != segno) |
| 1298 | continue; |
| 1299 | |
| 1300 | /* If page is clean, just change state to EMPTY (expected case). */ |
| 1301 | if (shared->page_status[slotno] == SLRU_PAGE_VALID && |
| 1302 | !shared->page_dirty[slotno]) |
| 1303 | { |
| 1304 | shared->page_status[slotno] = SLRU_PAGE_EMPTY; |
| 1305 | continue; |
| 1306 | } |
| 1307 | |
| 1308 | /* Same logic as SimpleLruTruncate() */ |
| 1309 | if (shared->page_status[slotno] == SLRU_PAGE_VALID) |
| 1310 | SlruInternalWritePage(ctl, slotno, NULL); |
| 1311 | else |
| 1312 | SimpleLruWaitIO(ctl, slotno); |
| 1313 | |
| 1314 | did_write = true; |
| 1315 | } |
| 1316 | |
| 1317 | /* |
| 1318 | * Be extra careful and re-check. The IO functions release the control |
| 1319 | * lock, so new pages could have been read in. |
| 1320 | */ |
| 1321 | if (did_write) |
| 1322 | goto restart; |
| 1323 | |
| 1324 | snprintf(path, MAXPGPATH, "%s/%04X" , ctl->Dir, segno); |
| 1325 | ereport(DEBUG2, |
| 1326 | (errmsg("removing file \"%s\"" , path))); |
| 1327 | unlink(path); |
| 1328 | |
| 1329 | LWLockRelease(shared->ControlLock); |
| 1330 | } |
| 1331 | |
| 1332 | /* |
| 1333 | * SlruScanDirectory callback |
| 1334 | * This callback reports true if there's any segment prior to the one |
| 1335 | * containing the page passed as "data". |
| 1336 | */ |
| 1337 | bool |
| 1338 | SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data) |
| 1339 | { |
| 1340 | int cutoffPage = *(int *) data; |
| 1341 | |
| 1342 | cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT; |
| 1343 | |
| 1344 | if (ctl->PagePrecedes(segpage, cutoffPage)) |
| 1345 | return true; /* found one; don't iterate any more */ |
| 1346 | |
| 1347 | return false; /* keep going */ |
| 1348 | } |
| 1349 | |
| 1350 | /* |
| 1351 | * SlruScanDirectory callback. |
| 1352 | * This callback deletes segments prior to the one passed in as "data". |
| 1353 | */ |
| 1354 | static bool |
| 1355 | SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) |
| 1356 | { |
| 1357 | int cutoffPage = *(int *) data; |
| 1358 | |
| 1359 | if (ctl->PagePrecedes(segpage, cutoffPage)) |
| 1360 | SlruInternalDeleteSegment(ctl, filename); |
| 1361 | |
| 1362 | return false; /* keep going */ |
| 1363 | } |
| 1364 | |
| 1365 | /* |
| 1366 | * SlruScanDirectory callback. |
| 1367 | * This callback deletes all segments. |
| 1368 | */ |
| 1369 | bool |
| 1370 | SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) |
| 1371 | { |
| 1372 | SlruInternalDeleteSegment(ctl, filename); |
| 1373 | |
| 1374 | return false; /* keep going */ |
| 1375 | } |
| 1376 | |
| 1377 | /* |
| 1378 | * Scan the SimpleLRU directory and apply a callback to each file found in it. |
| 1379 | * |
| 1380 | * If the callback returns true, the scan is stopped. The last return value |
| 1381 | * from the callback is returned. |
| 1382 | * |
| 1383 | * The callback receives the following arguments: 1. the SlruCtl struct for the |
| 1384 | * slru being truncated; 2. the filename being considered; 3. the page number |
| 1385 | * for the first page of that file; 4. a pointer to the opaque data given to us |
| 1386 | * by the caller. |
| 1387 | * |
| 1388 | * Note that the ordering in which the directory is scanned is not guaranteed. |
| 1389 | * |
| 1390 | * Note that no locking is applied. |
| 1391 | */ |
| 1392 | bool |
| 1393 | SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) |
| 1394 | { |
| 1395 | bool retval = false; |
| 1396 | DIR *cldir; |
| 1397 | struct dirent *clde; |
| 1398 | int segno; |
| 1399 | int segpage; |
| 1400 | |
| 1401 | cldir = AllocateDir(ctl->Dir); |
| 1402 | while ((clde = ReadDir(cldir, ctl->Dir)) != NULL) |
| 1403 | { |
| 1404 | size_t len; |
| 1405 | |
| 1406 | len = strlen(clde->d_name); |
| 1407 | |
| 1408 | if ((len == 4 || len == 5 || len == 6) && |
| 1409 | strspn(clde->d_name, "0123456789ABCDEF" ) == len) |
| 1410 | { |
| 1411 | segno = (int) strtol(clde->d_name, NULL, 16); |
| 1412 | segpage = segno * SLRU_PAGES_PER_SEGMENT; |
| 1413 | |
| 1414 | elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s" , |
| 1415 | ctl->Dir, clde->d_name); |
| 1416 | retval = callback(ctl, clde->d_name, segpage, data); |
| 1417 | if (retval) |
| 1418 | break; |
| 1419 | } |
| 1420 | } |
| 1421 | FreeDir(cldir); |
| 1422 | |
| 1423 | return retval; |
| 1424 | } |
| 1425 | |