1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * xlogutils.c |
4 | * |
5 | * PostgreSQL write-ahead log manager utility routines |
6 | * |
7 | * This file contains support routines that are used by XLOG replay functions. |
8 | * None of this code is used during normal system operation. |
9 | * |
10 | * |
11 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
12 | * Portions Copyright (c) 1994, Regents of the University of California |
13 | * |
14 | * src/backend/access/transam/xlogutils.c |
15 | * |
16 | *------------------------------------------------------------------------- |
17 | */ |
18 | #include "postgres.h" |
19 | |
20 | #include <unistd.h> |
21 | |
22 | #include "access/timeline.h" |
23 | #include "access/xlog.h" |
24 | #include "access/xlog_internal.h" |
25 | #include "access/xlogutils.h" |
26 | #include "miscadmin.h" |
27 | #include "pgstat.h" |
28 | #include "storage/smgr.h" |
29 | #include "utils/guc.h" |
30 | #include "utils/hsearch.h" |
31 | #include "utils/rel.h" |
32 | |
33 | |
34 | /* |
35 | * During XLOG replay, we may see XLOG records for incremental updates of |
36 | * pages that no longer exist, because their relation was later dropped or |
37 | * truncated. (Note: this is only possible when full_page_writes = OFF, |
38 | * since when it's ON, the first reference we see to a page should always |
39 | * be a full-page rewrite not an incremental update.) Rather than simply |
40 | * ignoring such records, we make a note of the referenced page, and then |
41 | * complain if we don't actually see a drop or truncate covering the page |
42 | * later in replay. |
43 | */ |
44 | typedef struct xl_invalid_page_key |
45 | { |
46 | RelFileNode node; /* the relation */ |
47 | ForkNumber forkno; /* the fork number */ |
48 | BlockNumber blkno; /* the page */ |
49 | } xl_invalid_page_key; |
50 | |
51 | typedef struct xl_invalid_page |
52 | { |
53 | xl_invalid_page_key key; /* hash key ... must be first */ |
54 | bool present; /* page existed but contained zeroes */ |
55 | } xl_invalid_page; |
56 | |
57 | static HTAB *invalid_page_tab = NULL; |
58 | |
59 | |
60 | /* Report a reference to an invalid page */ |
61 | static void |
62 | report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno, |
63 | BlockNumber blkno, bool present) |
64 | { |
65 | char *path = relpathperm(node, forkno); |
66 | |
67 | if (present) |
68 | elog(elevel, "page %u of relation %s is uninitialized" , |
69 | blkno, path); |
70 | else |
71 | elog(elevel, "page %u of relation %s does not exist" , |
72 | blkno, path); |
73 | pfree(path); |
74 | } |
75 | |
76 | /* Log a reference to an invalid page */ |
77 | static void |
78 | log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno, |
79 | bool present) |
80 | { |
81 | xl_invalid_page_key key; |
82 | xl_invalid_page *hentry; |
83 | bool found; |
84 | |
85 | /* |
86 | * Once recovery has reached a consistent state, the invalid-page table |
87 | * should be empty and remain so. If a reference to an invalid page is |
88 | * found after consistency is reached, PANIC immediately. This might seem |
89 | * aggressive, but it's better than letting the invalid reference linger |
90 | * in the hash table until the end of recovery and PANIC there, which |
91 | * might come only much later if this is a standby server. |
92 | */ |
93 | if (reachedConsistency) |
94 | { |
95 | report_invalid_page(WARNING, node, forkno, blkno, present); |
96 | elog(PANIC, "WAL contains references to invalid pages" ); |
97 | } |
98 | |
99 | /* |
100 | * Log references to invalid pages at DEBUG1 level. This allows some |
101 | * tracing of the cause (note the elog context mechanism will tell us |
102 | * something about the XLOG record that generated the reference). |
103 | */ |
104 | if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1) |
105 | report_invalid_page(DEBUG1, node, forkno, blkno, present); |
106 | |
107 | if (invalid_page_tab == NULL) |
108 | { |
109 | /* create hash table when first needed */ |
110 | HASHCTL ctl; |
111 | |
112 | memset(&ctl, 0, sizeof(ctl)); |
113 | ctl.keysize = sizeof(xl_invalid_page_key); |
114 | ctl.entrysize = sizeof(xl_invalid_page); |
115 | |
116 | invalid_page_tab = hash_create("XLOG invalid-page table" , |
117 | 100, |
118 | &ctl, |
119 | HASH_ELEM | HASH_BLOBS); |
120 | } |
121 | |
122 | /* we currently assume xl_invalid_page_key contains no padding */ |
123 | key.node = node; |
124 | key.forkno = forkno; |
125 | key.blkno = blkno; |
126 | hentry = (xl_invalid_page *) |
127 | hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found); |
128 | |
129 | if (!found) |
130 | { |
131 | /* hash_search already filled in the key */ |
132 | hentry->present = present; |
133 | } |
134 | else |
135 | { |
136 | /* repeat reference ... leave "present" as it was */ |
137 | } |
138 | } |
139 | |
140 | /* Forget any invalid pages >= minblkno, because they've been dropped */ |
141 | static void |
142 | forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno) |
143 | { |
144 | HASH_SEQ_STATUS status; |
145 | xl_invalid_page *hentry; |
146 | |
147 | if (invalid_page_tab == NULL) |
148 | return; /* nothing to do */ |
149 | |
150 | hash_seq_init(&status, invalid_page_tab); |
151 | |
152 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
153 | { |
154 | if (RelFileNodeEquals(hentry->key.node, node) && |
155 | hentry->key.forkno == forkno && |
156 | hentry->key.blkno >= minblkno) |
157 | { |
158 | if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2) |
159 | { |
160 | char *path = relpathperm(hentry->key.node, forkno); |
161 | |
162 | elog(DEBUG2, "page %u of relation %s has been dropped" , |
163 | hentry->key.blkno, path); |
164 | pfree(path); |
165 | } |
166 | |
167 | if (hash_search(invalid_page_tab, |
168 | (void *) &hentry->key, |
169 | HASH_REMOVE, NULL) == NULL) |
170 | elog(ERROR, "hash table corrupted" ); |
171 | } |
172 | } |
173 | } |
174 | |
175 | /* Forget any invalid pages in a whole database */ |
176 | static void |
177 | forget_invalid_pages_db(Oid dbid) |
178 | { |
179 | HASH_SEQ_STATUS status; |
180 | xl_invalid_page *hentry; |
181 | |
182 | if (invalid_page_tab == NULL) |
183 | return; /* nothing to do */ |
184 | |
185 | hash_seq_init(&status, invalid_page_tab); |
186 | |
187 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
188 | { |
189 | if (hentry->key.node.dbNode == dbid) |
190 | { |
191 | if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2) |
192 | { |
193 | char *path = relpathperm(hentry->key.node, hentry->key.forkno); |
194 | |
195 | elog(DEBUG2, "page %u of relation %s has been dropped" , |
196 | hentry->key.blkno, path); |
197 | pfree(path); |
198 | } |
199 | |
200 | if (hash_search(invalid_page_tab, |
201 | (void *) &hentry->key, |
202 | HASH_REMOVE, NULL) == NULL) |
203 | elog(ERROR, "hash table corrupted" ); |
204 | } |
205 | } |
206 | } |
207 | |
208 | /* Are there any unresolved references to invalid pages? */ |
209 | bool |
210 | XLogHaveInvalidPages(void) |
211 | { |
212 | if (invalid_page_tab != NULL && |
213 | hash_get_num_entries(invalid_page_tab) > 0) |
214 | return true; |
215 | return false; |
216 | } |
217 | |
218 | /* Complain about any remaining invalid-page entries */ |
219 | void |
220 | XLogCheckInvalidPages(void) |
221 | { |
222 | HASH_SEQ_STATUS status; |
223 | xl_invalid_page *hentry; |
224 | bool foundone = false; |
225 | |
226 | if (invalid_page_tab == NULL) |
227 | return; /* nothing to do */ |
228 | |
229 | hash_seq_init(&status, invalid_page_tab); |
230 | |
231 | /* |
232 | * Our strategy is to emit WARNING messages for all remaining entries and |
233 | * only PANIC after we've dumped all the available info. |
234 | */ |
235 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
236 | { |
237 | report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno, |
238 | hentry->key.blkno, hentry->present); |
239 | foundone = true; |
240 | } |
241 | |
242 | if (foundone) |
243 | elog(PANIC, "WAL contains references to invalid pages" ); |
244 | |
245 | hash_destroy(invalid_page_tab); |
246 | invalid_page_tab = NULL; |
247 | } |
248 | |
249 | |
250 | /* |
251 | * XLogReadBufferForRedo |
252 | * Read a page during XLOG replay |
253 | * |
254 | * Reads a block referenced by a WAL record into shared buffer cache, and |
255 | * determines what needs to be done to redo the changes to it. If the WAL |
256 | * record includes a full-page image of the page, it is restored. |
257 | * |
258 | * 'lsn' is the LSN of the record being replayed. It is compared with the |
259 | * page's LSN to determine if the record has already been replayed. |
260 | * 'block_id' is the ID number the block was registered with, when the WAL |
261 | * record was created. |
262 | * |
263 | * Returns one of the following: |
264 | * |
265 | * BLK_NEEDS_REDO - changes from the WAL record need to be applied |
266 | * BLK_DONE - block doesn't need replaying |
267 | * BLK_RESTORED - block was restored from a full-page image included in |
268 | * the record |
269 | * BLK_NOTFOUND - block was not found (because it was truncated away by |
270 | * an operation later in the WAL stream) |
271 | * |
272 | * On return, the buffer is locked in exclusive-mode, and returned in *buf. |
273 | * Note that the buffer is locked and returned even if it doesn't need |
274 | * replaying. (Getting the buffer lock is not really necessary during |
275 | * single-process crash recovery, but some subroutines such as MarkBufferDirty |
276 | * will complain if we don't have the lock. In hot standby mode it's |
277 | * definitely necessary.) |
278 | * |
279 | * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag |
280 | * set, we restore it, even if the page in the database appears newer. This |
281 | * is to protect ourselves against database pages that were partially or |
282 | * incorrectly written during a crash. We assume that the XLOG data must be |
283 | * good because it has passed a CRC check, while the database page might not |
284 | * be. This will force us to replay all subsequent modifications of the page |
285 | * that appear in XLOG, rather than possibly ignoring them as already |
286 | * applied, but that's not a huge drawback. |
287 | */ |
288 | XLogRedoAction |
289 | XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, |
290 | Buffer *buf) |
291 | { |
292 | return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, |
293 | false, buf); |
294 | } |
295 | |
296 | /* |
297 | * Pin and lock a buffer referenced by a WAL record, for the purpose of |
298 | * re-initializing it. |
299 | */ |
300 | Buffer |
301 | XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) |
302 | { |
303 | Buffer buf; |
304 | |
305 | XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, |
306 | &buf); |
307 | return buf; |
308 | } |
309 | |
310 | /* |
311 | * XLogReadBufferForRedoExtended |
312 | * Like XLogReadBufferForRedo, but with extra options. |
313 | * |
314 | * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended |
315 | * with all-zeroes pages up to the referenced block number. In |
316 | * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value |
317 | * is always BLK_NEEDS_REDO. |
318 | * |
319 | * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock |
320 | * parameter. Do not use an inconsistent combination!) |
321 | * |
322 | * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer |
323 | * using LockBufferForCleanup(), instead of a regular exclusive lock. |
324 | */ |
325 | XLogRedoAction |
326 | XLogReadBufferForRedoExtended(XLogReaderState *record, |
327 | uint8 block_id, |
328 | ReadBufferMode mode, bool get_cleanup_lock, |
329 | Buffer *buf) |
330 | { |
331 | XLogRecPtr lsn = record->EndRecPtr; |
332 | RelFileNode rnode; |
333 | ForkNumber forknum; |
334 | BlockNumber blkno; |
335 | Page page; |
336 | bool zeromode; |
337 | bool willinit; |
338 | |
339 | if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) |
340 | { |
341 | /* Caller specified a bogus block_id */ |
342 | elog(PANIC, "failed to locate backup block with ID %d" , block_id); |
343 | } |
344 | |
345 | /* |
346 | * Make sure that if the block is marked with WILL_INIT, the caller is |
347 | * going to initialize it. And vice versa. |
348 | */ |
349 | zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); |
350 | willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; |
351 | if (willinit && !zeromode) |
352 | elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine" ); |
353 | if (!willinit && zeromode) |
354 | elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record" ); |
355 | |
356 | /* If it has a full-page image and it should be restored, do it. */ |
357 | if (XLogRecBlockImageApply(record, block_id)) |
358 | { |
359 | Assert(XLogRecHasBlockImage(record, block_id)); |
360 | *buf = XLogReadBufferExtended(rnode, forknum, blkno, |
361 | get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); |
362 | page = BufferGetPage(*buf); |
363 | if (!RestoreBlockImage(record, block_id, page)) |
364 | elog(ERROR, "failed to restore block image" ); |
365 | |
366 | /* |
367 | * The page may be uninitialized. If so, we can't set the LSN because |
368 | * that would corrupt the page. |
369 | */ |
370 | if (!PageIsNew(page)) |
371 | { |
372 | PageSetLSN(page, lsn); |
373 | } |
374 | |
375 | MarkBufferDirty(*buf); |
376 | |
377 | /* |
378 | * At the end of crash recovery the init forks of unlogged relations |
379 | * are copied, without going through shared buffers. So we need to |
380 | * force the on-disk state of init forks to always be in sync with the |
381 | * state in shared buffers. |
382 | */ |
383 | if (forknum == INIT_FORKNUM) |
384 | FlushOneBuffer(*buf); |
385 | |
386 | return BLK_RESTORED; |
387 | } |
388 | else |
389 | { |
390 | *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); |
391 | if (BufferIsValid(*buf)) |
392 | { |
393 | if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) |
394 | { |
395 | if (get_cleanup_lock) |
396 | LockBufferForCleanup(*buf); |
397 | else |
398 | LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); |
399 | } |
400 | if (lsn <= PageGetLSN(BufferGetPage(*buf))) |
401 | return BLK_DONE; |
402 | else |
403 | return BLK_NEEDS_REDO; |
404 | } |
405 | else |
406 | return BLK_NOTFOUND; |
407 | } |
408 | } |
409 | |
410 | /* |
411 | * XLogReadBufferExtended |
412 | * Read a page during XLOG replay |
413 | * |
414 | * This is functionally comparable to ReadBufferExtended. There's some |
415 | * differences in the behavior wrt. the "mode" argument: |
416 | * |
417 | * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we |
418 | * return InvalidBuffer. In this case the caller should silently skip the |
419 | * update on this page. (In this situation, we expect that the page was later |
420 | * dropped or truncated. If we don't see evidence of that later in the WAL |
421 | * sequence, we'll complain at the end of WAL replay.) |
422 | * |
423 | * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended |
424 | * with all-zeroes pages up to the given block number. |
425 | * |
426 | * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't |
427 | * exist, and we don't check for all-zeroes. Thus, no log entry is made |
428 | * to imply that the page should be dropped or truncated later. |
429 | * |
430 | * NB: A redo function should normally not call this directly. To get a page |
431 | * to modify, use XLogReadBufferForRedoExtended instead. It is important that |
432 | * all pages modified by a WAL record are registered in the WAL records, or |
433 | * they will be invisible to tools that that need to know which pages are |
434 | * modified. |
435 | */ |
436 | Buffer |
437 | XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, |
438 | BlockNumber blkno, ReadBufferMode mode) |
439 | { |
440 | BlockNumber lastblock; |
441 | Buffer buffer; |
442 | SMgrRelation smgr; |
443 | |
444 | Assert(blkno != P_NEW); |
445 | |
446 | /* Open the relation at smgr level */ |
447 | smgr = smgropen(rnode, InvalidBackendId); |
448 | |
449 | /* |
450 | * Create the target file if it doesn't already exist. This lets us cope |
451 | * if the replay sequence contains writes to a relation that is later |
452 | * deleted. (The original coding of this routine would instead suppress |
453 | * the writes, but that seems like it risks losing valuable data if the |
454 | * filesystem loses an inode during a crash. Better to write the data |
455 | * until we are actually told to delete the file.) |
456 | */ |
457 | smgrcreate(smgr, forknum, true); |
458 | |
459 | lastblock = smgrnblocks(smgr, forknum); |
460 | |
461 | if (blkno < lastblock) |
462 | { |
463 | /* page exists in file */ |
464 | buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, |
465 | mode, NULL); |
466 | } |
467 | else |
468 | { |
469 | /* hm, page doesn't exist in file */ |
470 | if (mode == RBM_NORMAL) |
471 | { |
472 | log_invalid_page(rnode, forknum, blkno, false); |
473 | return InvalidBuffer; |
474 | } |
475 | if (mode == RBM_NORMAL_NO_LOG) |
476 | return InvalidBuffer; |
477 | /* OK to extend the file */ |
478 | /* we do this in recovery only - no rel-extension lock needed */ |
479 | Assert(InRecovery); |
480 | buffer = InvalidBuffer; |
481 | do |
482 | { |
483 | if (buffer != InvalidBuffer) |
484 | { |
485 | if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) |
486 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
487 | ReleaseBuffer(buffer); |
488 | } |
489 | buffer = ReadBufferWithoutRelcache(rnode, forknum, |
490 | P_NEW, mode, NULL); |
491 | } |
492 | while (BufferGetBlockNumber(buffer) < blkno); |
493 | /* Handle the corner case that P_NEW returns non-consecutive pages */ |
494 | if (BufferGetBlockNumber(buffer) != blkno) |
495 | { |
496 | if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) |
497 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
498 | ReleaseBuffer(buffer); |
499 | buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, |
500 | mode, NULL); |
501 | } |
502 | } |
503 | |
504 | if (mode == RBM_NORMAL) |
505 | { |
506 | /* check that page has been initialized */ |
507 | Page page = (Page) BufferGetPage(buffer); |
508 | |
509 | /* |
510 | * We assume that PageIsNew is safe without a lock. During recovery, |
511 | * there should be no other backends that could modify the buffer at |
512 | * the same time. |
513 | */ |
514 | if (PageIsNew(page)) |
515 | { |
516 | ReleaseBuffer(buffer); |
517 | log_invalid_page(rnode, forknum, blkno, true); |
518 | return InvalidBuffer; |
519 | } |
520 | } |
521 | |
522 | return buffer; |
523 | } |
524 | |
525 | /* |
526 | * Struct actually returned by XLogFakeRelcacheEntry, though the declared |
527 | * return type is Relation. |
528 | */ |
529 | typedef struct |
530 | { |
531 | RelationData reldata; /* Note: this must be first */ |
532 | FormData_pg_class pgc; |
533 | } FakeRelCacheEntryData; |
534 | |
535 | typedef FakeRelCacheEntryData *FakeRelCacheEntry; |
536 | |
537 | /* |
538 | * Create a fake relation cache entry for a physical relation |
539 | * |
540 | * It's often convenient to use the same functions in XLOG replay as in the |
541 | * main codepath, but those functions typically work with a relcache entry. |
542 | * We don't have a working relation cache during XLOG replay, but this |
543 | * function can be used to create a fake relcache entry instead. Only the |
544 | * fields related to physical storage, like rd_rel, are initialized, so the |
545 | * fake entry is only usable in low-level operations like ReadBuffer(). |
546 | * |
547 | * Caller must free the returned entry with FreeFakeRelcacheEntry(). |
548 | */ |
549 | Relation |
550 | CreateFakeRelcacheEntry(RelFileNode rnode) |
551 | { |
552 | FakeRelCacheEntry fakeentry; |
553 | Relation rel; |
554 | |
555 | Assert(InRecovery); |
556 | |
557 | /* Allocate the Relation struct and all related space in one block. */ |
558 | fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); |
559 | rel = (Relation) fakeentry; |
560 | |
561 | rel->rd_rel = &fakeentry->pgc; |
562 | rel->rd_node = rnode; |
563 | /* We will never be working with temp rels during recovery */ |
564 | rel->rd_backend = InvalidBackendId; |
565 | |
566 | /* It must be a permanent table if we're in recovery. */ |
567 | rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; |
568 | |
569 | /* We don't know the name of the relation; use relfilenode instead */ |
570 | sprintf(RelationGetRelationName(rel), "%u" , rnode.relNode); |
571 | |
572 | /* |
573 | * We set up the lockRelId in case anything tries to lock the dummy |
574 | * relation. Note that this is fairly bogus since relNode may be |
575 | * different from the relation's OID. It shouldn't really matter though, |
576 | * since we are presumably running by ourselves and can't have any lock |
577 | * conflicts ... |
578 | */ |
579 | rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; |
580 | rel->rd_lockInfo.lockRelId.relId = rnode.relNode; |
581 | |
582 | rel->rd_smgr = NULL; |
583 | |
584 | return rel; |
585 | } |
586 | |
587 | /* |
588 | * Free a fake relation cache entry. |
589 | */ |
590 | void |
591 | FreeFakeRelcacheEntry(Relation fakerel) |
592 | { |
593 | /* make sure the fakerel is not referenced by the SmgrRelation anymore */ |
594 | if (fakerel->rd_smgr != NULL) |
595 | smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr); |
596 | pfree(fakerel); |
597 | } |
598 | |
599 | /* |
600 | * Drop a relation during XLOG replay |
601 | * |
602 | * This is called when the relation is about to be deleted; we need to remove |
603 | * any open "invalid-page" records for the relation. |
604 | */ |
605 | void |
606 | XLogDropRelation(RelFileNode rnode, ForkNumber forknum) |
607 | { |
608 | forget_invalid_pages(rnode, forknum, 0); |
609 | } |
610 | |
611 | /* |
612 | * Drop a whole database during XLOG replay |
613 | * |
614 | * As above, but for DROP DATABASE instead of dropping a single rel |
615 | */ |
616 | void |
617 | XLogDropDatabase(Oid dbid) |
618 | { |
619 | /* |
620 | * This is unnecessarily heavy-handed, as it will close SMgrRelation |
621 | * objects for other databases as well. DROP DATABASE occurs seldom enough |
622 | * that it's not worth introducing a variant of smgrclose for just this |
623 | * purpose. XXX: Or should we rather leave the smgr entries dangling? |
624 | */ |
625 | smgrcloseall(); |
626 | |
627 | forget_invalid_pages_db(dbid); |
628 | } |
629 | |
630 | /* |
631 | * Truncate a relation during XLOG replay |
632 | * |
633 | * We need to clean up any open "invalid-page" records for the dropped pages. |
634 | */ |
635 | void |
636 | XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, |
637 | BlockNumber nblocks) |
638 | { |
639 | forget_invalid_pages(rnode, forkNum, nblocks); |
640 | } |
641 | |
642 | /* |
643 | * Read 'count' bytes from WAL into 'buf', starting at location 'startptr' |
644 | * in timeline 'tli'. |
645 | * |
646 | * Will open, and keep open, one WAL segment stored in the static file |
647 | * descriptor 'sendFile'. This means if XLogRead is used once, there will |
648 | * always be one descriptor left open until the process ends, but never |
649 | * more than one. |
650 | * |
651 | * XXX This is very similar to pg_waldump's XLogDumpXLogRead and to XLogRead |
652 | * in walsender.c but for small differences (such as lack of elog() in |
653 | * frontend). Probably these should be merged at some point. |
654 | */ |
655 | static void |
656 | XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, |
657 | Size count) |
658 | { |
659 | char *p; |
660 | XLogRecPtr recptr; |
661 | Size nbytes; |
662 | |
663 | /* state maintained across calls */ |
664 | static int sendFile = -1; |
665 | static XLogSegNo sendSegNo = 0; |
666 | static TimeLineID sendTLI = 0; |
667 | static uint32 sendOff = 0; |
668 | |
669 | Assert(segsize == wal_segment_size); |
670 | |
671 | p = buf; |
672 | recptr = startptr; |
673 | nbytes = count; |
674 | |
675 | while (nbytes > 0) |
676 | { |
677 | uint32 startoff; |
678 | int segbytes; |
679 | int readbytes; |
680 | |
681 | startoff = XLogSegmentOffset(recptr, segsize); |
682 | |
683 | /* Do we need to switch to a different xlog segment? */ |
684 | if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo, segsize) || |
685 | sendTLI != tli) |
686 | { |
687 | char path[MAXPGPATH]; |
688 | |
689 | if (sendFile >= 0) |
690 | close(sendFile); |
691 | |
692 | XLByteToSeg(recptr, sendSegNo, segsize); |
693 | |
694 | XLogFilePath(path, tli, sendSegNo, segsize); |
695 | |
696 | sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY); |
697 | |
698 | if (sendFile < 0) |
699 | { |
700 | if (errno == ENOENT) |
701 | ereport(ERROR, |
702 | (errcode_for_file_access(), |
703 | errmsg("requested WAL segment %s has already been removed" , |
704 | path))); |
705 | else |
706 | ereport(ERROR, |
707 | (errcode_for_file_access(), |
708 | errmsg("could not open file \"%s\": %m" , |
709 | path))); |
710 | } |
711 | sendOff = 0; |
712 | sendTLI = tli; |
713 | } |
714 | |
715 | /* Need to seek in the file? */ |
716 | if (sendOff != startoff) |
717 | { |
718 | if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) |
719 | { |
720 | char path[MAXPGPATH]; |
721 | int save_errno = errno; |
722 | |
723 | XLogFilePath(path, tli, sendSegNo, segsize); |
724 | errno = save_errno; |
725 | ereport(ERROR, |
726 | (errcode_for_file_access(), |
727 | errmsg("could not seek in log segment %s to offset %u: %m" , |
728 | path, startoff))); |
729 | } |
730 | sendOff = startoff; |
731 | } |
732 | |
733 | /* How many bytes are within this segment? */ |
734 | if (nbytes > (segsize - startoff)) |
735 | segbytes = segsize - startoff; |
736 | else |
737 | segbytes = nbytes; |
738 | |
739 | pgstat_report_wait_start(WAIT_EVENT_WAL_READ); |
740 | readbytes = read(sendFile, p, segbytes); |
741 | pgstat_report_wait_end(); |
742 | if (readbytes <= 0) |
743 | { |
744 | char path[MAXPGPATH]; |
745 | int save_errno = errno; |
746 | |
747 | XLogFilePath(path, tli, sendSegNo, segsize); |
748 | errno = save_errno; |
749 | ereport(ERROR, |
750 | (errcode_for_file_access(), |
751 | errmsg("could not read from log segment %s, offset %u, length %lu: %m" , |
752 | path, sendOff, (unsigned long) segbytes))); |
753 | } |
754 | |
755 | /* Update state for read */ |
756 | recptr += readbytes; |
757 | |
758 | sendOff += readbytes; |
759 | nbytes -= readbytes; |
760 | p += readbytes; |
761 | } |
762 | } |
763 | |
764 | /* |
765 | * Determine which timeline to read an xlog page from and set the |
766 | * XLogReaderState's currTLI to that timeline ID. |
767 | * |
768 | * We care about timelines in xlogreader when we might be reading xlog |
769 | * generated prior to a promotion, either if we're currently a standby in |
770 | * recovery or if we're a promoted master reading xlogs generated by the old |
771 | * master before our promotion. |
772 | * |
773 | * wantPage must be set to the start address of the page to read and |
774 | * wantLength to the amount of the page that will be read, up to |
775 | * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. |
776 | * |
777 | * We switch to an xlog segment from the new timeline eagerly when on a |
778 | * historical timeline, as soon as we reach the start of the xlog segment |
779 | * containing the timeline switch. The server copied the segment to the new |
780 | * timeline so all the data up to the switch point is the same, but there's no |
781 | * guarantee the old segment will still exist. It may have been deleted or |
782 | * renamed with a .partial suffix so we can't necessarily keep reading from |
783 | * the old TLI even though tliSwitchPoint says it's OK. |
784 | * |
785 | * We can't just check the timeline when we read a page on a different segment |
786 | * to the last page. We could've received a timeline switch from a cascading |
787 | * upstream, so the current segment ends abruptly (possibly getting renamed to |
788 | * .partial) and we have to switch to a new one. Even in the middle of reading |
789 | * a page we could have to dump the cached page and switch to a new TLI. |
790 | * |
791 | * Because of this, callers MAY NOT assume that currTLI is the timeline that |
792 | * will be in a page's xlp_tli; the page may begin on an older timeline or we |
793 | * might be reading from historical timeline data on a segment that's been |
794 | * copied to a new timeline. |
795 | * |
796 | * The caller must also make sure it doesn't read past the current replay |
797 | * position (using GetWalRcvWriteRecPtr) if executing in recovery, so it |
798 | * doesn't fail to notice that the current timeline became historical. The |
799 | * caller must also update ThisTimeLineID with the result of |
800 | * GetWalRcvWriteRecPtr and must check RecoveryInProgress(). |
801 | */ |
802 | void |
803 | XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) |
804 | { |
805 | const XLogRecPtr lastReadPage = state->readSegNo * |
806 | state->wal_segment_size + state->readOff; |
807 | |
808 | Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); |
809 | Assert(wantLength <= XLOG_BLCKSZ); |
810 | Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); |
811 | |
812 | /* |
813 | * If the desired page is currently read in and valid, we have nothing to |
814 | * do. |
815 | * |
816 | * The caller should've ensured that it didn't previously advance readOff |
817 | * past the valid limit of this timeline, so it doesn't matter if the |
818 | * current TLI has since become historical. |
819 | */ |
820 | if (lastReadPage == wantPage && |
821 | state->readLen != 0 && |
822 | lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) |
823 | return; |
824 | |
825 | /* |
826 | * If we're reading from the current timeline, it hasn't become historical |
827 | * and the page we're reading is after the last page read, we can again |
828 | * just carry on. (Seeking backwards requires a check to make sure the |
829 | * older page isn't on a prior timeline). |
830 | * |
831 | * ThisTimeLineID might've become historical since we last looked, but the |
832 | * caller is required not to read past the flush limit it saw at the time |
833 | * it looked up the timeline. There's nothing we can do about it if |
834 | * StartupXLOG() renames it to .partial concurrently. |
835 | */ |
836 | if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) |
837 | { |
838 | Assert(state->currTLIValidUntil == InvalidXLogRecPtr); |
839 | return; |
840 | } |
841 | |
842 | /* |
843 | * If we're just reading pages from a previously validated historical |
844 | * timeline and the timeline we're reading from is valid until the end of |
845 | * the current segment we can just keep reading. |
846 | */ |
847 | if (state->currTLIValidUntil != InvalidXLogRecPtr && |
848 | state->currTLI != ThisTimeLineID && |
849 | state->currTLI != 0 && |
850 | ((wantPage + wantLength) / state->wal_segment_size) < |
851 | (state->currTLIValidUntil / state->wal_segment_size)) |
852 | return; |
853 | |
854 | /* |
855 | * If we reach this point we're either looking up a page for random |
856 | * access, the current timeline just became historical, or we're reading |
857 | * from a new segment containing a timeline switch. In all cases we need |
858 | * to determine the newest timeline on the segment. |
859 | * |
860 | * If it's the current timeline we can just keep reading from here unless |
861 | * we detect a timeline switch that makes the current timeline historical. |
862 | * If it's a historical timeline we can read all the segment on the newest |
863 | * timeline because it contains all the old timelines' data too. So only |
864 | * one switch check is required. |
865 | */ |
866 | { |
867 | /* |
868 | * We need to re-read the timeline history in case it's been changed |
869 | * by a promotion or replay from a cascaded replica. |
870 | */ |
871 | List *timelineHistory = readTimeLineHistory(ThisTimeLineID); |
872 | |
873 | XLogRecPtr endOfSegment = (((wantPage / state->wal_segment_size) + 1) |
874 | * state->wal_segment_size) - 1; |
875 | |
876 | Assert(wantPage / state->wal_segment_size == |
877 | endOfSegment / state->wal_segment_size); |
878 | |
879 | /* |
880 | * Find the timeline of the last LSN on the segment containing |
881 | * wantPage. |
882 | */ |
883 | state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); |
884 | state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, |
885 | &state->nextTLI); |
886 | |
887 | Assert(state->currTLIValidUntil == InvalidXLogRecPtr || |
888 | wantPage + wantLength < state->currTLIValidUntil); |
889 | |
890 | list_free_deep(timelineHistory); |
891 | |
892 | elog(DEBUG3, "switched to timeline %u valid until %X/%X" , |
893 | state->currTLI, |
894 | (uint32) (state->currTLIValidUntil >> 32), |
895 | (uint32) (state->currTLIValidUntil)); |
896 | } |
897 | } |
898 | |
899 | /* |
900 | * read_page callback for reading local xlog files |
901 | * |
902 | * Public because it would likely be very helpful for someone writing another |
903 | * output method outside walsender, e.g. in a bgworker. |
904 | * |
905 | * TODO: The walsender has its own version of this, but it relies on the |
906 | * walsender's latch being set whenever WAL is flushed. No such infrastructure |
907 | * exists for normal backends, so we have to do a check/sleep/repeat style of |
908 | * loop for now. |
909 | */ |
910 | int |
911 | read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, |
912 | int reqLen, XLogRecPtr targetRecPtr, char *cur_page, |
913 | TimeLineID *pageTLI) |
914 | { |
915 | XLogRecPtr read_upto, |
916 | loc; |
917 | int count; |
918 | |
919 | loc = targetPagePtr + reqLen; |
920 | |
921 | /* Loop waiting for xlog to be available if necessary */ |
922 | while (1) |
923 | { |
924 | /* |
925 | * Determine the limit of xlog we can currently read to, and what the |
926 | * most recent timeline is. |
927 | * |
928 | * RecoveryInProgress() will update ThisTimeLineID when it first |
929 | * notices recovery finishes, so we only have to maintain it for the |
930 | * local process until recovery ends. |
931 | */ |
932 | if (!RecoveryInProgress()) |
933 | read_upto = GetFlushRecPtr(); |
934 | else |
935 | read_upto = GetXLogReplayRecPtr(&ThisTimeLineID); |
936 | |
937 | *pageTLI = ThisTimeLineID; |
938 | |
939 | /* |
940 | * Check which timeline to get the record from. |
941 | * |
942 | * We have to do it each time through the loop because if we're in |
943 | * recovery as a cascading standby, the current timeline might've |
944 | * become historical. We can't rely on RecoveryInProgress() because in |
945 | * a standby configuration like |
946 | * |
947 | * A => B => C |
948 | * |
949 | * if we're a logical decoding session on C, and B gets promoted, our |
950 | * timeline will change while we remain in recovery. |
951 | * |
952 | * We can't just keep reading from the old timeline as the last WAL |
953 | * archive in the timeline will get renamed to .partial by |
954 | * StartupXLOG(). |
955 | * |
956 | * If that happens after our caller updated ThisTimeLineID but before |
957 | * we actually read the xlog page, we might still try to read from the |
958 | * old (now renamed) segment and fail. There's not much we can do |
959 | * about this, but it can only happen when we're a leaf of a cascading |
960 | * standby whose master gets promoted while we're decoding, so a |
961 | * one-off ERROR isn't too bad. |
962 | */ |
963 | XLogReadDetermineTimeline(state, targetPagePtr, reqLen); |
964 | |
965 | if (state->currTLI == ThisTimeLineID) |
966 | { |
967 | |
968 | if (loc <= read_upto) |
969 | break; |
970 | |
971 | CHECK_FOR_INTERRUPTS(); |
972 | pg_usleep(1000L); |
973 | } |
974 | else |
975 | { |
976 | /* |
977 | * We're on a historical timeline, so limit reading to the switch |
978 | * point where we moved to the next timeline. |
979 | * |
980 | * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know |
981 | * about the new timeline, so we must've received past the end of |
982 | * it. |
983 | */ |
984 | read_upto = state->currTLIValidUntil; |
985 | |
986 | /* |
987 | * Setting pageTLI to our wanted record's TLI is slightly wrong; |
988 | * the page might begin on an older timeline if it contains a |
989 | * timeline switch, since its xlog segment will have been copied |
990 | * from the prior timeline. This is pretty harmless though, as |
991 | * nothing cares so long as the timeline doesn't go backwards. We |
992 | * should read the page header instead; FIXME someday. |
993 | */ |
994 | *pageTLI = state->currTLI; |
995 | |
996 | /* No need to wait on a historical timeline */ |
997 | break; |
998 | } |
999 | } |
1000 | |
1001 | if (targetPagePtr + XLOG_BLCKSZ <= read_upto) |
1002 | { |
1003 | /* |
1004 | * more than one block available; read only that block, have caller |
1005 | * come back if they need more. |
1006 | */ |
1007 | count = XLOG_BLCKSZ; |
1008 | } |
1009 | else if (targetPagePtr + reqLen > read_upto) |
1010 | { |
1011 | /* not enough data there */ |
1012 | return -1; |
1013 | } |
1014 | else |
1015 | { |
1016 | /* enough bytes available to satisfy the request */ |
1017 | count = read_upto - targetPagePtr; |
1018 | } |
1019 | |
1020 | /* |
1021 | * Even though we just determined how much of the page can be validly read |
1022 | * as 'count', read the whole page anyway. It's guaranteed to be |
1023 | * zero-padded up to the page boundary if it's incomplete. |
1024 | */ |
1025 | XLogRead(cur_page, state->wal_segment_size, *pageTLI, targetPagePtr, |
1026 | XLOG_BLCKSZ); |
1027 | |
1028 | /* number of valid bytes in the buffer */ |
1029 | return count; |
1030 | } |
1031 | |