| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * xlogreader.c |
| 4 | * Generic XLog reading facility |
| 5 | * |
| 6 | * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group |
| 7 | * |
| 8 | * IDENTIFICATION |
| 9 | * src/backend/access/transam/xlogreader.c |
| 10 | * |
| 11 | * NOTES |
| 12 | * See xlogreader.h for more notes on this facility. |
| 13 | * |
| 14 | * This file is compiled as both front-end and backend code, so it |
| 15 | * may not use ereport, server-defined static variables, etc. |
| 16 | *------------------------------------------------------------------------- |
| 17 | */ |
| 18 | #include "postgres.h" |
| 19 | |
| 20 | #include "access/transam.h" |
| 21 | #include "access/xlogrecord.h" |
| 22 | #include "access/xlog_internal.h" |
| 23 | #include "access/xlogreader.h" |
| 24 | #include "catalog/pg_control.h" |
| 25 | #include "common/pg_lzcompress.h" |
| 26 | #include "replication/origin.h" |
| 27 | |
| 28 | #ifndef FRONTEND |
| 29 | #include "utils/memutils.h" |
| 30 | #endif |
| 31 | |
| 32 | static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); |
| 33 | |
| 34 | static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, |
| 35 | XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess); |
| 36 | static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, |
| 37 | XLogRecPtr recptr); |
| 38 | static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, |
| 39 | int reqLen); |
| 40 | static void report_invalid_record(XLogReaderState *state, const char *fmt,...) pg_attribute_printf(2, 3); |
| 41 | |
| 42 | static void ResetDecoder(XLogReaderState *state); |
| 43 | |
| 44 | /* size of the buffer allocated for error message. */ |
| 45 | #define MAX_ERRORMSG_LEN 1000 |
| 46 | |
| 47 | /* |
| 48 | * Construct a string in state->errormsg_buf explaining what's wrong with |
| 49 | * the current record being read. |
| 50 | */ |
| 51 | static void |
| 52 | report_invalid_record(XLogReaderState *state, const char *fmt,...) |
| 53 | { |
| 54 | va_list args; |
| 55 | |
| 56 | fmt = _(fmt); |
| 57 | |
| 58 | va_start(args, fmt); |
| 59 | vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args); |
| 60 | va_end(args); |
| 61 | } |
| 62 | |
| 63 | /* |
| 64 | * Allocate and initialize a new XLogReader. |
| 65 | * |
| 66 | * Returns NULL if the xlogreader couldn't be allocated. |
| 67 | */ |
| 68 | XLogReaderState * |
| 69 | XLogReaderAllocate(int wal_segment_size, XLogPageReadCB , |
| 70 | void *private_data) |
| 71 | { |
| 72 | XLogReaderState *state; |
| 73 | |
| 74 | state = (XLogReaderState *) |
| 75 | palloc_extended(sizeof(XLogReaderState), |
| 76 | MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); |
| 77 | if (!state) |
| 78 | return NULL; |
| 79 | |
| 80 | state->max_block_id = -1; |
| 81 | |
| 82 | /* |
| 83 | * Permanently allocate readBuf. We do it this way, rather than just |
| 84 | * making a static array, for two reasons: (1) no need to waste the |
| 85 | * storage in most instantiations of the backend; (2) a static char array |
| 86 | * isn't guaranteed to have any particular alignment, whereas |
| 87 | * palloc_extended() will provide MAXALIGN'd storage. |
| 88 | */ |
| 89 | state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ, |
| 90 | MCXT_ALLOC_NO_OOM); |
| 91 | if (!state->readBuf) |
| 92 | { |
| 93 | pfree(state); |
| 94 | return NULL; |
| 95 | } |
| 96 | |
| 97 | state->wal_segment_size = wal_segment_size; |
| 98 | state->read_page = pagereadfunc; |
| 99 | /* system_identifier initialized to zeroes above */ |
| 100 | state->private_data = private_data; |
| 101 | /* ReadRecPtr and EndRecPtr initialized to zeroes above */ |
| 102 | /* readSegNo, readOff, readLen, readPageTLI initialized to zeroes above */ |
| 103 | state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1, |
| 104 | MCXT_ALLOC_NO_OOM); |
| 105 | if (!state->errormsg_buf) |
| 106 | { |
| 107 | pfree(state->readBuf); |
| 108 | pfree(state); |
| 109 | return NULL; |
| 110 | } |
| 111 | state->errormsg_buf[0] = '\0'; |
| 112 | |
| 113 | /* |
| 114 | * Allocate an initial readRecordBuf of minimal size, which can later be |
| 115 | * enlarged if necessary. |
| 116 | */ |
| 117 | if (!allocate_recordbuf(state, 0)) |
| 118 | { |
| 119 | pfree(state->errormsg_buf); |
| 120 | pfree(state->readBuf); |
| 121 | pfree(state); |
| 122 | return NULL; |
| 123 | } |
| 124 | |
| 125 | return state; |
| 126 | } |
| 127 | |
| 128 | void |
| 129 | XLogReaderFree(XLogReaderState *state) |
| 130 | { |
| 131 | int block_id; |
| 132 | |
| 133 | for (block_id = 0; block_id <= XLR_MAX_BLOCK_ID; block_id++) |
| 134 | { |
| 135 | if (state->blocks[block_id].data) |
| 136 | pfree(state->blocks[block_id].data); |
| 137 | } |
| 138 | if (state->main_data) |
| 139 | pfree(state->main_data); |
| 140 | |
| 141 | pfree(state->errormsg_buf); |
| 142 | if (state->readRecordBuf) |
| 143 | pfree(state->readRecordBuf); |
| 144 | pfree(state->readBuf); |
| 145 | pfree(state); |
| 146 | } |
| 147 | |
| 148 | /* |
| 149 | * Allocate readRecordBuf to fit a record of at least the given length. |
| 150 | * Returns true if successful, false if out of memory. |
| 151 | * |
| 152 | * readRecordBufSize is set to the new buffer size. |
| 153 | * |
| 154 | * To avoid useless small increases, round its size to a multiple of |
| 155 | * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start |
| 156 | * with. (That is enough for all "normal" records, but very large commit or |
| 157 | * abort records might need more space.) |
| 158 | */ |
| 159 | static bool |
| 160 | allocate_recordbuf(XLogReaderState *state, uint32 reclength) |
| 161 | { |
| 162 | uint32 newSize = reclength; |
| 163 | |
| 164 | newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ); |
| 165 | newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ)); |
| 166 | |
| 167 | #ifndef FRONTEND |
| 168 | |
| 169 | /* |
| 170 | * Note that in much unlucky circumstances, the random data read from a |
| 171 | * recycled segment can cause this routine to be called with a size |
| 172 | * causing a hard failure at allocation. For a standby, this would cause |
| 173 | * the instance to stop suddenly with a hard failure, preventing it to |
| 174 | * retry fetching WAL from one of its sources which could allow it to move |
| 175 | * on with replay without a manual restart. If the data comes from a past |
| 176 | * recycled segment and is still valid, then the allocation may succeed |
| 177 | * but record checks are going to fail so this would be short-lived. If |
| 178 | * the allocation fails because of a memory shortage, then this is not a |
| 179 | * hard failure either per the guarantee given by MCXT_ALLOC_NO_OOM. |
| 180 | */ |
| 181 | if (!AllocSizeIsValid(newSize)) |
| 182 | return false; |
| 183 | |
| 184 | #endif |
| 185 | |
| 186 | if (state->readRecordBuf) |
| 187 | pfree(state->readRecordBuf); |
| 188 | state->readRecordBuf = |
| 189 | (char *) palloc_extended(newSize, MCXT_ALLOC_NO_OOM); |
| 190 | if (state->readRecordBuf == NULL) |
| 191 | { |
| 192 | state->readRecordBufSize = 0; |
| 193 | return false; |
| 194 | } |
| 195 | state->readRecordBufSize = newSize; |
| 196 | return true; |
| 197 | } |
| 198 | |
| 199 | /* |
| 200 | * Attempt to read an XLOG record. |
| 201 | * |
| 202 | * If RecPtr is valid, try to read a record at that position. Otherwise |
| 203 | * try to read a record just after the last one previously read. |
| 204 | * |
| 205 | * If the read_page callback fails to read the requested data, NULL is |
| 206 | * returned. The callback is expected to have reported the error; errormsg |
| 207 | * is set to NULL. |
| 208 | * |
| 209 | * If the reading fails for some other reason, NULL is also returned, and |
| 210 | * *errormsg is set to a string with details of the failure. |
| 211 | * |
| 212 | * The returned pointer (or *errormsg) points to an internal buffer that's |
| 213 | * valid until the next call to XLogReadRecord. |
| 214 | */ |
| 215 | XLogRecord * |
| 216 | XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) |
| 217 | { |
| 218 | XLogRecord *record; |
| 219 | XLogRecPtr targetPagePtr; |
| 220 | bool randAccess; |
| 221 | uint32 len, |
| 222 | total_len; |
| 223 | uint32 targetRecOff; |
| 224 | uint32 ; |
| 225 | bool ; |
| 226 | int readOff; |
| 227 | |
| 228 | /* |
| 229 | * randAccess indicates whether to verify the previous-record pointer of |
| 230 | * the record we're reading. We only do this if we're reading |
| 231 | * sequentially, which is what we initially assume. |
| 232 | */ |
| 233 | randAccess = false; |
| 234 | |
| 235 | /* reset error state */ |
| 236 | *errormsg = NULL; |
| 237 | state->errormsg_buf[0] = '\0'; |
| 238 | |
| 239 | ResetDecoder(state); |
| 240 | |
| 241 | if (RecPtr == InvalidXLogRecPtr) |
| 242 | { |
| 243 | /* No explicit start point; read the record after the one we just read */ |
| 244 | RecPtr = state->EndRecPtr; |
| 245 | |
| 246 | if (state->ReadRecPtr == InvalidXLogRecPtr) |
| 247 | randAccess = true; |
| 248 | |
| 249 | /* |
| 250 | * RecPtr is pointing to end+1 of the previous WAL record. If we're |
| 251 | * at a page boundary, no more records can fit on the current page. We |
| 252 | * must skip over the page header, but we can't do that until we've |
| 253 | * read in the page, since the header size is variable. |
| 254 | */ |
| 255 | } |
| 256 | else |
| 257 | { |
| 258 | /* |
| 259 | * Caller supplied a position to start at. |
| 260 | * |
| 261 | * In this case, the passed-in record pointer should already be |
| 262 | * pointing to a valid record starting position. |
| 263 | */ |
| 264 | Assert(XRecOffIsValid(RecPtr)); |
| 265 | randAccess = true; |
| 266 | } |
| 267 | |
| 268 | state->currRecPtr = RecPtr; |
| 269 | |
| 270 | targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); |
| 271 | targetRecOff = RecPtr % XLOG_BLCKSZ; |
| 272 | |
| 273 | /* |
| 274 | * Read the page containing the record into state->readBuf. Request enough |
| 275 | * byte to cover the whole record header, or at least the part of it that |
| 276 | * fits on the same page. |
| 277 | */ |
| 278 | readOff = ReadPageInternal(state, |
| 279 | targetPagePtr, |
| 280 | Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); |
| 281 | if (readOff < 0) |
| 282 | goto err; |
| 283 | |
| 284 | /* |
| 285 | * ReadPageInternal always returns at least the page header, so we can |
| 286 | * examine it now. |
| 287 | */ |
| 288 | pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); |
| 289 | if (targetRecOff == 0) |
| 290 | { |
| 291 | /* |
| 292 | * At page start, so skip over page header. |
| 293 | */ |
| 294 | RecPtr += pageHeaderSize; |
| 295 | targetRecOff = pageHeaderSize; |
| 296 | } |
| 297 | else if (targetRecOff < pageHeaderSize) |
| 298 | { |
| 299 | report_invalid_record(state, "invalid record offset at %X/%X" , |
| 300 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 301 | goto err; |
| 302 | } |
| 303 | |
| 304 | if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && |
| 305 | targetRecOff == pageHeaderSize) |
| 306 | { |
| 307 | report_invalid_record(state, "contrecord is requested by %X/%X" , |
| 308 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 309 | goto err; |
| 310 | } |
| 311 | |
| 312 | /* ReadPageInternal has verified the page header */ |
| 313 | Assert(pageHeaderSize <= readOff); |
| 314 | |
| 315 | /* |
| 316 | * Read the record length. |
| 317 | * |
| 318 | * NB: Even though we use an XLogRecord pointer here, the whole record |
| 319 | * header might not fit on this page. xl_tot_len is the first field of the |
| 320 | * struct, so it must be on this page (the records are MAXALIGNed), but we |
| 321 | * cannot access any other fields until we've verified that we got the |
| 322 | * whole header. |
| 323 | */ |
| 324 | record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); |
| 325 | total_len = record->xl_tot_len; |
| 326 | |
| 327 | /* |
| 328 | * If the whole record header is on this page, validate it immediately. |
| 329 | * Otherwise do just a basic sanity check on xl_tot_len, and validate the |
| 330 | * rest of the header after reading it from the next page. The xl_tot_len |
| 331 | * check is necessary here to ensure that we enter the "Need to reassemble |
| 332 | * record" code path below; otherwise we might fail to apply |
| 333 | * ValidXLogRecordHeader at all. |
| 334 | */ |
| 335 | if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) |
| 336 | { |
| 337 | if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, |
| 338 | randAccess)) |
| 339 | goto err; |
| 340 | gotheader = true; |
| 341 | } |
| 342 | else |
| 343 | { |
| 344 | /* XXX: more validation should be done here */ |
| 345 | if (total_len < SizeOfXLogRecord) |
| 346 | { |
| 347 | report_invalid_record(state, |
| 348 | "invalid record length at %X/%X: wanted %u, got %u" , |
| 349 | (uint32) (RecPtr >> 32), (uint32) RecPtr, |
| 350 | (uint32) SizeOfXLogRecord, total_len); |
| 351 | goto err; |
| 352 | } |
| 353 | gotheader = false; |
| 354 | } |
| 355 | |
| 356 | len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; |
| 357 | if (total_len > len) |
| 358 | { |
| 359 | /* Need to reassemble record */ |
| 360 | char *contdata; |
| 361 | XLogPageHeader ; |
| 362 | char *buffer; |
| 363 | uint32 gotlen; |
| 364 | |
| 365 | /* |
| 366 | * Enlarge readRecordBuf as needed. |
| 367 | */ |
| 368 | if (total_len > state->readRecordBufSize && |
| 369 | !allocate_recordbuf(state, total_len)) |
| 370 | { |
| 371 | /* We treat this as a "bogus data" condition */ |
| 372 | report_invalid_record(state, "record length %u at %X/%X too long" , |
| 373 | total_len, |
| 374 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 375 | goto err; |
| 376 | } |
| 377 | |
| 378 | /* Copy the first fragment of the record from the first page. */ |
| 379 | memcpy(state->readRecordBuf, |
| 380 | state->readBuf + RecPtr % XLOG_BLCKSZ, len); |
| 381 | buffer = state->readRecordBuf + len; |
| 382 | gotlen = len; |
| 383 | |
| 384 | do |
| 385 | { |
| 386 | /* Calculate pointer to beginning of next page */ |
| 387 | targetPagePtr += XLOG_BLCKSZ; |
| 388 | |
| 389 | /* Wait for the next page to become available */ |
| 390 | readOff = ReadPageInternal(state, targetPagePtr, |
| 391 | Min(total_len - gotlen + SizeOfXLogShortPHD, |
| 392 | XLOG_BLCKSZ)); |
| 393 | |
| 394 | if (readOff < 0) |
| 395 | goto err; |
| 396 | |
| 397 | Assert(SizeOfXLogShortPHD <= readOff); |
| 398 | |
| 399 | /* Check that the continuation on next page looks valid */ |
| 400 | pageHeader = (XLogPageHeader) state->readBuf; |
| 401 | if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) |
| 402 | { |
| 403 | report_invalid_record(state, |
| 404 | "there is no contrecord flag at %X/%X" , |
| 405 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 406 | goto err; |
| 407 | } |
| 408 | |
| 409 | /* |
| 410 | * Cross-check that xlp_rem_len agrees with how much of the record |
| 411 | * we expect there to be left. |
| 412 | */ |
| 413 | if (pageHeader->xlp_rem_len == 0 || |
| 414 | total_len != (pageHeader->xlp_rem_len + gotlen)) |
| 415 | { |
| 416 | report_invalid_record(state, |
| 417 | "invalid contrecord length %u at %X/%X" , |
| 418 | pageHeader->xlp_rem_len, |
| 419 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 420 | goto err; |
| 421 | } |
| 422 | |
| 423 | /* Append the continuation from this page to the buffer */ |
| 424 | pageHeaderSize = XLogPageHeaderSize(pageHeader); |
| 425 | |
| 426 | if (readOff < pageHeaderSize) |
| 427 | readOff = ReadPageInternal(state, targetPagePtr, |
| 428 | pageHeaderSize); |
| 429 | |
| 430 | Assert(pageHeaderSize <= readOff); |
| 431 | |
| 432 | contdata = (char *) state->readBuf + pageHeaderSize; |
| 433 | len = XLOG_BLCKSZ - pageHeaderSize; |
| 434 | if (pageHeader->xlp_rem_len < len) |
| 435 | len = pageHeader->xlp_rem_len; |
| 436 | |
| 437 | if (readOff < pageHeaderSize + len) |
| 438 | readOff = ReadPageInternal(state, targetPagePtr, |
| 439 | pageHeaderSize + len); |
| 440 | |
| 441 | memcpy(buffer, (char *) contdata, len); |
| 442 | buffer += len; |
| 443 | gotlen += len; |
| 444 | |
| 445 | /* If we just reassembled the record header, validate it. */ |
| 446 | if (!gotheader) |
| 447 | { |
| 448 | record = (XLogRecord *) state->readRecordBuf; |
| 449 | if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, |
| 450 | record, randAccess)) |
| 451 | goto err; |
| 452 | gotheader = true; |
| 453 | } |
| 454 | } while (gotlen < total_len); |
| 455 | |
| 456 | Assert(gotheader); |
| 457 | |
| 458 | record = (XLogRecord *) state->readRecordBuf; |
| 459 | if (!ValidXLogRecord(state, record, RecPtr)) |
| 460 | goto err; |
| 461 | |
| 462 | pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); |
| 463 | state->ReadRecPtr = RecPtr; |
| 464 | state->EndRecPtr = targetPagePtr + pageHeaderSize |
| 465 | + MAXALIGN(pageHeader->xlp_rem_len); |
| 466 | } |
| 467 | else |
| 468 | { |
| 469 | /* Wait for the record data to become available */ |
| 470 | readOff = ReadPageInternal(state, targetPagePtr, |
| 471 | Min(targetRecOff + total_len, XLOG_BLCKSZ)); |
| 472 | if (readOff < 0) |
| 473 | goto err; |
| 474 | |
| 475 | /* Record does not cross a page boundary */ |
| 476 | if (!ValidXLogRecord(state, record, RecPtr)) |
| 477 | goto err; |
| 478 | |
| 479 | state->EndRecPtr = RecPtr + MAXALIGN(total_len); |
| 480 | |
| 481 | state->ReadRecPtr = RecPtr; |
| 482 | } |
| 483 | |
| 484 | /* |
| 485 | * Special processing if it's an XLOG SWITCH record |
| 486 | */ |
| 487 | if (record->xl_rmid == RM_XLOG_ID && |
| 488 | (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) |
| 489 | { |
| 490 | /* Pretend it extends to end of segment */ |
| 491 | state->EndRecPtr += state->wal_segment_size - 1; |
| 492 | state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->wal_segment_size); |
| 493 | } |
| 494 | |
| 495 | if (DecodeXLogRecord(state, record, errormsg)) |
| 496 | return record; |
| 497 | else |
| 498 | return NULL; |
| 499 | |
| 500 | err: |
| 501 | |
| 502 | /* |
| 503 | * Invalidate the read state. We might read from a different source after |
| 504 | * failure. |
| 505 | */ |
| 506 | XLogReaderInvalReadState(state); |
| 507 | |
| 508 | if (state->errormsg_buf[0] != '\0') |
| 509 | *errormsg = state->errormsg_buf; |
| 510 | |
| 511 | return NULL; |
| 512 | } |
| 513 | |
| 514 | /* |
| 515 | * Read a single xlog page including at least [pageptr, reqLen] of valid data |
| 516 | * via the read_page() callback. |
| 517 | * |
| 518 | * Returns -1 if the required page cannot be read for some reason; errormsg_buf |
| 519 | * is set in that case (unless the error occurs in the read_page callback). |
| 520 | * |
| 521 | * We fetch the page from a reader-local cache if we know we have the required |
| 522 | * data and if there hasn't been any error since caching the data. |
| 523 | */ |
| 524 | static int |
| 525 | ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) |
| 526 | { |
| 527 | int readLen; |
| 528 | uint32 targetPageOff; |
| 529 | XLogSegNo targetSegNo; |
| 530 | XLogPageHeader hdr; |
| 531 | |
| 532 | Assert((pageptr % XLOG_BLCKSZ) == 0); |
| 533 | |
| 534 | XLByteToSeg(pageptr, targetSegNo, state->wal_segment_size); |
| 535 | targetPageOff = XLogSegmentOffset(pageptr, state->wal_segment_size); |
| 536 | |
| 537 | /* check whether we have all the requested data already */ |
| 538 | if (targetSegNo == state->readSegNo && targetPageOff == state->readOff && |
| 539 | reqLen <= state->readLen) |
| 540 | return state->readLen; |
| 541 | |
| 542 | /* |
| 543 | * Data is not in our buffer. |
| 544 | * |
| 545 | * Every time we actually read the page, even if we looked at parts of it |
| 546 | * before, we need to do verification as the read_page callback might now |
| 547 | * be rereading data from a different source. |
| 548 | * |
| 549 | * Whenever switching to a new WAL segment, we read the first page of the |
| 550 | * file and validate its header, even if that's not where the target |
| 551 | * record is. This is so that we can check the additional identification |
| 552 | * info that is present in the first page's "long" header. |
| 553 | */ |
| 554 | if (targetSegNo != state->readSegNo && targetPageOff != 0) |
| 555 | { |
| 556 | XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; |
| 557 | |
| 558 | readLen = state->read_page(state, targetSegmentPtr, XLOG_BLCKSZ, |
| 559 | state->currRecPtr, |
| 560 | state->readBuf, &state->readPageTLI); |
| 561 | if (readLen < 0) |
| 562 | goto err; |
| 563 | |
| 564 | /* we can be sure to have enough WAL available, we scrolled back */ |
| 565 | Assert(readLen == XLOG_BLCKSZ); |
| 566 | |
| 567 | if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, |
| 568 | state->readBuf)) |
| 569 | goto err; |
| 570 | } |
| 571 | |
| 572 | /* |
| 573 | * First, read the requested data length, but at least a short page header |
| 574 | * so that we can validate it. |
| 575 | */ |
| 576 | readLen = state->read_page(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), |
| 577 | state->currRecPtr, |
| 578 | state->readBuf, &state->readPageTLI); |
| 579 | if (readLen < 0) |
| 580 | goto err; |
| 581 | |
| 582 | Assert(readLen <= XLOG_BLCKSZ); |
| 583 | |
| 584 | /* Do we have enough data to check the header length? */ |
| 585 | if (readLen <= SizeOfXLogShortPHD) |
| 586 | goto err; |
| 587 | |
| 588 | Assert(readLen >= reqLen); |
| 589 | |
| 590 | hdr = (XLogPageHeader) state->readBuf; |
| 591 | |
| 592 | /* still not enough */ |
| 593 | if (readLen < XLogPageHeaderSize(hdr)) |
| 594 | { |
| 595 | readLen = state->read_page(state, pageptr, XLogPageHeaderSize(hdr), |
| 596 | state->currRecPtr, |
| 597 | state->readBuf, &state->readPageTLI); |
| 598 | if (readLen < 0) |
| 599 | goto err; |
| 600 | } |
| 601 | |
| 602 | /* |
| 603 | * Now that we know we have the full header, validate it. |
| 604 | */ |
| 605 | if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) |
| 606 | goto err; |
| 607 | |
| 608 | /* update read state information */ |
| 609 | state->readSegNo = targetSegNo; |
| 610 | state->readOff = targetPageOff; |
| 611 | state->readLen = readLen; |
| 612 | |
| 613 | return readLen; |
| 614 | |
| 615 | err: |
| 616 | XLogReaderInvalReadState(state); |
| 617 | return -1; |
| 618 | } |
| 619 | |
| 620 | /* |
| 621 | * Invalidate the xlogreader's read state to force a re-read. |
| 622 | */ |
| 623 | void |
| 624 | XLogReaderInvalReadState(XLogReaderState *state) |
| 625 | { |
| 626 | state->readSegNo = 0; |
| 627 | state->readOff = 0; |
| 628 | state->readLen = 0; |
| 629 | } |
| 630 | |
| 631 | /* |
| 632 | * Validate an XLOG record header. |
| 633 | * |
| 634 | * This is just a convenience subroutine to avoid duplicated code in |
| 635 | * XLogReadRecord. It's not intended for use from anywhere else. |
| 636 | */ |
| 637 | static bool |
| 638 | (XLogReaderState *state, XLogRecPtr RecPtr, |
| 639 | XLogRecPtr PrevRecPtr, XLogRecord *record, |
| 640 | bool randAccess) |
| 641 | { |
| 642 | if (record->xl_tot_len < SizeOfXLogRecord) |
| 643 | { |
| 644 | report_invalid_record(state, |
| 645 | "invalid record length at %X/%X: wanted %u, got %u" , |
| 646 | (uint32) (RecPtr >> 32), (uint32) RecPtr, |
| 647 | (uint32) SizeOfXLogRecord, record->xl_tot_len); |
| 648 | return false; |
| 649 | } |
| 650 | if (record->xl_rmid > RM_MAX_ID) |
| 651 | { |
| 652 | report_invalid_record(state, |
| 653 | "invalid resource manager ID %u at %X/%X" , |
| 654 | record->xl_rmid, (uint32) (RecPtr >> 32), |
| 655 | (uint32) RecPtr); |
| 656 | return false; |
| 657 | } |
| 658 | if (randAccess) |
| 659 | { |
| 660 | /* |
| 661 | * We can't exactly verify the prev-link, but surely it should be less |
| 662 | * than the record's own address. |
| 663 | */ |
| 664 | if (!(record->xl_prev < RecPtr)) |
| 665 | { |
| 666 | report_invalid_record(state, |
| 667 | "record with incorrect prev-link %X/%X at %X/%X" , |
| 668 | (uint32) (record->xl_prev >> 32), |
| 669 | (uint32) record->xl_prev, |
| 670 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 671 | return false; |
| 672 | } |
| 673 | } |
| 674 | else |
| 675 | { |
| 676 | /* |
| 677 | * Record's prev-link should exactly match our previous location. This |
| 678 | * check guards against torn WAL pages where a stale but valid-looking |
| 679 | * WAL record starts on a sector boundary. |
| 680 | */ |
| 681 | if (record->xl_prev != PrevRecPtr) |
| 682 | { |
| 683 | report_invalid_record(state, |
| 684 | "record with incorrect prev-link %X/%X at %X/%X" , |
| 685 | (uint32) (record->xl_prev >> 32), |
| 686 | (uint32) record->xl_prev, |
| 687 | (uint32) (RecPtr >> 32), (uint32) RecPtr); |
| 688 | return false; |
| 689 | } |
| 690 | } |
| 691 | |
| 692 | return true; |
| 693 | } |
| 694 | |
| 695 | |
| 696 | /* |
| 697 | * CRC-check an XLOG record. We do not believe the contents of an XLOG |
| 698 | * record (other than to the minimal extent of computing the amount of |
| 699 | * data to read in) until we've checked the CRCs. |
| 700 | * |
| 701 | * We assume all of the record (that is, xl_tot_len bytes) has been read |
| 702 | * into memory at *record. Also, ValidXLogRecordHeader() has accepted the |
| 703 | * record's header, which means in particular that xl_tot_len is at least |
| 704 | * SizeOfXlogRecord. |
| 705 | */ |
| 706 | static bool |
| 707 | ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) |
| 708 | { |
| 709 | pg_crc32c crc; |
| 710 | |
| 711 | /* Calculate the CRC */ |
| 712 | INIT_CRC32C(crc); |
| 713 | COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); |
| 714 | /* include the record header last */ |
| 715 | COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); |
| 716 | FIN_CRC32C(crc); |
| 717 | |
| 718 | if (!EQ_CRC32C(record->xl_crc, crc)) |
| 719 | { |
| 720 | report_invalid_record(state, |
| 721 | "incorrect resource manager data checksum in record at %X/%X" , |
| 722 | (uint32) (recptr >> 32), (uint32) recptr); |
| 723 | return false; |
| 724 | } |
| 725 | |
| 726 | return true; |
| 727 | } |
| 728 | |
| 729 | /* |
| 730 | * Validate a page header. |
| 731 | * |
| 732 | * Check if 'phdr' is valid as the header of the XLog page at position |
| 733 | * 'recptr'. |
| 734 | */ |
| 735 | bool |
| 736 | (XLogReaderState *state, XLogRecPtr recptr, |
| 737 | char *phdr) |
| 738 | { |
| 739 | XLogRecPtr recaddr; |
| 740 | XLogSegNo segno; |
| 741 | int32 offset; |
| 742 | XLogPageHeader hdr = (XLogPageHeader) phdr; |
| 743 | |
| 744 | Assert((recptr % XLOG_BLCKSZ) == 0); |
| 745 | |
| 746 | XLByteToSeg(recptr, segno, state->wal_segment_size); |
| 747 | offset = XLogSegmentOffset(recptr, state->wal_segment_size); |
| 748 | |
| 749 | XLogSegNoOffsetToRecPtr(segno, offset, state->wal_segment_size, recaddr); |
| 750 | |
| 751 | if (hdr->xlp_magic != XLOG_PAGE_MAGIC) |
| 752 | { |
| 753 | char fname[MAXFNAMELEN]; |
| 754 | |
| 755 | XLogFileName(fname, state->readPageTLI, segno, state->wal_segment_size); |
| 756 | |
| 757 | report_invalid_record(state, |
| 758 | "invalid magic number %04X in log segment %s, offset %u" , |
| 759 | hdr->xlp_magic, |
| 760 | fname, |
| 761 | offset); |
| 762 | return false; |
| 763 | } |
| 764 | |
| 765 | if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0) |
| 766 | { |
| 767 | char fname[MAXFNAMELEN]; |
| 768 | |
| 769 | XLogFileName(fname, state->readPageTLI, segno, state->wal_segment_size); |
| 770 | |
| 771 | report_invalid_record(state, |
| 772 | "invalid info bits %04X in log segment %s, offset %u" , |
| 773 | hdr->xlp_info, |
| 774 | fname, |
| 775 | offset); |
| 776 | return false; |
| 777 | } |
| 778 | |
| 779 | if (hdr->xlp_info & XLP_LONG_HEADER) |
| 780 | { |
| 781 | XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr; |
| 782 | |
| 783 | if (state->system_identifier && |
| 784 | longhdr->xlp_sysid != state->system_identifier) |
| 785 | { |
| 786 | char fhdrident_str[32]; |
| 787 | char sysident_str[32]; |
| 788 | |
| 789 | /* |
| 790 | * Format sysids separately to keep platform-dependent format code |
| 791 | * out of the translatable message string. |
| 792 | */ |
| 793 | snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT, |
| 794 | longhdr->xlp_sysid); |
| 795 | snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT, |
| 796 | state->system_identifier); |
| 797 | report_invalid_record(state, |
| 798 | "WAL file is from different database system: WAL file database system identifier is %s, pg_control database system identifier is %s" , |
| 799 | fhdrident_str, sysident_str); |
| 800 | return false; |
| 801 | } |
| 802 | else if (longhdr->xlp_seg_size != state->wal_segment_size) |
| 803 | { |
| 804 | report_invalid_record(state, |
| 805 | "WAL file is from different database system: incorrect segment size in page header" ); |
| 806 | return false; |
| 807 | } |
| 808 | else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ) |
| 809 | { |
| 810 | report_invalid_record(state, |
| 811 | "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header" ); |
| 812 | return false; |
| 813 | } |
| 814 | } |
| 815 | else if (offset == 0) |
| 816 | { |
| 817 | char fname[MAXFNAMELEN]; |
| 818 | |
| 819 | XLogFileName(fname, state->readPageTLI, segno, state->wal_segment_size); |
| 820 | |
| 821 | /* hmm, first page of file doesn't have a long header? */ |
| 822 | report_invalid_record(state, |
| 823 | "invalid info bits %04X in log segment %s, offset %u" , |
| 824 | hdr->xlp_info, |
| 825 | fname, |
| 826 | offset); |
| 827 | return false; |
| 828 | } |
| 829 | |
| 830 | /* |
| 831 | * Check that the address on the page agrees with what we expected. This |
| 832 | * check typically fails when an old WAL segment is recycled, and hasn't |
| 833 | * yet been overwritten with new data yet. |
| 834 | */ |
| 835 | if (hdr->xlp_pageaddr != recaddr) |
| 836 | { |
| 837 | char fname[MAXFNAMELEN]; |
| 838 | |
| 839 | XLogFileName(fname, state->readPageTLI, segno, state->wal_segment_size); |
| 840 | |
| 841 | report_invalid_record(state, |
| 842 | "unexpected pageaddr %X/%X in log segment %s, offset %u" , |
| 843 | (uint32) (hdr->xlp_pageaddr >> 32), (uint32) hdr->xlp_pageaddr, |
| 844 | fname, |
| 845 | offset); |
| 846 | return false; |
| 847 | } |
| 848 | |
| 849 | /* |
| 850 | * Since child timelines are always assigned a TLI greater than their |
| 851 | * immediate parent's TLI, we should never see TLI go backwards across |
| 852 | * successive pages of a consistent WAL sequence. |
| 853 | * |
| 854 | * Sometimes we re-read a segment that's already been (partially) read. So |
| 855 | * we only verify TLIs for pages that are later than the last remembered |
| 856 | * LSN. |
| 857 | */ |
| 858 | if (recptr > state->latestPagePtr) |
| 859 | { |
| 860 | if (hdr->xlp_tli < state->latestPageTLI) |
| 861 | { |
| 862 | char fname[MAXFNAMELEN]; |
| 863 | |
| 864 | XLogFileName(fname, state->readPageTLI, segno, state->wal_segment_size); |
| 865 | |
| 866 | report_invalid_record(state, |
| 867 | "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u" , |
| 868 | hdr->xlp_tli, |
| 869 | state->latestPageTLI, |
| 870 | fname, |
| 871 | offset); |
| 872 | return false; |
| 873 | } |
| 874 | } |
| 875 | state->latestPagePtr = recptr; |
| 876 | state->latestPageTLI = hdr->xlp_tli; |
| 877 | |
| 878 | return true; |
| 879 | } |
| 880 | |
| 881 | #ifdef FRONTEND |
| 882 | /* |
| 883 | * Functions that are currently not needed in the backend, but are better |
| 884 | * implemented inside xlogreader.c because of the internal facilities available |
| 885 | * here. |
| 886 | */ |
| 887 | |
| 888 | /* |
| 889 | * Find the first record with an lsn >= RecPtr. |
| 890 | * |
| 891 | * Useful for checking whether RecPtr is a valid xlog address for reading, and |
| 892 | * to find the first valid address after some address when dumping records for |
| 893 | * debugging purposes. |
| 894 | */ |
| 895 | XLogRecPtr |
| 896 | XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) |
| 897 | { |
| 898 | XLogReaderState saved_state = *state; |
| 899 | XLogRecPtr tmpRecPtr; |
| 900 | XLogRecPtr found = InvalidXLogRecPtr; |
| 901 | XLogPageHeader header; |
| 902 | char *errormsg; |
| 903 | |
| 904 | Assert(!XLogRecPtrIsInvalid(RecPtr)); |
| 905 | |
| 906 | /* |
| 907 | * skip over potential continuation data, keeping in mind that it may span |
| 908 | * multiple pages |
| 909 | */ |
| 910 | tmpRecPtr = RecPtr; |
| 911 | while (true) |
| 912 | { |
| 913 | XLogRecPtr targetPagePtr; |
| 914 | int targetRecOff; |
| 915 | uint32 pageHeaderSize; |
| 916 | int readLen; |
| 917 | |
| 918 | /* |
| 919 | * Compute targetRecOff. It should typically be equal or greater than |
| 920 | * short page-header since a valid record can't start anywhere before |
| 921 | * that, except when caller has explicitly specified the offset that |
| 922 | * falls somewhere there or when we are skipping multi-page |
| 923 | * continuation record. It doesn't matter though because |
| 924 | * ReadPageInternal() is prepared to handle that and will read at |
| 925 | * least short page-header worth of data |
| 926 | */ |
| 927 | targetRecOff = tmpRecPtr % XLOG_BLCKSZ; |
| 928 | |
| 929 | /* scroll back to page boundary */ |
| 930 | targetPagePtr = tmpRecPtr - targetRecOff; |
| 931 | |
| 932 | /* Read the page containing the record */ |
| 933 | readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); |
| 934 | if (readLen < 0) |
| 935 | goto err; |
| 936 | |
| 937 | header = (XLogPageHeader) state->readBuf; |
| 938 | |
| 939 | pageHeaderSize = XLogPageHeaderSize(header); |
| 940 | |
| 941 | /* make sure we have enough data for the page header */ |
| 942 | readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); |
| 943 | if (readLen < 0) |
| 944 | goto err; |
| 945 | |
| 946 | /* skip over potential continuation data */ |
| 947 | if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) |
| 948 | { |
| 949 | /* |
| 950 | * If the length of the remaining continuation data is more than |
| 951 | * what can fit in this page, the continuation record crosses over |
| 952 | * this page. Read the next page and try again. xlp_rem_len in the |
| 953 | * next page header will contain the remaining length of the |
| 954 | * continuation data |
| 955 | * |
| 956 | * Note that record headers are MAXALIGN'ed |
| 957 | */ |
| 958 | if (MAXALIGN(header->xlp_rem_len) > (XLOG_BLCKSZ - pageHeaderSize)) |
| 959 | tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; |
| 960 | else |
| 961 | { |
| 962 | /* |
| 963 | * The previous continuation record ends in this page. Set |
| 964 | * tmpRecPtr to point to the first valid record |
| 965 | */ |
| 966 | tmpRecPtr = targetPagePtr + pageHeaderSize |
| 967 | + MAXALIGN(header->xlp_rem_len); |
| 968 | break; |
| 969 | } |
| 970 | } |
| 971 | else |
| 972 | { |
| 973 | tmpRecPtr = targetPagePtr + pageHeaderSize; |
| 974 | break; |
| 975 | } |
| 976 | } |
| 977 | |
| 978 | /* |
| 979 | * we know now that tmpRecPtr is an address pointing to a valid XLogRecord |
| 980 | * because either we're at the first record after the beginning of a page |
| 981 | * or we just jumped over the remaining data of a continuation. |
| 982 | */ |
| 983 | while (XLogReadRecord(state, tmpRecPtr, &errormsg) != NULL) |
| 984 | { |
| 985 | /* continue after the record */ |
| 986 | tmpRecPtr = InvalidXLogRecPtr; |
| 987 | |
| 988 | /* past the record we've found, break out */ |
| 989 | if (RecPtr <= state->ReadRecPtr) |
| 990 | { |
| 991 | found = state->ReadRecPtr; |
| 992 | goto out; |
| 993 | } |
| 994 | } |
| 995 | |
| 996 | err: |
| 997 | out: |
| 998 | /* Reset state to what we had before finding the record */ |
| 999 | state->ReadRecPtr = saved_state.ReadRecPtr; |
| 1000 | state->EndRecPtr = saved_state.EndRecPtr; |
| 1001 | XLogReaderInvalReadState(state); |
| 1002 | |
| 1003 | return found; |
| 1004 | } |
| 1005 | |
| 1006 | #endif /* FRONTEND */ |
| 1007 | |
| 1008 | |
| 1009 | /* ---------------------------------------- |
| 1010 | * Functions for decoding the data and block references in a record. |
| 1011 | * ---------------------------------------- |
| 1012 | */ |
| 1013 | |
| 1014 | /* private function to reset the state between records */ |
| 1015 | static void |
| 1016 | ResetDecoder(XLogReaderState *state) |
| 1017 | { |
| 1018 | int block_id; |
| 1019 | |
| 1020 | state->decoded_record = NULL; |
| 1021 | |
| 1022 | state->main_data_len = 0; |
| 1023 | |
| 1024 | for (block_id = 0; block_id <= state->max_block_id; block_id++) |
| 1025 | { |
| 1026 | state->blocks[block_id].in_use = false; |
| 1027 | state->blocks[block_id].has_image = false; |
| 1028 | state->blocks[block_id].has_data = false; |
| 1029 | state->blocks[block_id].apply_image = false; |
| 1030 | } |
| 1031 | state->max_block_id = -1; |
| 1032 | } |
| 1033 | |
| 1034 | /* |
| 1035 | * Decode the previously read record. |
| 1036 | * |
| 1037 | * On error, a human-readable error message is returned in *errormsg, and |
| 1038 | * the return value is false. |
| 1039 | */ |
| 1040 | bool |
| 1041 | DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) |
| 1042 | { |
| 1043 | /* |
| 1044 | * read next _size bytes from record buffer, but check for overrun first. |
| 1045 | */ |
| 1046 | #define (_dst, _size) \ |
| 1047 | do { \ |
| 1048 | if (remaining < _size) \ |
| 1049 | goto shortdata_err; \ |
| 1050 | memcpy(_dst, ptr, _size); \ |
| 1051 | ptr += _size; \ |
| 1052 | remaining -= _size; \ |
| 1053 | } while(0) |
| 1054 | |
| 1055 | char *ptr; |
| 1056 | uint32 remaining; |
| 1057 | uint32 datatotal; |
| 1058 | RelFileNode *rnode = NULL; |
| 1059 | uint8 block_id; |
| 1060 | |
| 1061 | ResetDecoder(state); |
| 1062 | |
| 1063 | state->decoded_record = record; |
| 1064 | state->record_origin = InvalidRepOriginId; |
| 1065 | |
| 1066 | ptr = (char *) record; |
| 1067 | ptr += SizeOfXLogRecord; |
| 1068 | remaining = record->xl_tot_len - SizeOfXLogRecord; |
| 1069 | |
| 1070 | /* Decode the headers */ |
| 1071 | datatotal = 0; |
| 1072 | while (remaining > datatotal) |
| 1073 | { |
| 1074 | COPY_HEADER_FIELD(&block_id, sizeof(uint8)); |
| 1075 | |
| 1076 | if (block_id == XLR_BLOCK_ID_DATA_SHORT) |
| 1077 | { |
| 1078 | /* XLogRecordDataHeaderShort */ |
| 1079 | uint8 main_data_len; |
| 1080 | |
| 1081 | COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); |
| 1082 | |
| 1083 | state->main_data_len = main_data_len; |
| 1084 | datatotal += main_data_len; |
| 1085 | break; /* by convention, the main data fragment is |
| 1086 | * always last */ |
| 1087 | } |
| 1088 | else if (block_id == XLR_BLOCK_ID_DATA_LONG) |
| 1089 | { |
| 1090 | /* XLogRecordDataHeaderLong */ |
| 1091 | uint32 main_data_len; |
| 1092 | |
| 1093 | COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); |
| 1094 | state->main_data_len = main_data_len; |
| 1095 | datatotal += main_data_len; |
| 1096 | break; /* by convention, the main data fragment is |
| 1097 | * always last */ |
| 1098 | } |
| 1099 | else if (block_id == XLR_BLOCK_ID_ORIGIN) |
| 1100 | { |
| 1101 | COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId)); |
| 1102 | } |
| 1103 | else if (block_id <= XLR_MAX_BLOCK_ID) |
| 1104 | { |
| 1105 | /* XLogRecordBlockHeader */ |
| 1106 | DecodedBkpBlock *blk; |
| 1107 | uint8 fork_flags; |
| 1108 | |
| 1109 | if (block_id <= state->max_block_id) |
| 1110 | { |
| 1111 | report_invalid_record(state, |
| 1112 | "out-of-order block_id %u at %X/%X" , |
| 1113 | block_id, |
| 1114 | (uint32) (state->ReadRecPtr >> 32), |
| 1115 | (uint32) state->ReadRecPtr); |
| 1116 | goto err; |
| 1117 | } |
| 1118 | state->max_block_id = block_id; |
| 1119 | |
| 1120 | blk = &state->blocks[block_id]; |
| 1121 | blk->in_use = true; |
| 1122 | blk->apply_image = false; |
| 1123 | |
| 1124 | COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); |
| 1125 | blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; |
| 1126 | blk->flags = fork_flags; |
| 1127 | blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); |
| 1128 | blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); |
| 1129 | |
| 1130 | COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); |
| 1131 | /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ |
| 1132 | if (blk->has_data && blk->data_len == 0) |
| 1133 | { |
| 1134 | report_invalid_record(state, |
| 1135 | "BKPBLOCK_HAS_DATA set, but no data included at %X/%X" , |
| 1136 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1137 | goto err; |
| 1138 | } |
| 1139 | if (!blk->has_data && blk->data_len != 0) |
| 1140 | { |
| 1141 | report_invalid_record(state, |
| 1142 | "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X" , |
| 1143 | (unsigned int) blk->data_len, |
| 1144 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1145 | goto err; |
| 1146 | } |
| 1147 | datatotal += blk->data_len; |
| 1148 | |
| 1149 | if (blk->has_image) |
| 1150 | { |
| 1151 | COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); |
| 1152 | COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); |
| 1153 | COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); |
| 1154 | |
| 1155 | blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); |
| 1156 | |
| 1157 | if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) |
| 1158 | { |
| 1159 | if (blk->bimg_info & BKPIMAGE_HAS_HOLE) |
| 1160 | COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); |
| 1161 | else |
| 1162 | blk->hole_length = 0; |
| 1163 | } |
| 1164 | else |
| 1165 | blk->hole_length = BLCKSZ - blk->bimg_len; |
| 1166 | datatotal += blk->bimg_len; |
| 1167 | |
| 1168 | /* |
| 1169 | * cross-check that hole_offset > 0, hole_length > 0 and |
| 1170 | * bimg_len < BLCKSZ if the HAS_HOLE flag is set. |
| 1171 | */ |
| 1172 | if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && |
| 1173 | (blk->hole_offset == 0 || |
| 1174 | blk->hole_length == 0 || |
| 1175 | blk->bimg_len == BLCKSZ)) |
| 1176 | { |
| 1177 | report_invalid_record(state, |
| 1178 | "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X" , |
| 1179 | (unsigned int) blk->hole_offset, |
| 1180 | (unsigned int) blk->hole_length, |
| 1181 | (unsigned int) blk->bimg_len, |
| 1182 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1183 | goto err; |
| 1184 | } |
| 1185 | |
| 1186 | /* |
| 1187 | * cross-check that hole_offset == 0 and hole_length == 0 if |
| 1188 | * the HAS_HOLE flag is not set. |
| 1189 | */ |
| 1190 | if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && |
| 1191 | (blk->hole_offset != 0 || blk->hole_length != 0)) |
| 1192 | { |
| 1193 | report_invalid_record(state, |
| 1194 | "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X" , |
| 1195 | (unsigned int) blk->hole_offset, |
| 1196 | (unsigned int) blk->hole_length, |
| 1197 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1198 | goto err; |
| 1199 | } |
| 1200 | |
| 1201 | /* |
| 1202 | * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED |
| 1203 | * flag is set. |
| 1204 | */ |
| 1205 | if ((blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && |
| 1206 | blk->bimg_len == BLCKSZ) |
| 1207 | { |
| 1208 | report_invalid_record(state, |
| 1209 | "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X" , |
| 1210 | (unsigned int) blk->bimg_len, |
| 1211 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1212 | goto err; |
| 1213 | } |
| 1214 | |
| 1215 | /* |
| 1216 | * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor |
| 1217 | * IS_COMPRESSED flag is set. |
| 1218 | */ |
| 1219 | if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && |
| 1220 | !(blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && |
| 1221 | blk->bimg_len != BLCKSZ) |
| 1222 | { |
| 1223 | report_invalid_record(state, |
| 1224 | "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X" , |
| 1225 | (unsigned int) blk->data_len, |
| 1226 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1227 | goto err; |
| 1228 | } |
| 1229 | } |
| 1230 | if (!(fork_flags & BKPBLOCK_SAME_REL)) |
| 1231 | { |
| 1232 | COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); |
| 1233 | rnode = &blk->rnode; |
| 1234 | } |
| 1235 | else |
| 1236 | { |
| 1237 | if (rnode == NULL) |
| 1238 | { |
| 1239 | report_invalid_record(state, |
| 1240 | "BKPBLOCK_SAME_REL set but no previous rel at %X/%X" , |
| 1241 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1242 | goto err; |
| 1243 | } |
| 1244 | |
| 1245 | blk->rnode = *rnode; |
| 1246 | } |
| 1247 | COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); |
| 1248 | } |
| 1249 | else |
| 1250 | { |
| 1251 | report_invalid_record(state, |
| 1252 | "invalid block_id %u at %X/%X" , |
| 1253 | block_id, |
| 1254 | (uint32) (state->ReadRecPtr >> 32), |
| 1255 | (uint32) state->ReadRecPtr); |
| 1256 | goto err; |
| 1257 | } |
| 1258 | } |
| 1259 | |
| 1260 | if (remaining != datatotal) |
| 1261 | goto shortdata_err; |
| 1262 | |
| 1263 | /* |
| 1264 | * Ok, we've parsed the fragment headers, and verified that the total |
| 1265 | * length of the payload in the fragments is equal to the amount of data |
| 1266 | * left. Copy the data of each fragment to a separate buffer. |
| 1267 | * |
| 1268 | * We could just set up pointers into readRecordBuf, but we want to align |
| 1269 | * the data for the convenience of the callers. Backup images are not |
| 1270 | * copied, however; they don't need alignment. |
| 1271 | */ |
| 1272 | |
| 1273 | /* block data first */ |
| 1274 | for (block_id = 0; block_id <= state->max_block_id; block_id++) |
| 1275 | { |
| 1276 | DecodedBkpBlock *blk = &state->blocks[block_id]; |
| 1277 | |
| 1278 | if (!blk->in_use) |
| 1279 | continue; |
| 1280 | |
| 1281 | Assert(blk->has_image || !blk->apply_image); |
| 1282 | |
| 1283 | if (blk->has_image) |
| 1284 | { |
| 1285 | blk->bkp_image = ptr; |
| 1286 | ptr += blk->bimg_len; |
| 1287 | } |
| 1288 | if (blk->has_data) |
| 1289 | { |
| 1290 | if (!blk->data || blk->data_len > blk->data_bufsz) |
| 1291 | { |
| 1292 | if (blk->data) |
| 1293 | pfree(blk->data); |
| 1294 | |
| 1295 | /* |
| 1296 | * Force the initial request to be BLCKSZ so that we don't |
| 1297 | * waste time with lots of trips through this stanza as a |
| 1298 | * result of WAL compression. |
| 1299 | */ |
| 1300 | blk->data_bufsz = MAXALIGN(Max(blk->data_len, BLCKSZ)); |
| 1301 | blk->data = palloc(blk->data_bufsz); |
| 1302 | } |
| 1303 | memcpy(blk->data, ptr, blk->data_len); |
| 1304 | ptr += blk->data_len; |
| 1305 | } |
| 1306 | } |
| 1307 | |
| 1308 | /* and finally, the main data */ |
| 1309 | if (state->main_data_len > 0) |
| 1310 | { |
| 1311 | if (!state->main_data || state->main_data_len > state->main_data_bufsz) |
| 1312 | { |
| 1313 | if (state->main_data) |
| 1314 | pfree(state->main_data); |
| 1315 | |
| 1316 | /* |
| 1317 | * main_data_bufsz must be MAXALIGN'ed. In many xlog record |
| 1318 | * types, we omit trailing struct padding on-disk to save a few |
| 1319 | * bytes; but compilers may generate accesses to the xlog struct |
| 1320 | * that assume that padding bytes are present. If the palloc |
| 1321 | * request is not large enough to include such padding bytes then |
| 1322 | * we'll get valgrind complaints due to otherwise-harmless fetches |
| 1323 | * of the padding bytes. |
| 1324 | * |
| 1325 | * In addition, force the initial request to be reasonably large |
| 1326 | * so that we don't waste time with lots of trips through this |
| 1327 | * stanza. BLCKSZ / 2 seems like a good compromise choice. |
| 1328 | */ |
| 1329 | state->main_data_bufsz = MAXALIGN(Max(state->main_data_len, |
| 1330 | BLCKSZ / 2)); |
| 1331 | state->main_data = palloc(state->main_data_bufsz); |
| 1332 | } |
| 1333 | memcpy(state->main_data, ptr, state->main_data_len); |
| 1334 | ptr += state->main_data_len; |
| 1335 | } |
| 1336 | |
| 1337 | return true; |
| 1338 | |
| 1339 | shortdata_err: |
| 1340 | report_invalid_record(state, |
| 1341 | "record with invalid length at %X/%X" , |
| 1342 | (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); |
| 1343 | err: |
| 1344 | *errormsg = state->errormsg_buf; |
| 1345 | |
| 1346 | return false; |
| 1347 | } |
| 1348 | |
| 1349 | /* |
| 1350 | * Returns information about the block that a block reference refers to. |
| 1351 | * |
| 1352 | * If the WAL record contains a block reference with the given ID, *rnode, |
| 1353 | * *forknum, and *blknum are filled in (if not NULL), and returns true. |
| 1354 | * Otherwise returns false. |
| 1355 | */ |
| 1356 | bool |
| 1357 | XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, |
| 1358 | RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum) |
| 1359 | { |
| 1360 | DecodedBkpBlock *bkpb; |
| 1361 | |
| 1362 | if (!record->blocks[block_id].in_use) |
| 1363 | return false; |
| 1364 | |
| 1365 | bkpb = &record->blocks[block_id]; |
| 1366 | if (rnode) |
| 1367 | *rnode = bkpb->rnode; |
| 1368 | if (forknum) |
| 1369 | *forknum = bkpb->forknum; |
| 1370 | if (blknum) |
| 1371 | *blknum = bkpb->blkno; |
| 1372 | return true; |
| 1373 | } |
| 1374 | |
| 1375 | /* |
| 1376 | * Returns the data associated with a block reference, or NULL if there is |
| 1377 | * no data (e.g. because a full-page image was taken instead). The returned |
| 1378 | * pointer points to a MAXALIGNed buffer. |
| 1379 | */ |
| 1380 | char * |
| 1381 | XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len) |
| 1382 | { |
| 1383 | DecodedBkpBlock *bkpb; |
| 1384 | |
| 1385 | if (!record->blocks[block_id].in_use) |
| 1386 | return NULL; |
| 1387 | |
| 1388 | bkpb = &record->blocks[block_id]; |
| 1389 | |
| 1390 | if (!bkpb->has_data) |
| 1391 | { |
| 1392 | if (len) |
| 1393 | *len = 0; |
| 1394 | return NULL; |
| 1395 | } |
| 1396 | else |
| 1397 | { |
| 1398 | if (len) |
| 1399 | *len = bkpb->data_len; |
| 1400 | return bkpb->data; |
| 1401 | } |
| 1402 | } |
| 1403 | |
| 1404 | /* |
| 1405 | * Restore a full-page image from a backup block attached to an XLOG record. |
| 1406 | * |
| 1407 | * Returns the buffer number containing the page. |
| 1408 | */ |
| 1409 | bool |
| 1410 | RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) |
| 1411 | { |
| 1412 | DecodedBkpBlock *bkpb; |
| 1413 | char *ptr; |
| 1414 | PGAlignedBlock tmp; |
| 1415 | |
| 1416 | if (!record->blocks[block_id].in_use) |
| 1417 | return false; |
| 1418 | if (!record->blocks[block_id].has_image) |
| 1419 | return false; |
| 1420 | |
| 1421 | bkpb = &record->blocks[block_id]; |
| 1422 | ptr = bkpb->bkp_image; |
| 1423 | |
| 1424 | if (bkpb->bimg_info & BKPIMAGE_IS_COMPRESSED) |
| 1425 | { |
| 1426 | /* If a backup block image is compressed, decompress it */ |
| 1427 | if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data, |
| 1428 | BLCKSZ - bkpb->hole_length, true) < 0) |
| 1429 | { |
| 1430 | report_invalid_record(record, "invalid compressed image at %X/%X, block %d" , |
| 1431 | (uint32) (record->ReadRecPtr >> 32), |
| 1432 | (uint32) record->ReadRecPtr, |
| 1433 | block_id); |
| 1434 | return false; |
| 1435 | } |
| 1436 | ptr = tmp.data; |
| 1437 | } |
| 1438 | |
| 1439 | /* generate page, taking into account hole if necessary */ |
| 1440 | if (bkpb->hole_length == 0) |
| 1441 | { |
| 1442 | memcpy(page, ptr, BLCKSZ); |
| 1443 | } |
| 1444 | else |
| 1445 | { |
| 1446 | memcpy(page, ptr, bkpb->hole_offset); |
| 1447 | /* must zero-fill the hole */ |
| 1448 | MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); |
| 1449 | memcpy(page + (bkpb->hole_offset + bkpb->hole_length), |
| 1450 | ptr + bkpb->hole_offset, |
| 1451 | BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); |
| 1452 | } |
| 1453 | |
| 1454 | return true; |
| 1455 | } |
| 1456 | |