1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * generic_xlog.c |
4 | * Implementation of generic xlog records. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * src/backend/access/transam/generic_xlog.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #include "postgres.h" |
15 | |
16 | #include "access/bufmask.h" |
17 | #include "access/generic_xlog.h" |
18 | #include "access/xlogutils.h" |
19 | #include "miscadmin.h" |
20 | #include "utils/memutils.h" |
21 | |
22 | /*------------------------------------------------------------------------- |
23 | * Internally, a delta between pages consists of a set of fragments. Each |
24 | * fragment represents changes made in a given region of a page. A fragment |
25 | * is made up as follows: |
26 | * |
27 | * - offset of page region (OffsetNumber) |
28 | * - length of page region (OffsetNumber) |
29 | * - data - the data to place into the region ('length' number of bytes) |
30 | * |
31 | * Unchanged regions of a page are not represented in its delta. As a result, |
32 | * a delta can be more compact than the full page image. But having an |
33 | * unchanged region between two fragments that is smaller than the fragment |
34 | * header (offset+length) does not pay off in terms of the overall size of |
35 | * the delta. For this reason, we merge adjacent fragments if the unchanged |
36 | * region between them is <= MATCH_THRESHOLD bytes. |
37 | * |
38 | * We do not bother to merge fragments across the "lower" and "upper" parts |
39 | * of a page; it's very seldom the case that pd_lower and pd_upper are within |
40 | * MATCH_THRESHOLD bytes of each other, and handling that infrequent case |
41 | * would complicate and slow down the delta-computation code unduly. |
42 | * Therefore, the worst-case delta size includes two fragment headers plus |
43 | * a full page's worth of data. |
44 | *------------------------------------------------------------------------- |
45 | */ |
46 | #define (2 * sizeof(OffsetNumber)) |
47 | #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE |
48 | #define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE) |
49 | |
50 | /* Struct of generic xlog data for single page */ |
51 | typedef struct |
52 | { |
53 | Buffer buffer; /* registered buffer */ |
54 | int flags; /* flags for this buffer */ |
55 | int deltaLen; /* space consumed in delta field */ |
56 | char *image; /* copy of page image for modification, do not |
57 | * do it in-place to have aligned memory chunk */ |
58 | char delta[MAX_DELTA_SIZE]; /* delta between page images */ |
59 | } PageData; |
60 | |
61 | /* State of generic xlog record construction */ |
62 | struct GenericXLogState |
63 | { |
64 | /* Info about each page, see above */ |
65 | PageData pages[MAX_GENERIC_XLOG_PAGES]; |
66 | bool isLogged; |
67 | /* Page images (properly aligned) */ |
68 | PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; |
69 | }; |
70 | |
71 | static void writeFragment(PageData *pageData, OffsetNumber offset, |
72 | OffsetNumber len, const char *data); |
73 | static void computeRegionDelta(PageData *pageData, |
74 | const char *curpage, const char *targetpage, |
75 | int targetStart, int targetEnd, |
76 | int validStart, int validEnd); |
77 | static void computeDelta(PageData *pageData, Page curpage, Page targetpage); |
78 | static void applyPageRedo(Page page, const char *delta, Size deltaSize); |
79 | |
80 | |
81 | /* |
82 | * Write next fragment into pageData's delta. |
83 | * |
84 | * The fragment has the given offset and length, and data points to the |
85 | * actual data (of length length). |
86 | */ |
87 | static void |
88 | writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length, |
89 | const char *data) |
90 | { |
91 | char *ptr = pageData->delta + pageData->deltaLen; |
92 | |
93 | /* Verify we have enough space */ |
94 | Assert(pageData->deltaLen + sizeof(offset) + |
95 | sizeof(length) + length <= sizeof(pageData->delta)); |
96 | |
97 | /* Write fragment data */ |
98 | memcpy(ptr, &offset, sizeof(offset)); |
99 | ptr += sizeof(offset); |
100 | memcpy(ptr, &length, sizeof(length)); |
101 | ptr += sizeof(length); |
102 | memcpy(ptr, data, length); |
103 | ptr += length; |
104 | |
105 | pageData->deltaLen = ptr - pageData->delta; |
106 | } |
107 | |
108 | /* |
109 | * Compute the XLOG fragments needed to transform a region of curpage into the |
110 | * corresponding region of targetpage, and append them to pageData's delta |
111 | * field. The region to transform runs from targetStart to targetEnd-1. |
112 | * Bytes in curpage outside the range validStart to validEnd-1 should be |
113 | * considered invalid, and always overwritten with target data. |
114 | * |
115 | * This function is a hot spot, so it's worth being as tense as possible |
116 | * about the data-matching loops. |
117 | */ |
118 | static void |
119 | computeRegionDelta(PageData *pageData, |
120 | const char *curpage, const char *targetpage, |
121 | int targetStart, int targetEnd, |
122 | int validStart, int validEnd) |
123 | { |
124 | int i, |
125 | loopEnd, |
126 | fragmentBegin = -1, |
127 | fragmentEnd = -1; |
128 | |
129 | /* Deal with any invalid start region by including it in first fragment */ |
130 | if (validStart > targetStart) |
131 | { |
132 | fragmentBegin = targetStart; |
133 | targetStart = validStart; |
134 | } |
135 | |
136 | /* We'll deal with any invalid end region after the main loop */ |
137 | loopEnd = Min(targetEnd, validEnd); |
138 | |
139 | /* Examine all the potentially matchable bytes */ |
140 | i = targetStart; |
141 | while (i < loopEnd) |
142 | { |
143 | if (curpage[i] != targetpage[i]) |
144 | { |
145 | /* On unmatched byte, start new fragment if not already in one */ |
146 | if (fragmentBegin < 0) |
147 | fragmentBegin = i; |
148 | /* Mark unmatched-data endpoint as uncertain */ |
149 | fragmentEnd = -1; |
150 | /* Extend the fragment as far as possible in a tight loop */ |
151 | i++; |
152 | while (i < loopEnd && curpage[i] != targetpage[i]) |
153 | i++; |
154 | if (i >= loopEnd) |
155 | break; |
156 | } |
157 | |
158 | /* Found a matched byte, so remember end of unmatched fragment */ |
159 | fragmentEnd = i; |
160 | |
161 | /* |
162 | * Extend the match as far as possible in a tight loop. (On typical |
163 | * workloads, this inner loop is the bulk of this function's runtime.) |
164 | */ |
165 | i++; |
166 | while (i < loopEnd && curpage[i] == targetpage[i]) |
167 | i++; |
168 | |
169 | /* |
170 | * There are several possible cases at this point: |
171 | * |
172 | * 1. We have no unwritten fragment (fragmentBegin < 0). There's |
173 | * nothing to write; and it doesn't matter what fragmentEnd is. |
174 | * |
175 | * 2. We found more than MATCH_THRESHOLD consecutive matching bytes. |
176 | * Dump out the unwritten fragment, stopping at fragmentEnd. |
177 | * |
178 | * 3. The match extends to loopEnd. We'll do nothing here, exit the |
179 | * loop, and then dump the unwritten fragment, after merging it with |
180 | * the invalid end region if any. If we don't so merge, fragmentEnd |
181 | * establishes how much the final writeFragment call needs to write. |
182 | * |
183 | * 4. We found an unmatched byte before loopEnd. The loop will repeat |
184 | * and will enter the unmatched-byte stanza above. So in this case |
185 | * also, it doesn't matter what fragmentEnd is. The matched bytes |
186 | * will get merged into the continuing unmatched fragment. |
187 | * |
188 | * Only in case 3 do we reach the bottom of the loop with a meaningful |
189 | * fragmentEnd value, which is why it's OK that we unconditionally |
190 | * assign "fragmentEnd = i" above. |
191 | */ |
192 | if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD) |
193 | { |
194 | writeFragment(pageData, fragmentBegin, |
195 | fragmentEnd - fragmentBegin, |
196 | targetpage + fragmentBegin); |
197 | fragmentBegin = -1; |
198 | fragmentEnd = -1; /* not really necessary */ |
199 | } |
200 | } |
201 | |
202 | /* Deal with any invalid end region by including it in final fragment */ |
203 | if (loopEnd < targetEnd) |
204 | { |
205 | if (fragmentBegin < 0) |
206 | fragmentBegin = loopEnd; |
207 | fragmentEnd = targetEnd; |
208 | } |
209 | |
210 | /* Write final fragment if any */ |
211 | if (fragmentBegin >= 0) |
212 | { |
213 | if (fragmentEnd < 0) |
214 | fragmentEnd = targetEnd; |
215 | writeFragment(pageData, fragmentBegin, |
216 | fragmentEnd - fragmentBegin, |
217 | targetpage + fragmentBegin); |
218 | } |
219 | } |
220 | |
221 | /* |
222 | * Compute the XLOG delta record needed to transform curpage into targetpage, |
223 | * and store it in pageData's delta field. |
224 | */ |
225 | static void |
226 | computeDelta(PageData *pageData, Page curpage, Page targetpage) |
227 | { |
228 | int targetLower = ((PageHeader) targetpage)->pd_lower, |
229 | targetUpper = ((PageHeader) targetpage)->pd_upper, |
230 | curLower = ((PageHeader) curpage)->pd_lower, |
231 | curUpper = ((PageHeader) curpage)->pd_upper; |
232 | |
233 | pageData->deltaLen = 0; |
234 | |
235 | /* Compute delta records for lower part of page ... */ |
236 | computeRegionDelta(pageData, curpage, targetpage, |
237 | 0, targetLower, |
238 | 0, curLower); |
239 | /* ... and for upper part, ignoring what's between */ |
240 | computeRegionDelta(pageData, curpage, targetpage, |
241 | targetUpper, BLCKSZ, |
242 | curUpper, BLCKSZ); |
243 | |
244 | /* |
245 | * If xlog debug is enabled, then check produced delta. Result of delta |
246 | * application to curpage should be equivalent to targetpage. |
247 | */ |
248 | #ifdef WAL_DEBUG |
249 | if (XLOG_DEBUG) |
250 | { |
251 | PGAlignedBlock tmp; |
252 | |
253 | memcpy(tmp.data, curpage, BLCKSZ); |
254 | applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen); |
255 | if (memcmp(tmp.data, targetpage, targetLower) != 0 || |
256 | memcmp(tmp.data + targetUpper, targetpage + targetUpper, |
257 | BLCKSZ - targetUpper) != 0) |
258 | elog(ERROR, "result of generic xlog apply does not match" ); |
259 | } |
260 | #endif |
261 | } |
262 | |
263 | /* |
264 | * Start new generic xlog record for modifications to specified relation. |
265 | */ |
266 | GenericXLogState * |
267 | GenericXLogStart(Relation relation) |
268 | { |
269 | GenericXLogState *state; |
270 | int i; |
271 | |
272 | state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); |
273 | state->isLogged = RelationNeedsWAL(relation); |
274 | |
275 | for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) |
276 | { |
277 | state->pages[i].image = state->images[i].data; |
278 | state->pages[i].buffer = InvalidBuffer; |
279 | } |
280 | |
281 | return state; |
282 | } |
283 | |
284 | /* |
285 | * Register new buffer for generic xlog record. |
286 | * |
287 | * Returns pointer to the page's image in the GenericXLogState, which |
288 | * is what the caller should modify. |
289 | * |
290 | * If the buffer is already registered, just return its existing entry. |
291 | * (It's not very clear what to do with the flags in such a case, but |
292 | * for now we stay with the original flags.) |
293 | */ |
294 | Page |
295 | GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags) |
296 | { |
297 | int block_id; |
298 | |
299 | /* Search array for existing entry or first unused slot */ |
300 | for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++) |
301 | { |
302 | PageData *page = &state->pages[block_id]; |
303 | |
304 | if (BufferIsInvalid(page->buffer)) |
305 | { |
306 | /* Empty slot, so use it (there cannot be a match later) */ |
307 | page->buffer = buffer; |
308 | page->flags = flags; |
309 | memcpy(page->image, BufferGetPage(buffer), BLCKSZ); |
310 | return (Page) page->image; |
311 | } |
312 | else if (page->buffer == buffer) |
313 | { |
314 | /* |
315 | * Buffer is already registered. Just return the image, which is |
316 | * already prepared. |
317 | */ |
318 | return (Page) page->image; |
319 | } |
320 | } |
321 | |
322 | elog(ERROR, "maximum number %d of generic xlog buffers is exceeded" , |
323 | MAX_GENERIC_XLOG_PAGES); |
324 | /* keep compiler quiet */ |
325 | return NULL; |
326 | } |
327 | |
328 | /* |
329 | * Apply changes represented by GenericXLogState to the actual buffers, |
330 | * and emit a generic xlog record. |
331 | */ |
332 | XLogRecPtr |
333 | GenericXLogFinish(GenericXLogState *state) |
334 | { |
335 | XLogRecPtr lsn; |
336 | int i; |
337 | |
338 | if (state->isLogged) |
339 | { |
340 | /* Logged relation: make xlog record in critical section. */ |
341 | XLogBeginInsert(); |
342 | |
343 | START_CRIT_SECTION(); |
344 | |
345 | for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) |
346 | { |
347 | PageData *pageData = &state->pages[i]; |
348 | Page page; |
349 | PageHeader ; |
350 | |
351 | if (BufferIsInvalid(pageData->buffer)) |
352 | continue; |
353 | |
354 | page = BufferGetPage(pageData->buffer); |
355 | pageHeader = (PageHeader) pageData->image; |
356 | |
357 | if (pageData->flags & GENERIC_XLOG_FULL_IMAGE) |
358 | { |
359 | /* |
360 | * A full-page image does not require us to supply any xlog |
361 | * data. Just apply the image, being careful to zero the |
362 | * "hole" between pd_lower and pd_upper in order to avoid |
363 | * divergence between actual page state and what replay would |
364 | * produce. |
365 | */ |
366 | memcpy(page, pageData->image, pageHeader->pd_lower); |
367 | memset(page + pageHeader->pd_lower, 0, |
368 | pageHeader->pd_upper - pageHeader->pd_lower); |
369 | memcpy(page + pageHeader->pd_upper, |
370 | pageData->image + pageHeader->pd_upper, |
371 | BLCKSZ - pageHeader->pd_upper); |
372 | |
373 | XLogRegisterBuffer(i, pageData->buffer, |
374 | REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
375 | } |
376 | else |
377 | { |
378 | /* |
379 | * In normal mode, calculate delta and write it as xlog data |
380 | * associated with this page. |
381 | */ |
382 | computeDelta(pageData, page, (Page) pageData->image); |
383 | |
384 | /* Apply the image, with zeroed "hole" as above */ |
385 | memcpy(page, pageData->image, pageHeader->pd_lower); |
386 | memset(page + pageHeader->pd_lower, 0, |
387 | pageHeader->pd_upper - pageHeader->pd_lower); |
388 | memcpy(page + pageHeader->pd_upper, |
389 | pageData->image + pageHeader->pd_upper, |
390 | BLCKSZ - pageHeader->pd_upper); |
391 | |
392 | XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD); |
393 | XLogRegisterBufData(i, pageData->delta, pageData->deltaLen); |
394 | } |
395 | } |
396 | |
397 | /* Insert xlog record */ |
398 | lsn = XLogInsert(RM_GENERIC_ID, 0); |
399 | |
400 | /* Set LSN and mark buffers dirty */ |
401 | for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) |
402 | { |
403 | PageData *pageData = &state->pages[i]; |
404 | |
405 | if (BufferIsInvalid(pageData->buffer)) |
406 | continue; |
407 | PageSetLSN(BufferGetPage(pageData->buffer), lsn); |
408 | MarkBufferDirty(pageData->buffer); |
409 | } |
410 | END_CRIT_SECTION(); |
411 | } |
412 | else |
413 | { |
414 | /* Unlogged relation: skip xlog-related stuff */ |
415 | START_CRIT_SECTION(); |
416 | for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) |
417 | { |
418 | PageData *pageData = &state->pages[i]; |
419 | |
420 | if (BufferIsInvalid(pageData->buffer)) |
421 | continue; |
422 | memcpy(BufferGetPage(pageData->buffer), |
423 | pageData->image, |
424 | BLCKSZ); |
425 | /* We don't worry about zeroing the "hole" in this case */ |
426 | MarkBufferDirty(pageData->buffer); |
427 | } |
428 | END_CRIT_SECTION(); |
429 | /* We don't have a LSN to return, in this case */ |
430 | lsn = InvalidXLogRecPtr; |
431 | } |
432 | |
433 | pfree(state); |
434 | |
435 | return lsn; |
436 | } |
437 | |
438 | /* |
439 | * Abort generic xlog record construction. No changes are applied to buffers. |
440 | * |
441 | * Note: caller is responsible for releasing locks/pins on buffers, if needed. |
442 | */ |
443 | void |
444 | GenericXLogAbort(GenericXLogState *state) |
445 | { |
446 | pfree(state); |
447 | } |
448 | |
449 | /* |
450 | * Apply delta to given page image. |
451 | */ |
452 | static void |
453 | (Page page, const char *delta, Size deltaSize) |
454 | { |
455 | const char *ptr = delta; |
456 | const char *end = delta + deltaSize; |
457 | |
458 | while (ptr < end) |
459 | { |
460 | OffsetNumber offset, |
461 | length; |
462 | |
463 | memcpy(&offset, ptr, sizeof(offset)); |
464 | ptr += sizeof(offset); |
465 | memcpy(&length, ptr, sizeof(length)); |
466 | ptr += sizeof(length); |
467 | |
468 | memcpy(page + offset, ptr, length); |
469 | |
470 | ptr += length; |
471 | } |
472 | } |
473 | |
474 | /* |
475 | * Redo function for generic xlog record. |
476 | */ |
477 | void |
478 | generic_redo(XLogReaderState *record) |
479 | { |
480 | XLogRecPtr lsn = record->EndRecPtr; |
481 | Buffer buffers[MAX_GENERIC_XLOG_PAGES]; |
482 | uint8 block_id; |
483 | |
484 | /* Protect limited size of buffers[] array */ |
485 | Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES); |
486 | |
487 | /* Iterate over blocks */ |
488 | for (block_id = 0; block_id <= record->max_block_id; block_id++) |
489 | { |
490 | XLogRedoAction action; |
491 | |
492 | if (!XLogRecHasBlockRef(record, block_id)) |
493 | { |
494 | buffers[block_id] = InvalidBuffer; |
495 | continue; |
496 | } |
497 | |
498 | action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]); |
499 | |
500 | /* Apply redo to given block if needed */ |
501 | if (action == BLK_NEEDS_REDO) |
502 | { |
503 | Page page; |
504 | PageHeader ; |
505 | char *blockDelta; |
506 | Size blockDeltaSize; |
507 | |
508 | page = BufferGetPage(buffers[block_id]); |
509 | blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize); |
510 | applyPageRedo(page, blockDelta, blockDeltaSize); |
511 | |
512 | /* |
513 | * Since the delta contains no information about what's in the |
514 | * "hole" between pd_lower and pd_upper, set that to zero to |
515 | * ensure we produce the same page state that application of the |
516 | * logged action by GenericXLogFinish did. |
517 | */ |
518 | pageHeader = (PageHeader) page; |
519 | memset(page + pageHeader->pd_lower, 0, |
520 | pageHeader->pd_upper - pageHeader->pd_lower); |
521 | |
522 | PageSetLSN(page, lsn); |
523 | MarkBufferDirty(buffers[block_id]); |
524 | } |
525 | } |
526 | |
527 | /* Changes are done: unlock and release all buffers */ |
528 | for (block_id = 0; block_id <= record->max_block_id; block_id++) |
529 | { |
530 | if (BufferIsValid(buffers[block_id])) |
531 | UnlockReleaseBuffer(buffers[block_id]); |
532 | } |
533 | } |
534 | |
535 | /* |
536 | * Mask a generic page before performing consistency checks on it. |
537 | */ |
538 | void |
539 | generic_mask(char *page, BlockNumber blkno) |
540 | { |
541 | mask_page_lsn_and_checksum(page); |
542 | |
543 | mask_unused_space(page); |
544 | } |
545 | |