1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * clog.c |
4 | * PostgreSQL transaction-commit-log manager |
5 | * |
6 | * This module replaces the old "pg_log" access code, which treated pg_log |
7 | * essentially like a relation, in that it went through the regular buffer |
8 | * manager. The problem with that was that there wasn't any good way to |
9 | * recycle storage space for transactions so old that they'll never be |
10 | * looked up again. Now we use specialized access code so that the commit |
11 | * log can be broken into relatively small, independent segments. |
12 | * |
13 | * XLOG interactions: this module generates an XLOG record whenever a new |
14 | * CLOG page is initialized to zeroes. Other writes of CLOG come from |
15 | * recording of transaction commit or abort in xact.c, which generates its |
16 | * own XLOG records for these events and will re-perform the status update |
17 | * on redo; so we need make no additional XLOG entry here. For synchronous |
18 | * transaction commits, the XLOG is guaranteed flushed through the XLOG commit |
19 | * record before we are called to log a commit, so the WAL rule "write xlog |
20 | * before data" is satisfied automatically. However, for async commits we |
21 | * must track the latest LSN affecting each CLOG page, so that we can flush |
22 | * XLOG that far and satisfy the WAL rule. We don't have to worry about this |
23 | * for aborts (whether sync or async), since the post-crash assumption would |
24 | * be that such transactions failed anyway. |
25 | * |
26 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
27 | * Portions Copyright (c) 1994, Regents of the University of California |
28 | * |
29 | * src/backend/access/transam/clog.c |
30 | * |
31 | *------------------------------------------------------------------------- |
32 | */ |
33 | #include "postgres.h" |
34 | |
35 | #include "access/clog.h" |
36 | #include "access/slru.h" |
37 | #include "access/transam.h" |
38 | #include "access/xlog.h" |
39 | #include "access/xloginsert.h" |
40 | #include "access/xlogutils.h" |
41 | #include "miscadmin.h" |
42 | #include "pgstat.h" |
43 | #include "pg_trace.h" |
44 | #include "storage/proc.h" |
45 | |
46 | /* |
47 | * Defines for CLOG page sizes. A page is the same BLCKSZ as is used |
48 | * everywhere else in Postgres. |
49 | * |
50 | * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, |
51 | * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE, |
52 | * and CLOG segment numbering at |
53 | * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no |
54 | * explicit notice of that fact in this module, except when comparing segment |
55 | * and page numbers in TruncateCLOG (see CLOGPagePrecedes). |
56 | */ |
57 | |
58 | /* We need two bits per xact, so four xacts fit in a byte */ |
59 | #define CLOG_BITS_PER_XACT 2 |
60 | #define CLOG_XACTS_PER_BYTE 4 |
61 | #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) |
62 | #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) |
63 | |
64 | #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) |
65 | #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) |
66 | #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE) |
67 | #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE) |
68 | |
69 | /* We store the latest async LSN for each group of transactions */ |
70 | #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */ |
71 | #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP) |
72 | |
73 | #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ |
74 | ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) |
75 | |
76 | /* |
77 | * The number of subtransactions below which we consider to apply clog group |
78 | * update optimization. Testing reveals that the number higher than this can |
79 | * hurt performance. |
80 | */ |
81 | #define THRESHOLD_SUBTRANS_CLOG_OPT 5 |
82 | |
83 | /* |
84 | * Link to shared-memory data structures for CLOG control |
85 | */ |
86 | static SlruCtlData ClogCtlData; |
87 | |
88 | #define ClogCtl (&ClogCtlData) |
89 | |
90 | |
91 | static int ZeroCLOGPage(int pageno, bool writeXlog); |
92 | static bool CLOGPagePrecedes(int page1, int page2); |
93 | static void WriteZeroPageXlogRec(int pageno); |
94 | static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, |
95 | Oid oldestXidDb); |
96 | static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, |
97 | TransactionId *subxids, XidStatus status, |
98 | XLogRecPtr lsn, int pageno, |
99 | bool all_xact_same_page); |
100 | static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, |
101 | XLogRecPtr lsn, int slotno); |
102 | static void set_status_by_pages(int nsubxids, TransactionId *subxids, |
103 | XidStatus status, XLogRecPtr lsn); |
104 | static bool TransactionGroupUpdateXidStatus(TransactionId xid, |
105 | XidStatus status, XLogRecPtr lsn, int pageno); |
106 | static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, |
107 | TransactionId *subxids, XidStatus status, |
108 | XLogRecPtr lsn, int pageno); |
109 | |
110 | |
111 | /* |
112 | * TransactionIdSetTreeStatus |
113 | * |
114 | * Record the final state of transaction entries in the commit log for |
115 | * a transaction and its subtransaction tree. Take care to ensure this is |
116 | * efficient, and as atomic as possible. |
117 | * |
118 | * xid is a single xid to set status for. This will typically be |
119 | * the top level transactionid for a top level commit or abort. It can |
120 | * also be a subtransaction when we record transaction aborts. |
121 | * |
122 | * subxids is an array of xids of length nsubxids, representing subtransactions |
123 | * in the tree of xid. In various cases nsubxids may be zero. |
124 | * |
125 | * lsn must be the WAL location of the commit record when recording an async |
126 | * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the |
127 | * caller guarantees the commit record is already flushed in that case. It |
128 | * should be InvalidXLogRecPtr for abort cases, too. |
129 | * |
130 | * In the commit case, atomicity is limited by whether all the subxids are in |
131 | * the same CLOG page as xid. If they all are, then the lock will be grabbed |
132 | * only once, and the status will be set to committed directly. Otherwise |
133 | * we must |
134 | * 1. set sub-committed all subxids that are not on the same page as the |
135 | * main xid |
136 | * 2. atomically set committed the main xid and the subxids on the same page |
137 | * 3. go over the first bunch again and set them committed |
138 | * Note that as far as concurrent checkers are concerned, main transaction |
139 | * commit as a whole is still atomic. |
140 | * |
141 | * Example: |
142 | * TransactionId t commits and has subxids t1, t2, t3, t4 |
143 | * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 |
144 | * 1. update pages2-3: |
145 | * page2: set t2,t3 as sub-committed |
146 | * page3: set t4 as sub-committed |
147 | * 2. update page1: |
148 | * set t1 as sub-committed, |
149 | * then set t as committed, |
150 | then set t1 as committed |
151 | * 3. update pages2-3: |
152 | * page2: set t2,t3 as committed |
153 | * page3: set t4 as committed |
154 | * |
155 | * NB: this is a low-level routine and is NOT the preferred entry point |
156 | * for most uses; functions in transam.c are the intended callers. |
157 | * |
158 | * XXX Think about issuing FADVISE_WILLNEED on pages that we will need, |
159 | * but aren't yet in cache, as well as hinting pages not to fall out of |
160 | * cache yet. |
161 | */ |
162 | void |
163 | TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, |
164 | TransactionId *subxids, XidStatus status, XLogRecPtr lsn) |
165 | { |
166 | int pageno = TransactionIdToPage(xid); /* get page of parent */ |
167 | int i; |
168 | |
169 | Assert(status == TRANSACTION_STATUS_COMMITTED || |
170 | status == TRANSACTION_STATUS_ABORTED); |
171 | |
172 | /* |
173 | * See how many subxids, if any, are on the same page as the parent, if |
174 | * any. |
175 | */ |
176 | for (i = 0; i < nsubxids; i++) |
177 | { |
178 | if (TransactionIdToPage(subxids[i]) != pageno) |
179 | break; |
180 | } |
181 | |
182 | /* |
183 | * Do all items fit on a single page? |
184 | */ |
185 | if (i == nsubxids) |
186 | { |
187 | /* |
188 | * Set the parent and all subtransactions in a single call |
189 | */ |
190 | TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, |
191 | pageno, true); |
192 | } |
193 | else |
194 | { |
195 | int nsubxids_on_first_page = i; |
196 | |
197 | /* |
198 | * If this is a commit then we care about doing this correctly (i.e. |
199 | * using the subcommitted intermediate status). By here, we know |
200 | * we're updating more than one page of clog, so we must mark entries |
201 | * that are *not* on the first page so that they show as subcommitted |
202 | * before we then return to update the status to fully committed. |
203 | * |
204 | * To avoid touching the first page twice, skip marking subcommitted |
205 | * for the subxids on that first page. |
206 | */ |
207 | if (status == TRANSACTION_STATUS_COMMITTED) |
208 | set_status_by_pages(nsubxids - nsubxids_on_first_page, |
209 | subxids + nsubxids_on_first_page, |
210 | TRANSACTION_STATUS_SUB_COMMITTED, lsn); |
211 | |
212 | /* |
213 | * Now set the parent and subtransactions on same page as the parent, |
214 | * if any |
215 | */ |
216 | pageno = TransactionIdToPage(xid); |
217 | TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, |
218 | lsn, pageno, false); |
219 | |
220 | /* |
221 | * Now work through the rest of the subxids one clog page at a time, |
222 | * starting from the second page onwards, like we did above. |
223 | */ |
224 | set_status_by_pages(nsubxids - nsubxids_on_first_page, |
225 | subxids + nsubxids_on_first_page, |
226 | status, lsn); |
227 | } |
228 | } |
229 | |
230 | /* |
231 | * Helper for TransactionIdSetTreeStatus: set the status for a bunch of |
232 | * transactions, chunking in the separate CLOG pages involved. We never |
233 | * pass the whole transaction tree to this function, only subtransactions |
234 | * that are on different pages to the top level transaction id. |
235 | */ |
236 | static void |
237 | set_status_by_pages(int nsubxids, TransactionId *subxids, |
238 | XidStatus status, XLogRecPtr lsn) |
239 | { |
240 | int pageno = TransactionIdToPage(subxids[0]); |
241 | int offset = 0; |
242 | int i = 0; |
243 | |
244 | Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */ |
245 | |
246 | while (i < nsubxids) |
247 | { |
248 | int num_on_page = 0; |
249 | int nextpageno; |
250 | |
251 | do |
252 | { |
253 | nextpageno = TransactionIdToPage(subxids[i]); |
254 | if (nextpageno != pageno) |
255 | break; |
256 | num_on_page++; |
257 | i++; |
258 | } while (i < nsubxids); |
259 | |
260 | TransactionIdSetPageStatus(InvalidTransactionId, |
261 | num_on_page, subxids + offset, |
262 | status, lsn, pageno, false); |
263 | offset = i; |
264 | pageno = nextpageno; |
265 | } |
266 | } |
267 | |
268 | /* |
269 | * Record the final state of transaction entries in the commit log for all |
270 | * entries on a single page. Atomic only on this page. |
271 | */ |
272 | static void |
273 | TransactionIdSetPageStatus(TransactionId xid, int nsubxids, |
274 | TransactionId *subxids, XidStatus status, |
275 | XLogRecPtr lsn, int pageno, |
276 | bool all_xact_same_page) |
277 | { |
278 | /* Can't use group update when PGPROC overflows. */ |
279 | StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS, |
280 | "group clog threshold less than PGPROC cached subxids" ); |
281 | |
282 | /* |
283 | * When there is contention on CLogControlLock, we try to group multiple |
284 | * updates; a single leader process will perform transaction status |
285 | * updates for multiple backends so that the number of times |
286 | * CLogControlLock needs to be acquired is reduced. |
287 | * |
288 | * For this optimization to be safe, the XID in MyPgXact and the subxids |
289 | * in MyProc must be the same as the ones for which we're setting the |
290 | * status. Check that this is the case. |
291 | * |
292 | * For this optimization to be efficient, we shouldn't have too many |
293 | * sub-XIDs and all of the XIDs for which we're adjusting clog should be |
294 | * on the same page. Check those conditions, too. |
295 | */ |
296 | if (all_xact_same_page && xid == MyPgXact->xid && |
297 | nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && |
298 | nsubxids == MyPgXact->nxids && |
299 | memcmp(subxids, MyProc->subxids.xids, |
300 | nsubxids * sizeof(TransactionId)) == 0) |
301 | { |
302 | /* |
303 | * We don't try to do group update optimization if a process has |
304 | * overflowed the subxids array in its PGPROC, since in that case we |
305 | * don't have a complete list of XIDs for it. |
306 | */ |
307 | Assert(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS); |
308 | |
309 | /* |
310 | * If we can immediately acquire CLogControlLock, we update the status |
311 | * of our own XID and release the lock. If not, try use group XID |
312 | * update. If that doesn't work out, fall back to waiting for the |
313 | * lock to perform an update for this transaction only. |
314 | */ |
315 | if (LWLockConditionalAcquire(CLogControlLock, LW_EXCLUSIVE)) |
316 | { |
317 | /* Got the lock without waiting! Do the update. */ |
318 | TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, |
319 | lsn, pageno); |
320 | LWLockRelease(CLogControlLock); |
321 | return; |
322 | } |
323 | else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno)) |
324 | { |
325 | /* Group update mechanism has done the work. */ |
326 | return; |
327 | } |
328 | |
329 | /* Fall through only if update isn't done yet. */ |
330 | } |
331 | |
332 | /* Group update not applicable, or couldn't accept this page number. */ |
333 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
334 | TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, |
335 | lsn, pageno); |
336 | LWLockRelease(CLogControlLock); |
337 | } |
338 | |
339 | /* |
340 | * Record the final state of transaction entry in the commit log |
341 | * |
342 | * We don't do any locking here; caller must handle that. |
343 | */ |
344 | static void |
345 | TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, |
346 | TransactionId *subxids, XidStatus status, |
347 | XLogRecPtr lsn, int pageno) |
348 | { |
349 | int slotno; |
350 | int i; |
351 | |
352 | Assert(status == TRANSACTION_STATUS_COMMITTED || |
353 | status == TRANSACTION_STATUS_ABORTED || |
354 | (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); |
355 | Assert(LWLockHeldByMeInMode(CLogControlLock, LW_EXCLUSIVE)); |
356 | |
357 | /* |
358 | * If we're doing an async commit (ie, lsn is valid), then we must wait |
359 | * for any active write on the page slot to complete. Otherwise our |
360 | * update could reach disk in that write, which will not do since we |
361 | * mustn't let it reach disk until we've done the appropriate WAL flush. |
362 | * But when lsn is invalid, it's OK to scribble on a page while it is |
363 | * write-busy, since we don't care if the update reaches disk sooner than |
364 | * we think. |
365 | */ |
366 | slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); |
367 | |
368 | /* |
369 | * Set the main transaction id, if any. |
370 | * |
371 | * If we update more than one xid on this page while it is being written |
372 | * out, we might find that some of the bits go to disk and others don't. |
373 | * If we are updating commits on the page with the top-level xid that |
374 | * could break atomicity, so we subcommit the subxids first before we mark |
375 | * the top-level commit. |
376 | */ |
377 | if (TransactionIdIsValid(xid)) |
378 | { |
379 | /* Subtransactions first, if needed ... */ |
380 | if (status == TRANSACTION_STATUS_COMMITTED) |
381 | { |
382 | for (i = 0; i < nsubxids; i++) |
383 | { |
384 | Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); |
385 | TransactionIdSetStatusBit(subxids[i], |
386 | TRANSACTION_STATUS_SUB_COMMITTED, |
387 | lsn, slotno); |
388 | } |
389 | } |
390 | |
391 | /* ... then the main transaction */ |
392 | TransactionIdSetStatusBit(xid, status, lsn, slotno); |
393 | } |
394 | |
395 | /* Set the subtransactions */ |
396 | for (i = 0; i < nsubxids; i++) |
397 | { |
398 | Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); |
399 | TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); |
400 | } |
401 | |
402 | ClogCtl->shared->page_dirty[slotno] = true; |
403 | } |
404 | |
405 | /* |
406 | * When we cannot immediately acquire CLogControlLock in exclusive mode at |
407 | * commit time, add ourselves to a list of processes that need their XIDs |
408 | * status update. The first process to add itself to the list will acquire |
409 | * CLogControlLock in exclusive mode and set transaction status as required |
410 | * on behalf of all group members. This avoids a great deal of contention |
411 | * around CLogControlLock when many processes are trying to commit at once, |
412 | * since the lock need not be repeatedly handed off from one committing |
413 | * process to the next. |
414 | * |
415 | * Returns true when transaction status has been updated in clog; returns |
416 | * false if we decided against applying the optimization because the page |
417 | * number we need to update differs from those processes already waiting. |
418 | */ |
419 | static bool |
420 | TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, |
421 | XLogRecPtr lsn, int pageno) |
422 | { |
423 | volatile PROC_HDR *procglobal = ProcGlobal; |
424 | PGPROC *proc = MyProc; |
425 | uint32 nextidx; |
426 | uint32 wakeidx; |
427 | |
428 | /* We should definitely have an XID whose status needs to be updated. */ |
429 | Assert(TransactionIdIsValid(xid)); |
430 | |
431 | /* |
432 | * Add ourselves to the list of processes needing a group XID status |
433 | * update. |
434 | */ |
435 | proc->clogGroupMember = true; |
436 | proc->clogGroupMemberXid = xid; |
437 | proc->clogGroupMemberXidStatus = status; |
438 | proc->clogGroupMemberPage = pageno; |
439 | proc->clogGroupMemberLsn = lsn; |
440 | |
441 | nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst); |
442 | |
443 | while (true) |
444 | { |
445 | /* |
446 | * Add the proc to list, if the clog page where we need to update the |
447 | * current transaction status is same as group leader's clog page. |
448 | * |
449 | * There is a race condition here, which is that after doing the below |
450 | * check and before adding this proc's clog update to a group, the |
451 | * group leader might have already finished the group update for this |
452 | * page and becomes group leader of another group. This will lead to a |
453 | * situation where a single group can have different clog page |
454 | * updates. This isn't likely and will still work, just maybe a bit |
455 | * less efficiently. |
456 | */ |
457 | if (nextidx != INVALID_PGPROCNO && |
458 | ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage) |
459 | { |
460 | proc->clogGroupMember = false; |
461 | return false; |
462 | } |
463 | |
464 | pg_atomic_write_u32(&proc->clogGroupNext, nextidx); |
465 | |
466 | if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst, |
467 | &nextidx, |
468 | (uint32) proc->pgprocno)) |
469 | break; |
470 | } |
471 | |
472 | /* |
473 | * If the list was not empty, the leader will update the status of our |
474 | * XID. It is impossible to have followers without a leader because the |
475 | * first process that has added itself to the list will always have |
476 | * nextidx as INVALID_PGPROCNO. |
477 | */ |
478 | if (nextidx != INVALID_PGPROCNO) |
479 | { |
480 | int = 0; |
481 | |
482 | /* Sleep until the leader updates our XID status. */ |
483 | pgstat_report_wait_start(WAIT_EVENT_CLOG_GROUP_UPDATE); |
484 | for (;;) |
485 | { |
486 | /* acts as a read barrier */ |
487 | PGSemaphoreLock(proc->sem); |
488 | if (!proc->clogGroupMember) |
489 | break; |
490 | extraWaits++; |
491 | } |
492 | pgstat_report_wait_end(); |
493 | |
494 | Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO); |
495 | |
496 | /* Fix semaphore count for any absorbed wakeups */ |
497 | while (extraWaits-- > 0) |
498 | PGSemaphoreUnlock(proc->sem); |
499 | return true; |
500 | } |
501 | |
502 | /* We are the leader. Acquire the lock on behalf of everyone. */ |
503 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
504 | |
505 | /* |
506 | * Now that we've got the lock, clear the list of processes waiting for |
507 | * group XID status update, saving a pointer to the head of the list. |
508 | * Trying to pop elements one at a time could lead to an ABA problem. |
509 | */ |
510 | nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst, |
511 | INVALID_PGPROCNO); |
512 | |
513 | /* Remember head of list so we can perform wakeups after dropping lock. */ |
514 | wakeidx = nextidx; |
515 | |
516 | /* Walk the list and update the status of all XIDs. */ |
517 | while (nextidx != INVALID_PGPROCNO) |
518 | { |
519 | PGPROC *proc = &ProcGlobal->allProcs[nextidx]; |
520 | PGXACT *pgxact = &ProcGlobal->allPgXact[nextidx]; |
521 | |
522 | /* |
523 | * Overflowed transactions should not use group XID status update |
524 | * mechanism. |
525 | */ |
526 | Assert(!pgxact->overflowed); |
527 | |
528 | TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid, |
529 | pgxact->nxids, |
530 | proc->subxids.xids, |
531 | proc->clogGroupMemberXidStatus, |
532 | proc->clogGroupMemberLsn, |
533 | proc->clogGroupMemberPage); |
534 | |
535 | /* Move to next proc in list. */ |
536 | nextidx = pg_atomic_read_u32(&proc->clogGroupNext); |
537 | } |
538 | |
539 | /* We're done with the lock now. */ |
540 | LWLockRelease(CLogControlLock); |
541 | |
542 | /* |
543 | * Now that we've released the lock, go back and wake everybody up. We |
544 | * don't do this under the lock so as to keep lock hold times to a |
545 | * minimum. |
546 | */ |
547 | while (wakeidx != INVALID_PGPROCNO) |
548 | { |
549 | PGPROC *proc = &ProcGlobal->allProcs[wakeidx]; |
550 | |
551 | wakeidx = pg_atomic_read_u32(&proc->clogGroupNext); |
552 | pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO); |
553 | |
554 | /* ensure all previous writes are visible before follower continues. */ |
555 | pg_write_barrier(); |
556 | |
557 | proc->clogGroupMember = false; |
558 | |
559 | if (proc != MyProc) |
560 | PGSemaphoreUnlock(proc->sem); |
561 | } |
562 | |
563 | return true; |
564 | } |
565 | |
566 | /* |
567 | * Sets the commit status of a single transaction. |
568 | * |
569 | * Must be called with CLogControlLock held |
570 | */ |
571 | static void |
572 | TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) |
573 | { |
574 | int byteno = TransactionIdToByte(xid); |
575 | int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; |
576 | char *byteptr; |
577 | char byteval; |
578 | char curval; |
579 | |
580 | byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; |
581 | curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; |
582 | |
583 | /* |
584 | * When replaying transactions during recovery we still need to perform |
585 | * the two phases of subcommit and then commit. However, some transactions |
586 | * are already correctly marked, so we just treat those as a no-op which |
587 | * allows us to keep the following Assert as restrictive as possible. |
588 | */ |
589 | if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED && |
590 | curval == TRANSACTION_STATUS_COMMITTED) |
591 | return; |
592 | |
593 | /* |
594 | * Current state change should be from 0 or subcommitted to target state |
595 | * or we should already be there when replaying changes during recovery. |
596 | */ |
597 | Assert(curval == 0 || |
598 | (curval == TRANSACTION_STATUS_SUB_COMMITTED && |
599 | status != TRANSACTION_STATUS_IN_PROGRESS) || |
600 | curval == status); |
601 | |
602 | /* note this assumes exclusive access to the clog page */ |
603 | byteval = *byteptr; |
604 | byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift); |
605 | byteval |= (status << bshift); |
606 | *byteptr = byteval; |
607 | |
608 | /* |
609 | * Update the group LSN if the transaction completion LSN is higher. |
610 | * |
611 | * Note: lsn will be invalid when supplied during InRecovery processing, |
612 | * so we don't need to do anything special to avoid LSN updates during |
613 | * recovery. After recovery completes the next clog change will set the |
614 | * LSN correctly. |
615 | */ |
616 | if (!XLogRecPtrIsInvalid(lsn)) |
617 | { |
618 | int lsnindex = GetLSNIndex(slotno, xid); |
619 | |
620 | if (ClogCtl->shared->group_lsn[lsnindex] < lsn) |
621 | ClogCtl->shared->group_lsn[lsnindex] = lsn; |
622 | } |
623 | } |
624 | |
625 | /* |
626 | * Interrogate the state of a transaction in the commit log. |
627 | * |
628 | * Aside from the actual commit status, this function returns (into *lsn) |
629 | * an LSN that is late enough to be able to guarantee that if we flush up to |
630 | * that LSN then we will have flushed the transaction's commit record to disk. |
631 | * The result is not necessarily the exact LSN of the transaction's commit |
632 | * record! For example, for long-past transactions (those whose clog pages |
633 | * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because |
634 | * we group transactions on the same clog page to conserve storage, we might |
635 | * return the LSN of a later transaction that falls into the same group. |
636 | * |
637 | * NB: this is a low-level routine and is NOT the preferred entry point |
638 | * for most uses; TransactionLogFetch() in transam.c is the intended caller. |
639 | */ |
640 | XidStatus |
641 | TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) |
642 | { |
643 | int pageno = TransactionIdToPage(xid); |
644 | int byteno = TransactionIdToByte(xid); |
645 | int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; |
646 | int slotno; |
647 | int lsnindex; |
648 | char *byteptr; |
649 | XidStatus status; |
650 | |
651 | /* lock is acquired by SimpleLruReadPage_ReadOnly */ |
652 | |
653 | slotno = SimpleLruReadPage_ReadOnly(ClogCtl, pageno, xid); |
654 | byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; |
655 | |
656 | status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; |
657 | |
658 | lsnindex = GetLSNIndex(slotno, xid); |
659 | *lsn = ClogCtl->shared->group_lsn[lsnindex]; |
660 | |
661 | LWLockRelease(CLogControlLock); |
662 | |
663 | return status; |
664 | } |
665 | |
666 | /* |
667 | * Number of shared CLOG buffers. |
668 | * |
669 | * On larger multi-processor systems, it is possible to have many CLOG page |
670 | * requests in flight at one time which could lead to disk access for CLOG |
671 | * page if the required page is not found in memory. Testing revealed that we |
672 | * can get the best performance by having 128 CLOG buffers, more than that it |
673 | * doesn't improve performance. |
674 | * |
675 | * Unconditionally keeping the number of CLOG buffers to 128 did not seem like |
676 | * a good idea, because it would increase the minimum amount of shared memory |
677 | * required to start, which could be a problem for people running very small |
678 | * configurations. The following formula seems to represent a reasonable |
679 | * compromise: people with very low values for shared_buffers will get fewer |
680 | * CLOG buffers as well, and everyone else will get 128. |
681 | */ |
682 | Size |
683 | CLOGShmemBuffers(void) |
684 | { |
685 | return Min(128, Max(4, NBuffers / 512)); |
686 | } |
687 | |
688 | /* |
689 | * Initialization of shared memory for CLOG |
690 | */ |
691 | Size |
692 | CLOGShmemSize(void) |
693 | { |
694 | return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); |
695 | } |
696 | |
697 | void |
698 | CLOGShmemInit(void) |
699 | { |
700 | ClogCtl->PagePrecedes = CLOGPagePrecedes; |
701 | SimpleLruInit(ClogCtl, "clog" , CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, |
702 | CLogControlLock, "pg_xact" , LWTRANCHE_CLOG_BUFFERS); |
703 | } |
704 | |
705 | /* |
706 | * This func must be called ONCE on system install. It creates |
707 | * the initial CLOG segment. (The CLOG directory is assumed to |
708 | * have been created by initdb, and CLOGShmemInit must have been |
709 | * called already.) |
710 | */ |
711 | void |
712 | BootStrapCLOG(void) |
713 | { |
714 | int slotno; |
715 | |
716 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
717 | |
718 | /* Create and zero the first page of the commit log */ |
719 | slotno = ZeroCLOGPage(0, false); |
720 | |
721 | /* Make sure it's written out */ |
722 | SimpleLruWritePage(ClogCtl, slotno); |
723 | Assert(!ClogCtl->shared->page_dirty[slotno]); |
724 | |
725 | LWLockRelease(CLogControlLock); |
726 | } |
727 | |
728 | /* |
729 | * Initialize (or reinitialize) a page of CLOG to zeroes. |
730 | * If writeXlog is true, also emit an XLOG record saying we did this. |
731 | * |
732 | * The page is not actually written, just set up in shared memory. |
733 | * The slot number of the new page is returned. |
734 | * |
735 | * Control lock must be held at entry, and will be held at exit. |
736 | */ |
737 | static int |
738 | ZeroCLOGPage(int pageno, bool writeXlog) |
739 | { |
740 | int slotno; |
741 | |
742 | slotno = SimpleLruZeroPage(ClogCtl, pageno); |
743 | |
744 | if (writeXlog) |
745 | WriteZeroPageXlogRec(pageno); |
746 | |
747 | return slotno; |
748 | } |
749 | |
750 | /* |
751 | * This must be called ONCE during postmaster or standalone-backend startup, |
752 | * after StartupXLOG has initialized ShmemVariableCache->nextFullXid. |
753 | */ |
754 | void |
755 | StartupCLOG(void) |
756 | { |
757 | TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); |
758 | int pageno = TransactionIdToPage(xid); |
759 | |
760 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
761 | |
762 | /* |
763 | * Initialize our idea of the latest page number. |
764 | */ |
765 | ClogCtl->shared->latest_page_number = pageno; |
766 | |
767 | LWLockRelease(CLogControlLock); |
768 | } |
769 | |
770 | /* |
771 | * This must be called ONCE at the end of startup/recovery. |
772 | */ |
773 | void |
774 | TrimCLOG(void) |
775 | { |
776 | TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); |
777 | int pageno = TransactionIdToPage(xid); |
778 | |
779 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
780 | |
781 | /* |
782 | * Re-Initialize our idea of the latest page number. |
783 | */ |
784 | ClogCtl->shared->latest_page_number = pageno; |
785 | |
786 | /* |
787 | * Zero out the remainder of the current clog page. Under normal |
788 | * circumstances it should be zeroes already, but it seems at least |
789 | * theoretically possible that XLOG replay will have settled on a nextXID |
790 | * value that is less than the last XID actually used and marked by the |
791 | * previous database lifecycle (since subtransaction commit writes clog |
792 | * but makes no WAL entry). Let's just be safe. (We need not worry about |
793 | * pages beyond the current one, since those will be zeroed when first |
794 | * used. For the same reason, there is no need to do anything when |
795 | * nextFullXid is exactly at a page boundary; and it's likely that the |
796 | * "current" page doesn't exist yet in that case.) |
797 | */ |
798 | if (TransactionIdToPgIndex(xid) != 0) |
799 | { |
800 | int byteno = TransactionIdToByte(xid); |
801 | int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; |
802 | int slotno; |
803 | char *byteptr; |
804 | |
805 | slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid); |
806 | byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; |
807 | |
808 | /* Zero so-far-unused positions in the current byte */ |
809 | *byteptr &= (1 << bshift) - 1; |
810 | /* Zero the rest of the page */ |
811 | MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); |
812 | |
813 | ClogCtl->shared->page_dirty[slotno] = true; |
814 | } |
815 | |
816 | LWLockRelease(CLogControlLock); |
817 | } |
818 | |
819 | /* |
820 | * This must be called ONCE during postmaster or standalone-backend shutdown |
821 | */ |
822 | void |
823 | ShutdownCLOG(void) |
824 | { |
825 | /* Flush dirty CLOG pages to disk */ |
826 | TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false); |
827 | SimpleLruFlush(ClogCtl, false); |
828 | |
829 | /* |
830 | * fsync pg_xact to ensure that any files flushed previously are durably |
831 | * on disk. |
832 | */ |
833 | fsync_fname("pg_xact" , true); |
834 | |
835 | TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false); |
836 | } |
837 | |
838 | /* |
839 | * Perform a checkpoint --- either during shutdown, or on-the-fly |
840 | */ |
841 | void |
842 | CheckPointCLOG(void) |
843 | { |
844 | /* Flush dirty CLOG pages to disk */ |
845 | TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); |
846 | SimpleLruFlush(ClogCtl, true); |
847 | |
848 | /* |
849 | * fsync pg_xact to ensure that any files flushed previously are durably |
850 | * on disk. |
851 | */ |
852 | fsync_fname("pg_xact" , true); |
853 | |
854 | TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); |
855 | } |
856 | |
857 | |
858 | /* |
859 | * Make sure that CLOG has room for a newly-allocated XID. |
860 | * |
861 | * NB: this is called while holding XidGenLock. We want it to be very fast |
862 | * most of the time; even when it's not so fast, no actual I/O need happen |
863 | * unless we're forced to write out a dirty clog or xlog page to make room |
864 | * in shared memory. |
865 | */ |
866 | void |
867 | ExtendCLOG(TransactionId newestXact) |
868 | { |
869 | int pageno; |
870 | |
871 | /* |
872 | * No work except at first XID of a page. But beware: just after |
873 | * wraparound, the first XID of page zero is FirstNormalTransactionId. |
874 | */ |
875 | if (TransactionIdToPgIndex(newestXact) != 0 && |
876 | !TransactionIdEquals(newestXact, FirstNormalTransactionId)) |
877 | return; |
878 | |
879 | pageno = TransactionIdToPage(newestXact); |
880 | |
881 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
882 | |
883 | /* Zero the page and make an XLOG entry about it */ |
884 | ZeroCLOGPage(pageno, true); |
885 | |
886 | LWLockRelease(CLogControlLock); |
887 | } |
888 | |
889 | |
890 | /* |
891 | * Remove all CLOG segments before the one holding the passed transaction ID |
892 | * |
893 | * Before removing any CLOG data, we must flush XLOG to disk, to ensure |
894 | * that any recently-emitted HEAP_FREEZE records have reached disk; otherwise |
895 | * a crash and restart might leave us with some unfrozen tuples referencing |
896 | * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too. |
897 | * Replaying the deletion from XLOG is not critical, since the files could |
898 | * just as well be removed later, but doing so prevents a long-running hot |
899 | * standby server from acquiring an unreasonably bloated CLOG directory. |
900 | * |
901 | * Since CLOG segments hold a large number of transactions, the opportunity to |
902 | * actually remove a segment is fairly rare, and so it seems best not to do |
903 | * the XLOG flush unless we have confirmed that there is a removable segment. |
904 | */ |
905 | void |
906 | TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) |
907 | { |
908 | int cutoffPage; |
909 | |
910 | /* |
911 | * The cutoff point is the start of the segment containing oldestXact. We |
912 | * pass the *page* containing oldestXact to SimpleLruTruncate. |
913 | */ |
914 | cutoffPage = TransactionIdToPage(oldestXact); |
915 | |
916 | /* Check to see if there's any files that could be removed */ |
917 | if (!SlruScanDirectory(ClogCtl, SlruScanDirCbReportPresence, &cutoffPage)) |
918 | return; /* nothing to remove */ |
919 | |
920 | /* |
921 | * Advance oldestClogXid before truncating clog, so concurrent xact status |
922 | * lookups can ensure they don't attempt to access truncated-away clog. |
923 | * |
924 | * It's only necessary to do this if we will actually truncate away clog |
925 | * pages. |
926 | */ |
927 | AdvanceOldestClogXid(oldestXact); |
928 | |
929 | /* |
930 | * Write XLOG record and flush XLOG to disk. We record the oldest xid |
931 | * we're keeping information about here so we can ensure that it's always |
932 | * ahead of clog truncation in case we crash, and so a standby finds out |
933 | * the new valid xid before the next checkpoint. |
934 | */ |
935 | WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); |
936 | |
937 | /* Now we can remove the old CLOG segment(s) */ |
938 | SimpleLruTruncate(ClogCtl, cutoffPage); |
939 | } |
940 | |
941 | |
942 | /* |
943 | * Decide which of two CLOG page numbers is "older" for truncation purposes. |
944 | * |
945 | * We need to use comparison of TransactionIds here in order to do the right |
946 | * thing with wraparound XID arithmetic. However, if we are asked about |
947 | * page number zero, we don't want to hand InvalidTransactionId to |
948 | * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, |
949 | * offset both xids by FirstNormalTransactionId to avoid that. |
950 | */ |
951 | static bool |
952 | CLOGPagePrecedes(int page1, int page2) |
953 | { |
954 | TransactionId xid1; |
955 | TransactionId xid2; |
956 | |
957 | xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE; |
958 | xid1 += FirstNormalTransactionId; |
959 | xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE; |
960 | xid2 += FirstNormalTransactionId; |
961 | |
962 | return TransactionIdPrecedes(xid1, xid2); |
963 | } |
964 | |
965 | |
966 | /* |
967 | * Write a ZEROPAGE xlog record |
968 | */ |
969 | static void |
970 | WriteZeroPageXlogRec(int pageno) |
971 | { |
972 | XLogBeginInsert(); |
973 | XLogRegisterData((char *) (&pageno), sizeof(int)); |
974 | (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); |
975 | } |
976 | |
977 | /* |
978 | * Write a TRUNCATE xlog record |
979 | * |
980 | * We must flush the xlog record to disk before returning --- see notes |
981 | * in TruncateCLOG(). |
982 | */ |
983 | static void |
984 | WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb) |
985 | { |
986 | XLogRecPtr recptr; |
987 | xl_clog_truncate xlrec; |
988 | |
989 | xlrec.pageno = pageno; |
990 | xlrec.oldestXact = oldestXact; |
991 | xlrec.oldestXactDb = oldestXactDb; |
992 | |
993 | XLogBeginInsert(); |
994 | XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate)); |
995 | recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); |
996 | XLogFlush(recptr); |
997 | } |
998 | |
999 | /* |
1000 | * CLOG resource manager's routines |
1001 | */ |
1002 | void |
1003 | clog_redo(XLogReaderState *record) |
1004 | { |
1005 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
1006 | |
1007 | /* Backup blocks are not used in clog records */ |
1008 | Assert(!XLogRecHasAnyBlockRefs(record)); |
1009 | |
1010 | if (info == CLOG_ZEROPAGE) |
1011 | { |
1012 | int pageno; |
1013 | int slotno; |
1014 | |
1015 | memcpy(&pageno, XLogRecGetData(record), sizeof(int)); |
1016 | |
1017 | LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); |
1018 | |
1019 | slotno = ZeroCLOGPage(pageno, false); |
1020 | SimpleLruWritePage(ClogCtl, slotno); |
1021 | Assert(!ClogCtl->shared->page_dirty[slotno]); |
1022 | |
1023 | LWLockRelease(CLogControlLock); |
1024 | } |
1025 | else if (info == CLOG_TRUNCATE) |
1026 | { |
1027 | xl_clog_truncate xlrec; |
1028 | |
1029 | memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate)); |
1030 | |
1031 | /* |
1032 | * During XLOG replay, latest_page_number isn't set up yet; insert a |
1033 | * suitable value to bypass the sanity test in SimpleLruTruncate. |
1034 | */ |
1035 | ClogCtl->shared->latest_page_number = xlrec.pageno; |
1036 | |
1037 | AdvanceOldestClogXid(xlrec.oldestXact); |
1038 | |
1039 | SimpleLruTruncate(ClogCtl, xlrec.pageno); |
1040 | } |
1041 | else |
1042 | elog(PANIC, "clog_redo: unknown op code %u" , info); |
1043 | } |
1044 | |