1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * predicate_internals.h |
4 | * POSTGRES internal predicate locking definitions. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * src/include/storage/predicate_internals.h |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #ifndef PREDICATE_INTERNALS_H |
15 | #define PREDICATE_INTERNALS_H |
16 | |
17 | #include "storage/lock.h" |
18 | #include "storage/lwlock.h" |
19 | |
20 | /* |
21 | * Commit number. |
22 | */ |
23 | typedef uint64 SerCommitSeqNo; |
24 | |
25 | /* |
26 | * Reserved commit sequence numbers: |
27 | * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be |
28 | * used as a SerCommitSeqNo, even an invalid one |
29 | * - InvalidSerCommitSeqNo is used to indicate a transaction that |
30 | * hasn't committed yet, so use a number greater than all valid |
31 | * ones to make comparison do the expected thing |
32 | * - RecoverySerCommitSeqNo is used to refer to transactions that |
33 | * happened before a crash/recovery, since we restart the sequence |
34 | * at that point. It's earlier than all normal sequence numbers, |
35 | * and is only used by recovered prepared transactions |
36 | */ |
37 | #define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX) |
38 | #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) |
39 | #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) |
40 | |
41 | /* |
42 | * The SERIALIZABLEXACT struct contains information needed for each |
43 | * serializable database transaction to support SSI techniques. |
44 | * |
45 | * A home-grown list is maintained in shared memory to manage these. |
46 | * An entry is used when the serializable transaction acquires a snapshot. |
47 | * Unless the transaction is rolled back, this entry must generally remain |
48 | * until all concurrent transactions have completed. (There are special |
49 | * optimizations for READ ONLY transactions which often allow them to be |
50 | * cleaned up earlier.) A transaction which is rolled back is cleaned up |
51 | * as soon as possible. |
52 | * |
53 | * Eligibility for cleanup of committed transactions is generally determined |
54 | * by comparing the transaction's finishedBefore field to |
55 | * SerializableGlobalXmin. |
56 | */ |
57 | typedef struct SERIALIZABLEXACT |
58 | { |
59 | VirtualTransactionId vxid; /* The executing process always has one of |
60 | * these. */ |
61 | |
62 | /* |
63 | * We use two numbers to track the order that transactions commit. Before |
64 | * commit, a transaction is marked as prepared, and prepareSeqNo is set. |
65 | * Shortly after commit, it's marked as committed, and commitSeqNo is set. |
66 | * This doesn't give a strict commit order, but these two values together |
67 | * are good enough for us, as we can always err on the safe side and |
68 | * assume that there's a conflict, if we can't be sure of the exact |
69 | * ordering of two commits. |
70 | * |
71 | * Note that a transaction is marked as prepared for a short period during |
72 | * commit processing, even if two-phase commit is not used. But with |
73 | * two-phase commit, a transaction can stay in prepared state for some |
74 | * time. |
75 | */ |
76 | SerCommitSeqNo prepareSeqNo; |
77 | SerCommitSeqNo commitSeqNo; |
78 | |
79 | /* these values are not both interesting at the same time */ |
80 | union |
81 | { |
82 | SerCommitSeqNo earliestOutConflictCommit; /* when committed with |
83 | * conflict out */ |
84 | SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or |
85 | * no conflict out */ |
86 | } SeqNo; |
87 | SHM_QUEUE outConflicts; /* list of write transactions whose data we |
88 | * couldn't read. */ |
89 | SHM_QUEUE inConflicts; /* list of read transactions which couldn't |
90 | * see our write. */ |
91 | SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ |
92 | SHM_QUEUE finishedLink; /* list link in |
93 | * FinishedSerializableTransactions */ |
94 | |
95 | LWLock predicateLockListLock; /* protects predicateLocks in parallel |
96 | * mode */ |
97 | |
98 | /* |
99 | * for r/o transactions: list of concurrent r/w transactions that we could |
100 | * potentially have conflicts with, and vice versa for r/w transactions |
101 | */ |
102 | SHM_QUEUE possibleUnsafeConflicts; |
103 | |
104 | TransactionId topXid; /* top level xid for the transaction, if one |
105 | * exists; else invalid */ |
106 | TransactionId finishedBefore; /* invalid means still running; else the |
107 | * struct expires when no serializable |
108 | * xids are before this. */ |
109 | TransactionId xmin; /* the transaction's snapshot xmin */ |
110 | uint32 flags; /* OR'd combination of values defined below */ |
111 | int pid; /* pid of associated process */ |
112 | } SERIALIZABLEXACT; |
113 | |
114 | #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ |
115 | #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ |
116 | #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ |
117 | #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ |
118 | /* |
119 | * The following flag actually means that the flagged transaction has a |
120 | * conflict out *to a transaction which committed ahead of it*. It's hard |
121 | * to get that into a name of a reasonable length. |
122 | */ |
123 | #define SXACT_FLAG_CONFLICT_OUT 0x00000010 |
124 | #define SXACT_FLAG_READ_ONLY 0x00000020 |
125 | #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 |
126 | #define SXACT_FLAG_RO_SAFE 0x00000080 |
127 | #define SXACT_FLAG_RO_UNSAFE 0x00000100 |
128 | #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 |
129 | #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 |
130 | /* |
131 | * The following flag means the transaction has been partially released |
132 | * already, but is being preserved because parallel workers might have a |
133 | * reference to it. It'll be recycled by the leader at end-of-transaction. |
134 | */ |
135 | #define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800 |
136 | |
137 | /* |
138 | * The following types are used to provide an ad hoc list for holding |
139 | * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to |
140 | * access these by key -- there are direct pointers to these objects where |
141 | * needed. If a shared memory list is created, these types can probably be |
142 | * eliminated in favor of using the general solution. |
143 | */ |
144 | typedef struct PredXactListElementData |
145 | { |
146 | SHM_QUEUE link; |
147 | SERIALIZABLEXACT sxact; |
148 | } PredXactListElementData; |
149 | |
150 | typedef struct PredXactListElementData *PredXactListElement; |
151 | |
152 | #define PredXactListElementDataSize \ |
153 | ((Size)MAXALIGN(sizeof(PredXactListElementData))) |
154 | |
155 | typedef struct PredXactListData |
156 | { |
157 | SHM_QUEUE availableList; |
158 | SHM_QUEUE activeList; |
159 | |
160 | /* |
161 | * These global variables are maintained when registering and cleaning up |
162 | * serializable transactions. They must be global across all backends, |
163 | * but are not needed outside the predicate.c source file. Protected by |
164 | * SerializableXactHashLock. |
165 | */ |
166 | TransactionId SxactGlobalXmin; /* global xmin for active serializable |
167 | * transactions */ |
168 | int SxactGlobalXminCount; /* how many active serializable |
169 | * transactions have this xmin */ |
170 | int WritableSxactCount; /* how many non-read-only serializable |
171 | * transactions are active */ |
172 | SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically |
173 | * increasing number for commits |
174 | * of serializable transactions */ |
175 | /* Protected by SerializableXactHashLock. */ |
176 | SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and |
177 | * inConflicts for committed |
178 | * transactions through this seq |
179 | * no */ |
180 | /* Protected by SerializableFinishedListLock. */ |
181 | SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this |
182 | * seq no */ |
183 | SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ |
184 | |
185 | PredXactListElement element; |
186 | } PredXactListData; |
187 | |
188 | typedef struct PredXactListData *PredXactList; |
189 | |
190 | #define PredXactListDataSize \ |
191 | ((Size)MAXALIGN(sizeof(PredXactListData))) |
192 | |
193 | |
194 | /* |
195 | * The following types are used to provide lists of rw-conflicts between |
196 | * pairs of transactions. Since exactly the same information is needed, |
197 | * they are also used to record possible unsafe transaction relationships |
198 | * for purposes of identifying safe snapshots for read-only transactions. |
199 | * |
200 | * When a RWConflictData is not in use to record either type of relationship |
201 | * between a pair of transactions, it is kept on an "available" list. The |
202 | * outLink field is used for maintaining that list. |
203 | */ |
204 | typedef struct RWConflictData |
205 | { |
206 | SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ |
207 | SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ |
208 | SERIALIZABLEXACT *sxactOut; |
209 | SERIALIZABLEXACT *sxactIn; |
210 | } RWConflictData; |
211 | |
212 | typedef struct RWConflictData *RWConflict; |
213 | |
214 | #define RWConflictDataSize \ |
215 | ((Size)MAXALIGN(sizeof(RWConflictData))) |
216 | |
217 | typedef struct |
218 | { |
219 | SHM_QUEUE ; |
220 | RWConflict ; |
221 | } ; |
222 | |
223 | typedef struct RWConflictPoolHeaderData *; |
224 | |
225 | #define \ |
226 | ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) |
227 | |
228 | |
229 | /* |
230 | * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable |
231 | * transaction or any of its subtransactions. |
232 | */ |
233 | typedef struct SERIALIZABLEXIDTAG |
234 | { |
235 | TransactionId xid; |
236 | } SERIALIZABLEXIDTAG; |
237 | |
238 | /* |
239 | * The SERIALIZABLEXID struct provides a link from a TransactionId for a |
240 | * serializable transaction to the related SERIALIZABLEXACT record, even if |
241 | * the transaction has completed and its connection has been closed. |
242 | * |
243 | * These are created as new top level transaction IDs are first assigned to |
244 | * transactions which are participating in predicate locking. This may |
245 | * never happen for a particular transaction if it doesn't write anything. |
246 | * They are removed with their related serializable transaction objects. |
247 | * |
248 | * The SubTransGetTopmostTransaction method is used where necessary to get |
249 | * from an XID which might be from a subtransaction to the top level XID. |
250 | */ |
251 | typedef struct SERIALIZABLEXID |
252 | { |
253 | /* hash key */ |
254 | SERIALIZABLEXIDTAG tag; |
255 | |
256 | /* data */ |
257 | SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ |
258 | } SERIALIZABLEXID; |
259 | |
260 | |
261 | /* |
262 | * The PREDICATELOCKTARGETTAG struct identifies a database object which can |
263 | * be the target of predicate locks. |
264 | * |
265 | * Note that the hash function being used doesn't properly respect tag |
266 | * length -- if the length of the structure isn't a multiple of four bytes it |
267 | * will go to a four byte boundary past the end of the tag. If you change |
268 | * this struct, make sure any slack space is initialized, so that any random |
269 | * bytes in the middle or at the end are not included in the hash. |
270 | * |
271 | * TODO SSI: If we always use the same fields for the same type of value, we |
272 | * should rename these. Holding off until it's clear there are no exceptions. |
273 | * Since indexes are relations with blocks and tuples, it's looking likely that |
274 | * the rename will be possible. If not, we may need to divide the last field |
275 | * and use part of it for a target type, so that we know how to interpret the |
276 | * data.. |
277 | */ |
278 | typedef struct PREDICATELOCKTARGETTAG |
279 | { |
280 | uint32 locktag_field1; /* a 32-bit ID field */ |
281 | uint32 locktag_field2; /* a 32-bit ID field */ |
282 | uint32 locktag_field3; /* a 32-bit ID field */ |
283 | uint32 locktag_field4; /* a 32-bit ID field */ |
284 | } PREDICATELOCKTARGETTAG; |
285 | |
286 | /* |
287 | * The PREDICATELOCKTARGET struct represents a database object on which there |
288 | * are predicate locks. |
289 | * |
290 | * A hash list of these objects is maintained in shared memory. An entry is |
291 | * added when a predicate lock is requested on an object which doesn't |
292 | * already have one. An entry is removed when the last lock is removed from |
293 | * its list. |
294 | */ |
295 | typedef struct PREDICATELOCKTARGET |
296 | { |
297 | /* hash key */ |
298 | PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ |
299 | |
300 | /* data */ |
301 | SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with |
302 | * predicate lock target */ |
303 | } PREDICATELOCKTARGET; |
304 | |
305 | |
306 | /* |
307 | * The PREDICATELOCKTAG struct identifies an individual predicate lock. |
308 | * |
309 | * It is the combination of predicate lock target (which is a lockable |
310 | * object) and a serializable transaction which has acquired a lock on that |
311 | * target. |
312 | */ |
313 | typedef struct PREDICATELOCKTAG |
314 | { |
315 | PREDICATELOCKTARGET *myTarget; |
316 | SERIALIZABLEXACT *myXact; |
317 | } PREDICATELOCKTAG; |
318 | |
319 | /* |
320 | * The PREDICATELOCK struct represents an individual lock. |
321 | * |
322 | * An entry can be created here when the related database object is read, or |
323 | * by promotion of multiple finer-grained targets. All entries related to a |
324 | * serializable transaction are removed when that serializable transaction is |
325 | * cleaned up. Entries can also be removed when they are combined into a |
326 | * single coarser-grained lock entry. |
327 | */ |
328 | typedef struct PREDICATELOCK |
329 | { |
330 | /* hash key */ |
331 | PREDICATELOCKTAG tag; /* unique identifier of lock */ |
332 | |
333 | /* data */ |
334 | SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of |
335 | * predicate locks */ |
336 | SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of |
337 | * predicate locks */ |
338 | SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ |
339 | } PREDICATELOCK; |
340 | |
341 | |
342 | /* |
343 | * The LOCALPREDICATELOCK struct represents a local copy of data which is |
344 | * also present in the PREDICATELOCK table, organized for fast access without |
345 | * needing to acquire a LWLock. It is strictly for optimization. |
346 | * |
347 | * Each serializable transaction creates its own local hash table to hold a |
348 | * collection of these. This information is used to determine when a number |
349 | * of fine-grained locks should be promoted to a single coarser-grained lock. |
350 | * The information is maintained more-or-less in parallel to the |
351 | * PREDICATELOCK data, but because this data is not protected by locks and is |
352 | * only used in an optimization heuristic, it is allowed to drift in a few |
353 | * corner cases where maintaining exact data would be expensive. |
354 | * |
355 | * The hash table is created when the serializable transaction acquires its |
356 | * snapshot, and its memory is released upon completion of the transaction. |
357 | */ |
358 | typedef struct LOCALPREDICATELOCK |
359 | { |
360 | /* hash key */ |
361 | PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ |
362 | |
363 | /* data */ |
364 | bool held; /* is lock held, or just its children? */ |
365 | int childLocks; /* number of child locks currently held */ |
366 | } LOCALPREDICATELOCK; |
367 | |
368 | |
369 | /* |
370 | * The types of predicate locks which can be acquired. |
371 | */ |
372 | typedef enum PredicateLockTargetType |
373 | { |
374 | PREDLOCKTAG_RELATION, |
375 | PREDLOCKTAG_PAGE, |
376 | PREDLOCKTAG_TUPLE |
377 | /* TODO SSI: Other types may be needed for index locking */ |
378 | } PredicateLockTargetType; |
379 | |
380 | |
381 | /* |
382 | * This structure is used to quickly capture a copy of all predicate |
383 | * locks. This is currently used only by the pg_lock_status function, |
384 | * which in turn is used by the pg_locks view. |
385 | */ |
386 | typedef struct PredicateLockData |
387 | { |
388 | int nelements; |
389 | PREDICATELOCKTARGETTAG *locktags; |
390 | SERIALIZABLEXACT *xacts; |
391 | } PredicateLockData; |
392 | |
393 | |
394 | /* |
395 | * These macros define how we map logical IDs of lockable objects into the |
396 | * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, |
397 | * rather than accessing the fields directly. Note multiple eval of target! |
398 | */ |
399 | #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ |
400 | ((locktag).locktag_field1 = (dboid), \ |
401 | (locktag).locktag_field2 = (reloid), \ |
402 | (locktag).locktag_field3 = InvalidBlockNumber, \ |
403 | (locktag).locktag_field4 = InvalidOffsetNumber) |
404 | |
405 | #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ |
406 | ((locktag).locktag_field1 = (dboid), \ |
407 | (locktag).locktag_field2 = (reloid), \ |
408 | (locktag).locktag_field3 = (blocknum), \ |
409 | (locktag).locktag_field4 = InvalidOffsetNumber) |
410 | |
411 | #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ |
412 | ((locktag).locktag_field1 = (dboid), \ |
413 | (locktag).locktag_field2 = (reloid), \ |
414 | (locktag).locktag_field3 = (blocknum), \ |
415 | (locktag).locktag_field4 = (offnum)) |
416 | |
417 | #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ |
418 | ((Oid) (locktag).locktag_field1) |
419 | #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ |
420 | ((Oid) (locktag).locktag_field2) |
421 | #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ |
422 | ((BlockNumber) (locktag).locktag_field3) |
423 | #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ |
424 | ((OffsetNumber) (locktag).locktag_field4) |
425 | #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ |
426 | (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ |
427 | (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ |
428 | PREDLOCKTAG_RELATION)) |
429 | |
430 | /* |
431 | * Two-phase commit statefile records. There are two types: for each |
432 | * transaction, we generate one per-transaction record and a variable |
433 | * number of per-predicate-lock records. |
434 | */ |
435 | typedef enum TwoPhasePredicateRecordType |
436 | { |
437 | TWOPHASEPREDICATERECORD_XACT, |
438 | TWOPHASEPREDICATERECORD_LOCK |
439 | } TwoPhasePredicateRecordType; |
440 | |
441 | /* |
442 | * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not |
443 | * much is needed because most of it not meaningful for a recovered |
444 | * prepared transaction. |
445 | * |
446 | * In particular, we do not record the in and out conflict lists for a |
447 | * prepared transaction because the associated SERIALIZABLEXACTs will |
448 | * not be available after recovery. Instead, we simply record the |
449 | * existence of each type of conflict by setting the transaction's |
450 | * summary conflict in/out flag. |
451 | */ |
452 | typedef struct TwoPhasePredicateXactRecord |
453 | { |
454 | TransactionId xmin; |
455 | uint32 flags; |
456 | } TwoPhasePredicateXactRecord; |
457 | |
458 | /* Per-lock state */ |
459 | typedef struct TwoPhasePredicateLockRecord |
460 | { |
461 | PREDICATELOCKTARGETTAG target; |
462 | uint32 filler; /* to avoid length change in back-patched fix */ |
463 | } TwoPhasePredicateLockRecord; |
464 | |
465 | typedef struct TwoPhasePredicateRecord |
466 | { |
467 | TwoPhasePredicateRecordType type; |
468 | union |
469 | { |
470 | TwoPhasePredicateXactRecord xactRecord; |
471 | TwoPhasePredicateLockRecord lockRecord; |
472 | } data; |
473 | } TwoPhasePredicateRecord; |
474 | |
475 | /* |
476 | * Define a macro to use for an "empty" SERIALIZABLEXACT reference. |
477 | */ |
478 | #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) |
479 | |
480 | |
481 | /* |
482 | * Function definitions for functions needing awareness of predicate |
483 | * locking internals. |
484 | */ |
485 | extern PredicateLockData *GetPredicateLockStatusData(void); |
486 | extern int GetSafeSnapshotBlockingPids(int blocked_pid, |
487 | int *output, int output_size); |
488 | |
489 | #endif /* PREDICATE_INTERNALS_H */ |
490 | |