1/*-------------------------------------------------------------------------
2 *
3 * predicate_internals.h
4 * POSTGRES internal predicate locking definitions.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/include/storage/predicate_internals.h
11 *
12 *-------------------------------------------------------------------------
13 */
14#ifndef PREDICATE_INTERNALS_H
15#define PREDICATE_INTERNALS_H
16
17#include "storage/lock.h"
18#include "storage/lwlock.h"
19
20/*
21 * Commit number.
22 */
23typedef uint64 SerCommitSeqNo;
24
25/*
26 * Reserved commit sequence numbers:
27 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
28 * used as a SerCommitSeqNo, even an invalid one
29 * - InvalidSerCommitSeqNo is used to indicate a transaction that
30 * hasn't committed yet, so use a number greater than all valid
31 * ones to make comparison do the expected thing
32 * - RecoverySerCommitSeqNo is used to refer to transactions that
33 * happened before a crash/recovery, since we restart the sequence
34 * at that point. It's earlier than all normal sequence numbers,
35 * and is only used by recovered prepared transactions
36 */
37#define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
38#define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
39#define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
40
41/*
42 * The SERIALIZABLEXACT struct contains information needed for each
43 * serializable database transaction to support SSI techniques.
44 *
45 * A home-grown list is maintained in shared memory to manage these.
46 * An entry is used when the serializable transaction acquires a snapshot.
47 * Unless the transaction is rolled back, this entry must generally remain
48 * until all concurrent transactions have completed. (There are special
49 * optimizations for READ ONLY transactions which often allow them to be
50 * cleaned up earlier.) A transaction which is rolled back is cleaned up
51 * as soon as possible.
52 *
53 * Eligibility for cleanup of committed transactions is generally determined
54 * by comparing the transaction's finishedBefore field to
55 * SerializableGlobalXmin.
56 */
57typedef struct SERIALIZABLEXACT
58{
59 VirtualTransactionId vxid; /* The executing process always has one of
60 * these. */
61
62 /*
63 * We use two numbers to track the order that transactions commit. Before
64 * commit, a transaction is marked as prepared, and prepareSeqNo is set.
65 * Shortly after commit, it's marked as committed, and commitSeqNo is set.
66 * This doesn't give a strict commit order, but these two values together
67 * are good enough for us, as we can always err on the safe side and
68 * assume that there's a conflict, if we can't be sure of the exact
69 * ordering of two commits.
70 *
71 * Note that a transaction is marked as prepared for a short period during
72 * commit processing, even if two-phase commit is not used. But with
73 * two-phase commit, a transaction can stay in prepared state for some
74 * time.
75 */
76 SerCommitSeqNo prepareSeqNo;
77 SerCommitSeqNo commitSeqNo;
78
79 /* these values are not both interesting at the same time */
80 union
81 {
82 SerCommitSeqNo earliestOutConflictCommit; /* when committed with
83 * conflict out */
84 SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
85 * no conflict out */
86 } SeqNo;
87 SHM_QUEUE outConflicts; /* list of write transactions whose data we
88 * couldn't read. */
89 SHM_QUEUE inConflicts; /* list of read transactions which couldn't
90 * see our write. */
91 SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
92 SHM_QUEUE finishedLink; /* list link in
93 * FinishedSerializableTransactions */
94
95 LWLock predicateLockListLock; /* protects predicateLocks in parallel
96 * mode */
97
98 /*
99 * for r/o transactions: list of concurrent r/w transactions that we could
100 * potentially have conflicts with, and vice versa for r/w transactions
101 */
102 SHM_QUEUE possibleUnsafeConflicts;
103
104 TransactionId topXid; /* top level xid for the transaction, if one
105 * exists; else invalid */
106 TransactionId finishedBefore; /* invalid means still running; else the
107 * struct expires when no serializable
108 * xids are before this. */
109 TransactionId xmin; /* the transaction's snapshot xmin */
110 uint32 flags; /* OR'd combination of values defined below */
111 int pid; /* pid of associated process */
112} SERIALIZABLEXACT;
113
114#define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
115#define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
116#define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
117#define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
118/*
119 * The following flag actually means that the flagged transaction has a
120 * conflict out *to a transaction which committed ahead of it*. It's hard
121 * to get that into a name of a reasonable length.
122 */
123#define SXACT_FLAG_CONFLICT_OUT 0x00000010
124#define SXACT_FLAG_READ_ONLY 0x00000020
125#define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
126#define SXACT_FLAG_RO_SAFE 0x00000080
127#define SXACT_FLAG_RO_UNSAFE 0x00000100
128#define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
129#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
130/*
131 * The following flag means the transaction has been partially released
132 * already, but is being preserved because parallel workers might have a
133 * reference to it. It'll be recycled by the leader at end-of-transaction.
134 */
135#define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
136
137/*
138 * The following types are used to provide an ad hoc list for holding
139 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
140 * access these by key -- there are direct pointers to these objects where
141 * needed. If a shared memory list is created, these types can probably be
142 * eliminated in favor of using the general solution.
143 */
144typedef struct PredXactListElementData
145{
146 SHM_QUEUE link;
147 SERIALIZABLEXACT sxact;
148} PredXactListElementData;
149
150typedef struct PredXactListElementData *PredXactListElement;
151
152#define PredXactListElementDataSize \
153 ((Size)MAXALIGN(sizeof(PredXactListElementData)))
154
155typedef struct PredXactListData
156{
157 SHM_QUEUE availableList;
158 SHM_QUEUE activeList;
159
160 /*
161 * These global variables are maintained when registering and cleaning up
162 * serializable transactions. They must be global across all backends,
163 * but are not needed outside the predicate.c source file. Protected by
164 * SerializableXactHashLock.
165 */
166 TransactionId SxactGlobalXmin; /* global xmin for active serializable
167 * transactions */
168 int SxactGlobalXminCount; /* how many active serializable
169 * transactions have this xmin */
170 int WritableSxactCount; /* how many non-read-only serializable
171 * transactions are active */
172 SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
173 * increasing number for commits
174 * of serializable transactions */
175 /* Protected by SerializableXactHashLock. */
176 SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and
177 * inConflicts for committed
178 * transactions through this seq
179 * no */
180 /* Protected by SerializableFinishedListLock. */
181 SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
182 * seq no */
183 SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
184
185 PredXactListElement element;
186} PredXactListData;
187
188typedef struct PredXactListData *PredXactList;
189
190#define PredXactListDataSize \
191 ((Size)MAXALIGN(sizeof(PredXactListData)))
192
193
194/*
195 * The following types are used to provide lists of rw-conflicts between
196 * pairs of transactions. Since exactly the same information is needed,
197 * they are also used to record possible unsafe transaction relationships
198 * for purposes of identifying safe snapshots for read-only transactions.
199 *
200 * When a RWConflictData is not in use to record either type of relationship
201 * between a pair of transactions, it is kept on an "available" list. The
202 * outLink field is used for maintaining that list.
203 */
204typedef struct RWConflictData
205{
206 SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
207 SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
208 SERIALIZABLEXACT *sxactOut;
209 SERIALIZABLEXACT *sxactIn;
210} RWConflictData;
211
212typedef struct RWConflictData *RWConflict;
213
214#define RWConflictDataSize \
215 ((Size)MAXALIGN(sizeof(RWConflictData)))
216
217typedef struct RWConflictPoolHeaderData
218{
219 SHM_QUEUE availableList;
220 RWConflict element;
221} RWConflictPoolHeaderData;
222
223typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
224
225#define RWConflictPoolHeaderDataSize \
226 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
227
228
229/*
230 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
231 * transaction or any of its subtransactions.
232 */
233typedef struct SERIALIZABLEXIDTAG
234{
235 TransactionId xid;
236} SERIALIZABLEXIDTAG;
237
238/*
239 * The SERIALIZABLEXID struct provides a link from a TransactionId for a
240 * serializable transaction to the related SERIALIZABLEXACT record, even if
241 * the transaction has completed and its connection has been closed.
242 *
243 * These are created as new top level transaction IDs are first assigned to
244 * transactions which are participating in predicate locking. This may
245 * never happen for a particular transaction if it doesn't write anything.
246 * They are removed with their related serializable transaction objects.
247 *
248 * The SubTransGetTopmostTransaction method is used where necessary to get
249 * from an XID which might be from a subtransaction to the top level XID.
250 */
251typedef struct SERIALIZABLEXID
252{
253 /* hash key */
254 SERIALIZABLEXIDTAG tag;
255
256 /* data */
257 SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
258} SERIALIZABLEXID;
259
260
261/*
262 * The PREDICATELOCKTARGETTAG struct identifies a database object which can
263 * be the target of predicate locks.
264 *
265 * Note that the hash function being used doesn't properly respect tag
266 * length -- if the length of the structure isn't a multiple of four bytes it
267 * will go to a four byte boundary past the end of the tag. If you change
268 * this struct, make sure any slack space is initialized, so that any random
269 * bytes in the middle or at the end are not included in the hash.
270 *
271 * TODO SSI: If we always use the same fields for the same type of value, we
272 * should rename these. Holding off until it's clear there are no exceptions.
273 * Since indexes are relations with blocks and tuples, it's looking likely that
274 * the rename will be possible. If not, we may need to divide the last field
275 * and use part of it for a target type, so that we know how to interpret the
276 * data..
277 */
278typedef struct PREDICATELOCKTARGETTAG
279{
280 uint32 locktag_field1; /* a 32-bit ID field */
281 uint32 locktag_field2; /* a 32-bit ID field */
282 uint32 locktag_field3; /* a 32-bit ID field */
283 uint32 locktag_field4; /* a 32-bit ID field */
284} PREDICATELOCKTARGETTAG;
285
286/*
287 * The PREDICATELOCKTARGET struct represents a database object on which there
288 * are predicate locks.
289 *
290 * A hash list of these objects is maintained in shared memory. An entry is
291 * added when a predicate lock is requested on an object which doesn't
292 * already have one. An entry is removed when the last lock is removed from
293 * its list.
294 */
295typedef struct PREDICATELOCKTARGET
296{
297 /* hash key */
298 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
299
300 /* data */
301 SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
302 * predicate lock target */
303} PREDICATELOCKTARGET;
304
305
306/*
307 * The PREDICATELOCKTAG struct identifies an individual predicate lock.
308 *
309 * It is the combination of predicate lock target (which is a lockable
310 * object) and a serializable transaction which has acquired a lock on that
311 * target.
312 */
313typedef struct PREDICATELOCKTAG
314{
315 PREDICATELOCKTARGET *myTarget;
316 SERIALIZABLEXACT *myXact;
317} PREDICATELOCKTAG;
318
319/*
320 * The PREDICATELOCK struct represents an individual lock.
321 *
322 * An entry can be created here when the related database object is read, or
323 * by promotion of multiple finer-grained targets. All entries related to a
324 * serializable transaction are removed when that serializable transaction is
325 * cleaned up. Entries can also be removed when they are combined into a
326 * single coarser-grained lock entry.
327 */
328typedef struct PREDICATELOCK
329{
330 /* hash key */
331 PREDICATELOCKTAG tag; /* unique identifier of lock */
332
333 /* data */
334 SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
335 * predicate locks */
336 SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
337 * predicate locks */
338 SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
339} PREDICATELOCK;
340
341
342/*
343 * The LOCALPREDICATELOCK struct represents a local copy of data which is
344 * also present in the PREDICATELOCK table, organized for fast access without
345 * needing to acquire a LWLock. It is strictly for optimization.
346 *
347 * Each serializable transaction creates its own local hash table to hold a
348 * collection of these. This information is used to determine when a number
349 * of fine-grained locks should be promoted to a single coarser-grained lock.
350 * The information is maintained more-or-less in parallel to the
351 * PREDICATELOCK data, but because this data is not protected by locks and is
352 * only used in an optimization heuristic, it is allowed to drift in a few
353 * corner cases where maintaining exact data would be expensive.
354 *
355 * The hash table is created when the serializable transaction acquires its
356 * snapshot, and its memory is released upon completion of the transaction.
357 */
358typedef struct LOCALPREDICATELOCK
359{
360 /* hash key */
361 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
362
363 /* data */
364 bool held; /* is lock held, or just its children? */
365 int childLocks; /* number of child locks currently held */
366} LOCALPREDICATELOCK;
367
368
369/*
370 * The types of predicate locks which can be acquired.
371 */
372typedef enum PredicateLockTargetType
373{
374 PREDLOCKTAG_RELATION,
375 PREDLOCKTAG_PAGE,
376 PREDLOCKTAG_TUPLE
377 /* TODO SSI: Other types may be needed for index locking */
378} PredicateLockTargetType;
379
380
381/*
382 * This structure is used to quickly capture a copy of all predicate
383 * locks. This is currently used only by the pg_lock_status function,
384 * which in turn is used by the pg_locks view.
385 */
386typedef struct PredicateLockData
387{
388 int nelements;
389 PREDICATELOCKTARGETTAG *locktags;
390 SERIALIZABLEXACT *xacts;
391} PredicateLockData;
392
393
394/*
395 * These macros define how we map logical IDs of lockable objects into the
396 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
397 * rather than accessing the fields directly. Note multiple eval of target!
398 */
399#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
400 ((locktag).locktag_field1 = (dboid), \
401 (locktag).locktag_field2 = (reloid), \
402 (locktag).locktag_field3 = InvalidBlockNumber, \
403 (locktag).locktag_field4 = InvalidOffsetNumber)
404
405#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
406 ((locktag).locktag_field1 = (dboid), \
407 (locktag).locktag_field2 = (reloid), \
408 (locktag).locktag_field3 = (blocknum), \
409 (locktag).locktag_field4 = InvalidOffsetNumber)
410
411#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
412 ((locktag).locktag_field1 = (dboid), \
413 (locktag).locktag_field2 = (reloid), \
414 (locktag).locktag_field3 = (blocknum), \
415 (locktag).locktag_field4 = (offnum))
416
417#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
418 ((Oid) (locktag).locktag_field1)
419#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
420 ((Oid) (locktag).locktag_field2)
421#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
422 ((BlockNumber) (locktag).locktag_field3)
423#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
424 ((OffsetNumber) (locktag).locktag_field4)
425#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
426 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
427 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
428 PREDLOCKTAG_RELATION))
429
430/*
431 * Two-phase commit statefile records. There are two types: for each
432 * transaction, we generate one per-transaction record and a variable
433 * number of per-predicate-lock records.
434 */
435typedef enum TwoPhasePredicateRecordType
436{
437 TWOPHASEPREDICATERECORD_XACT,
438 TWOPHASEPREDICATERECORD_LOCK
439} TwoPhasePredicateRecordType;
440
441/*
442 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
443 * much is needed because most of it not meaningful for a recovered
444 * prepared transaction.
445 *
446 * In particular, we do not record the in and out conflict lists for a
447 * prepared transaction because the associated SERIALIZABLEXACTs will
448 * not be available after recovery. Instead, we simply record the
449 * existence of each type of conflict by setting the transaction's
450 * summary conflict in/out flag.
451 */
452typedef struct TwoPhasePredicateXactRecord
453{
454 TransactionId xmin;
455 uint32 flags;
456} TwoPhasePredicateXactRecord;
457
458/* Per-lock state */
459typedef struct TwoPhasePredicateLockRecord
460{
461 PREDICATELOCKTARGETTAG target;
462 uint32 filler; /* to avoid length change in back-patched fix */
463} TwoPhasePredicateLockRecord;
464
465typedef struct TwoPhasePredicateRecord
466{
467 TwoPhasePredicateRecordType type;
468 union
469 {
470 TwoPhasePredicateXactRecord xactRecord;
471 TwoPhasePredicateLockRecord lockRecord;
472 } data;
473} TwoPhasePredicateRecord;
474
475/*
476 * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
477 */
478#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
479
480
481/*
482 * Function definitions for functions needing awareness of predicate
483 * locking internals.
484 */
485extern PredicateLockData *GetPredicateLockStatusData(void);
486extern int GetSafeSnapshotBlockingPids(int blocked_pid,
487 int *output, int output_size);
488
489#endif /* PREDICATE_INTERNALS_H */
490