| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * lwlock.c |
| 4 | * Lightweight lock manager |
| 5 | * |
| 6 | * Lightweight locks are intended primarily to provide mutual exclusion of |
| 7 | * access to shared-memory data structures. Therefore, they offer both |
| 8 | * exclusive and shared lock modes (to support read/write and read-only |
| 9 | * access to a shared object). There are few other frammishes. User-level |
| 10 | * locking should be done with the full lock manager --- which depends on |
| 11 | * LWLocks to protect its shared state. |
| 12 | * |
| 13 | * In addition to exclusive and shared modes, lightweight locks can be used to |
| 14 | * wait until a variable changes value. The variable is initially not set |
| 15 | * when the lock is acquired with LWLockAcquire, i.e. it remains set to the |
| 16 | * value it was set to when the lock was released last, and can be updated |
| 17 | * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar |
| 18 | * waits for the variable to be updated, or until the lock is free. When |
| 19 | * releasing the lock with LWLockReleaseClearVar() the value can be set to an |
| 20 | * appropriate value for a free lock. The meaning of the variable is up to |
| 21 | * the caller, the lightweight lock code just assigns and compares it. |
| 22 | * |
| 23 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 24 | * Portions Copyright (c) 1994, Regents of the University of California |
| 25 | * |
| 26 | * IDENTIFICATION |
| 27 | * src/backend/storage/lmgr/lwlock.c |
| 28 | * |
| 29 | * NOTES: |
| 30 | * |
| 31 | * This used to be a pretty straight forward reader-writer lock |
| 32 | * implementation, in which the internal state was protected by a |
| 33 | * spinlock. Unfortunately the overhead of taking the spinlock proved to be |
| 34 | * too high for workloads/locks that were taken in shared mode very |
| 35 | * frequently. Often we were spinning in the (obviously exclusive) spinlock, |
| 36 | * while trying to acquire a shared lock that was actually free. |
| 37 | * |
| 38 | * Thus a new implementation was devised that provides wait-free shared lock |
| 39 | * acquisition for locks that aren't exclusively locked. |
| 40 | * |
| 41 | * The basic idea is to have a single atomic variable 'lockcount' instead of |
| 42 | * the formerly separate shared and exclusive counters and to use atomic |
| 43 | * operations to acquire the lock. That's fairly easy to do for plain |
| 44 | * rw-spinlocks, but a lot harder for something like LWLocks that want to wait |
| 45 | * in the OS. |
| 46 | * |
| 47 | * For lock acquisition we use an atomic compare-and-exchange on the lockcount |
| 48 | * variable. For exclusive lock we swap in a sentinel value |
| 49 | * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders. |
| 50 | * |
| 51 | * To release the lock we use an atomic decrement to release the lock. If the |
| 52 | * new value is zero (we get that atomically), we know we can/have to release |
| 53 | * waiters. |
| 54 | * |
| 55 | * Obviously it is important that the sentinel value for exclusive locks |
| 56 | * doesn't conflict with the maximum number of possible share lockers - |
| 57 | * luckily MAX_BACKENDS makes that easily possible. |
| 58 | * |
| 59 | * |
| 60 | * The attentive reader might have noticed that naively doing the above has a |
| 61 | * glaring race condition: We try to lock using the atomic operations and |
| 62 | * notice that we have to wait. Unfortunately by the time we have finished |
| 63 | * queuing, the former locker very well might have already finished it's |
| 64 | * work. That's problematic because we're now stuck waiting inside the OS. |
| 65 | |
| 66 | * To mitigate those races we use a two phased attempt at locking: |
| 67 | * Phase 1: Try to do it atomically, if we succeed, nice |
| 68 | * Phase 2: Add ourselves to the waitqueue of the lock |
| 69 | * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from |
| 70 | * the queue |
| 71 | * Phase 4: Sleep till wake-up, goto Phase 1 |
| 72 | * |
| 73 | * This protects us against the problem from above as nobody can release too |
| 74 | * quick, before we're queued, since after Phase 2 we're already queued. |
| 75 | * ------------------------------------------------------------------------- |
| 76 | */ |
| 77 | #include "postgres.h" |
| 78 | |
| 79 | #include "miscadmin.h" |
| 80 | #include "pgstat.h" |
| 81 | #include "pg_trace.h" |
| 82 | #include "postmaster/postmaster.h" |
| 83 | #include "replication/slot.h" |
| 84 | #include "storage/ipc.h" |
| 85 | #include "storage/predicate.h" |
| 86 | #include "storage/proc.h" |
| 87 | #include "storage/proclist.h" |
| 88 | #include "storage/spin.h" |
| 89 | #include "utils/memutils.h" |
| 90 | |
| 91 | #ifdef LWLOCK_STATS |
| 92 | #include "utils/hsearch.h" |
| 93 | #endif |
| 94 | |
| 95 | |
| 96 | /* We use the ShmemLock spinlock to protect LWLockCounter */ |
| 97 | extern slock_t *ShmemLock; |
| 98 | |
| 99 | #define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30) |
| 100 | #define LW_FLAG_RELEASE_OK ((uint32) 1 << 29) |
| 101 | #define LW_FLAG_LOCKED ((uint32) 1 << 28) |
| 102 | |
| 103 | #define LW_VAL_EXCLUSIVE ((uint32) 1 << 24) |
| 104 | #define LW_VAL_SHARED 1 |
| 105 | |
| 106 | #define LW_LOCK_MASK ((uint32) ((1 << 25)-1)) |
| 107 | /* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */ |
| 108 | #define LW_SHARED_MASK ((uint32) ((1 << 24)-1)) |
| 109 | |
| 110 | /* |
| 111 | * This is indexed by tranche ID and stores the names of all tranches known |
| 112 | * to the current backend. |
| 113 | */ |
| 114 | static const char **LWLockTrancheArray = NULL; |
| 115 | static int LWLockTranchesAllocated = 0; |
| 116 | |
| 117 | #define T_NAME(lock) \ |
| 118 | (LWLockTrancheArray[(lock)->tranche]) |
| 119 | |
| 120 | /* |
| 121 | * This points to the main array of LWLocks in shared memory. Backends inherit |
| 122 | * the pointer by fork from the postmaster (except in the EXEC_BACKEND case, |
| 123 | * where we have special measures to pass it down). |
| 124 | */ |
| 125 | LWLockPadded *MainLWLockArray = NULL; |
| 126 | |
| 127 | /* |
| 128 | * We use this structure to keep track of locked LWLocks for release |
| 129 | * during error recovery. Normally, only a few will be held at once, but |
| 130 | * occasionally the number can be much higher; for example, the pg_buffercache |
| 131 | * extension locks all buffer partitions simultaneously. |
| 132 | */ |
| 133 | #define MAX_SIMUL_LWLOCKS 200 |
| 134 | |
| 135 | /* struct representing the LWLocks we're holding */ |
| 136 | typedef struct LWLockHandle |
| 137 | { |
| 138 | LWLock *lock; |
| 139 | LWLockMode mode; |
| 140 | } LWLockHandle; |
| 141 | |
| 142 | static int num_held_lwlocks = 0; |
| 143 | static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; |
| 144 | |
| 145 | /* struct representing the LWLock tranche request for named tranche */ |
| 146 | typedef struct NamedLWLockTrancheRequest |
| 147 | { |
| 148 | char tranche_name[NAMEDATALEN]; |
| 149 | int num_lwlocks; |
| 150 | } NamedLWLockTrancheRequest; |
| 151 | |
| 152 | NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; |
| 153 | static int NamedLWLockTrancheRequestsAllocated = 0; |
| 154 | int NamedLWLockTrancheRequests = 0; |
| 155 | |
| 156 | NamedLWLockTranche *NamedLWLockTrancheArray = NULL; |
| 157 | |
| 158 | static bool lock_named_request_allowed = true; |
| 159 | |
| 160 | static void InitializeLWLocks(void); |
| 161 | static void RegisterLWLockTranches(void); |
| 162 | |
| 163 | static inline void LWLockReportWaitStart(LWLock *lock); |
| 164 | static inline void LWLockReportWaitEnd(void); |
| 165 | |
| 166 | #ifdef LWLOCK_STATS |
| 167 | typedef struct lwlock_stats_key |
| 168 | { |
| 169 | int tranche; |
| 170 | void *instance; |
| 171 | } lwlock_stats_key; |
| 172 | |
| 173 | typedef struct lwlock_stats |
| 174 | { |
| 175 | lwlock_stats_key key; |
| 176 | int sh_acquire_count; |
| 177 | int ex_acquire_count; |
| 178 | int block_count; |
| 179 | int dequeue_self_count; |
| 180 | int spin_delay_count; |
| 181 | } lwlock_stats; |
| 182 | |
| 183 | static HTAB *lwlock_stats_htab; |
| 184 | static lwlock_stats lwlock_stats_dummy; |
| 185 | #endif |
| 186 | |
| 187 | #ifdef LOCK_DEBUG |
| 188 | bool Trace_lwlocks = false; |
| 189 | |
| 190 | inline static void |
| 191 | PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode) |
| 192 | { |
| 193 | /* hide statement & context here, otherwise the log is just too verbose */ |
| 194 | if (Trace_lwlocks) |
| 195 | { |
| 196 | uint32 state = pg_atomic_read_u32(&lock->state); |
| 197 | |
| 198 | ereport(LOG, |
| 199 | (errhidestmt(true), |
| 200 | errhidecontext(true), |
| 201 | errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d" , |
| 202 | MyProcPid, |
| 203 | where, T_NAME(lock), lock, |
| 204 | (state & LW_VAL_EXCLUSIVE) != 0, |
| 205 | state & LW_SHARED_MASK, |
| 206 | (state & LW_FLAG_HAS_WAITERS) != 0, |
| 207 | pg_atomic_read_u32(&lock->nwaiters), |
| 208 | (state & LW_FLAG_RELEASE_OK) != 0))); |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | inline static void |
| 213 | LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) |
| 214 | { |
| 215 | /* hide statement & context here, otherwise the log is just too verbose */ |
| 216 | if (Trace_lwlocks) |
| 217 | { |
| 218 | ereport(LOG, |
| 219 | (errhidestmt(true), |
| 220 | errhidecontext(true), |
| 221 | errmsg_internal("%s(%s %p): %s" , where, |
| 222 | T_NAME(lock), lock, msg))); |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | #else /* not LOCK_DEBUG */ |
| 227 | #define PRINT_LWDEBUG(a,b,c) ((void)0) |
| 228 | #define LOG_LWDEBUG(a,b,c) ((void)0) |
| 229 | #endif /* LOCK_DEBUG */ |
| 230 | |
| 231 | #ifdef LWLOCK_STATS |
| 232 | |
| 233 | static void init_lwlock_stats(void); |
| 234 | static void print_lwlock_stats(int code, Datum arg); |
| 235 | static lwlock_stats * get_lwlock_stats_entry(LWLock *lockid); |
| 236 | |
| 237 | static void |
| 238 | init_lwlock_stats(void) |
| 239 | { |
| 240 | HASHCTL ctl; |
| 241 | static MemoryContext lwlock_stats_cxt = NULL; |
| 242 | static bool exit_registered = false; |
| 243 | |
| 244 | if (lwlock_stats_cxt != NULL) |
| 245 | MemoryContextDelete(lwlock_stats_cxt); |
| 246 | |
| 247 | /* |
| 248 | * The LWLock stats will be updated within a critical section, which |
| 249 | * requires allocating new hash entries. Allocations within a critical |
| 250 | * section are normally not allowed because running out of memory would |
| 251 | * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally |
| 252 | * turned on in production, so that's an acceptable risk. The hash entries |
| 253 | * are small, so the risk of running out of memory is minimal in practice. |
| 254 | */ |
| 255 | lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, |
| 256 | "LWLock stats" , |
| 257 | ALLOCSET_DEFAULT_SIZES); |
| 258 | MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); |
| 259 | |
| 260 | MemSet(&ctl, 0, sizeof(ctl)); |
| 261 | ctl.keysize = sizeof(lwlock_stats_key); |
| 262 | ctl.entrysize = sizeof(lwlock_stats); |
| 263 | ctl.hcxt = lwlock_stats_cxt; |
| 264 | lwlock_stats_htab = hash_create("lwlock stats" , 16384, &ctl, |
| 265 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| 266 | if (!exit_registered) |
| 267 | { |
| 268 | on_shmem_exit(print_lwlock_stats, 0); |
| 269 | exit_registered = true; |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | static void |
| 274 | print_lwlock_stats(int code, Datum arg) |
| 275 | { |
| 276 | HASH_SEQ_STATUS scan; |
| 277 | lwlock_stats *lwstats; |
| 278 | |
| 279 | hash_seq_init(&scan, lwlock_stats_htab); |
| 280 | |
| 281 | /* Grab an LWLock to keep different backends from mixing reports */ |
| 282 | LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE); |
| 283 | |
| 284 | while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) |
| 285 | { |
| 286 | fprintf(stderr, |
| 287 | "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n" , |
| 288 | MyProcPid, LWLockTrancheArray[lwstats->key.tranche], |
| 289 | lwstats->key.instance, lwstats->sh_acquire_count, |
| 290 | lwstats->ex_acquire_count, lwstats->block_count, |
| 291 | lwstats->spin_delay_count, lwstats->dequeue_self_count); |
| 292 | } |
| 293 | |
| 294 | LWLockRelease(&MainLWLockArray[0].lock); |
| 295 | } |
| 296 | |
| 297 | static lwlock_stats * |
| 298 | get_lwlock_stats_entry(LWLock *lock) |
| 299 | { |
| 300 | lwlock_stats_key key; |
| 301 | lwlock_stats *lwstats; |
| 302 | bool found; |
| 303 | |
| 304 | /* |
| 305 | * During shared memory initialization, the hash table doesn't exist yet. |
| 306 | * Stats of that phase aren't very interesting, so just collect operations |
| 307 | * on all locks in a single dummy entry. |
| 308 | */ |
| 309 | if (lwlock_stats_htab == NULL) |
| 310 | return &lwlock_stats_dummy; |
| 311 | |
| 312 | /* Fetch or create the entry. */ |
| 313 | key.tranche = lock->tranche; |
| 314 | key.instance = lock; |
| 315 | lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found); |
| 316 | if (!found) |
| 317 | { |
| 318 | lwstats->sh_acquire_count = 0; |
| 319 | lwstats->ex_acquire_count = 0; |
| 320 | lwstats->block_count = 0; |
| 321 | lwstats->dequeue_self_count = 0; |
| 322 | lwstats->spin_delay_count = 0; |
| 323 | } |
| 324 | return lwstats; |
| 325 | } |
| 326 | #endif /* LWLOCK_STATS */ |
| 327 | |
| 328 | |
| 329 | /* |
| 330 | * Compute number of LWLocks required by named tranches. These will be |
| 331 | * allocated in the main array. |
| 332 | */ |
| 333 | static int |
| 334 | NumLWLocksByNamedTranches(void) |
| 335 | { |
| 336 | int numLocks = 0; |
| 337 | int i; |
| 338 | |
| 339 | for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| 340 | numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks; |
| 341 | |
| 342 | return numLocks; |
| 343 | } |
| 344 | |
| 345 | /* |
| 346 | * Compute shmem space needed for LWLocks and named tranches. |
| 347 | */ |
| 348 | Size |
| 349 | LWLockShmemSize(void) |
| 350 | { |
| 351 | Size size; |
| 352 | int i; |
| 353 | int numLocks = NUM_FIXED_LWLOCKS; |
| 354 | |
| 355 | numLocks += NumLWLocksByNamedTranches(); |
| 356 | |
| 357 | /* Space for the LWLock array. */ |
| 358 | size = mul_size(numLocks, sizeof(LWLockPadded)); |
| 359 | |
| 360 | /* Space for dynamic allocation counter, plus room for alignment. */ |
| 361 | size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); |
| 362 | |
| 363 | /* space for named tranches. */ |
| 364 | size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); |
| 365 | |
| 366 | /* space for name of each tranche. */ |
| 367 | for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| 368 | size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); |
| 369 | |
| 370 | /* Disallow named LWLocks' requests after startup */ |
| 371 | lock_named_request_allowed = false; |
| 372 | |
| 373 | return size; |
| 374 | } |
| 375 | |
| 376 | /* |
| 377 | * Allocate shmem space for the main LWLock array and all tranches and |
| 378 | * initialize it. We also register all the LWLock tranches here. |
| 379 | */ |
| 380 | void |
| 381 | CreateLWLocks(void) |
| 382 | { |
| 383 | StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, |
| 384 | "MAX_BACKENDS too big for lwlock.c" ); |
| 385 | |
| 386 | StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE && |
| 387 | sizeof(LWLock) <= LWLOCK_PADDED_SIZE, |
| 388 | "Miscalculated LWLock padding" ); |
| 389 | |
| 390 | if (!IsUnderPostmaster) |
| 391 | { |
| 392 | Size spaceLocks = LWLockShmemSize(); |
| 393 | int *LWLockCounter; |
| 394 | char *ptr; |
| 395 | |
| 396 | /* Allocate space */ |
| 397 | ptr = (char *) ShmemAlloc(spaceLocks); |
| 398 | |
| 399 | /* Leave room for dynamic allocation of tranches */ |
| 400 | ptr += sizeof(int); |
| 401 | |
| 402 | /* Ensure desired alignment of LWLock array */ |
| 403 | ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; |
| 404 | |
| 405 | MainLWLockArray = (LWLockPadded *) ptr; |
| 406 | |
| 407 | /* |
| 408 | * Initialize the dynamic-allocation counter for tranches, which is |
| 409 | * stored just before the first LWLock. |
| 410 | */ |
| 411 | LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); |
| 412 | *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; |
| 413 | |
| 414 | /* Initialize all LWLocks */ |
| 415 | InitializeLWLocks(); |
| 416 | } |
| 417 | |
| 418 | /* Register all LWLock tranches */ |
| 419 | RegisterLWLockTranches(); |
| 420 | } |
| 421 | |
| 422 | /* |
| 423 | * Initialize LWLocks that are fixed and those belonging to named tranches. |
| 424 | */ |
| 425 | static void |
| 426 | InitializeLWLocks(void) |
| 427 | { |
| 428 | int numNamedLocks = NumLWLocksByNamedTranches(); |
| 429 | int id; |
| 430 | int i; |
| 431 | int j; |
| 432 | LWLockPadded *lock; |
| 433 | |
| 434 | /* Initialize all individual LWLocks in main array */ |
| 435 | for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++) |
| 436 | LWLockInitialize(&lock->lock, id); |
| 437 | |
| 438 | /* Initialize buffer mapping LWLocks in main array */ |
| 439 | lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS; |
| 440 | for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++) |
| 441 | LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING); |
| 442 | |
| 443 | /* Initialize lmgrs' LWLocks in main array */ |
| 444 | lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS + NUM_BUFFER_PARTITIONS; |
| 445 | for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++) |
| 446 | LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER); |
| 447 | |
| 448 | /* Initialize predicate lmgrs' LWLocks in main array */ |
| 449 | lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS + |
| 450 | NUM_BUFFER_PARTITIONS + NUM_LOCK_PARTITIONS; |
| 451 | for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) |
| 452 | LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); |
| 453 | |
| 454 | /* Initialize named tranches. */ |
| 455 | if (NamedLWLockTrancheRequests > 0) |
| 456 | { |
| 457 | char *trancheNames; |
| 458 | |
| 459 | NamedLWLockTrancheArray = (NamedLWLockTranche *) |
| 460 | &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; |
| 461 | |
| 462 | trancheNames = (char *) NamedLWLockTrancheArray + |
| 463 | (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); |
| 464 | lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; |
| 465 | |
| 466 | for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| 467 | { |
| 468 | NamedLWLockTrancheRequest *request; |
| 469 | NamedLWLockTranche *tranche; |
| 470 | char *name; |
| 471 | |
| 472 | request = &NamedLWLockTrancheRequestArray[i]; |
| 473 | tranche = &NamedLWLockTrancheArray[i]; |
| 474 | |
| 475 | name = trancheNames; |
| 476 | trancheNames += strlen(request->tranche_name) + 1; |
| 477 | strcpy(name, request->tranche_name); |
| 478 | tranche->trancheId = LWLockNewTrancheId(); |
| 479 | tranche->trancheName = name; |
| 480 | |
| 481 | for (j = 0; j < request->num_lwlocks; j++, lock++) |
| 482 | LWLockInitialize(&lock->lock, tranche->trancheId); |
| 483 | } |
| 484 | } |
| 485 | } |
| 486 | |
| 487 | /* |
| 488 | * Register named tranches and tranches for fixed LWLocks. |
| 489 | */ |
| 490 | static void |
| 491 | RegisterLWLockTranches(void) |
| 492 | { |
| 493 | int i; |
| 494 | |
| 495 | if (LWLockTrancheArray == NULL) |
| 496 | { |
| 497 | LWLockTranchesAllocated = 128; |
| 498 | LWLockTrancheArray = (const char **) |
| 499 | MemoryContextAllocZero(TopMemoryContext, |
| 500 | LWLockTranchesAllocated * sizeof(char *)); |
| 501 | Assert(LWLockTranchesAllocated >= LWTRANCHE_FIRST_USER_DEFINED); |
| 502 | } |
| 503 | |
| 504 | for (i = 0; i < NUM_INDIVIDUAL_LWLOCKS; ++i) |
| 505 | LWLockRegisterTranche(i, MainLWLockNames[i]); |
| 506 | |
| 507 | LWLockRegisterTranche(LWTRANCHE_BUFFER_MAPPING, "buffer_mapping" ); |
| 508 | LWLockRegisterTranche(LWTRANCHE_LOCK_MANAGER, "lock_manager" ); |
| 509 | LWLockRegisterTranche(LWTRANCHE_PREDICATE_LOCK_MANAGER, |
| 510 | "predicate_lock_manager" ); |
| 511 | LWLockRegisterTranche(LWTRANCHE_PARALLEL_QUERY_DSA, |
| 512 | "parallel_query_dsa" ); |
| 513 | LWLockRegisterTranche(LWTRANCHE_SESSION_DSA, |
| 514 | "session_dsa" ); |
| 515 | LWLockRegisterTranche(LWTRANCHE_SESSION_RECORD_TABLE, |
| 516 | "session_record_table" ); |
| 517 | LWLockRegisterTranche(LWTRANCHE_SESSION_TYPMOD_TABLE, |
| 518 | "session_typmod_table" ); |
| 519 | LWLockRegisterTranche(LWTRANCHE_SHARED_TUPLESTORE, |
| 520 | "shared_tuplestore" ); |
| 521 | LWLockRegisterTranche(LWTRANCHE_TBM, "tbm" ); |
| 522 | LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append" ); |
| 523 | LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join" ); |
| 524 | LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact" ); |
| 525 | |
| 526 | /* Register named tranches. */ |
| 527 | for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| 528 | LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, |
| 529 | NamedLWLockTrancheArray[i].trancheName); |
| 530 | } |
| 531 | |
| 532 | /* |
| 533 | * InitLWLockAccess - initialize backend-local state needed to hold LWLocks |
| 534 | */ |
| 535 | void |
| 536 | InitLWLockAccess(void) |
| 537 | { |
| 538 | #ifdef LWLOCK_STATS |
| 539 | init_lwlock_stats(); |
| 540 | #endif |
| 541 | } |
| 542 | |
| 543 | /* |
| 544 | * GetNamedLWLockTranche - returns the base address of LWLock from the |
| 545 | * specified tranche. |
| 546 | * |
| 547 | * Caller needs to retrieve the requested number of LWLocks starting from |
| 548 | * the base lock address returned by this API. This can be used for |
| 549 | * tranches that are requested by using RequestNamedLWLockTranche() API. |
| 550 | */ |
| 551 | LWLockPadded * |
| 552 | GetNamedLWLockTranche(const char *tranche_name) |
| 553 | { |
| 554 | int lock_pos; |
| 555 | int i; |
| 556 | |
| 557 | /* |
| 558 | * Obtain the position of base address of LWLock belonging to requested |
| 559 | * tranche_name in MainLWLockArray. LWLocks for named tranches are placed |
| 560 | * in MainLWLockArray after fixed locks. |
| 561 | */ |
| 562 | lock_pos = NUM_FIXED_LWLOCKS; |
| 563 | for (i = 0; i < NamedLWLockTrancheRequests; i++) |
| 564 | { |
| 565 | if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name, |
| 566 | tranche_name) == 0) |
| 567 | return &MainLWLockArray[lock_pos]; |
| 568 | |
| 569 | lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks; |
| 570 | } |
| 571 | |
| 572 | if (i >= NamedLWLockTrancheRequests) |
| 573 | elog(ERROR, "requested tranche is not registered" ); |
| 574 | |
| 575 | /* just to keep compiler quiet */ |
| 576 | return NULL; |
| 577 | } |
| 578 | |
| 579 | /* |
| 580 | * Allocate a new tranche ID. |
| 581 | */ |
| 582 | int |
| 583 | LWLockNewTrancheId(void) |
| 584 | { |
| 585 | int result; |
| 586 | int *LWLockCounter; |
| 587 | |
| 588 | LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); |
| 589 | SpinLockAcquire(ShmemLock); |
| 590 | result = (*LWLockCounter)++; |
| 591 | SpinLockRelease(ShmemLock); |
| 592 | |
| 593 | return result; |
| 594 | } |
| 595 | |
| 596 | /* |
| 597 | * Register a tranche ID in the lookup table for the current process. This |
| 598 | * routine will save a pointer to the tranche name passed as an argument, |
| 599 | * so the name should be allocated in a backend-lifetime context |
| 600 | * (TopMemoryContext, static variable, or similar). |
| 601 | */ |
| 602 | void |
| 603 | LWLockRegisterTranche(int tranche_id, const char *tranche_name) |
| 604 | { |
| 605 | Assert(LWLockTrancheArray != NULL); |
| 606 | |
| 607 | if (tranche_id >= LWLockTranchesAllocated) |
| 608 | { |
| 609 | int i = LWLockTranchesAllocated; |
| 610 | int j = LWLockTranchesAllocated; |
| 611 | |
| 612 | while (i <= tranche_id) |
| 613 | i *= 2; |
| 614 | |
| 615 | LWLockTrancheArray = (const char **) |
| 616 | repalloc(LWLockTrancheArray, i * sizeof(char *)); |
| 617 | LWLockTranchesAllocated = i; |
| 618 | while (j < LWLockTranchesAllocated) |
| 619 | LWLockTrancheArray[j++] = NULL; |
| 620 | } |
| 621 | |
| 622 | LWLockTrancheArray[tranche_id] = tranche_name; |
| 623 | } |
| 624 | |
| 625 | /* |
| 626 | * RequestNamedLWLockTranche |
| 627 | * Request that extra LWLocks be allocated during postmaster |
| 628 | * startup. |
| 629 | * |
| 630 | * This is only useful for extensions if called from the _PG_init hook |
| 631 | * of a library that is loaded into the postmaster via |
| 632 | * shared_preload_libraries. Once shared memory has been allocated, calls |
| 633 | * will be ignored. (We could raise an error, but it seems better to make |
| 634 | * it a no-op, so that libraries containing such calls can be reloaded if |
| 635 | * needed.) |
| 636 | */ |
| 637 | void |
| 638 | RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) |
| 639 | { |
| 640 | NamedLWLockTrancheRequest *request; |
| 641 | |
| 642 | if (IsUnderPostmaster || !lock_named_request_allowed) |
| 643 | return; /* too late */ |
| 644 | |
| 645 | if (NamedLWLockTrancheRequestArray == NULL) |
| 646 | { |
| 647 | NamedLWLockTrancheRequestsAllocated = 16; |
| 648 | NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) |
| 649 | MemoryContextAlloc(TopMemoryContext, |
| 650 | NamedLWLockTrancheRequestsAllocated |
| 651 | * sizeof(NamedLWLockTrancheRequest)); |
| 652 | } |
| 653 | |
| 654 | if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) |
| 655 | { |
| 656 | int i = NamedLWLockTrancheRequestsAllocated; |
| 657 | |
| 658 | while (i <= NamedLWLockTrancheRequests) |
| 659 | i *= 2; |
| 660 | |
| 661 | NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) |
| 662 | repalloc(NamedLWLockTrancheRequestArray, |
| 663 | i * sizeof(NamedLWLockTrancheRequest)); |
| 664 | NamedLWLockTrancheRequestsAllocated = i; |
| 665 | } |
| 666 | |
| 667 | request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; |
| 668 | Assert(strlen(tranche_name) + 1 < NAMEDATALEN); |
| 669 | StrNCpy(request->tranche_name, tranche_name, NAMEDATALEN); |
| 670 | request->num_lwlocks = num_lwlocks; |
| 671 | NamedLWLockTrancheRequests++; |
| 672 | } |
| 673 | |
| 674 | /* |
| 675 | * LWLockInitialize - initialize a new lwlock; it's initially unlocked |
| 676 | */ |
| 677 | void |
| 678 | LWLockInitialize(LWLock *lock, int tranche_id) |
| 679 | { |
| 680 | pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| 681 | #ifdef LOCK_DEBUG |
| 682 | pg_atomic_init_u32(&lock->nwaiters, 0); |
| 683 | #endif |
| 684 | lock->tranche = tranche_id; |
| 685 | proclist_init(&lock->waiters); |
| 686 | } |
| 687 | |
| 688 | /* |
| 689 | * Report start of wait event for light-weight locks. |
| 690 | * |
| 691 | * This function will be used by all the light-weight lock calls which |
| 692 | * needs to wait to acquire the lock. This function distinguishes wait |
| 693 | * event based on tranche and lock id. |
| 694 | */ |
| 695 | static inline void |
| 696 | LWLockReportWaitStart(LWLock *lock) |
| 697 | { |
| 698 | pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche); |
| 699 | } |
| 700 | |
| 701 | /* |
| 702 | * Report end of wait event for light-weight locks. |
| 703 | */ |
| 704 | static inline void |
| 705 | LWLockReportWaitEnd(void) |
| 706 | { |
| 707 | pgstat_report_wait_end(); |
| 708 | } |
| 709 | |
| 710 | /* |
| 711 | * Return an identifier for an LWLock based on the wait class and event. |
| 712 | */ |
| 713 | const char * |
| 714 | GetLWLockIdentifier(uint32 classId, uint16 eventId) |
| 715 | { |
| 716 | Assert(classId == PG_WAIT_LWLOCK); |
| 717 | |
| 718 | /* |
| 719 | * It is quite possible that user has registered tranche in one of the |
| 720 | * backends (e.g. by allocating lwlocks in dynamic shared memory) but not |
| 721 | * all of them, so we can't assume the tranche is registered here. |
| 722 | */ |
| 723 | if (eventId >= LWLockTranchesAllocated || |
| 724 | LWLockTrancheArray[eventId] == NULL) |
| 725 | return "extension" ; |
| 726 | |
| 727 | return LWLockTrancheArray[eventId]; |
| 728 | } |
| 729 | |
| 730 | /* |
| 731 | * Internal function that tries to atomically acquire the lwlock in the passed |
| 732 | * in mode. |
| 733 | * |
| 734 | * This function will not block waiting for a lock to become free - that's the |
| 735 | * callers job. |
| 736 | * |
| 737 | * Returns true if the lock isn't free and we need to wait. |
| 738 | */ |
| 739 | static bool |
| 740 | LWLockAttemptLock(LWLock *lock, LWLockMode mode) |
| 741 | { |
| 742 | uint32 old_state; |
| 743 | |
| 744 | AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED); |
| 745 | |
| 746 | /* |
| 747 | * Read once outside the loop, later iterations will get the newer value |
| 748 | * via compare & exchange. |
| 749 | */ |
| 750 | old_state = pg_atomic_read_u32(&lock->state); |
| 751 | |
| 752 | /* loop until we've determined whether we could acquire the lock or not */ |
| 753 | while (true) |
| 754 | { |
| 755 | uint32 desired_state; |
| 756 | bool lock_free; |
| 757 | |
| 758 | desired_state = old_state; |
| 759 | |
| 760 | if (mode == LW_EXCLUSIVE) |
| 761 | { |
| 762 | lock_free = (old_state & LW_LOCK_MASK) == 0; |
| 763 | if (lock_free) |
| 764 | desired_state += LW_VAL_EXCLUSIVE; |
| 765 | } |
| 766 | else |
| 767 | { |
| 768 | lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; |
| 769 | if (lock_free) |
| 770 | desired_state += LW_VAL_SHARED; |
| 771 | } |
| 772 | |
| 773 | /* |
| 774 | * Attempt to swap in the state we are expecting. If we didn't see |
| 775 | * lock to be free, that's just the old value. If we saw it as free, |
| 776 | * we'll attempt to mark it acquired. The reason that we always swap |
| 777 | * in the value is that this doubles as a memory barrier. We could try |
| 778 | * to be smarter and only swap in values if we saw the lock as free, |
| 779 | * but benchmark haven't shown it as beneficial so far. |
| 780 | * |
| 781 | * Retry if the value changed since we last looked at it. |
| 782 | */ |
| 783 | if (pg_atomic_compare_exchange_u32(&lock->state, |
| 784 | &old_state, desired_state)) |
| 785 | { |
| 786 | if (lock_free) |
| 787 | { |
| 788 | /* Great! Got the lock. */ |
| 789 | #ifdef LOCK_DEBUG |
| 790 | if (mode == LW_EXCLUSIVE) |
| 791 | lock->owner = MyProc; |
| 792 | #endif |
| 793 | return false; |
| 794 | } |
| 795 | else |
| 796 | return true; /* somebody else has the lock */ |
| 797 | } |
| 798 | } |
| 799 | pg_unreachable(); |
| 800 | } |
| 801 | |
| 802 | /* |
| 803 | * Lock the LWLock's wait list against concurrent activity. |
| 804 | * |
| 805 | * NB: even though the wait list is locked, non-conflicting lock operations |
| 806 | * may still happen concurrently. |
| 807 | * |
| 808 | * Time spent holding mutex should be short! |
| 809 | */ |
| 810 | static void |
| 811 | LWLockWaitListLock(LWLock *lock) |
| 812 | { |
| 813 | uint32 old_state; |
| 814 | #ifdef LWLOCK_STATS |
| 815 | lwlock_stats *lwstats; |
| 816 | uint32 delays = 0; |
| 817 | |
| 818 | lwstats = get_lwlock_stats_entry(lock); |
| 819 | #endif |
| 820 | |
| 821 | while (true) |
| 822 | { |
| 823 | /* always try once to acquire lock directly */ |
| 824 | old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); |
| 825 | if (!(old_state & LW_FLAG_LOCKED)) |
| 826 | break; /* got lock */ |
| 827 | |
| 828 | /* and then spin without atomic operations until lock is released */ |
| 829 | { |
| 830 | SpinDelayStatus delayStatus; |
| 831 | |
| 832 | init_local_spin_delay(&delayStatus); |
| 833 | |
| 834 | while (old_state & LW_FLAG_LOCKED) |
| 835 | { |
| 836 | perform_spin_delay(&delayStatus); |
| 837 | old_state = pg_atomic_read_u32(&lock->state); |
| 838 | } |
| 839 | #ifdef LWLOCK_STATS |
| 840 | delays += delayStatus.delays; |
| 841 | #endif |
| 842 | finish_spin_delay(&delayStatus); |
| 843 | } |
| 844 | |
| 845 | /* |
| 846 | * Retry. The lock might obviously already be re-acquired by the time |
| 847 | * we're attempting to get it again. |
| 848 | */ |
| 849 | } |
| 850 | |
| 851 | #ifdef LWLOCK_STATS |
| 852 | lwstats->spin_delay_count += delays; |
| 853 | #endif |
| 854 | } |
| 855 | |
| 856 | /* |
| 857 | * Unlock the LWLock's wait list. |
| 858 | * |
| 859 | * Note that it can be more efficient to manipulate flags and release the |
| 860 | * locks in a single atomic operation. |
| 861 | */ |
| 862 | static void |
| 863 | LWLockWaitListUnlock(LWLock *lock) |
| 864 | { |
| 865 | uint32 old_state PG_USED_FOR_ASSERTS_ONLY; |
| 866 | |
| 867 | old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED); |
| 868 | |
| 869 | Assert(old_state & LW_FLAG_LOCKED); |
| 870 | } |
| 871 | |
| 872 | /* |
| 873 | * Wakeup all the lockers that currently have a chance to acquire the lock. |
| 874 | */ |
| 875 | static void |
| 876 | LWLockWakeup(LWLock *lock) |
| 877 | { |
| 878 | bool new_release_ok; |
| 879 | bool wokeup_somebody = false; |
| 880 | proclist_head wakeup; |
| 881 | proclist_mutable_iter iter; |
| 882 | |
| 883 | proclist_init(&wakeup); |
| 884 | |
| 885 | new_release_ok = true; |
| 886 | |
| 887 | /* lock wait list while collecting backends to wake up */ |
| 888 | LWLockWaitListLock(lock); |
| 889 | |
| 890 | proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) |
| 891 | { |
| 892 | PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| 893 | |
| 894 | if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) |
| 895 | continue; |
| 896 | |
| 897 | proclist_delete(&lock->waiters, iter.cur, lwWaitLink); |
| 898 | proclist_push_tail(&wakeup, iter.cur, lwWaitLink); |
| 899 | |
| 900 | if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) |
| 901 | { |
| 902 | /* |
| 903 | * Prevent additional wakeups until retryer gets to run. Backends |
| 904 | * that are just waiting for the lock to become free don't retry |
| 905 | * automatically. |
| 906 | */ |
| 907 | new_release_ok = false; |
| 908 | |
| 909 | /* |
| 910 | * Don't wakeup (further) exclusive locks. |
| 911 | */ |
| 912 | wokeup_somebody = true; |
| 913 | } |
| 914 | |
| 915 | /* |
| 916 | * Once we've woken up an exclusive lock, there's no point in waking |
| 917 | * up anybody else. |
| 918 | */ |
| 919 | if (waiter->lwWaitMode == LW_EXCLUSIVE) |
| 920 | break; |
| 921 | } |
| 922 | |
| 923 | Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS); |
| 924 | |
| 925 | /* unset required flags, and release lock, in one fell swoop */ |
| 926 | { |
| 927 | uint32 old_state; |
| 928 | uint32 desired_state; |
| 929 | |
| 930 | old_state = pg_atomic_read_u32(&lock->state); |
| 931 | while (true) |
| 932 | { |
| 933 | desired_state = old_state; |
| 934 | |
| 935 | /* compute desired flags */ |
| 936 | |
| 937 | if (new_release_ok) |
| 938 | desired_state |= LW_FLAG_RELEASE_OK; |
| 939 | else |
| 940 | desired_state &= ~LW_FLAG_RELEASE_OK; |
| 941 | |
| 942 | if (proclist_is_empty(&wakeup)) |
| 943 | desired_state &= ~LW_FLAG_HAS_WAITERS; |
| 944 | |
| 945 | desired_state &= ~LW_FLAG_LOCKED; /* release lock */ |
| 946 | |
| 947 | if (pg_atomic_compare_exchange_u32(&lock->state, &old_state, |
| 948 | desired_state)) |
| 949 | break; |
| 950 | } |
| 951 | } |
| 952 | |
| 953 | /* Awaken any waiters I removed from the queue. */ |
| 954 | proclist_foreach_modify(iter, &wakeup, lwWaitLink) |
| 955 | { |
| 956 | PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| 957 | |
| 958 | LOG_LWDEBUG("LWLockRelease" , lock, "release waiter" ); |
| 959 | proclist_delete(&wakeup, iter.cur, lwWaitLink); |
| 960 | |
| 961 | /* |
| 962 | * Guarantee that lwWaiting being unset only becomes visible once the |
| 963 | * unlink from the link has completed. Otherwise the target backend |
| 964 | * could be woken up for other reason and enqueue for a new lock - if |
| 965 | * that happens before the list unlink happens, the list would end up |
| 966 | * being corrupted. |
| 967 | * |
| 968 | * The barrier pairs with the LWLockWaitListLock() when enqueuing for |
| 969 | * another lock. |
| 970 | */ |
| 971 | pg_write_barrier(); |
| 972 | waiter->lwWaiting = false; |
| 973 | PGSemaphoreUnlock(waiter->sem); |
| 974 | } |
| 975 | } |
| 976 | |
| 977 | /* |
| 978 | * Add ourselves to the end of the queue. |
| 979 | * |
| 980 | * NB: Mode can be LW_WAIT_UNTIL_FREE here! |
| 981 | */ |
| 982 | static void |
| 983 | LWLockQueueSelf(LWLock *lock, LWLockMode mode) |
| 984 | { |
| 985 | /* |
| 986 | * If we don't have a PGPROC structure, there's no way to wait. This |
| 987 | * should never occur, since MyProc should only be null during shared |
| 988 | * memory initialization. |
| 989 | */ |
| 990 | if (MyProc == NULL) |
| 991 | elog(PANIC, "cannot wait without a PGPROC structure" ); |
| 992 | |
| 993 | if (MyProc->lwWaiting) |
| 994 | elog(PANIC, "queueing for lock while waiting on another one" ); |
| 995 | |
| 996 | LWLockWaitListLock(lock); |
| 997 | |
| 998 | /* setting the flag is protected by the spinlock */ |
| 999 | pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS); |
| 1000 | |
| 1001 | MyProc->lwWaiting = true; |
| 1002 | MyProc->lwWaitMode = mode; |
| 1003 | |
| 1004 | /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ |
| 1005 | if (mode == LW_WAIT_UNTIL_FREE) |
| 1006 | proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink); |
| 1007 | else |
| 1008 | proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink); |
| 1009 | |
| 1010 | /* Can release the mutex now */ |
| 1011 | LWLockWaitListUnlock(lock); |
| 1012 | |
| 1013 | #ifdef LOCK_DEBUG |
| 1014 | pg_atomic_fetch_add_u32(&lock->nwaiters, 1); |
| 1015 | #endif |
| 1016 | |
| 1017 | } |
| 1018 | |
| 1019 | /* |
| 1020 | * Remove ourselves from the waitlist. |
| 1021 | * |
| 1022 | * This is used if we queued ourselves because we thought we needed to sleep |
| 1023 | * but, after further checking, we discovered that we don't actually need to |
| 1024 | * do so. |
| 1025 | */ |
| 1026 | static void |
| 1027 | LWLockDequeueSelf(LWLock *lock) |
| 1028 | { |
| 1029 | bool found = false; |
| 1030 | proclist_mutable_iter iter; |
| 1031 | |
| 1032 | #ifdef LWLOCK_STATS |
| 1033 | lwlock_stats *lwstats; |
| 1034 | |
| 1035 | lwstats = get_lwlock_stats_entry(lock); |
| 1036 | |
| 1037 | lwstats->dequeue_self_count++; |
| 1038 | #endif |
| 1039 | |
| 1040 | LWLockWaitListLock(lock); |
| 1041 | |
| 1042 | /* |
| 1043 | * Can't just remove ourselves from the list, but we need to iterate over |
| 1044 | * all entries as somebody else could have dequeued us. |
| 1045 | */ |
| 1046 | proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) |
| 1047 | { |
| 1048 | if (iter.cur == MyProc->pgprocno) |
| 1049 | { |
| 1050 | found = true; |
| 1051 | proclist_delete(&lock->waiters, iter.cur, lwWaitLink); |
| 1052 | break; |
| 1053 | } |
| 1054 | } |
| 1055 | |
| 1056 | if (proclist_is_empty(&lock->waiters) && |
| 1057 | (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0) |
| 1058 | { |
| 1059 | pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS); |
| 1060 | } |
| 1061 | |
| 1062 | /* XXX: combine with fetch_and above? */ |
| 1063 | LWLockWaitListUnlock(lock); |
| 1064 | |
| 1065 | /* clear waiting state again, nice for debugging */ |
| 1066 | if (found) |
| 1067 | MyProc->lwWaiting = false; |
| 1068 | else |
| 1069 | { |
| 1070 | int = 0; |
| 1071 | |
| 1072 | /* |
| 1073 | * Somebody else dequeued us and has or will wake us up. Deal with the |
| 1074 | * superfluous absorption of a wakeup. |
| 1075 | */ |
| 1076 | |
| 1077 | /* |
| 1078 | * Reset releaseOk if somebody woke us before we removed ourselves - |
| 1079 | * they'll have set it to false. |
| 1080 | */ |
| 1081 | pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| 1082 | |
| 1083 | /* |
| 1084 | * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would |
| 1085 | * get reset at some inconvenient point later. Most of the time this |
| 1086 | * will immediately return. |
| 1087 | */ |
| 1088 | for (;;) |
| 1089 | { |
| 1090 | PGSemaphoreLock(MyProc->sem); |
| 1091 | if (!MyProc->lwWaiting) |
| 1092 | break; |
| 1093 | extraWaits++; |
| 1094 | } |
| 1095 | |
| 1096 | /* |
| 1097 | * Fix the process wait semaphore's count for any absorbed wakeups. |
| 1098 | */ |
| 1099 | while (extraWaits-- > 0) |
| 1100 | PGSemaphoreUnlock(MyProc->sem); |
| 1101 | } |
| 1102 | |
| 1103 | #ifdef LOCK_DEBUG |
| 1104 | { |
| 1105 | /* not waiting anymore */ |
| 1106 | uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| 1107 | |
| 1108 | Assert(nwaiters < MAX_BACKENDS); |
| 1109 | } |
| 1110 | #endif |
| 1111 | } |
| 1112 | |
| 1113 | /* |
| 1114 | * LWLockAcquire - acquire a lightweight lock in the specified mode |
| 1115 | * |
| 1116 | * If the lock is not available, sleep until it is. Returns true if the lock |
| 1117 | * was available immediately, false if we had to sleep. |
| 1118 | * |
| 1119 | * Side effect: cancel/die interrupts are held off until lock release. |
| 1120 | */ |
| 1121 | bool |
| 1122 | LWLockAcquire(LWLock *lock, LWLockMode mode) |
| 1123 | { |
| 1124 | PGPROC *proc = MyProc; |
| 1125 | bool result = true; |
| 1126 | int = 0; |
| 1127 | #ifdef LWLOCK_STATS |
| 1128 | lwlock_stats *lwstats; |
| 1129 | |
| 1130 | lwstats = get_lwlock_stats_entry(lock); |
| 1131 | #endif |
| 1132 | |
| 1133 | AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| 1134 | |
| 1135 | PRINT_LWDEBUG("LWLockAcquire" , lock, mode); |
| 1136 | |
| 1137 | #ifdef LWLOCK_STATS |
| 1138 | /* Count lock acquisition attempts */ |
| 1139 | if (mode == LW_EXCLUSIVE) |
| 1140 | lwstats->ex_acquire_count++; |
| 1141 | else |
| 1142 | lwstats->sh_acquire_count++; |
| 1143 | #endif /* LWLOCK_STATS */ |
| 1144 | |
| 1145 | /* |
| 1146 | * We can't wait if we haven't got a PGPROC. This should only occur |
| 1147 | * during bootstrap or shared memory initialization. Put an Assert here |
| 1148 | * to catch unsafe coding practices. |
| 1149 | */ |
| 1150 | Assert(!(proc == NULL && IsUnderPostmaster)); |
| 1151 | |
| 1152 | /* Ensure we will have room to remember the lock */ |
| 1153 | if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| 1154 | elog(ERROR, "too many LWLocks taken" ); |
| 1155 | |
| 1156 | /* |
| 1157 | * Lock out cancel/die interrupts until we exit the code section protected |
| 1158 | * by the LWLock. This ensures that interrupts will not interfere with |
| 1159 | * manipulations of data structures in shared memory. |
| 1160 | */ |
| 1161 | HOLD_INTERRUPTS(); |
| 1162 | |
| 1163 | /* |
| 1164 | * Loop here to try to acquire lock after each time we are signaled by |
| 1165 | * LWLockRelease. |
| 1166 | * |
| 1167 | * NOTE: it might seem better to have LWLockRelease actually grant us the |
| 1168 | * lock, rather than retrying and possibly having to go back to sleep. But |
| 1169 | * in practice that is no good because it means a process swap for every |
| 1170 | * lock acquisition when two or more processes are contending for the same |
| 1171 | * lock. Since LWLocks are normally used to protect not-very-long |
| 1172 | * sections of computation, a process needs to be able to acquire and |
| 1173 | * release the same lock many times during a single CPU time slice, even |
| 1174 | * in the presence of contention. The efficiency of being able to do that |
| 1175 | * outweighs the inefficiency of sometimes wasting a process dispatch |
| 1176 | * cycle because the lock is not free when a released waiter finally gets |
| 1177 | * to run. See pgsql-hackers archives for 29-Dec-01. |
| 1178 | */ |
| 1179 | for (;;) |
| 1180 | { |
| 1181 | bool mustwait; |
| 1182 | |
| 1183 | /* |
| 1184 | * Try to grab the lock the first time, we're not in the waitqueue |
| 1185 | * yet/anymore. |
| 1186 | */ |
| 1187 | mustwait = LWLockAttemptLock(lock, mode); |
| 1188 | |
| 1189 | if (!mustwait) |
| 1190 | { |
| 1191 | LOG_LWDEBUG("LWLockAcquire" , lock, "immediately acquired lock" ); |
| 1192 | break; /* got the lock */ |
| 1193 | } |
| 1194 | |
| 1195 | /* |
| 1196 | * Ok, at this point we couldn't grab the lock on the first try. We |
| 1197 | * cannot simply queue ourselves to the end of the list and wait to be |
| 1198 | * woken up because by now the lock could long have been released. |
| 1199 | * Instead add us to the queue and try to grab the lock again. If we |
| 1200 | * succeed we need to revert the queuing and be happy, otherwise we |
| 1201 | * recheck the lock. If we still couldn't grab it, we know that the |
| 1202 | * other locker will see our queue entries when releasing since they |
| 1203 | * existed before we checked for the lock. |
| 1204 | */ |
| 1205 | |
| 1206 | /* add to the queue */ |
| 1207 | LWLockQueueSelf(lock, mode); |
| 1208 | |
| 1209 | /* we're now guaranteed to be woken up if necessary */ |
| 1210 | mustwait = LWLockAttemptLock(lock, mode); |
| 1211 | |
| 1212 | /* ok, grabbed the lock the second time round, need to undo queueing */ |
| 1213 | if (!mustwait) |
| 1214 | { |
| 1215 | LOG_LWDEBUG("LWLockAcquire" , lock, "acquired, undoing queue" ); |
| 1216 | |
| 1217 | LWLockDequeueSelf(lock); |
| 1218 | break; |
| 1219 | } |
| 1220 | |
| 1221 | /* |
| 1222 | * Wait until awakened. |
| 1223 | * |
| 1224 | * Since we share the process wait semaphore with the regular lock |
| 1225 | * manager and ProcWaitForSignal, and we may need to acquire an LWLock |
| 1226 | * while one of those is pending, it is possible that we get awakened |
| 1227 | * for a reason other than being signaled by LWLockRelease. If so, |
| 1228 | * loop back and wait again. Once we've gotten the LWLock, |
| 1229 | * re-increment the sema by the number of additional signals received, |
| 1230 | * so that the lock manager or signal manager will see the received |
| 1231 | * signal when it next waits. |
| 1232 | */ |
| 1233 | LOG_LWDEBUG("LWLockAcquire" , lock, "waiting" ); |
| 1234 | |
| 1235 | #ifdef LWLOCK_STATS |
| 1236 | lwstats->block_count++; |
| 1237 | #endif |
| 1238 | |
| 1239 | LWLockReportWaitStart(lock); |
| 1240 | TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); |
| 1241 | |
| 1242 | for (;;) |
| 1243 | { |
| 1244 | PGSemaphoreLock(proc->sem); |
| 1245 | if (!proc->lwWaiting) |
| 1246 | break; |
| 1247 | extraWaits++; |
| 1248 | } |
| 1249 | |
| 1250 | /* Retrying, allow LWLockRelease to release waiters again. */ |
| 1251 | pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| 1252 | |
| 1253 | #ifdef LOCK_DEBUG |
| 1254 | { |
| 1255 | /* not waiting anymore */ |
| 1256 | uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| 1257 | |
| 1258 | Assert(nwaiters < MAX_BACKENDS); |
| 1259 | } |
| 1260 | #endif |
| 1261 | |
| 1262 | TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); |
| 1263 | LWLockReportWaitEnd(); |
| 1264 | |
| 1265 | LOG_LWDEBUG("LWLockAcquire" , lock, "awakened" ); |
| 1266 | |
| 1267 | /* Now loop back and try to acquire lock again. */ |
| 1268 | result = false; |
| 1269 | } |
| 1270 | |
| 1271 | TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode); |
| 1272 | |
| 1273 | /* Add lock to list of locks held by this backend */ |
| 1274 | held_lwlocks[num_held_lwlocks].lock = lock; |
| 1275 | held_lwlocks[num_held_lwlocks++].mode = mode; |
| 1276 | |
| 1277 | /* |
| 1278 | * Fix the process wait semaphore's count for any absorbed wakeups. |
| 1279 | */ |
| 1280 | while (extraWaits-- > 0) |
| 1281 | PGSemaphoreUnlock(proc->sem); |
| 1282 | |
| 1283 | return result; |
| 1284 | } |
| 1285 | |
| 1286 | /* |
| 1287 | * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode |
| 1288 | * |
| 1289 | * If the lock is not available, return false with no side-effects. |
| 1290 | * |
| 1291 | * If successful, cancel/die interrupts are held off until lock release. |
| 1292 | */ |
| 1293 | bool |
| 1294 | LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) |
| 1295 | { |
| 1296 | bool mustwait; |
| 1297 | |
| 1298 | AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| 1299 | |
| 1300 | PRINT_LWDEBUG("LWLockConditionalAcquire" , lock, mode); |
| 1301 | |
| 1302 | /* Ensure we will have room to remember the lock */ |
| 1303 | if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| 1304 | elog(ERROR, "too many LWLocks taken" ); |
| 1305 | |
| 1306 | /* |
| 1307 | * Lock out cancel/die interrupts until we exit the code section protected |
| 1308 | * by the LWLock. This ensures that interrupts will not interfere with |
| 1309 | * manipulations of data structures in shared memory. |
| 1310 | */ |
| 1311 | HOLD_INTERRUPTS(); |
| 1312 | |
| 1313 | /* Check for the lock */ |
| 1314 | mustwait = LWLockAttemptLock(lock, mode); |
| 1315 | |
| 1316 | if (mustwait) |
| 1317 | { |
| 1318 | /* Failed to get lock, so release interrupt holdoff */ |
| 1319 | RESUME_INTERRUPTS(); |
| 1320 | |
| 1321 | LOG_LWDEBUG("LWLockConditionalAcquire" , lock, "failed" ); |
| 1322 | TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode); |
| 1323 | } |
| 1324 | else |
| 1325 | { |
| 1326 | /* Add lock to list of locks held by this backend */ |
| 1327 | held_lwlocks[num_held_lwlocks].lock = lock; |
| 1328 | held_lwlocks[num_held_lwlocks++].mode = mode; |
| 1329 | TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode); |
| 1330 | } |
| 1331 | return !mustwait; |
| 1332 | } |
| 1333 | |
| 1334 | /* |
| 1335 | * LWLockAcquireOrWait - Acquire lock, or wait until it's free |
| 1336 | * |
| 1337 | * The semantics of this function are a bit funky. If the lock is currently |
| 1338 | * free, it is acquired in the given mode, and the function returns true. If |
| 1339 | * the lock isn't immediately free, the function waits until it is released |
| 1340 | * and returns false, but does not acquire the lock. |
| 1341 | * |
| 1342 | * This is currently used for WALWriteLock: when a backend flushes the WAL, |
| 1343 | * holding WALWriteLock, it can flush the commit records of many other |
| 1344 | * backends as a side-effect. Those other backends need to wait until the |
| 1345 | * flush finishes, but don't need to acquire the lock anymore. They can just |
| 1346 | * wake up, observe that their records have already been flushed, and return. |
| 1347 | */ |
| 1348 | bool |
| 1349 | LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) |
| 1350 | { |
| 1351 | PGPROC *proc = MyProc; |
| 1352 | bool mustwait; |
| 1353 | int = 0; |
| 1354 | #ifdef LWLOCK_STATS |
| 1355 | lwlock_stats *lwstats; |
| 1356 | |
| 1357 | lwstats = get_lwlock_stats_entry(lock); |
| 1358 | #endif |
| 1359 | |
| 1360 | Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); |
| 1361 | |
| 1362 | PRINT_LWDEBUG("LWLockAcquireOrWait" , lock, mode); |
| 1363 | |
| 1364 | /* Ensure we will have room to remember the lock */ |
| 1365 | if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) |
| 1366 | elog(ERROR, "too many LWLocks taken" ); |
| 1367 | |
| 1368 | /* |
| 1369 | * Lock out cancel/die interrupts until we exit the code section protected |
| 1370 | * by the LWLock. This ensures that interrupts will not interfere with |
| 1371 | * manipulations of data structures in shared memory. |
| 1372 | */ |
| 1373 | HOLD_INTERRUPTS(); |
| 1374 | |
| 1375 | /* |
| 1376 | * NB: We're using nearly the same twice-in-a-row lock acquisition |
| 1377 | * protocol as LWLockAcquire(). Check its comments for details. |
| 1378 | */ |
| 1379 | mustwait = LWLockAttemptLock(lock, mode); |
| 1380 | |
| 1381 | if (mustwait) |
| 1382 | { |
| 1383 | LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); |
| 1384 | |
| 1385 | mustwait = LWLockAttemptLock(lock, mode); |
| 1386 | |
| 1387 | if (mustwait) |
| 1388 | { |
| 1389 | /* |
| 1390 | * Wait until awakened. Like in LWLockAcquire, be prepared for |
| 1391 | * bogus wakeups, because we share the semaphore with |
| 1392 | * ProcWaitForSignal. |
| 1393 | */ |
| 1394 | LOG_LWDEBUG("LWLockAcquireOrWait" , lock, "waiting" ); |
| 1395 | |
| 1396 | #ifdef LWLOCK_STATS |
| 1397 | lwstats->block_count++; |
| 1398 | #endif |
| 1399 | |
| 1400 | LWLockReportWaitStart(lock); |
| 1401 | TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); |
| 1402 | |
| 1403 | for (;;) |
| 1404 | { |
| 1405 | PGSemaphoreLock(proc->sem); |
| 1406 | if (!proc->lwWaiting) |
| 1407 | break; |
| 1408 | extraWaits++; |
| 1409 | } |
| 1410 | |
| 1411 | #ifdef LOCK_DEBUG |
| 1412 | { |
| 1413 | /* not waiting anymore */ |
| 1414 | uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| 1415 | |
| 1416 | Assert(nwaiters < MAX_BACKENDS); |
| 1417 | } |
| 1418 | #endif |
| 1419 | TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); |
| 1420 | LWLockReportWaitEnd(); |
| 1421 | |
| 1422 | LOG_LWDEBUG("LWLockAcquireOrWait" , lock, "awakened" ); |
| 1423 | } |
| 1424 | else |
| 1425 | { |
| 1426 | LOG_LWDEBUG("LWLockAcquireOrWait" , lock, "acquired, undoing queue" ); |
| 1427 | |
| 1428 | /* |
| 1429 | * Got lock in the second attempt, undo queueing. We need to treat |
| 1430 | * this as having successfully acquired the lock, otherwise we'd |
| 1431 | * not necessarily wake up people we've prevented from acquiring |
| 1432 | * the lock. |
| 1433 | */ |
| 1434 | LWLockDequeueSelf(lock); |
| 1435 | } |
| 1436 | } |
| 1437 | |
| 1438 | /* |
| 1439 | * Fix the process wait semaphore's count for any absorbed wakeups. |
| 1440 | */ |
| 1441 | while (extraWaits-- > 0) |
| 1442 | PGSemaphoreUnlock(proc->sem); |
| 1443 | |
| 1444 | if (mustwait) |
| 1445 | { |
| 1446 | /* Failed to get lock, so release interrupt holdoff */ |
| 1447 | RESUME_INTERRUPTS(); |
| 1448 | LOG_LWDEBUG("LWLockAcquireOrWait" , lock, "failed" ); |
| 1449 | TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode); |
| 1450 | } |
| 1451 | else |
| 1452 | { |
| 1453 | LOG_LWDEBUG("LWLockAcquireOrWait" , lock, "succeeded" ); |
| 1454 | /* Add lock to list of locks held by this backend */ |
| 1455 | held_lwlocks[num_held_lwlocks].lock = lock; |
| 1456 | held_lwlocks[num_held_lwlocks++].mode = mode; |
| 1457 | TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode); |
| 1458 | } |
| 1459 | |
| 1460 | return !mustwait; |
| 1461 | } |
| 1462 | |
| 1463 | /* |
| 1464 | * Does the lwlock in its current state need to wait for the variable value to |
| 1465 | * change? |
| 1466 | * |
| 1467 | * If we don't need to wait, and it's because the value of the variable has |
| 1468 | * changed, store the current value in newval. |
| 1469 | * |
| 1470 | * *result is set to true if the lock was free, and false otherwise. |
| 1471 | */ |
| 1472 | static bool |
| 1473 | LWLockConflictsWithVar(LWLock *lock, |
| 1474 | uint64 *valptr, uint64 oldval, uint64 *newval, |
| 1475 | bool *result) |
| 1476 | { |
| 1477 | bool mustwait; |
| 1478 | uint64 value; |
| 1479 | |
| 1480 | /* |
| 1481 | * Test first to see if it the slot is free right now. |
| 1482 | * |
| 1483 | * XXX: the caller uses a spinlock before this, so we don't need a memory |
| 1484 | * barrier here as far as the current usage is concerned. But that might |
| 1485 | * not be safe in general. |
| 1486 | */ |
| 1487 | mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0; |
| 1488 | |
| 1489 | if (!mustwait) |
| 1490 | { |
| 1491 | *result = true; |
| 1492 | return false; |
| 1493 | } |
| 1494 | |
| 1495 | *result = false; |
| 1496 | |
| 1497 | /* |
| 1498 | * Read value using the lwlock's wait list lock, as we can't generally |
| 1499 | * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to |
| 1500 | * do atomic 64 bit reads/writes the spinlock should be optimized away. |
| 1501 | */ |
| 1502 | LWLockWaitListLock(lock); |
| 1503 | value = *valptr; |
| 1504 | LWLockWaitListUnlock(lock); |
| 1505 | |
| 1506 | if (value != oldval) |
| 1507 | { |
| 1508 | mustwait = false; |
| 1509 | *newval = value; |
| 1510 | } |
| 1511 | else |
| 1512 | { |
| 1513 | mustwait = true; |
| 1514 | } |
| 1515 | |
| 1516 | return mustwait; |
| 1517 | } |
| 1518 | |
| 1519 | /* |
| 1520 | * LWLockWaitForVar - Wait until lock is free, or a variable is updated. |
| 1521 | * |
| 1522 | * If the lock is held and *valptr equals oldval, waits until the lock is |
| 1523 | * either freed, or the lock holder updates *valptr by calling |
| 1524 | * LWLockUpdateVar. If the lock is free on exit (immediately or after |
| 1525 | * waiting), returns true. If the lock is still held, but *valptr no longer |
| 1526 | * matches oldval, returns false and sets *newval to the current value in |
| 1527 | * *valptr. |
| 1528 | * |
| 1529 | * Note: this function ignores shared lock holders; if the lock is held |
| 1530 | * in shared mode, returns 'true'. |
| 1531 | */ |
| 1532 | bool |
| 1533 | LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) |
| 1534 | { |
| 1535 | PGPROC *proc = MyProc; |
| 1536 | int = 0; |
| 1537 | bool result = false; |
| 1538 | #ifdef LWLOCK_STATS |
| 1539 | lwlock_stats *lwstats; |
| 1540 | |
| 1541 | lwstats = get_lwlock_stats_entry(lock); |
| 1542 | #endif |
| 1543 | |
| 1544 | PRINT_LWDEBUG("LWLockWaitForVar" , lock, LW_WAIT_UNTIL_FREE); |
| 1545 | |
| 1546 | /* |
| 1547 | * Lock out cancel/die interrupts while we sleep on the lock. There is no |
| 1548 | * cleanup mechanism to remove us from the wait queue if we got |
| 1549 | * interrupted. |
| 1550 | */ |
| 1551 | HOLD_INTERRUPTS(); |
| 1552 | |
| 1553 | /* |
| 1554 | * Loop here to check the lock's status after each time we are signaled. |
| 1555 | */ |
| 1556 | for (;;) |
| 1557 | { |
| 1558 | bool mustwait; |
| 1559 | |
| 1560 | mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, |
| 1561 | &result); |
| 1562 | |
| 1563 | if (!mustwait) |
| 1564 | break; /* the lock was free or value didn't match */ |
| 1565 | |
| 1566 | /* |
| 1567 | * Add myself to wait queue. Note that this is racy, somebody else |
| 1568 | * could wakeup before we're finished queuing. NB: We're using nearly |
| 1569 | * the same twice-in-a-row lock acquisition protocol as |
| 1570 | * LWLockAcquire(). Check its comments for details. The only |
| 1571 | * difference is that we also have to check the variable's values when |
| 1572 | * checking the state of the lock. |
| 1573 | */ |
| 1574 | LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); |
| 1575 | |
| 1576 | /* |
| 1577 | * Set RELEASE_OK flag, to make sure we get woken up as soon as the |
| 1578 | * lock is released. |
| 1579 | */ |
| 1580 | pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); |
| 1581 | |
| 1582 | /* |
| 1583 | * We're now guaranteed to be woken up if necessary. Recheck the lock |
| 1584 | * and variables state. |
| 1585 | */ |
| 1586 | mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, |
| 1587 | &result); |
| 1588 | |
| 1589 | /* Ok, no conflict after we queued ourselves. Undo queueing. */ |
| 1590 | if (!mustwait) |
| 1591 | { |
| 1592 | LOG_LWDEBUG("LWLockWaitForVar" , lock, "free, undoing queue" ); |
| 1593 | |
| 1594 | LWLockDequeueSelf(lock); |
| 1595 | break; |
| 1596 | } |
| 1597 | |
| 1598 | /* |
| 1599 | * Wait until awakened. |
| 1600 | * |
| 1601 | * Since we share the process wait semaphore with the regular lock |
| 1602 | * manager and ProcWaitForSignal, and we may need to acquire an LWLock |
| 1603 | * while one of those is pending, it is possible that we get awakened |
| 1604 | * for a reason other than being signaled by LWLockRelease. If so, |
| 1605 | * loop back and wait again. Once we've gotten the LWLock, |
| 1606 | * re-increment the sema by the number of additional signals received, |
| 1607 | * so that the lock manager or signal manager will see the received |
| 1608 | * signal when it next waits. |
| 1609 | */ |
| 1610 | LOG_LWDEBUG("LWLockWaitForVar" , lock, "waiting" ); |
| 1611 | |
| 1612 | #ifdef LWLOCK_STATS |
| 1613 | lwstats->block_count++; |
| 1614 | #endif |
| 1615 | |
| 1616 | LWLockReportWaitStart(lock); |
| 1617 | TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); |
| 1618 | |
| 1619 | for (;;) |
| 1620 | { |
| 1621 | PGSemaphoreLock(proc->sem); |
| 1622 | if (!proc->lwWaiting) |
| 1623 | break; |
| 1624 | extraWaits++; |
| 1625 | } |
| 1626 | |
| 1627 | #ifdef LOCK_DEBUG |
| 1628 | { |
| 1629 | /* not waiting anymore */ |
| 1630 | uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); |
| 1631 | |
| 1632 | Assert(nwaiters < MAX_BACKENDS); |
| 1633 | } |
| 1634 | #endif |
| 1635 | |
| 1636 | TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); |
| 1637 | LWLockReportWaitEnd(); |
| 1638 | |
| 1639 | LOG_LWDEBUG("LWLockWaitForVar" , lock, "awakened" ); |
| 1640 | |
| 1641 | /* Now loop back and check the status of the lock again. */ |
| 1642 | } |
| 1643 | |
| 1644 | TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), LW_EXCLUSIVE); |
| 1645 | |
| 1646 | /* |
| 1647 | * Fix the process wait semaphore's count for any absorbed wakeups. |
| 1648 | */ |
| 1649 | while (extraWaits-- > 0) |
| 1650 | PGSemaphoreUnlock(proc->sem); |
| 1651 | |
| 1652 | /* |
| 1653 | * Now okay to allow cancel/die interrupts. |
| 1654 | */ |
| 1655 | RESUME_INTERRUPTS(); |
| 1656 | |
| 1657 | return result; |
| 1658 | } |
| 1659 | |
| 1660 | |
| 1661 | /* |
| 1662 | * LWLockUpdateVar - Update a variable and wake up waiters atomically |
| 1663 | * |
| 1664 | * Sets *valptr to 'val', and wakes up all processes waiting for us with |
| 1665 | * LWLockWaitForVar(). Setting the value and waking up the processes happen |
| 1666 | * atomically so that any process calling LWLockWaitForVar() on the same lock |
| 1667 | * is guaranteed to see the new value, and act accordingly. |
| 1668 | * |
| 1669 | * The caller must be holding the lock in exclusive mode. |
| 1670 | */ |
| 1671 | void |
| 1672 | LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) |
| 1673 | { |
| 1674 | proclist_head wakeup; |
| 1675 | proclist_mutable_iter iter; |
| 1676 | |
| 1677 | PRINT_LWDEBUG("LWLockUpdateVar" , lock, LW_EXCLUSIVE); |
| 1678 | |
| 1679 | proclist_init(&wakeup); |
| 1680 | |
| 1681 | LWLockWaitListLock(lock); |
| 1682 | |
| 1683 | Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE); |
| 1684 | |
| 1685 | /* Update the lock's value */ |
| 1686 | *valptr = val; |
| 1687 | |
| 1688 | /* |
| 1689 | * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken |
| 1690 | * up. They are always in the front of the queue. |
| 1691 | */ |
| 1692 | proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) |
| 1693 | { |
| 1694 | PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| 1695 | |
| 1696 | if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) |
| 1697 | break; |
| 1698 | |
| 1699 | proclist_delete(&lock->waiters, iter.cur, lwWaitLink); |
| 1700 | proclist_push_tail(&wakeup, iter.cur, lwWaitLink); |
| 1701 | } |
| 1702 | |
| 1703 | /* We are done updating shared state of the lock itself. */ |
| 1704 | LWLockWaitListUnlock(lock); |
| 1705 | |
| 1706 | /* |
| 1707 | * Awaken any waiters I removed from the queue. |
| 1708 | */ |
| 1709 | proclist_foreach_modify(iter, &wakeup, lwWaitLink) |
| 1710 | { |
| 1711 | PGPROC *waiter = GetPGProcByNumber(iter.cur); |
| 1712 | |
| 1713 | proclist_delete(&wakeup, iter.cur, lwWaitLink); |
| 1714 | /* check comment in LWLockWakeup() about this barrier */ |
| 1715 | pg_write_barrier(); |
| 1716 | waiter->lwWaiting = false; |
| 1717 | PGSemaphoreUnlock(waiter->sem); |
| 1718 | } |
| 1719 | } |
| 1720 | |
| 1721 | |
| 1722 | /* |
| 1723 | * LWLockRelease - release a previously acquired lock |
| 1724 | */ |
| 1725 | void |
| 1726 | LWLockRelease(LWLock *lock) |
| 1727 | { |
| 1728 | LWLockMode mode; |
| 1729 | uint32 oldstate; |
| 1730 | bool check_waiters; |
| 1731 | int i; |
| 1732 | |
| 1733 | /* |
| 1734 | * Remove lock from list of locks held. Usually, but not always, it will |
| 1735 | * be the latest-acquired lock; so search array backwards. |
| 1736 | */ |
| 1737 | for (i = num_held_lwlocks; --i >= 0;) |
| 1738 | if (lock == held_lwlocks[i].lock) |
| 1739 | break; |
| 1740 | |
| 1741 | if (i < 0) |
| 1742 | elog(ERROR, "lock %s is not held" , T_NAME(lock)); |
| 1743 | |
| 1744 | mode = held_lwlocks[i].mode; |
| 1745 | |
| 1746 | num_held_lwlocks--; |
| 1747 | for (; i < num_held_lwlocks; i++) |
| 1748 | held_lwlocks[i] = held_lwlocks[i + 1]; |
| 1749 | |
| 1750 | PRINT_LWDEBUG("LWLockRelease" , lock, mode); |
| 1751 | |
| 1752 | /* |
| 1753 | * Release my hold on lock, after that it can immediately be acquired by |
| 1754 | * others, even if we still have to wakeup other waiters. |
| 1755 | */ |
| 1756 | if (mode == LW_EXCLUSIVE) |
| 1757 | oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE); |
| 1758 | else |
| 1759 | oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED); |
| 1760 | |
| 1761 | /* nobody else can have that kind of lock */ |
| 1762 | Assert(!(oldstate & LW_VAL_EXCLUSIVE)); |
| 1763 | |
| 1764 | |
| 1765 | /* |
| 1766 | * We're still waiting for backends to get scheduled, don't wake them up |
| 1767 | * again. |
| 1768 | */ |
| 1769 | if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) == |
| 1770 | (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) && |
| 1771 | (oldstate & LW_LOCK_MASK) == 0) |
| 1772 | check_waiters = true; |
| 1773 | else |
| 1774 | check_waiters = false; |
| 1775 | |
| 1776 | /* |
| 1777 | * As waking up waiters requires the spinlock to be acquired, only do so |
| 1778 | * if necessary. |
| 1779 | */ |
| 1780 | if (check_waiters) |
| 1781 | { |
| 1782 | /* XXX: remove before commit? */ |
| 1783 | LOG_LWDEBUG("LWLockRelease" , lock, "releasing waiters" ); |
| 1784 | LWLockWakeup(lock); |
| 1785 | } |
| 1786 | |
| 1787 | TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock)); |
| 1788 | |
| 1789 | /* |
| 1790 | * Now okay to allow cancel/die interrupts. |
| 1791 | */ |
| 1792 | RESUME_INTERRUPTS(); |
| 1793 | } |
| 1794 | |
| 1795 | /* |
| 1796 | * LWLockReleaseClearVar - release a previously acquired lock, reset variable |
| 1797 | */ |
| 1798 | void |
| 1799 | LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) |
| 1800 | { |
| 1801 | LWLockWaitListLock(lock); |
| 1802 | |
| 1803 | /* |
| 1804 | * Set the variable's value before releasing the lock, that prevents race |
| 1805 | * a race condition wherein a new locker acquires the lock, but hasn't yet |
| 1806 | * set the variables value. |
| 1807 | */ |
| 1808 | *valptr = val; |
| 1809 | LWLockWaitListUnlock(lock); |
| 1810 | |
| 1811 | LWLockRelease(lock); |
| 1812 | } |
| 1813 | |
| 1814 | |
| 1815 | /* |
| 1816 | * LWLockReleaseAll - release all currently-held locks |
| 1817 | * |
| 1818 | * Used to clean up after ereport(ERROR). An important difference between this |
| 1819 | * function and retail LWLockRelease calls is that InterruptHoldoffCount is |
| 1820 | * unchanged by this operation. This is necessary since InterruptHoldoffCount |
| 1821 | * has been set to an appropriate level earlier in error recovery. We could |
| 1822 | * decrement it below zero if we allow it to drop for each released lock! |
| 1823 | */ |
| 1824 | void |
| 1825 | LWLockReleaseAll(void) |
| 1826 | { |
| 1827 | while (num_held_lwlocks > 0) |
| 1828 | { |
| 1829 | HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ |
| 1830 | |
| 1831 | LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); |
| 1832 | } |
| 1833 | } |
| 1834 | |
| 1835 | |
| 1836 | /* |
| 1837 | * LWLockHeldByMe - test whether my process holds a lock in any mode |
| 1838 | * |
| 1839 | * This is meant as debug support only. |
| 1840 | */ |
| 1841 | bool |
| 1842 | LWLockHeldByMe(LWLock *l) |
| 1843 | { |
| 1844 | int i; |
| 1845 | |
| 1846 | for (i = 0; i < num_held_lwlocks; i++) |
| 1847 | { |
| 1848 | if (held_lwlocks[i].lock == l) |
| 1849 | return true; |
| 1850 | } |
| 1851 | return false; |
| 1852 | } |
| 1853 | |
| 1854 | /* |
| 1855 | * LWLockHeldByMeInMode - test whether my process holds a lock in given mode |
| 1856 | * |
| 1857 | * This is meant as debug support only. |
| 1858 | */ |
| 1859 | bool |
| 1860 | LWLockHeldByMeInMode(LWLock *l, LWLockMode mode) |
| 1861 | { |
| 1862 | int i; |
| 1863 | |
| 1864 | for (i = 0; i < num_held_lwlocks; i++) |
| 1865 | { |
| 1866 | if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode) |
| 1867 | return true; |
| 1868 | } |
| 1869 | return false; |
| 1870 | } |
| 1871 | |