| 1 | /* Copyright (C) 2003-2020 Free Software Foundation, Inc. | 
|---|
| 2 | This file is part of the GNU C Library. | 
|---|
| 3 | Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003. | 
|---|
| 4 |  | 
|---|
| 5 | The GNU C Library is free software; you can redistribute it and/or | 
|---|
| 6 | modify it under the terms of the GNU Lesser General Public | 
|---|
| 7 | License as published by the Free Software Foundation; either | 
|---|
| 8 | version 2.1 of the License, or (at your option) any later version. | 
|---|
| 9 |  | 
|---|
| 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU | 
|---|
| 13 | Lesser General Public License for more details. | 
|---|
| 14 |  | 
|---|
| 15 | You should have received a copy of the GNU Lesser General Public | 
|---|
| 16 | License along with the GNU C Library; if not, see | 
|---|
| 17 | <https://www.gnu.org/licenses/>.  */ | 
|---|
| 18 |  | 
|---|
| 19 | #include <endian.h> | 
|---|
| 20 | #include <errno.h> | 
|---|
| 21 | #include <sysdep.h> | 
|---|
| 22 | #include <futex-internal.h> | 
|---|
| 23 | #include <pthread.h> | 
|---|
| 24 | #include <pthreadP.h> | 
|---|
| 25 | #include <sys/time.h> | 
|---|
| 26 | #include <atomic.h> | 
|---|
| 27 | #include <stdint.h> | 
|---|
| 28 | #include <stdbool.h> | 
|---|
| 29 |  | 
|---|
| 30 | #include <shlib-compat.h> | 
|---|
| 31 | #include <stap-probe.h> | 
|---|
| 32 | #include <time.h> | 
|---|
| 33 |  | 
|---|
| 34 | #include "pthread_cond_common.c" | 
|---|
| 35 |  | 
|---|
| 36 |  | 
|---|
| 37 | struct _condvar_cleanup_buffer | 
|---|
| 38 | { | 
|---|
| 39 | uint64_t wseq; | 
|---|
| 40 | pthread_cond_t *cond; | 
|---|
| 41 | pthread_mutex_t *mutex; | 
|---|
| 42 | int private; | 
|---|
| 43 | }; | 
|---|
| 44 |  | 
|---|
| 45 |  | 
|---|
| 46 | /* Decrease the waiter reference count.  */ | 
|---|
| 47 | static void | 
|---|
| 48 | __condvar_confirm_wakeup (pthread_cond_t *cond, int private) | 
|---|
| 49 | { | 
|---|
| 50 | /* If destruction is pending (i.e., the wake-request flag is nonzero) and we | 
|---|
| 51 | are the last waiter (prior value of __wrefs was 1 << 3), then wake any | 
|---|
| 52 | threads waiting in pthread_cond_destroy.  Release MO to synchronize with | 
|---|
| 53 | these threads.  Don't bother clearing the wake-up request flag.  */ | 
|---|
| 54 | if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3) | 
|---|
| 55 | futex_wake (&cond->__data.__wrefs, INT_MAX, private); | 
|---|
| 56 | } | 
|---|
| 57 |  | 
|---|
| 58 |  | 
|---|
| 59 | /* Cancel waiting after having registered as a waiter previously.  SEQ is our | 
|---|
| 60 | position and G is our group index. | 
|---|
| 61 | The goal of cancellation is to make our group smaller if that is still | 
|---|
| 62 | possible.  If we are in a closed group, this is not possible anymore; in | 
|---|
| 63 | this case, we need to send a replacement signal for the one we effectively | 
|---|
| 64 | consumed because the signal should have gotten consumed by another waiter | 
|---|
| 65 | instead; we must not both cancel waiting and consume a signal. | 
|---|
| 66 |  | 
|---|
| 67 | Must not be called while still holding a reference on the group. | 
|---|
| 68 |  | 
|---|
| 69 | Returns true iff we consumed a signal. | 
|---|
| 70 |  | 
|---|
| 71 | On some kind of timeouts, we may be able to pretend that a signal we | 
|---|
| 72 | effectively consumed happened before the timeout (i.e., similarly to first | 
|---|
| 73 | spinning on signals before actually checking whether the timeout has | 
|---|
| 74 | passed already).  Doing this would allow us to skip sending a replacement | 
|---|
| 75 | signal, but this case might happen rarely because the end of the timeout | 
|---|
| 76 | must race with someone else sending a signal.  Therefore, we don't bother | 
|---|
| 77 | trying to optimize this.  */ | 
|---|
| 78 | static void | 
|---|
| 79 | __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, | 
|---|
| 80 | int private) | 
|---|
| 81 | { | 
|---|
| 82 | bool consumed_signal = false; | 
|---|
| 83 |  | 
|---|
| 84 | /* No deadlock with group switching is possible here because we have do | 
|---|
| 85 | not hold a reference on the group.  */ | 
|---|
| 86 | __condvar_acquire_lock (cond, private); | 
|---|
| 87 |  | 
|---|
| 88 | uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; | 
|---|
| 89 | if (g1_start > seq) | 
|---|
| 90 | { | 
|---|
| 91 | /* Our group is closed, so someone provided enough signals for it. | 
|---|
| 92 | Thus, we effectively consumed a signal.  */ | 
|---|
| 93 | consumed_signal = true; | 
|---|
| 94 | } | 
|---|
| 95 | else | 
|---|
| 96 | { | 
|---|
| 97 | if (g1_start + __condvar_get_orig_size (cond) <= seq) | 
|---|
| 98 | { | 
|---|
| 99 | /* We are in the current G2 and thus cannot have consumed a signal. | 
|---|
| 100 | Reduce its effective size or handle overflow.  Remember that in | 
|---|
| 101 | G2, unsigned int size is zero or a negative value.  */ | 
|---|
| 102 | if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0) | 
|---|
| 103 | { | 
|---|
| 104 | cond->__data.__g_size[g]--; | 
|---|
| 105 | } | 
|---|
| 106 | else | 
|---|
| 107 | { | 
|---|
| 108 | /* Cancellations would overflow the maximum group size.  Just | 
|---|
| 109 | wake up everyone spuriously to create a clean state.  This | 
|---|
| 110 | also means we do not consume a signal someone else sent.  */ | 
|---|
| 111 | __condvar_release_lock (cond, private); | 
|---|
| 112 | __pthread_cond_broadcast (cond); | 
|---|
| 113 | return; | 
|---|
| 114 | } | 
|---|
| 115 | } | 
|---|
| 116 | else | 
|---|
| 117 | { | 
|---|
| 118 | /* We are in current G1.  If the group's size is zero, someone put | 
|---|
| 119 | a signal in the group that nobody else but us can consume.  */ | 
|---|
| 120 | if (cond->__data.__g_size[g] == 0) | 
|---|
| 121 | consumed_signal = true; | 
|---|
| 122 | else | 
|---|
| 123 | { | 
|---|
| 124 | /* Otherwise, we decrease the size of the group.  This is | 
|---|
| 125 | equivalent to atomically putting in a signal just for us and | 
|---|
| 126 | consuming it right away.  We do not consume a signal sent | 
|---|
| 127 | by someone else.  We also cannot have consumed a futex | 
|---|
| 128 | wake-up because if we were cancelled or timed out in a futex | 
|---|
| 129 | call, the futex will wake another waiter.  */ | 
|---|
| 130 | cond->__data.__g_size[g]--; | 
|---|
| 131 | } | 
|---|
| 132 | } | 
|---|
| 133 | } | 
|---|
| 134 |  | 
|---|
| 135 | __condvar_release_lock (cond, private); | 
|---|
| 136 |  | 
|---|
| 137 | if (consumed_signal) | 
|---|
| 138 | { | 
|---|
| 139 | /* We effectively consumed a signal even though we didn't want to. | 
|---|
| 140 | Therefore, we need to send a replacement signal. | 
|---|
| 141 | If we would want to optimize this, we could do what | 
|---|
| 142 | pthread_cond_signal does right in the critical section above.  */ | 
|---|
| 143 | __pthread_cond_signal (cond); | 
|---|
| 144 | } | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | /* Wake up any signalers that might be waiting.  */ | 
|---|
| 148 | static void | 
|---|
| 149 | __condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private) | 
|---|
| 150 | { | 
|---|
| 151 | /* Release MO to synchronize-with the acquire load in | 
|---|
| 152 | __condvar_quiesce_and_switch_g1.  */ | 
|---|
| 153 | if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3) | 
|---|
| 154 | { | 
|---|
| 155 | /* Clear the wake-up request flag before waking up.  We do not need more | 
|---|
| 156 | than relaxed MO and it doesn't matter if we apply this for an aliased | 
|---|
| 157 | group because we wake all futex waiters right after clearing the | 
|---|
| 158 | flag.  */ | 
|---|
| 159 | atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1); | 
|---|
| 160 | futex_wake (cond->__data.__g_refs + g, INT_MAX, private); | 
|---|
| 161 | } | 
|---|
| 162 | } | 
|---|
| 163 |  | 
|---|
| 164 | /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel | 
|---|
| 165 | our registration as a waiter, confirm we have woken up, and re-acquire the | 
|---|
| 166 | mutex.  */ | 
|---|
| 167 | static void | 
|---|
| 168 | __condvar_cleanup_waiting (void *arg) | 
|---|
| 169 | { | 
|---|
| 170 | struct _condvar_cleanup_buffer *cbuffer = | 
|---|
| 171 | (struct _condvar_cleanup_buffer *) arg; | 
|---|
| 172 | pthread_cond_t *cond = cbuffer->cond; | 
|---|
| 173 | unsigned g = cbuffer->wseq & 1; | 
|---|
| 174 |  | 
|---|
| 175 | __condvar_dec_grefs (cond, g, cbuffer->private); | 
|---|
| 176 |  | 
|---|
| 177 | __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private); | 
|---|
| 178 | /* FIXME With the current cancellation implementation, it is possible that | 
|---|
| 179 | a thread is cancelled after it has returned from a syscall.  This could | 
|---|
| 180 | result in a cancelled waiter consuming a futex wake-up that is then | 
|---|
| 181 | causing another waiter in the same group to not wake up.  To work around | 
|---|
| 182 | this issue until we have fixed cancellation, just add a futex wake-up | 
|---|
| 183 | conservatively.  */ | 
|---|
| 184 | futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private); | 
|---|
| 185 |  | 
|---|
| 186 | __condvar_confirm_wakeup (cond, cbuffer->private); | 
|---|
| 187 |  | 
|---|
| 188 | /* XXX If locking the mutex fails, should we just stop execution?  This | 
|---|
| 189 | might be better than silently ignoring the error.  */ | 
|---|
| 190 | __pthread_mutex_cond_lock (cbuffer->mutex); | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | /* This condvar implementation guarantees that all calls to signal and | 
|---|
| 194 | broadcast and all of the three virtually atomic parts of each call to wait | 
|---|
| 195 | (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re- | 
|---|
| 196 | acquiring the mutex) happen in some total order that is consistent with the | 
|---|
| 197 | happens-before relations in the calling program.  However, this order does | 
|---|
| 198 | not necessarily result in additional happens-before relations being | 
|---|
| 199 | established (which aligns well with spurious wake-ups being allowed). | 
|---|
| 200 |  | 
|---|
| 201 | All waiters acquire a certain position in a 64b waiter sequence (__wseq). | 
|---|
| 202 | This sequence determines which waiters are allowed to consume signals. | 
|---|
| 203 | A broadcast is equal to sending as many signals as are unblocked waiters. | 
|---|
| 204 | When a signal arrives, it samples the current value of __wseq with a | 
|---|
| 205 | relaxed-MO load (i.e., the position the next waiter would get).  (This is | 
|---|
| 206 | sufficient because it is consistent with happens-before; the caller can | 
|---|
| 207 | enforce stronger ordering constraints by calling signal while holding the | 
|---|
| 208 | mutex.)  Only waiters with a position less than the __wseq value observed | 
|---|
| 209 | by the signal are eligible to consume this signal. | 
|---|
| 210 |  | 
|---|
| 211 | This would be straight-forward to implement if waiters would just spin but | 
|---|
| 212 | we need to let them block using futexes.  Futexes give no guarantee of | 
|---|
| 213 | waking in FIFO order, so we cannot reliably wake eligible waiters if we | 
|---|
| 214 | just use a single futex.  Also, futex words are 32b in size, but we need | 
|---|
| 215 | to distinguish more than 1<<32 states because we need to represent the | 
|---|
| 216 | order of wake-up (and thus which waiters are eligible to consume signals); | 
|---|
| 217 | blocking in a futex is not atomic with a waiter determining its position in | 
|---|
| 218 | the waiter sequence, so we need the futex word to reliably notify waiters | 
|---|
| 219 | that they should not attempt to block anymore because they have been | 
|---|
| 220 | already signaled in the meantime.  While an ABA issue on a 32b value will | 
|---|
| 221 | be rare, ignoring it when we are aware of it is not the right thing to do | 
|---|
| 222 | either. | 
|---|
| 223 |  | 
|---|
| 224 | Therefore, we use a 64b counter to represent the waiter sequence (on | 
|---|
| 225 | architectures which only support 32b atomics, we use a few bits less). | 
|---|
| 226 | To deal with the blocking using futexes, we maintain two groups of waiters: | 
|---|
| 227 | * Group G1 consists of waiters that are all eligible to consume signals; | 
|---|
| 228 | incoming signals will always signal waiters in this group until all | 
|---|
| 229 | waiters in G1 have been signaled. | 
|---|
| 230 | * Group G2 consists of waiters that arrive when a G1 is present and still | 
|---|
| 231 | contains waiters that have not been signaled.  When all waiters in G1 | 
|---|
| 232 | are signaled and a new signal arrives, the new signal will convert G2 | 
|---|
| 233 | into the new G1 and create a new G2 for future waiters. | 
|---|
| 234 |  | 
|---|
| 235 | We cannot allocate new memory because of process-shared condvars, so we | 
|---|
| 236 | have just two slots of groups that change their role between G1 and G2. | 
|---|
| 237 | Each has a separate futex word, a number of signals available for | 
|---|
| 238 | consumption, a size (number of waiters in the group that have not been | 
|---|
| 239 | signaled), and a reference count. | 
|---|
| 240 |  | 
|---|
| 241 | The group reference count is used to maintain the number of waiters that | 
|---|
| 242 | are using the group's futex.  Before a group can change its role, the | 
|---|
| 243 | reference count must show that no waiters are using the futex anymore; this | 
|---|
| 244 | prevents ABA issues on the futex word. | 
|---|
| 245 |  | 
|---|
| 246 | To represent which intervals in the waiter sequence the groups cover (and | 
|---|
| 247 | thus also which group slot contains G1 or G2), we use a 64b counter to | 
|---|
| 248 | designate the start position of G1 (inclusive), and a single bit in the | 
|---|
| 249 | waiter sequence counter to represent which group slot currently contains | 
|---|
| 250 | G2.  This allows us to switch group roles atomically wrt. waiters obtaining | 
|---|
| 251 | a position in the waiter sequence.  The G1 start position allows waiters to | 
|---|
| 252 | figure out whether they are in a group that has already been completely | 
|---|
| 253 | signaled (i.e., if the current G1 starts at a later position that the | 
|---|
| 254 | waiter's position).  Waiters cannot determine whether they are currently | 
|---|
| 255 | in G2 or G1 -- but they do not have too because all they are interested in | 
|---|
| 256 | is whether there are available signals, and they always start in G2 (whose | 
|---|
| 257 | group slot they know because of the bit in the waiter sequence.  Signalers | 
|---|
| 258 | will simply fill the right group until it is completely signaled and can | 
|---|
| 259 | be closed (they do not switch group roles until they really have to to | 
|---|
| 260 | decrease the likelihood of having to wait for waiters still holding a | 
|---|
| 261 | reference on the now-closed G1). | 
|---|
| 262 |  | 
|---|
| 263 | Signalers maintain the initial size of G1 to be able to determine where | 
|---|
| 264 | G2 starts (G2 is always open-ended until it becomes G1).  They track the | 
|---|
| 265 | remaining size of a group; when waiters cancel waiting (due to PThreads | 
|---|
| 266 | cancellation or timeouts), they will decrease this remaining size as well. | 
|---|
| 267 |  | 
|---|
| 268 | To implement condvar destruction requirements (i.e., that | 
|---|
| 269 | pthread_cond_destroy can be called as soon as all waiters have been | 
|---|
| 270 | signaled), waiters increment a reference count before starting to wait and | 
|---|
| 271 | decrement it after they stopped waiting but right before they acquire the | 
|---|
| 272 | mutex associated with the condvar. | 
|---|
| 273 |  | 
|---|
| 274 | pthread_cond_t thus consists of the following (bits that are used for | 
|---|
| 275 | flags and are not part of the primary value of each field but necessary | 
|---|
| 276 | to make some things atomic or because there was no space for them | 
|---|
| 277 | elsewhere in the data structure): | 
|---|
| 278 |  | 
|---|
| 279 | __wseq: Waiter sequence counter | 
|---|
| 280 | * LSB is index of current G2. | 
|---|
| 281 | * Waiters fetch-add while having acquire the mutex associated with the | 
|---|
| 282 | condvar.  Signalers load it and fetch-xor it concurrently. | 
|---|
| 283 | __g1_start: Starting position of G1 (inclusive) | 
|---|
| 284 | * LSB is index of current G2. | 
|---|
| 285 | * Modified by signalers while having acquired the condvar-internal lock | 
|---|
| 286 | and observed concurrently by waiters. | 
|---|
| 287 | __g1_orig_size: Initial size of G1 | 
|---|
| 288 | * The two least-significant bits represent the condvar-internal lock. | 
|---|
| 289 | * Only accessed while having acquired the condvar-internal lock. | 
|---|
| 290 | __wrefs: Waiter reference counter. | 
|---|
| 291 | * Bit 2 is true if waiters should run futex_wake when they remove the | 
|---|
| 292 | last reference.  pthread_cond_destroy uses this as futex word. | 
|---|
| 293 | * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC). | 
|---|
| 294 | * Bit 0 is true iff this is a process-shared condvar. | 
|---|
| 295 | * Simple reference count used by both waiters and pthread_cond_destroy. | 
|---|
| 296 | (If the format of __wrefs is changed, update nptl_lock_constants.pysym | 
|---|
| 297 | and the pretty printers.) | 
|---|
| 298 | For each of the two groups, we have: | 
|---|
| 299 | __g_refs: Futex waiter reference count. | 
|---|
| 300 | * LSB is true if waiters should run futex_wake when they remove the | 
|---|
| 301 | last reference. | 
|---|
| 302 | * Reference count used by waiters concurrently with signalers that have | 
|---|
| 303 | acquired the condvar-internal lock. | 
|---|
| 304 | __g_signals: The number of signals that can still be consumed. | 
|---|
| 305 | * Used as a futex word by waiters.  Used concurrently by waiters and | 
|---|
| 306 | signalers. | 
|---|
| 307 | * LSB is true iff this group has been completely signaled (i.e., it is | 
|---|
| 308 | closed). | 
|---|
| 309 | __g_size: Waiters remaining in this group (i.e., which have not been | 
|---|
| 310 | signaled yet. | 
|---|
| 311 | * Accessed by signalers and waiters that cancel waiting (both do so only | 
|---|
| 312 | when having acquired the condvar-internal lock. | 
|---|
| 313 | * The size of G2 is always zero because it cannot be determined until | 
|---|
| 314 | the group becomes G1. | 
|---|
| 315 | * Although this is of unsigned type, we rely on using unsigned overflow | 
|---|
| 316 | rules to make this hold effectively negative values too (in | 
|---|
| 317 | particular, when waiters in G2 cancel waiting). | 
|---|
| 318 |  | 
|---|
| 319 | A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields | 
|---|
| 320 | a condvar that has G2 starting at position 0 and a G1 that is closed. | 
|---|
| 321 |  | 
|---|
| 322 | Because waiters do not claim ownership of a group right when obtaining a | 
|---|
| 323 | position in __wseq but only reference count the group when using futexes | 
|---|
| 324 | to block, it can happen that a group gets closed before a waiter can | 
|---|
| 325 | increment the reference count.  Therefore, waiters have to check whether | 
|---|
| 326 | their group is already closed using __g1_start.  They also have to perform | 
|---|
| 327 | this check when spinning when trying to grab a signal from __g_signals. | 
|---|
| 328 | Note that for these checks, using relaxed MO to load __g1_start is | 
|---|
| 329 | sufficient because if a waiter can see a sufficiently large value, it could | 
|---|
| 330 | have also consume a signal in the waiters group. | 
|---|
| 331 |  | 
|---|
| 332 | Waiters try to grab a signal from __g_signals without holding a reference | 
|---|
| 333 | count, which can lead to stealing a signal from a more recent group after | 
|---|
| 334 | their own group was already closed.  They cannot always detect whether they | 
|---|
| 335 | in fact did because they do not know when they stole, but they can | 
|---|
| 336 | conservatively add a signal back to the group they stole from; if they | 
|---|
| 337 | did so unnecessarily, all that happens is a spurious wake-up.  To make this | 
|---|
| 338 | even less likely, __g1_start contains the index of the current g2 too, | 
|---|
| 339 | which allows waiters to check if there aliasing on the group slots; if | 
|---|
| 340 | there wasn't, they didn't steal from the current G1, which means that the | 
|---|
| 341 | G1 they stole from must have been already closed and they do not need to | 
|---|
| 342 | fix anything. | 
|---|
| 343 |  | 
|---|
| 344 | It is essential that the last field in pthread_cond_t is __g_signals[1]: | 
|---|
| 345 | The previous condvar used a pointer-sized field in pthread_cond_t, so a | 
|---|
| 346 | PTHREAD_COND_INITIALIZER from that condvar implementation might only | 
|---|
| 347 | initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes | 
|---|
| 348 | in total instead of the 48 we need).  __g_signals[1] is not accessed before | 
|---|
| 349 | the first group switch (G2 starts at index 0), which will set its value to | 
|---|
| 350 | zero after a harmless fetch-or whose return value is ignored.  This | 
|---|
| 351 | effectively completes initialization. | 
|---|
| 352 |  | 
|---|
| 353 |  | 
|---|
| 354 | Limitations: | 
|---|
| 355 | * This condvar isn't designed to allow for more than | 
|---|
| 356 | __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait. | 
|---|
| 357 | * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not | 
|---|
| 358 | supported. | 
|---|
| 359 | * Beyond what is allowed as errors by POSIX or documented, we can also | 
|---|
| 360 | return the following errors: | 
|---|
| 361 | * EPERM if MUTEX is a recursive mutex and the caller doesn't own it. | 
|---|
| 362 | * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike | 
|---|
| 363 | for other errors, this can happen when we re-acquire the mutex; this | 
|---|
| 364 | isn't allowed by POSIX (which requires all errors to virtually happen | 
|---|
| 365 | before we release the mutex or change the condvar state), but there's | 
|---|
| 366 | nothing we can do really. | 
|---|
| 367 | * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors | 
|---|
| 368 | returned by __pthread_tpp_change_priority.  We will already have | 
|---|
| 369 | released the mutex in such cases, so the caller cannot expect to own | 
|---|
| 370 | MUTEX. | 
|---|
| 371 |  | 
|---|
| 372 | Other notes: | 
|---|
| 373 | * Instead of the normal mutex unlock / lock functions, we use | 
|---|
| 374 | __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m) | 
|---|
| 375 | because those will not change the mutex-internal users count, so that it | 
|---|
| 376 | can be detected when a condvar is still associated with a particular | 
|---|
| 377 | mutex because there is a waiter blocked on this condvar using this mutex. | 
|---|
| 378 | */ | 
|---|
| 379 | static __always_inline int | 
|---|
| 380 | __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, | 
|---|
| 381 | clockid_t clockid, | 
|---|
| 382 | const struct timespec *abstime) | 
|---|
| 383 | { | 
|---|
| 384 | const int maxspin = 0; | 
|---|
| 385 | int err; | 
|---|
| 386 | int result = 0; | 
|---|
| 387 |  | 
|---|
| 388 | LIBC_PROBE (cond_wait, 2, cond, mutex); | 
|---|
| 389 |  | 
|---|
| 390 | /* clockid will already have been checked by | 
|---|
| 391 | __pthread_cond_clockwait or pthread_condattr_setclock, or we | 
|---|
| 392 | don't use it if abstime is NULL, so we don't need to check it | 
|---|
| 393 | here. */ | 
|---|
| 394 |  | 
|---|
| 395 | /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an | 
|---|
| 396 | atomic operation because signals and broadcasts may update the group | 
|---|
| 397 | switch without acquiring the mutex.  We do not need release MO here | 
|---|
| 398 | because we do not need to establish any happens-before relation with | 
|---|
| 399 | signalers (see __pthread_cond_signal); modification order alone | 
|---|
| 400 | establishes a total order of waiters/signals.  We do need acquire MO | 
|---|
| 401 | to synchronize with group reinitialization in | 
|---|
| 402 | __condvar_quiesce_and_switch_g1.  */ | 
|---|
| 403 | uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2); | 
|---|
| 404 | /* Find our group's index.  We always go into what was G2 when we acquired | 
|---|
| 405 | our position.  */ | 
|---|
| 406 | unsigned int g = wseq & 1; | 
|---|
| 407 | uint64_t seq = wseq >> 1; | 
|---|
| 408 |  | 
|---|
| 409 | /* Increase the waiter reference count.  Relaxed MO is sufficient because | 
|---|
| 410 | we only need to synchronize when decrementing the reference count.  */ | 
|---|
| 411 | unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8); | 
|---|
| 412 | int private = __condvar_get_private (flags); | 
|---|
| 413 |  | 
|---|
| 414 | /* Now that we are registered as a waiter, we can release the mutex. | 
|---|
| 415 | Waiting on the condvar must be atomic with releasing the mutex, so if | 
|---|
| 416 | the mutex is used to establish a happens-before relation with any | 
|---|
| 417 | signaler, the waiter must be visible to the latter; thus, we release the | 
|---|
| 418 | mutex after registering as waiter. | 
|---|
| 419 | If releasing the mutex fails, we just cancel our registration as a | 
|---|
| 420 | waiter and confirm that we have woken up.  */ | 
|---|
| 421 | err = __pthread_mutex_unlock_usercnt (mutex, 0); | 
|---|
| 422 | if (__glibc_unlikely (err != 0)) | 
|---|
| 423 | { | 
|---|
| 424 | __condvar_cancel_waiting (cond, seq, g, private); | 
|---|
| 425 | __condvar_confirm_wakeup (cond, private); | 
|---|
| 426 | return err; | 
|---|
| 427 | } | 
|---|
| 428 |  | 
|---|
| 429 | /* Now wait until a signal is available in our group or it is closed. | 
|---|
| 430 | Acquire MO so that if we observe a value of zero written after group | 
|---|
| 431 | switching in __condvar_quiesce_and_switch_g1, we synchronize with that | 
|---|
| 432 | store and will see the prior update of __g1_start done while switching | 
|---|
| 433 | groups too.  */ | 
|---|
| 434 | unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); | 
|---|
| 435 |  | 
|---|
| 436 | do | 
|---|
| 437 | { | 
|---|
| 438 | while (1) | 
|---|
| 439 | { | 
|---|
| 440 | /* Spin-wait first. | 
|---|
| 441 | Note that spinning first without checking whether a timeout | 
|---|
| 442 | passed might lead to what looks like a spurious wake-up even | 
|---|
| 443 | though we should return ETIMEDOUT (e.g., if the caller provides | 
|---|
| 444 | an absolute timeout that is clearly in the past).  However, | 
|---|
| 445 | (1) spurious wake-ups are allowed, (2) it seems unlikely that a | 
|---|
| 446 | user will (ab)use pthread_cond_wait as a check for whether a | 
|---|
| 447 | point in time is in the past, and (3) spinning first without | 
|---|
| 448 | having to compare against the current time seems to be the right | 
|---|
| 449 | choice from a performance perspective for most use cases.  */ | 
|---|
| 450 | unsigned int spin = maxspin; | 
|---|
| 451 | while (signals == 0 && spin > 0) | 
|---|
| 452 | { | 
|---|
| 453 | /* Check that we are not spinning on a group that's already | 
|---|
| 454 | closed.  */ | 
|---|
| 455 | if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) | 
|---|
| 456 | goto done; | 
|---|
| 457 |  | 
|---|
| 458 | /* TODO Back off.  */ | 
|---|
| 459 |  | 
|---|
| 460 | /* Reload signals.  See above for MO.  */ | 
|---|
| 461 | signals = atomic_load_acquire (cond->__data.__g_signals + g); | 
|---|
| 462 | spin--; | 
|---|
| 463 | } | 
|---|
| 464 |  | 
|---|
| 465 | /* If our group will be closed as indicated by the flag on signals, | 
|---|
| 466 | don't bother grabbing a signal.  */ | 
|---|
| 467 | if (signals & 1) | 
|---|
| 468 | goto done; | 
|---|
| 469 |  | 
|---|
| 470 | /* If there is an available signal, don't block.  */ | 
|---|
| 471 | if (signals != 0) | 
|---|
| 472 | break; | 
|---|
| 473 |  | 
|---|
| 474 | /* No signals available after spinning, so prepare to block. | 
|---|
| 475 | We first acquire a group reference and use acquire MO for that so | 
|---|
| 476 | that we synchronize with the dummy read-modify-write in | 
|---|
| 477 | __condvar_quiesce_and_switch_g1 if we read from that.  In turn, | 
|---|
| 478 | in this case this will make us see the closed flag on __g_signals | 
|---|
| 479 | that designates a concurrent attempt to reuse the group's slot. | 
|---|
| 480 | We use acquire MO for the __g_signals check to make the | 
|---|
| 481 | __g1_start check work (see spinning above). | 
|---|
| 482 | Note that the group reference acquisition will not mask the | 
|---|
| 483 | release MO when decrementing the reference count because we use | 
|---|
| 484 | an atomic read-modify-write operation and thus extend the release | 
|---|
| 485 | sequence.  */ | 
|---|
| 486 | atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); | 
|---|
| 487 | if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) | 
|---|
| 488 | || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) | 
|---|
| 489 | { | 
|---|
| 490 | /* Our group is closed.  Wake up any signalers that might be | 
|---|
| 491 | waiting.  */ | 
|---|
| 492 | __condvar_dec_grefs (cond, g, private); | 
|---|
| 493 | goto done; | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | // Now block. | 
|---|
| 497 | struct _pthread_cleanup_buffer buffer; | 
|---|
| 498 | struct _condvar_cleanup_buffer cbuffer; | 
|---|
| 499 | cbuffer.wseq = wseq; | 
|---|
| 500 | cbuffer.cond = cond; | 
|---|
| 501 | cbuffer.mutex = mutex; | 
|---|
| 502 | cbuffer.private = private; | 
|---|
| 503 | __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); | 
|---|
| 504 |  | 
|---|
| 505 | if (abstime == NULL) | 
|---|
| 506 | { | 
|---|
| 507 | /* Block without a timeout.  */ | 
|---|
| 508 | err = futex_wait_cancelable ( | 
|---|
| 509 | cond->__data.__g_signals + g, 0, private); | 
|---|
| 510 | } | 
|---|
| 511 | else | 
|---|
| 512 | { | 
|---|
| 513 | /* Block, but with a timeout. | 
|---|
| 514 | Work around the fact that the kernel rejects negative timeout | 
|---|
| 515 | values despite them being valid.  */ | 
|---|
| 516 | if (__glibc_unlikely (abstime->tv_sec < 0)) | 
|---|
| 517 | err = ETIMEDOUT; | 
|---|
| 518 | else | 
|---|
| 519 | { | 
|---|
| 520 | err = futex_abstimed_wait_cancelable | 
|---|
| 521 | (cond->__data.__g_signals + g, 0, clockid, abstime, | 
|---|
| 522 | private); | 
|---|
| 523 | } | 
|---|
| 524 | } | 
|---|
| 525 |  | 
|---|
| 526 | __pthread_cleanup_pop (&buffer, 0); | 
|---|
| 527 |  | 
|---|
| 528 | if (__glibc_unlikely (err == ETIMEDOUT)) | 
|---|
| 529 | { | 
|---|
| 530 | __condvar_dec_grefs (cond, g, private); | 
|---|
| 531 | /* If we timed out, we effectively cancel waiting.  Note that | 
|---|
| 532 | we have decremented __g_refs before cancellation, so that a | 
|---|
| 533 | deadlock between waiting for quiescence of our group in | 
|---|
| 534 | __condvar_quiesce_and_switch_g1 and us trying to acquire | 
|---|
| 535 | the lock during cancellation is not possible.  */ | 
|---|
| 536 | __condvar_cancel_waiting (cond, seq, g, private); | 
|---|
| 537 | result = ETIMEDOUT; | 
|---|
| 538 | goto done; | 
|---|
| 539 | } | 
|---|
| 540 | else | 
|---|
| 541 | __condvar_dec_grefs (cond, g, private); | 
|---|
| 542 |  | 
|---|
| 543 | /* Reload signals.  See above for MO.  */ | 
|---|
| 544 | signals = atomic_load_acquire (cond->__data.__g_signals + g); | 
|---|
| 545 | } | 
|---|
| 546 |  | 
|---|
| 547 | } | 
|---|
| 548 | /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value | 
|---|
| 549 | of __g1_start below (see spinning above for a similar case).  In | 
|---|
| 550 | particular, if we steal from a more recent group, we will also see a | 
|---|
| 551 | more recent __g1_start below.  */ | 
|---|
| 552 | while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, | 
|---|
| 553 | &signals, signals - 2)); | 
|---|
| 554 |  | 
|---|
| 555 | /* We consumed a signal but we could have consumed from a more recent group | 
|---|
| 556 | that aliased with ours due to being in the same group slot.  If this | 
|---|
| 557 | might be the case our group must be closed as visible through | 
|---|
| 558 | __g1_start.  */ | 
|---|
| 559 | uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); | 
|---|
| 560 | if (seq < (g1_start >> 1)) | 
|---|
| 561 | { | 
|---|
| 562 | /* We potentially stole a signal from a more recent group but we do not | 
|---|
| 563 | know which group we really consumed from. | 
|---|
| 564 | We do not care about groups older than current G1 because they are | 
|---|
| 565 | closed; we could have stolen from these, but then we just add a | 
|---|
| 566 | spurious wake-up for the current groups. | 
|---|
| 567 | We will never steal a signal from current G2 that was really intended | 
|---|
| 568 | for G2 because G2 never receives signals (until it becomes G1).  We | 
|---|
| 569 | could have stolen a signal from G2 that was conservatively added by a | 
|---|
| 570 | previous waiter that also thought it stole a signal -- but given that | 
|---|
| 571 | that signal was added unnecessarily, it's not a problem if we steal | 
|---|
| 572 | it. | 
|---|
| 573 | Thus, the remaining case is that we could have stolen from the current | 
|---|
| 574 | G1, where "current" means the __g1_start value we observed.  However, | 
|---|
| 575 | if the current G1 does not have the same slot index as we do, we did | 
|---|
| 576 | not steal from it and do not need to undo that.  This is the reason | 
|---|
| 577 | for putting a bit with G2's index into__g1_start as well.  */ | 
|---|
| 578 | if (((g1_start & 1) ^ 1) == g) | 
|---|
| 579 | { | 
|---|
| 580 | /* We have to conservatively undo our potential mistake of stealing | 
|---|
| 581 | a signal.  We can stop trying to do that when the current G1 | 
|---|
| 582 | changes because other spinning waiters will notice this too and | 
|---|
| 583 | __condvar_quiesce_and_switch_g1 has checked that there are no | 
|---|
| 584 | futex waiters anymore before switching G1. | 
|---|
| 585 | Relaxed MO is fine for the __g1_start load because we need to | 
|---|
| 586 | merely be able to observe this fact and not have to observe | 
|---|
| 587 | something else as well. | 
|---|
| 588 | ??? Would it help to spin for a little while to see whether the | 
|---|
| 589 | current G1 gets closed?  This might be worthwhile if the group is | 
|---|
| 590 | small or close to being closed.  */ | 
|---|
| 591 | unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); | 
|---|
| 592 | while (__condvar_load_g1_start_relaxed (cond) == g1_start) | 
|---|
| 593 | { | 
|---|
| 594 | /* Try to add a signal.  We don't need to acquire the lock | 
|---|
| 595 | because at worst we can cause a spurious wake-up.  If the | 
|---|
| 596 | group is in the process of being closed (LSB is true), this | 
|---|
| 597 | has an effect similar to us adding a signal.  */ | 
|---|
| 598 | if (((s & 1) != 0) | 
|---|
| 599 | || atomic_compare_exchange_weak_relaxed | 
|---|
| 600 | (cond->__data.__g_signals + g, &s, s + 2)) | 
|---|
| 601 | { | 
|---|
| 602 | /* If we added a signal, we also need to add a wake-up on | 
|---|
| 603 | the futex.  We also need to do that if we skipped adding | 
|---|
| 604 | a signal because the group is being closed because | 
|---|
| 605 | while __condvar_quiesce_and_switch_g1 could have closed | 
|---|
| 606 | the group, it might stil be waiting for futex waiters to | 
|---|
| 607 | leave (and one of those waiters might be the one we stole | 
|---|
| 608 | the signal from, which cause it to block using the | 
|---|
| 609 | futex).  */ | 
|---|
| 610 | futex_wake (cond->__data.__g_signals + g, 1, private); | 
|---|
| 611 | break; | 
|---|
| 612 | } | 
|---|
| 613 | /* TODO Back off.  */ | 
|---|
| 614 | } | 
|---|
| 615 | } | 
|---|
| 616 | } | 
|---|
| 617 |  | 
|---|
| 618 | done: | 
|---|
| 619 |  | 
|---|
| 620 | /* Confirm that we have been woken.  We do that before acquiring the mutex | 
|---|
| 621 | to allow for execution of pthread_cond_destroy while having acquired the | 
|---|
| 622 | mutex.  */ | 
|---|
| 623 | __condvar_confirm_wakeup (cond, private); | 
|---|
| 624 |  | 
|---|
| 625 | /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT, | 
|---|
| 626 | which is set to ETIMEDOUT if a timeout occured, or zero otherwise.  */ | 
|---|
| 627 | err = __pthread_mutex_cond_lock (mutex); | 
|---|
| 628 | /* XXX Abort on errors that are disallowed by POSIX?  */ | 
|---|
| 629 | return (err != 0) ? err : result; | 
|---|
| 630 | } | 
|---|
| 631 |  | 
|---|
| 632 |  | 
|---|
| 633 | /* See __pthread_cond_wait_common.  */ | 
|---|
| 634 | int | 
|---|
| 635 | __pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) | 
|---|
| 636 | { | 
|---|
| 637 | /* clockid is unused when abstime is NULL. */ | 
|---|
| 638 | return __pthread_cond_wait_common (cond, mutex, 0, NULL); | 
|---|
| 639 | } | 
|---|
| 640 |  | 
|---|
| 641 | /* See __pthread_cond_wait_common.  */ | 
|---|
| 642 | int | 
|---|
| 643 | __pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, | 
|---|
| 644 | const struct timespec *abstime) | 
|---|
| 645 | { | 
|---|
| 646 | /* Check parameter validity.  This should also tell the compiler that | 
|---|
| 647 | it can assume that abstime is not NULL.  */ | 
|---|
| 648 | if (! valid_nanoseconds (abstime->tv_nsec)) | 
|---|
| 649 | return EINVAL; | 
|---|
| 650 |  | 
|---|
| 651 | /* Relaxed MO is suffice because clock ID bit is only modified | 
|---|
| 652 | in condition creation.  */ | 
|---|
| 653 | unsigned int flags = atomic_load_relaxed (&cond->__data.__wrefs); | 
|---|
| 654 | clockid_t clockid = (flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) | 
|---|
| 655 | ? CLOCK_MONOTONIC : CLOCK_REALTIME; | 
|---|
| 656 | return __pthread_cond_wait_common (cond, mutex, clockid, abstime); | 
|---|
| 657 | } | 
|---|
| 658 | versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, | 
|---|
| 659 | GLIBC_2_3_2); | 
|---|
| 660 | versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait, | 
|---|
| 661 | GLIBC_2_3_2); | 
|---|
| 662 |  | 
|---|
| 663 | /* See __pthread_cond_wait_common.  */ | 
|---|
| 664 | int | 
|---|
| 665 | __pthread_cond_clockwait (pthread_cond_t *cond, pthread_mutex_t *mutex, | 
|---|
| 666 | clockid_t clockid, | 
|---|
| 667 | const struct timespec *abstime) | 
|---|
| 668 | { | 
|---|
| 669 | /* Check parameter validity.  This should also tell the compiler that | 
|---|
| 670 | it can assume that abstime is not NULL.  */ | 
|---|
| 671 | if (! valid_nanoseconds (abstime->tv_nsec)) | 
|---|
| 672 | return EINVAL; | 
|---|
| 673 |  | 
|---|
| 674 | if (!futex_abstimed_supported_clockid (clockid)) | 
|---|
| 675 | return EINVAL; | 
|---|
| 676 |  | 
|---|
| 677 | return __pthread_cond_wait_common (cond, mutex, clockid, abstime); | 
|---|
| 678 | } | 
|---|
| 679 | weak_alias (__pthread_cond_clockwait, pthread_cond_clockwait); | 
|---|
| 680 |  | 
|---|