| 1 | /* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc. |
| 2 | Copyright (c) 2011, 2013, Monty Program Ab. |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or modify |
| 5 | it under the terms of the GNU General Public License as published by |
| 6 | the Free Software Foundation; version 2 of the License. |
| 7 | |
| 8 | This program is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | GNU General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU General Public License |
| 14 | along with this program; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */ |
| 16 | |
| 17 | /** |
| 18 | @file |
| 19 | |
| 20 | "waiting threads" subsystem - a unified interface for threads to wait |
| 21 | on each other, with built-in deadlock detection. |
| 22 | |
| 23 | Main concepts |
| 24 | ^^^^^^^^^^^^^ |
| 25 | a thread - is represented by a WT_THD structure. One physical thread |
| 26 | can have only one WT_THD descriptor at any given moment. |
| 27 | |
| 28 | a resource - a thread does not wait for other threads directly, |
| 29 | instead it waits for a "resource", which is "owned" by other threads. |
| 30 | It waits, exactly, for all "owners" to "release" a resource. |
| 31 | It does not have to correspond to a physical resource. For example, it |
| 32 | may be convenient in certain cases to force resource == thread. |
| 33 | A resource is represented by a WT_RESOURCE structure. |
| 34 | |
| 35 | a resource identifier - a pair of {resource type, value}. A value is |
| 36 | an ulonglong number. Represented by a WT_RESOURCE_ID structure. |
| 37 | |
| 38 | a resource type - a pointer to a statically defined instance of |
| 39 | WT_RESOURCE_TYPE structure. This structure contains a pointer to |
| 40 | a function that knows how to compare values of this resource type. |
| 41 | In the simple case it could be wt_resource_id_memcmp(). |
| 42 | |
| 43 | a wait-for graph - a graph, that represenst "wait-for" relationships. |
| 44 | It has two types of nodes - threads and resources. There are directed |
| 45 | edges from a thread to a resource it is waiting for (WT_THD::waiting_for), |
| 46 | from a thread to resources that it "owns" (WT_THD::my_resources), |
| 47 | and from a resource to threads that "own" it (WT_RESOURCE::owners) |
| 48 | |
| 49 | Graph completeness |
| 50 | ^^^^^^^^^^^^^^^^^^ |
| 51 | |
| 52 | For flawless deadlock detection wait-for graph must be complete. |
| 53 | It means that when a thread starts waiting it needs to know *all* its |
| 54 | blockers, and call wt_thd_will_wait_for() for every one of them. |
| 55 | Otherwise two phenomena should be expected: |
| 56 | |
| 57 | 1. Fuzzy timeouts: |
| 58 | |
| 59 | thread A needs to get a lock, and is blocked by a thread B. |
| 60 | it waits. |
| 61 | Just before the timeout thread B releases the lock. |
| 62 | thread A is ready to grab the lock but discovers that it is also |
| 63 | blocked by a thread C. |
| 64 | It waits and times out. |
| 65 | |
| 66 | As a result thread A has waited two timeout intervals, instead of one. |
| 67 | |
| 68 | 2. Unreliable cycle detection: |
| 69 | |
| 70 | Thread A waits for threads B and C |
| 71 | Thread C waits for D |
| 72 | Thread D wants to start waiting for A |
| 73 | |
| 74 | one can see immediately that thread D creates a cycle, and thus |
| 75 | a deadlock is detected. |
| 76 | |
| 77 | But if thread A would only wait for B, and start waiting for C |
| 78 | when B would unlock, thread D would be allowed to wait, a deadlock |
| 79 | would be only detected when B unlocks or somebody times out. |
| 80 | |
| 81 | These two phenomena don't affect a correctness, and strictly speaking, |
| 82 | the caller is not required to call wt_thd_will_wait_for() for *all* |
| 83 | blockers - it may optimize wt_thd_will_wait_for() calls. But they |
| 84 | may be perceived as bugs by users, it must be understood that such |
| 85 | an optimization comes with its price. |
| 86 | |
| 87 | Usage |
| 88 | ^^^^^ |
| 89 | |
| 90 | First, the wt* subsystem must be initialized by calling |
| 91 | wt_init(). In the server you don't need to do it, it's done |
| 92 | in mysqld.cc. |
| 93 | |
| 94 | Similarly, wt_end() frees wt* structures, should be called |
| 95 | at the end, but in the server mysqld.cc takes care of that. |
| 96 | |
| 97 | Every WT_THD should be initialized with wt_thd_lazy_init(). |
| 98 | After that they can be used in other wt_thd_* calls. |
| 99 | Before discarding, WT_THD should be free'd with |
| 100 | wt_thd_destroy(). In the server both are handled in sql_class.cc, |
| 101 | it's an error to try to do it manually. |
| 102 | |
| 103 | To use the deadlock detection one needs to use this thread's WT_THD, |
| 104 | call wt_thd_will_wait_for() for every thread it needs to wait on, |
| 105 | then call wt_thd_cond_timedwait(). When thread releases a resource |
| 106 | it should call wt_thd_release() (or wt_thd_release_all()) - it will |
| 107 | notify (send a signal) threads waiting in wt_thd_cond_timedwait(), |
| 108 | if appropriate. |
| 109 | |
| 110 | Just like with pthread's cond_wait, there could be spurious |
| 111 | wake-ups from wt_thd_cond_timedwait(). A caller is expected to |
| 112 | handle that (that is, to re-check the blocking criteria). |
| 113 | |
| 114 | wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either |
| 115 | WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return |
| 116 | WT_TIMEOUT. Out of memory and other fatal errors are reported as |
| 117 | WT_DEADLOCK - and a transaction must be aborted just the same. |
| 118 | |
| 119 | Configuration |
| 120 | ^^^^^^^^^^^^^ |
| 121 | There are four config variables. Two deadlock search depths - short and |
| 122 | long - and two timeouts. Deadlock search is performed with the short |
| 123 | depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait() |
| 124 | waits with a short timeout, performs a deadlock search with the long |
| 125 | depth, and waits with a long timeout. As most deadlock cycles are supposed |
| 126 | to be short, most deadlocks will be detected at once, and waits will |
| 127 | rarely be necessary. |
| 128 | |
| 129 | These config variables are thread-local. Different threads may have |
| 130 | different search depth and timeout values. |
| 131 | |
| 132 | Also, deadlock detector supports different killing strategies, the victim |
| 133 | in a deadlock cycle is selected based on the "weight". See "weight" |
| 134 | description in waiting_threads.h for details. It's up to the caller to |
| 135 | set weights accordingly. |
| 136 | |
| 137 | Status |
| 138 | ^^^^^^ |
| 139 | We calculate the number of successful waits (WT_OK returned from |
| 140 | wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle |
| 141 | length distribution - number of deadlocks with every length from |
| 142 | 1 to WT_CYCLE_STATS, and a wait time distribution - number |
| 143 | of waits with a time from 1 us to 1 min in WT_WAIT_STATS |
| 144 | intervals on a log e scale. |
| 145 | */ |
| 146 | |
| 147 | /* |
| 148 | Note that if your lock system satisfy the following condition: |
| 149 | |
| 150 | there exist four lock levels A, B, C, D, such as |
| 151 | A is compatible with B |
| 152 | A is not compatible with C |
| 153 | D is not compatible with B |
| 154 | |
| 155 | (example A=IX, B=IS, C=S, D=X) |
| 156 | |
| 157 | you need to include lock level in the resource identifier - a |
| 158 | thread waiting for lock of the type A on resource R and another |
| 159 | thread waiting for lock of the type B on resource R should wait on |
| 160 | different WT_RESOURCE structures, on different {lock, resource} |
| 161 | pairs. Otherwise the following is possible: |
| 162 | |
| 163 | thread1> take S-lock on R |
| 164 | thread2> take IS-lock on R |
| 165 | thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R. |
| 166 | thread3 is killed (or timeout or whatever) |
| 167 | WT_RESOURCE structure for R is still in the hash, as it has two owners |
| 168 | thread4> wants an IX-lock on R |
| 169 | WT_RESOURCE for R is found in the hash, thread4 starts waiting on it. |
| 170 | !! now thread4 is waiting for both thread1 and thread2 |
| 171 | !! while, in fact, IX-lock and IS-lock are compatible and |
| 172 | !! thread4 should not wait for thread2. |
| 173 | */ |
| 174 | |
| 175 | #include <my_global.h> |
| 176 | #include <waiting_threads.h> |
| 177 | #include <m_string.h> |
| 178 | |
| 179 | /* status variables */ |
| 180 | |
| 181 | /** |
| 182 | preset table of wait intervals |
| 183 | */ |
| 184 | ulonglong wt_wait_table[WT_WAIT_STATS]; |
| 185 | /** |
| 186 | wait time distribution (log e scale) |
| 187 | */ |
| 188 | uint32 wt_wait_stats[WT_WAIT_STATS+1]; |
| 189 | /** |
| 190 | distribution of cycle lengths |
| 191 | first column tells whether this was during short or long detection |
| 192 | */ |
| 193 | uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1]; |
| 194 | uint32 wt_success_stats; |
| 195 | |
| 196 | #ifdef HAVE_PSI_INTERFACE |
| 197 | extern PSI_cond_key key_WT_RESOURCE_cond; |
| 198 | #endif |
| 199 | |
| 200 | #ifdef SAFE_STATISTICS |
| 201 | #define incr(VAR, LOCK) do { my_atomic_add32(&(VAR), 1); } while(0) |
| 202 | #else |
| 203 | #define incr(VAR,LOCK) do { (VAR)++; } while(0) |
| 204 | #endif |
| 205 | |
| 206 | static void increment_success_stats() |
| 207 | { |
| 208 | incr(wt_success_stats, success_stats_lock); |
| 209 | } |
| 210 | |
| 211 | static void increment_cycle_stats(uint depth, uint slot) |
| 212 | { |
| 213 | if (depth >= WT_CYCLE_STATS) |
| 214 | depth= WT_CYCLE_STATS; |
| 215 | incr(wt_cycle_stats[slot][depth], cycle_stats_lock); |
| 216 | } |
| 217 | |
| 218 | static void increment_wait_stats(ulonglong waited,int ret) |
| 219 | { |
| 220 | uint i; |
| 221 | if ((ret) == ETIMEDOUT) |
| 222 | i= WT_WAIT_STATS; |
| 223 | else |
| 224 | for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ; |
| 225 | incr(wt_wait_stats[i], wait_stats_lock); |
| 226 | } |
| 227 | |
| 228 | /* |
| 229 | 'lock' protects 'owners', 'state', and 'waiter_count' |
| 230 | 'id' is read-only |
| 231 | |
| 232 | a resource is picked up from a hash in a lock-free manner |
| 233 | it's returned pinned, so it cannot be freed at once |
| 234 | but it may be freed right after the pin is removed |
| 235 | to free a resource it should |
| 236 | 1. have no owners |
| 237 | 2. have no waiters |
| 238 | |
| 239 | two ways to access a resource: |
| 240 | 1. find it in a hash |
| 241 | - it's returned pinned. |
| 242 | a) take a lock in exclusive mode |
| 243 | b) check the state, it should be ACTIVE to be usable |
| 244 | c) unpin |
| 245 | 2. by a direct reference |
| 246 | - could only used if a resource cannot be freed |
| 247 | e.g. accessing a resource by thd->waiting_for is safe, |
| 248 | a resource cannot be freed as there's a thread waiting for it |
| 249 | */ |
| 250 | struct st_wt_resource { |
| 251 | WT_RESOURCE_ID id; |
| 252 | uint waiter_count; |
| 253 | enum { ACTIVE, FREE } state; |
| 254 | #ifndef DBUG_OFF |
| 255 | mysql_mutex_t *cond_mutex; /* a mutex for the 'cond' below */ |
| 256 | #endif |
| 257 | |
| 258 | #ifdef WT_RWLOCKS_USE_MUTEXES |
| 259 | /* |
| 260 | we need a special rwlock-like 'lock' to allow readers bypass |
| 261 | waiting writers, otherwise readers can deadlock. For example: |
| 262 | |
| 263 | A waits on resource x, owned by B, B waits on resource y, owned |
| 264 | by A, we have a cycle (A->x->B->y->A) |
| 265 | Both A and B start deadlock detection: |
| 266 | |
| 267 | A locks x B locks y |
| 268 | A goes deeper B goes deeper |
| 269 | A locks y B locks x |
| 270 | |
| 271 | with mutexes it would deadlock. With rwlocks it won't, as long |
| 272 | as both A and B are taking read locks (and they do). |
| 273 | But other threads may take write locks. Assume there's |
| 274 | C who wants to start waiting on x, and D who wants to start |
| 275 | waiting on y. |
| 276 | |
| 277 | A read-locks x B read-locks y |
| 278 | A goes deeper B goes deeper |
| 279 | => C write-locks x (to add a new edge) D write-locks y |
| 280 | .. C is blocked D is blocked |
| 281 | A read-locks y B read-locks x |
| 282 | |
| 283 | Now, if a read lock can bypass a pending wrote lock request, we're fine. |
| 284 | If it can not, we have a deadlock. |
| 285 | |
| 286 | writer starvation is technically possible, but unlikely, because |
| 287 | the contention is expected to be low. |
| 288 | */ |
| 289 | struct { |
| 290 | pthread_cond_t cond; |
| 291 | pthread_mutex_t mutex; |
| 292 | uint readers: 16; |
| 293 | uint pending_writers: 15; |
| 294 | uint write_locked: 1; |
| 295 | } lock; |
| 296 | #else |
| 297 | rw_lock_t lock; |
| 298 | #endif |
| 299 | mysql_cond_t cond; /* the corresponding mutex is provided by the caller */ |
| 300 | DYNAMIC_ARRAY owners; |
| 301 | }; |
| 302 | |
| 303 | #ifdef WT_RWLOCKS_USE_MUTEXES |
| 304 | static void rc_rwlock_init(WT_RESOURCE *rc) |
| 305 | { |
| 306 | pthread_cond_init(&rc->lock.cond, 0); |
| 307 | pthread_mutex_init(&rc->lock.mutex, MY_MUTEX_INIT_FAST); |
| 308 | } |
| 309 | static void rc_rwlock_destroy(WT_RESOURCE *rc) |
| 310 | { |
| 311 | DBUG_ASSERT(rc->lock.write_locked == 0); |
| 312 | DBUG_ASSERT(rc->lock.readers == 0); |
| 313 | pthread_cond_destroy(&rc->lock.cond); |
| 314 | pthread_mutex_destroy(&rc->lock.mutex); |
| 315 | } |
| 316 | static void rc_rdlock(WT_RESOURCE *rc) |
| 317 | { |
| 318 | DBUG_PRINT("wt" , ("TRYLOCK resid=%ld for READ" , (ulong)rc->id.value)); |
| 319 | pthread_mutex_lock(&rc->lock.mutex); |
| 320 | while (rc->lock.write_locked) |
| 321 | pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex); |
| 322 | rc->lock.readers++; |
| 323 | pthread_mutex_unlock(&rc->lock.mutex); |
| 324 | DBUG_PRINT("wt" , ("LOCK resid=%ld for READ" , (ulong)rc->id.value)); |
| 325 | } |
| 326 | static void rc_wrlock(WT_RESOURCE *rc) |
| 327 | { |
| 328 | DBUG_PRINT("wt" , ("TRYLOCK resid=%ld for WRITE" , (ulong)rc->id.value)); |
| 329 | pthread_mutex_lock(&rc->lock.mutex); |
| 330 | while (rc->lock.write_locked || rc->lock.readers) |
| 331 | pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex); |
| 332 | rc->lock.write_locked= 1; |
| 333 | pthread_mutex_unlock(&rc->lock.mutex); |
| 334 | DBUG_PRINT("wt" , ("LOCK resid=%ld for WRITE" , (ulong)rc->id.value)); |
| 335 | } |
| 336 | static void rc_unlock(WT_RESOURCE *rc) |
| 337 | { |
| 338 | DBUG_PRINT("wt" , ("UNLOCK resid=%ld" , (ulong)rc->id.value)); |
| 339 | pthread_mutex_lock(&rc->lock.mutex); |
| 340 | if (rc->lock.write_locked) |
| 341 | { |
| 342 | rc->lock.write_locked= 0; |
| 343 | pthread_cond_broadcast(&rc->lock.cond); |
| 344 | } |
| 345 | else if (--rc->lock.readers == 0) |
| 346 | pthread_cond_broadcast(&rc->lock.cond); |
| 347 | pthread_mutex_unlock(&rc->lock.mutex); |
| 348 | } |
| 349 | #else |
| 350 | static void rc_rwlock_init(WT_RESOURCE *rc) |
| 351 | { |
| 352 | my_rwlock_init(&rc->lock, 0); |
| 353 | } |
| 354 | static void rc_rwlock_destroy(WT_RESOURCE *rc) |
| 355 | { |
| 356 | rwlock_destroy(&rc->lock); |
| 357 | } |
| 358 | static void rc_rdlock(WT_RESOURCE *rc) |
| 359 | { |
| 360 | DBUG_PRINT("wt" , ("TRYLOCK resid=%ld for READ" , (ulong)rc->id.value)); |
| 361 | rw_rdlock(&rc->lock); |
| 362 | DBUG_PRINT("wt" , ("LOCK resid=%ld for READ" , (ulong)rc->id.value)); |
| 363 | } |
| 364 | static void rc_wrlock(WT_RESOURCE *rc) |
| 365 | { |
| 366 | DBUG_PRINT("wt" , ("TRYLOCK resid=%ld for WRITE" , (ulong)rc->id.value)); |
| 367 | rw_wrlock(&rc->lock); |
| 368 | DBUG_PRINT("wt" , ("LOCK resid=%ld for WRITE" , (ulong)rc->id.value)); |
| 369 | } |
| 370 | static void rc_unlock(WT_RESOURCE *rc) |
| 371 | { |
| 372 | DBUG_PRINT("wt" , ("UNLOCK resid=%ld" , (ulong)rc->id.value)); |
| 373 | rw_unlock(&rc->lock); |
| 374 | } |
| 375 | #endif |
| 376 | |
| 377 | /* |
| 378 | All resources are stored in a lock-free hash. Different threads |
| 379 | may add new resources and perform deadlock detection concurrently. |
| 380 | */ |
| 381 | static LF_HASH reshash; |
| 382 | |
| 383 | /** |
| 384 | WT_RESOURCE constructor |
| 385 | |
| 386 | It's called from lf_hash and takes a pointer to an LF_SLIST instance. |
| 387 | WT_RESOURCE is located at arg+sizeof(LF_SLIST) |
| 388 | */ |
| 389 | static void wt_resource_create(uchar *arg) |
| 390 | { |
| 391 | WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD); |
| 392 | DBUG_ENTER("wt_resource_create" ); |
| 393 | |
| 394 | bzero(rc, sizeof(*rc)); |
| 395 | rc_rwlock_init(rc); |
| 396 | mysql_cond_init(key_WT_RESOURCE_cond, &rc->cond, 0); |
| 397 | my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5, MYF(0)); |
| 398 | DBUG_VOID_RETURN; |
| 399 | } |
| 400 | |
| 401 | /** |
| 402 | WT_RESOURCE destructor |
| 403 | |
| 404 | It's called from lf_hash and takes a pointer to an LF_SLIST instance. |
| 405 | WT_RESOURCE is located at arg+sizeof(LF_SLIST) |
| 406 | */ |
| 407 | static void wt_resource_destroy(uchar *arg) |
| 408 | { |
| 409 | WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD); |
| 410 | DBUG_ENTER("wt_resource_destroy" ); |
| 411 | |
| 412 | DBUG_ASSERT(rc->owners.elements == 0); |
| 413 | rc_rwlock_destroy(rc); |
| 414 | mysql_cond_destroy(&rc->cond); |
| 415 | delete_dynamic(&rc->owners); |
| 416 | DBUG_VOID_RETURN; |
| 417 | } |
| 418 | |
| 419 | /** |
| 420 | WT_RESOURCE initializer |
| 421 | |
| 422 | It's called from lf_hash when an element is inserted. |
| 423 | */ |
| 424 | static void wt_resource_init(LF_HASH *hash __attribute__((unused)), |
| 425 | WT_RESOURCE *rc, WT_RESOURCE_ID *id) |
| 426 | { |
| 427 | DBUG_ENTER("wt_resource_init" ); |
| 428 | rc->id= *id; |
| 429 | rc->waiter_count= 0; |
| 430 | rc->state= ACTIVE; |
| 431 | #ifndef DBUG_OFF |
| 432 | rc->cond_mutex= 0; |
| 433 | #endif |
| 434 | DBUG_VOID_RETURN; |
| 435 | } |
| 436 | |
| 437 | static int wt_init_done; |
| 438 | |
| 439 | void wt_init() |
| 440 | { |
| 441 | DBUG_ENTER("wt_init" ); |
| 442 | DBUG_ASSERT(reshash.alloc.constructor != wt_resource_create); |
| 443 | |
| 444 | lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0, |
| 445 | sizeof_WT_RESOURCE_ID, 0, 0); |
| 446 | reshash.alloc.constructor= wt_resource_create; |
| 447 | reshash.alloc.destructor= wt_resource_destroy; |
| 448 | reshash.initializer= (lf_hash_initializer) wt_resource_init; |
| 449 | |
| 450 | bzero(wt_wait_stats, sizeof(wt_wait_stats)); |
| 451 | bzero(wt_cycle_stats, sizeof(wt_cycle_stats)); |
| 452 | wt_success_stats= 0; |
| 453 | { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */ |
| 454 | int i; |
| 455 | double from= log(1); /* 1 us */ |
| 456 | double to= log(60e6); /* 1 min */ |
| 457 | for (i= 0; i < WT_WAIT_STATS; i++) |
| 458 | { |
| 459 | wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from); |
| 460 | DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]); |
| 461 | } |
| 462 | } |
| 463 | wt_init_done= 1; |
| 464 | DBUG_VOID_RETURN; |
| 465 | } |
| 466 | |
| 467 | void wt_end() |
| 468 | { |
| 469 | DBUG_ENTER("wt_end" ); |
| 470 | if (!wt_init_done) |
| 471 | DBUG_VOID_RETURN; |
| 472 | |
| 473 | DBUG_ASSERT(reshash.count == 0); |
| 474 | lf_hash_destroy(&reshash); |
| 475 | reshash.alloc.constructor= NULL; |
| 476 | wt_init_done= 0; |
| 477 | DBUG_VOID_RETURN; |
| 478 | } |
| 479 | |
| 480 | /** |
| 481 | Lazy WT_THD initialization |
| 482 | |
| 483 | Cheap initialization of WT_THD. Only initialize fields that don't require |
| 484 | memory allocations - basically, it only does assignments. The rest of the |
| 485 | WT_THD structure will be initialized on demand, on the first use. |
| 486 | This allows one to initialize lazily all WT_THD structures, even if some |
| 487 | (or even most) of them will never be used for deadlock detection. |
| 488 | |
| 489 | @param ds a pointer to deadlock search depth short value |
| 490 | @param ts a pointer to deadlock timeout short value (microseconds) |
| 491 | @param dl a pointer to deadlock search depth long value |
| 492 | @param tl a pointer to deadlock timeout long value (microseconds) |
| 493 | |
| 494 | @note these are pointers to values, and WT_THD stores them as pointers. |
| 495 | It allows one later to change search depths and timeouts for existing |
| 496 | threads. It also means that the pointers must stay valid for the lifetime |
| 497 | of WT_THD. |
| 498 | */ |
| 499 | void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts, |
| 500 | const ulong *dl, const ulong *tl) |
| 501 | { |
| 502 | DBUG_ENTER("wt_thd_lazy_init" ); |
| 503 | thd->waiting_for= 0; |
| 504 | thd->weight= 0; |
| 505 | thd->deadlock_search_depth_short= ds; |
| 506 | thd->timeout_short= ts; |
| 507 | thd->deadlock_search_depth_long= dl; |
| 508 | thd->timeout_long= tl; |
| 509 | /* dynamic array is also initialized lazily - without memory allocations */ |
| 510 | my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5, MYF(0)); |
| 511 | #ifndef DBUG_OFF |
| 512 | thd->name= my_thread_name(); |
| 513 | #endif |
| 514 | DBUG_VOID_RETURN; |
| 515 | } |
| 516 | |
| 517 | /** |
| 518 | Finalize WT_THD initialization |
| 519 | |
| 520 | After lazy WT_THD initialization, parts of the structure are still |
| 521 | uninitialized. This function completes the initialization, allocating |
| 522 | memory, if necessary. It's called automatically on demand, when WT_THD |
| 523 | is about to be used. |
| 524 | */ |
| 525 | static int fix_thd_pins(WT_THD *thd) |
| 526 | { |
| 527 | if (unlikely(thd->pins == 0)) |
| 528 | { |
| 529 | thd->pins= lf_hash_get_pins(&reshash); |
| 530 | #ifndef DBUG_OFF |
| 531 | thd->name= my_thread_name(); |
| 532 | #endif |
| 533 | } |
| 534 | return thd->pins == 0; |
| 535 | } |
| 536 | |
| 537 | void wt_thd_destroy(WT_THD *thd) |
| 538 | { |
| 539 | DBUG_ENTER("wt_thd_destroy" ); |
| 540 | |
| 541 | DBUG_ASSERT(thd->my_resources.elements == 0); |
| 542 | DBUG_ASSERT(thd->waiting_for == 0); |
| 543 | |
| 544 | if (thd->pins != 0) |
| 545 | lf_hash_put_pins(thd->pins); |
| 546 | |
| 547 | delete_dynamic(&thd->my_resources); |
| 548 | DBUG_VOID_RETURN; |
| 549 | } |
| 550 | /** |
| 551 | Trivial resource id comparison function - bytewise memcmp. |
| 552 | |
| 553 | It can be used in WT_RESOURCE_TYPE structures where bytewise |
| 554 | comparison of values is sufficient. |
| 555 | */ |
| 556 | my_bool wt_resource_id_memcmp(const void *a, const void *b) |
| 557 | { |
| 558 | /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */ |
| 559 | compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong)); |
| 560 | return MY_TEST(memcmp(a, b, sizeof_WT_RESOURCE_ID)); |
| 561 | } |
| 562 | |
| 563 | /** |
| 564 | arguments for the recursive deadlock_search function |
| 565 | */ |
| 566 | struct deadlock_arg { |
| 567 | WT_THD * const thd; /**< starting point of a search */ |
| 568 | uint const max_depth; /**< search depth limit */ |
| 569 | WT_THD *victim; /**< a thread to be killed to resolve a deadlock */ |
| 570 | WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */ |
| 571 | }; |
| 572 | |
| 573 | /** |
| 574 | helper function to change the victim, according to the weight |
| 575 | */ |
| 576 | static void change_victim(WT_THD* found, struct deadlock_arg *arg) |
| 577 | { |
| 578 | if (found->weight < arg->victim->weight) |
| 579 | { |
| 580 | if (arg->victim != arg->thd) |
| 581 | { |
| 582 | rc_unlock(arg->victim->waiting_for); /* release the previous victim */ |
| 583 | DBUG_ASSERT(arg->last_locked_rc == found->waiting_for); |
| 584 | } |
| 585 | arg->victim= found; |
| 586 | arg->last_locked_rc= 0; |
| 587 | } |
| 588 | } |
| 589 | |
| 590 | /** |
| 591 | recursive loop detection in a wait-for graph with a limited search depth |
| 592 | */ |
| 593 | static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker, |
| 594 | uint depth) |
| 595 | { |
| 596 | WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for; |
| 597 | WT_THD *cursor; |
| 598 | uint i; |
| 599 | int ret= WT_OK; |
| 600 | DBUG_ENTER("deadlock_search" ); |
| 601 | DBUG_PRINT("wt" , ("enter: thd=%s, blocker=%s, depth=%u" , |
| 602 | arg->thd->name, blocker->name, depth)); |
| 603 | |
| 604 | arg->last_locked_rc= 0; |
| 605 | |
| 606 | if (depth > arg->max_depth) |
| 607 | { |
| 608 | DBUG_PRINT("wt" , ("exit: WT_DEPTH_EXCEEDED (early)" )); |
| 609 | DBUG_RETURN(WT_DEPTH_EXCEEDED); |
| 610 | } |
| 611 | |
| 612 | retry: |
| 613 | /* |
| 614 | safe dereference as explained in lf_alloc-pin.c |
| 615 | (in short: protects against lf_alloc_free() in lf_hash_delete()) |
| 616 | */ |
| 617 | do |
| 618 | { |
| 619 | rc= *shared_ptr; |
| 620 | lf_pin(arg->thd->pins, 0, rc); |
| 621 | } while (rc != *shared_ptr && LF_BACKOFF()); |
| 622 | |
| 623 | if (rc == 0) |
| 624 | { |
| 625 | DBUG_PRINT("wt" , ("exit: OK (early)" )); |
| 626 | DBUG_RETURN(0); |
| 627 | } |
| 628 | |
| 629 | rc_rdlock(rc); |
| 630 | if (rc->state != ACTIVE || *shared_ptr != rc) |
| 631 | { |
| 632 | /* blocker is not waiting on this resource anymore */ |
| 633 | rc_unlock(rc); |
| 634 | lf_unpin(arg->thd->pins, 0); |
| 635 | goto retry; |
| 636 | } |
| 637 | /* as the state is locked, we can unpin now */ |
| 638 | lf_unpin(arg->thd->pins, 0); |
| 639 | |
| 640 | /* |
| 641 | Below is not a pure depth-first search. It's a depth-first with a |
| 642 | slightest hint of breadth-first. Depth-first is: |
| 643 | |
| 644 | check(element, X): |
| 645 | foreach current in element->nodes[] do: |
| 646 | if current == X return error; |
| 647 | check(current, X); |
| 648 | |
| 649 | while we do |
| 650 | |
| 651 | check(element, X): |
| 652 | foreach current in element->nodes[] do: |
| 653 | if current == X return error; |
| 654 | foreach current in element->nodes[] do: |
| 655 | check(current, X); |
| 656 | |
| 657 | preferring shorter deadlocks over longer ones. |
| 658 | */ |
| 659 | for (i= 0; i < rc->owners.elements; i++) |
| 660 | { |
| 661 | cursor= *dynamic_element(&rc->owners, i, WT_THD**); |
| 662 | /* |
| 663 | We're only looking for (and detecting) cycles that include 'arg->thd'. |
| 664 | That is, only deadlocks that *we* have created. For example, |
| 665 | thd->A->B->thd |
| 666 | (thd waits for A, A waits for B, while B is waiting for thd). |
| 667 | While walking the graph we can encounter other cicles, e.g. |
| 668 | thd->A->B->C->A |
| 669 | This will not be detected. Instead we will walk it in circles until |
| 670 | the search depth limit is reached (the latter guarantees that an |
| 671 | infinite loop is impossible). We expect the thread that has created |
| 672 | the cycle (one of A, B, and C) to detect its deadlock. |
| 673 | */ |
| 674 | if (cursor == arg->thd) |
| 675 | { |
| 676 | ret= WT_DEADLOCK; |
| 677 | increment_cycle_stats(depth, arg->max_depth == |
| 678 | *arg->thd->deadlock_search_depth_long); |
| 679 | arg->victim= cursor; |
| 680 | goto end; |
| 681 | } |
| 682 | } |
| 683 | for (i= 0; i < rc->owners.elements; i++) |
| 684 | { |
| 685 | cursor= *dynamic_element(&rc->owners, i, WT_THD**); |
| 686 | switch (deadlock_search(arg, cursor, depth+1)) { |
| 687 | case WT_OK: |
| 688 | break; |
| 689 | case WT_DEPTH_EXCEEDED: |
| 690 | ret= WT_DEPTH_EXCEEDED; |
| 691 | break; |
| 692 | case WT_DEADLOCK: |
| 693 | ret= WT_DEADLOCK; |
| 694 | change_victim(cursor, arg); /* also sets arg->last_locked_rc to 0 */ |
| 695 | i= rc->owners.elements; /* jump out of the loop */ |
| 696 | break; |
| 697 | default: |
| 698 | DBUG_ASSERT(0); |
| 699 | } |
| 700 | if (arg->last_locked_rc) |
| 701 | rc_unlock(arg->last_locked_rc); |
| 702 | } |
| 703 | end: |
| 704 | /* |
| 705 | Note that 'rc' is locked in this function, but it's never unlocked here. |
| 706 | Instead it's saved in arg->last_locked_rc and the *caller* is |
| 707 | expected to unlock it. It's done to support different killing |
| 708 | strategies. This is how it works: |
| 709 | Assuming a graph |
| 710 | |
| 711 | thd->A->B->C->thd |
| 712 | |
| 713 | deadlock_search() function starts from thd, locks it (in fact it locks not |
| 714 | a thd, but a resource it is waiting on, but below, for simplicity, I'll |
| 715 | talk about "locking a thd"). Then it goes down recursively, locks A, and so |
| 716 | on. Goes down recursively, locks B. Goes down recursively, locks C. |
| 717 | Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd. |
| 718 | Returns from the last deadlock_search() call. C stays locked! |
| 719 | Now it checks whether C is a more appropriate victim than 'thd'. |
| 720 | If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked. |
| 721 | Now it checks whether B is a more appropriate victim than arg->victim. |
| 722 | If yes - old arg->victim is unlocked and arg->victim=B, |
| 723 | otherwise B is unlocked. Return. |
| 724 | And so on. |
| 725 | |
| 726 | In short, a resource is locked in a frame. But it's not unlocked in the |
| 727 | same frame, it's unlocked by the caller, and only after the caller checks |
| 728 | that it doesn't need to use current WT_THD as a victim. If it does - the |
| 729 | lock is kept and the old victim's resource is unlocked. When the recursion |
| 730 | is unrolled and we are back to deadlock() function, there are only two |
| 731 | locks left - on thd and on the victim. |
| 732 | */ |
| 733 | arg->last_locked_rc= rc; |
| 734 | DBUG_PRINT("wt" , ("exit: %s" , |
| 735 | ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" : |
| 736 | ret ? "WT_DEADLOCK" : "OK" )); |
| 737 | DBUG_RETURN(ret); |
| 738 | } |
| 739 | |
| 740 | /** |
| 741 | Deadlock detection in a wait-for graph |
| 742 | |
| 743 | A wrapper for recursive deadlock_search() - prepares deadlock_arg structure, |
| 744 | invokes deadlock_search(), increments statistics, notifies the victim. |
| 745 | |
| 746 | @param thd thread that is going to wait. Deadlock is detected |
| 747 | if, while walking the graph, we reach a thread that |
| 748 | is waiting on thd |
| 749 | @param blocker starting point of a search. In wt_thd_cond_timedwait() |
| 750 | it's thd, in wt_thd_will_wait_for() it's a thread that |
| 751 | thd is going to wait for |
| 752 | @param depth starting search depth. In general it's the number of |
| 753 | edges in the wait-for graph between thd and the |
| 754 | blocker. Practically only two values are used (and |
| 755 | supported) - when thd == blocker it's 0, when thd |
| 756 | waits directly for blocker, it's 1 |
| 757 | @param max_depth search depth limit |
| 758 | */ |
| 759 | static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth, |
| 760 | uint max_depth) |
| 761 | { |
| 762 | struct deadlock_arg arg= {thd, max_depth, 0, 0}; |
| 763 | int ret; |
| 764 | DBUG_ENTER("deadlock" ); |
| 765 | DBUG_ASSERT(depth < 2); |
| 766 | ret= deadlock_search(&arg, blocker, depth); |
| 767 | if (ret == WT_DEPTH_EXCEEDED) |
| 768 | { |
| 769 | increment_cycle_stats(WT_CYCLE_STATS, max_depth == |
| 770 | *thd->deadlock_search_depth_long); |
| 771 | ret= WT_OK; |
| 772 | } |
| 773 | /* |
| 774 | if we started with depth==1, blocker was never considered for a victim |
| 775 | in deadlock_search(). Do it here. |
| 776 | */ |
| 777 | if (ret == WT_DEADLOCK && depth) |
| 778 | change_victim(blocker, &arg); |
| 779 | if (arg.last_locked_rc) |
| 780 | { |
| 781 | /* |
| 782 | Special return code if there's nobody to wait for. |
| 783 | |
| 784 | depth == 0 means that we start the search from thd (thd == blocker). |
| 785 | ret == WT_OK means that no cycle was found and |
| 786 | arg.last_locked_rc == thd->waiting_for. |
| 787 | and arg.last_locked_rc->owners.elements == 0 means that |
| 788 | (applying the rule above) thd->waiting_for->owners.elements == 0, |
| 789 | and thd doesn't have anybody to wait for. |
| 790 | */ |
| 791 | if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0) |
| 792 | { |
| 793 | DBUG_ASSERT(thd == blocker); |
| 794 | DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for); |
| 795 | ret= WT_FREE_TO_GO; |
| 796 | } |
| 797 | rc_unlock(arg.last_locked_rc); |
| 798 | } |
| 799 | /* notify the victim, if appropriate */ |
| 800 | if (ret == WT_DEADLOCK && arg.victim != thd) |
| 801 | { |
| 802 | DBUG_PRINT("wt" , ("killing %s" , arg.victim->name)); |
| 803 | arg.victim->killed= 1; |
| 804 | mysql_cond_broadcast(&arg.victim->waiting_for->cond); |
| 805 | rc_unlock(arg.victim->waiting_for); |
| 806 | ret= WT_OK; |
| 807 | } |
| 808 | DBUG_RETURN(ret); |
| 809 | } |
| 810 | |
| 811 | |
| 812 | /** |
| 813 | Delete an element from reshash if it has no waiters or owners |
| 814 | |
| 815 | rc->lock must be locked by the caller and it's unlocked on return. |
| 816 | */ |
| 817 | static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc) |
| 818 | { |
| 819 | uint keylen; |
| 820 | const void *key; |
| 821 | DBUG_ENTER("unlock_lock_and_free_resource" ); |
| 822 | |
| 823 | DBUG_ASSERT(rc->state == ACTIVE); |
| 824 | |
| 825 | if (rc->owners.elements || rc->waiter_count) |
| 826 | { |
| 827 | DBUG_PRINT("wt" , ("nothing to do, %u owners, %u waiters" , |
| 828 | rc->owners.elements, rc->waiter_count)); |
| 829 | rc_unlock(rc); |
| 830 | DBUG_RETURN(0); |
| 831 | } |
| 832 | |
| 833 | if (fix_thd_pins(thd)) |
| 834 | { |
| 835 | rc_unlock(rc); |
| 836 | DBUG_RETURN(1); |
| 837 | } |
| 838 | |
| 839 | /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */ |
| 840 | { |
| 841 | key= &rc->id; |
| 842 | keylen= sizeof_WT_RESOURCE_ID; |
| 843 | } |
| 844 | |
| 845 | /* |
| 846 | To free the element correctly we need to: |
| 847 | 1. take its lock (already done). |
| 848 | 2. set the state to FREE |
| 849 | 3. release the lock |
| 850 | 4. remove from the hash |
| 851 | */ |
| 852 | rc->state= FREE; |
| 853 | rc_unlock(rc); |
| 854 | DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1); |
| 855 | } |
| 856 | |
| 857 | |
| 858 | /** |
| 859 | register the fact that thd is not waiting anymore |
| 860 | |
| 861 | decrease waiter_count, clear waiting_for, free the resource if appropriate. |
| 862 | thd->waiting_for must be locked! |
| 863 | */ |
| 864 | static int stop_waiting_locked(WT_THD *thd) |
| 865 | { |
| 866 | int ret; |
| 867 | WT_RESOURCE *rc= thd->waiting_for; |
| 868 | DBUG_ENTER("stop_waiting_locked" ); |
| 869 | |
| 870 | DBUG_ASSERT(rc->waiter_count); |
| 871 | DBUG_ASSERT(rc->state == ACTIVE); |
| 872 | rc->waiter_count--; |
| 873 | thd->waiting_for= 0; |
| 874 | ret= unlock_lock_and_free_resource(thd, rc); |
| 875 | DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK); |
| 876 | } |
| 877 | |
| 878 | /** |
| 879 | register the fact that thd is not waiting anymore |
| 880 | |
| 881 | locks thd->waiting_for and calls stop_waiting_locked(). |
| 882 | */ |
| 883 | static int stop_waiting(WT_THD *thd) |
| 884 | { |
| 885 | int ret; |
| 886 | WT_RESOURCE *rc= thd->waiting_for; |
| 887 | DBUG_ENTER("stop_waiting" ); |
| 888 | |
| 889 | if (!rc) |
| 890 | DBUG_RETURN(WT_OK); |
| 891 | /* |
| 892 | nobody's trying to free the resource now, |
| 893 | as its waiter_count is guaranteed to be non-zero |
| 894 | */ |
| 895 | rc_wrlock(rc); |
| 896 | ret= stop_waiting_locked(thd); |
| 897 | DBUG_RETURN(ret); |
| 898 | } |
| 899 | |
| 900 | /** |
| 901 | notify the system that a thread needs to wait for another thread |
| 902 | |
| 903 | called by a *waiter* to declare that it (thd) will wait for another |
| 904 | thread (blocker) on a specific resource (resid). |
| 905 | can be called many times, if many blockers own a blocking resource. |
| 906 | but must always be called with the same resource id - a thread cannot |
| 907 | wait for more than one resource at a time. |
| 908 | |
| 909 | @return WT_OK or WT_DEADLOCK |
| 910 | |
| 911 | As a new edge is added to the wait-for graph, a deadlock detection is |
| 912 | performed for this new edge. |
| 913 | */ |
| 914 | int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker, |
| 915 | const WT_RESOURCE_ID *resid) |
| 916 | { |
| 917 | uint i; |
| 918 | WT_RESOURCE *rc; |
| 919 | DBUG_ENTER("wt_thd_will_wait_for" ); |
| 920 | |
| 921 | DBUG_PRINT("wt" , ("enter: thd=%s, blocker=%s, resid=%lu" , |
| 922 | thd->name, blocker->name, (ulong)resid->value)); |
| 923 | |
| 924 | if (fix_thd_pins(thd)) |
| 925 | DBUG_RETURN(WT_DEADLOCK); |
| 926 | |
| 927 | if (thd->waiting_for == 0) |
| 928 | { |
| 929 | uint keylen; |
| 930 | const void *key; |
| 931 | /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */ |
| 932 | { |
| 933 | key= resid; |
| 934 | keylen= sizeof_WT_RESOURCE_ID; |
| 935 | } |
| 936 | |
| 937 | DBUG_PRINT("wt" , ("first blocker" )); |
| 938 | |
| 939 | retry: |
| 940 | while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0) |
| 941 | { |
| 942 | DBUG_PRINT("wt" , ("failed to find rc in hash, inserting" )); |
| 943 | |
| 944 | if (lf_hash_insert(&reshash, thd->pins, resid) == -1) /* if OOM */ |
| 945 | DBUG_RETURN(WT_DEADLOCK); |
| 946 | /* |
| 947 | Two cases: either lf_hash_insert() failed - because another thread |
| 948 | has just inserted a resource with the same id - and we need to retry. |
| 949 | Or lf_hash_insert() succeeded, and then we need to repeat |
| 950 | lf_hash_search() to find a real address of the newly inserted element. |
| 951 | That is, we don't care what lf_hash_insert() has returned. |
| 952 | And we need to repeat the loop anyway. |
| 953 | */ |
| 954 | } |
| 955 | if (rc == MY_ERRPTR) |
| 956 | DBUG_RETURN(WT_DEADLOCK); |
| 957 | |
| 958 | DBUG_PRINT("wt" , ("found in hash rc=%p" , rc)); |
| 959 | |
| 960 | rc_wrlock(rc); |
| 961 | if (rc->state != ACTIVE) |
| 962 | { |
| 963 | DBUG_PRINT("wt" , ("but it's not active, retrying" )); |
| 964 | /* Somebody has freed the element while we weren't looking */ |
| 965 | rc_unlock(rc); |
| 966 | lf_hash_search_unpin(thd->pins); |
| 967 | goto retry; |
| 968 | } |
| 969 | |
| 970 | lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */ |
| 971 | thd->waiting_for= rc; |
| 972 | rc->waiter_count++; |
| 973 | thd->killed= 0; |
| 974 | } |
| 975 | else |
| 976 | { |
| 977 | DBUG_ASSERT(thd->waiting_for->id.type == resid->type); |
| 978 | DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0); |
| 979 | DBUG_PRINT("wt" , ("adding another blocker" )); |
| 980 | |
| 981 | /* |
| 982 | we can safely access the resource here, it's in the hash as it has |
| 983 | non-zero waiter_count |
| 984 | */ |
| 985 | rc= thd->waiting_for; |
| 986 | rc_wrlock(rc); |
| 987 | DBUG_ASSERT(rc->waiter_count); |
| 988 | DBUG_ASSERT(rc->state == ACTIVE); |
| 989 | |
| 990 | if (thd->killed) |
| 991 | { |
| 992 | stop_waiting_locked(thd); |
| 993 | DBUG_RETURN(WT_DEADLOCK); |
| 994 | } |
| 995 | } |
| 996 | /* |
| 997 | Another thread could be waiting on this resource for this very 'blocker'. |
| 998 | In this case we should not add it to the list for the second time. |
| 999 | */ |
| 1000 | for (i= 0; i < rc->owners.elements; i++) |
| 1001 | if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker) |
| 1002 | break; |
| 1003 | if (i >= rc->owners.elements) |
| 1004 | { |
| 1005 | if (push_dynamic(&blocker->my_resources, (void*)&rc)) |
| 1006 | { |
| 1007 | stop_waiting_locked(thd); |
| 1008 | DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */ |
| 1009 | } |
| 1010 | if (push_dynamic(&rc->owners, (void*)&blocker)) |
| 1011 | { |
| 1012 | pop_dynamic(&blocker->my_resources); |
| 1013 | stop_waiting_locked(thd); |
| 1014 | DBUG_RETURN(WT_DEADLOCK); |
| 1015 | } |
| 1016 | } |
| 1017 | rc_unlock(rc); |
| 1018 | |
| 1019 | if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK) |
| 1020 | { |
| 1021 | stop_waiting(thd); |
| 1022 | DBUG_RETURN(WT_DEADLOCK); |
| 1023 | } |
| 1024 | DBUG_RETURN(WT_OK); |
| 1025 | } |
| 1026 | |
| 1027 | /** |
| 1028 | called by a *waiter* (thd) to start waiting |
| 1029 | |
| 1030 | It's supposed to be a drop-in replacement for |
| 1031 | mysql_cond_timedwait(), and it takes mutex as an argument. |
| 1032 | |
| 1033 | @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK |
| 1034 | */ |
| 1035 | int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex) |
| 1036 | { |
| 1037 | int ret= WT_TIMEOUT; |
| 1038 | struct timespec timeout; |
| 1039 | my_hrtime_t before, after, starttime; |
| 1040 | WT_RESOURCE *rc= thd->waiting_for; |
| 1041 | ulonglong end_wait_time; |
| 1042 | DBUG_ENTER("wt_thd_cond_timedwait" ); |
| 1043 | DBUG_PRINT("wt" , ("enter: thd=%s, rc=%p" , thd->name, rc)); |
| 1044 | |
| 1045 | #ifndef DBUG_OFF |
| 1046 | if (rc->cond_mutex) |
| 1047 | DBUG_ASSERT(rc->cond_mutex == mutex); |
| 1048 | else |
| 1049 | rc->cond_mutex= mutex; |
| 1050 | mysql_mutex_assert_owner(mutex); |
| 1051 | #endif |
| 1052 | |
| 1053 | before= starttime= my_hrtime(); |
| 1054 | |
| 1055 | rc_wrlock(rc); |
| 1056 | if (rc->owners.elements == 0) |
| 1057 | ret= WT_OK; |
| 1058 | rc_unlock(rc); |
| 1059 | |
| 1060 | end_wait_time= starttime.val *1000 + (*thd->timeout_short)*1000000ULL; |
| 1061 | set_timespec_time_nsec(timeout, end_wait_time); |
| 1062 | if (ret == WT_TIMEOUT && !thd->killed) |
| 1063 | ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout); |
| 1064 | if (ret == WT_TIMEOUT && !thd->killed) |
| 1065 | { |
| 1066 | int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long); |
| 1067 | if (r == WT_FREE_TO_GO) |
| 1068 | ret= WT_OK; |
| 1069 | else if (r != WT_OK) |
| 1070 | ret= WT_DEADLOCK; |
| 1071 | else if (*thd->timeout_long > *thd->timeout_short) |
| 1072 | { |
| 1073 | end_wait_time= starttime.val *1000 + (*thd->timeout_long)*1000000ULL; |
| 1074 | set_timespec_time_nsec(timeout, end_wait_time); |
| 1075 | if (!thd->killed) |
| 1076 | ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout); |
| 1077 | } |
| 1078 | } |
| 1079 | after= my_hrtime(); |
| 1080 | if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */ |
| 1081 | ret= WT_DEADLOCK; |
| 1082 | increment_wait_stats(after.val-before.val, ret); |
| 1083 | if (ret == WT_OK) |
| 1084 | increment_success_stats(); |
| 1085 | DBUG_RETURN(ret); |
| 1086 | } |
| 1087 | |
| 1088 | /** |
| 1089 | called by a *blocker* when it releases a resource |
| 1090 | |
| 1091 | it's conceptually similar to pthread_cond_broadcast, and must be done |
| 1092 | under the same mutex as wt_thd_cond_timedwait(). |
| 1093 | |
| 1094 | @param resid a resource to release. 0 to release all resources |
| 1095 | */ |
| 1096 | |
| 1097 | void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid) |
| 1098 | { |
| 1099 | uint i; |
| 1100 | DBUG_ENTER("wt_thd_release" ); |
| 1101 | |
| 1102 | for (i= 0; i < thd->my_resources.elements; i++) |
| 1103 | { |
| 1104 | WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**); |
| 1105 | if (!resid || (resid->type->compare(&rc->id, resid) == 0)) |
| 1106 | { |
| 1107 | uint j; |
| 1108 | |
| 1109 | rc_wrlock(rc); |
| 1110 | /* |
| 1111 | nobody's trying to free the resource now, |
| 1112 | as its owners[] array is not empty (at least thd must be there) |
| 1113 | */ |
| 1114 | DBUG_ASSERT(rc->state == ACTIVE); |
| 1115 | for (j= 0; j < rc->owners.elements; j++) |
| 1116 | if (*dynamic_element(&rc->owners, j, WT_THD**) == thd) |
| 1117 | break; |
| 1118 | DBUG_ASSERT(j < rc->owners.elements); |
| 1119 | delete_dynamic_element(&rc->owners, j); |
| 1120 | if (rc->owners.elements == 0) |
| 1121 | { |
| 1122 | mysql_cond_broadcast(&rc->cond); |
| 1123 | #ifndef DBUG_OFF |
| 1124 | if (rc->cond_mutex) |
| 1125 | mysql_mutex_assert_owner(rc->cond_mutex); |
| 1126 | #endif |
| 1127 | } |
| 1128 | unlock_lock_and_free_resource(thd, rc); |
| 1129 | if (resid) |
| 1130 | { |
| 1131 | delete_dynamic_element(&thd->my_resources, i); |
| 1132 | DBUG_VOID_RETURN; |
| 1133 | } |
| 1134 | } |
| 1135 | } |
| 1136 | if (!resid) |
| 1137 | reset_dynamic(&thd->my_resources); |
| 1138 | DBUG_VOID_RETURN; |
| 1139 | } |
| 1140 | |
| 1141 | |