1/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
2 Copyright (c) 2011, 2013, Monty Program Ab.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
16
17/**
18 @file
19
20 "waiting threads" subsystem - a unified interface for threads to wait
21 on each other, with built-in deadlock detection.
22
23 Main concepts
24 ^^^^^^^^^^^^^
25 a thread - is represented by a WT_THD structure. One physical thread
26 can have only one WT_THD descriptor at any given moment.
27
28 a resource - a thread does not wait for other threads directly,
29 instead it waits for a "resource", which is "owned" by other threads.
30 It waits, exactly, for all "owners" to "release" a resource.
31 It does not have to correspond to a physical resource. For example, it
32 may be convenient in certain cases to force resource == thread.
33 A resource is represented by a WT_RESOURCE structure.
34
35 a resource identifier - a pair of {resource type, value}. A value is
36 an ulonglong number. Represented by a WT_RESOURCE_ID structure.
37
38 a resource type - a pointer to a statically defined instance of
39 WT_RESOURCE_TYPE structure. This structure contains a pointer to
40 a function that knows how to compare values of this resource type.
41 In the simple case it could be wt_resource_id_memcmp().
42
43 a wait-for graph - a graph, that represenst "wait-for" relationships.
44 It has two types of nodes - threads and resources. There are directed
45 edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
46 from a thread to resources that it "owns" (WT_THD::my_resources),
47 and from a resource to threads that "own" it (WT_RESOURCE::owners)
48
49 Graph completeness
50 ^^^^^^^^^^^^^^^^^^
51
52 For flawless deadlock detection wait-for graph must be complete.
53 It means that when a thread starts waiting it needs to know *all* its
54 blockers, and call wt_thd_will_wait_for() for every one of them.
55 Otherwise two phenomena should be expected:
56
57 1. Fuzzy timeouts:
58
59 thread A needs to get a lock, and is blocked by a thread B.
60 it waits.
61 Just before the timeout thread B releases the lock.
62 thread A is ready to grab the lock but discovers that it is also
63 blocked by a thread C.
64 It waits and times out.
65
66 As a result thread A has waited two timeout intervals, instead of one.
67
68 2. Unreliable cycle detection:
69
70 Thread A waits for threads B and C
71 Thread C waits for D
72 Thread D wants to start waiting for A
73
74 one can see immediately that thread D creates a cycle, and thus
75 a deadlock is detected.
76
77 But if thread A would only wait for B, and start waiting for C
78 when B would unlock, thread D would be allowed to wait, a deadlock
79 would be only detected when B unlocks or somebody times out.
80
81 These two phenomena don't affect a correctness, and strictly speaking,
82 the caller is not required to call wt_thd_will_wait_for() for *all*
83 blockers - it may optimize wt_thd_will_wait_for() calls. But they
84 may be perceived as bugs by users, it must be understood that such
85 an optimization comes with its price.
86
87 Usage
88 ^^^^^
89
90 First, the wt* subsystem must be initialized by calling
91 wt_init(). In the server you don't need to do it, it's done
92 in mysqld.cc.
93
94 Similarly, wt_end() frees wt* structures, should be called
95 at the end, but in the server mysqld.cc takes care of that.
96
97 Every WT_THD should be initialized with wt_thd_lazy_init().
98 After that they can be used in other wt_thd_* calls.
99 Before discarding, WT_THD should be free'd with
100 wt_thd_destroy(). In the server both are handled in sql_class.cc,
101 it's an error to try to do it manually.
102
103 To use the deadlock detection one needs to use this thread's WT_THD,
104 call wt_thd_will_wait_for() for every thread it needs to wait on,
105 then call wt_thd_cond_timedwait(). When thread releases a resource
106 it should call wt_thd_release() (or wt_thd_release_all()) - it will
107 notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
108 if appropriate.
109
110 Just like with pthread's cond_wait, there could be spurious
111 wake-ups from wt_thd_cond_timedwait(). A caller is expected to
112 handle that (that is, to re-check the blocking criteria).
113
114 wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
115 WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
116 WT_TIMEOUT. Out of memory and other fatal errors are reported as
117 WT_DEADLOCK - and a transaction must be aborted just the same.
118
119 Configuration
120 ^^^^^^^^^^^^^
121 There are four config variables. Two deadlock search depths - short and
122 long - and two timeouts. Deadlock search is performed with the short
123 depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
124 waits with a short timeout, performs a deadlock search with the long
125 depth, and waits with a long timeout. As most deadlock cycles are supposed
126 to be short, most deadlocks will be detected at once, and waits will
127 rarely be necessary.
128
129 These config variables are thread-local. Different threads may have
130 different search depth and timeout values.
131
132 Also, deadlock detector supports different killing strategies, the victim
133 in a deadlock cycle is selected based on the "weight". See "weight"
134 description in waiting_threads.h for details. It's up to the caller to
135 set weights accordingly.
136
137 Status
138 ^^^^^^
139 We calculate the number of successful waits (WT_OK returned from
140 wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
141 length distribution - number of deadlocks with every length from
142 1 to WT_CYCLE_STATS, and a wait time distribution - number
143 of waits with a time from 1 us to 1 min in WT_WAIT_STATS
144 intervals on a log e scale.
145*/
146
147/*
148 Note that if your lock system satisfy the following condition:
149
150 there exist four lock levels A, B, C, D, such as
151 A is compatible with B
152 A is not compatible with C
153 D is not compatible with B
154
155 (example A=IX, B=IS, C=S, D=X)
156
157 you need to include lock level in the resource identifier - a
158 thread waiting for lock of the type A on resource R and another
159 thread waiting for lock of the type B on resource R should wait on
160 different WT_RESOURCE structures, on different {lock, resource}
161 pairs. Otherwise the following is possible:
162
163 thread1> take S-lock on R
164 thread2> take IS-lock on R
165 thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
166 thread3 is killed (or timeout or whatever)
167 WT_RESOURCE structure for R is still in the hash, as it has two owners
168 thread4> wants an IX-lock on R
169 WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
170 !! now thread4 is waiting for both thread1 and thread2
171 !! while, in fact, IX-lock and IS-lock are compatible and
172 !! thread4 should not wait for thread2.
173*/
174
175#include <my_global.h>
176#include <waiting_threads.h>
177#include <m_string.h>
178
179/* status variables */
180
181/**
182 preset table of wait intervals
183*/
184ulonglong wt_wait_table[WT_WAIT_STATS];
185/**
186 wait time distribution (log e scale)
187*/
188uint32 wt_wait_stats[WT_WAIT_STATS+1];
189/**
190 distribution of cycle lengths
191 first column tells whether this was during short or long detection
192*/
193uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
194uint32 wt_success_stats;
195
196#ifdef HAVE_PSI_INTERFACE
197extern PSI_cond_key key_WT_RESOURCE_cond;
198#endif
199
200#ifdef SAFE_STATISTICS
201#define incr(VAR, LOCK) do { my_atomic_add32(&(VAR), 1); } while(0)
202#else
203#define incr(VAR,LOCK) do { (VAR)++; } while(0)
204#endif
205
206static void increment_success_stats()
207{
208 incr(wt_success_stats, success_stats_lock);
209}
210
211static void increment_cycle_stats(uint depth, uint slot)
212{
213 if (depth >= WT_CYCLE_STATS)
214 depth= WT_CYCLE_STATS;
215 incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
216}
217
218static void increment_wait_stats(ulonglong waited,int ret)
219{
220 uint i;
221 if ((ret) == ETIMEDOUT)
222 i= WT_WAIT_STATS;
223 else
224 for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
225 incr(wt_wait_stats[i], wait_stats_lock);
226}
227
228/*
229 'lock' protects 'owners', 'state', and 'waiter_count'
230 'id' is read-only
231
232 a resource is picked up from a hash in a lock-free manner
233 it's returned pinned, so it cannot be freed at once
234 but it may be freed right after the pin is removed
235 to free a resource it should
236 1. have no owners
237 2. have no waiters
238
239 two ways to access a resource:
240 1. find it in a hash
241 - it's returned pinned.
242 a) take a lock in exclusive mode
243 b) check the state, it should be ACTIVE to be usable
244 c) unpin
245 2. by a direct reference
246 - could only used if a resource cannot be freed
247 e.g. accessing a resource by thd->waiting_for is safe,
248 a resource cannot be freed as there's a thread waiting for it
249*/
250struct st_wt_resource {
251 WT_RESOURCE_ID id;
252 uint waiter_count;
253 enum { ACTIVE, FREE } state;
254#ifndef DBUG_OFF
255 mysql_mutex_t *cond_mutex; /* a mutex for the 'cond' below */
256#endif
257
258#ifdef WT_RWLOCKS_USE_MUTEXES
259 /*
260 we need a special rwlock-like 'lock' to allow readers bypass
261 waiting writers, otherwise readers can deadlock. For example:
262
263 A waits on resource x, owned by B, B waits on resource y, owned
264 by A, we have a cycle (A->x->B->y->A)
265 Both A and B start deadlock detection:
266
267 A locks x B locks y
268 A goes deeper B goes deeper
269 A locks y B locks x
270
271 with mutexes it would deadlock. With rwlocks it won't, as long
272 as both A and B are taking read locks (and they do).
273 But other threads may take write locks. Assume there's
274 C who wants to start waiting on x, and D who wants to start
275 waiting on y.
276
277 A read-locks x B read-locks y
278 A goes deeper B goes deeper
279 => C write-locks x (to add a new edge) D write-locks y
280 .. C is blocked D is blocked
281 A read-locks y B read-locks x
282
283 Now, if a read lock can bypass a pending wrote lock request, we're fine.
284 If it can not, we have a deadlock.
285
286 writer starvation is technically possible, but unlikely, because
287 the contention is expected to be low.
288 */
289 struct {
290 pthread_cond_t cond;
291 pthread_mutex_t mutex;
292 uint readers: 16;
293 uint pending_writers: 15;
294 uint write_locked: 1;
295 } lock;
296#else
297 rw_lock_t lock;
298#endif
299 mysql_cond_t cond; /* the corresponding mutex is provided by the caller */
300 DYNAMIC_ARRAY owners;
301};
302
303#ifdef WT_RWLOCKS_USE_MUTEXES
304static void rc_rwlock_init(WT_RESOURCE *rc)
305{
306 pthread_cond_init(&rc->lock.cond, 0);
307 pthread_mutex_init(&rc->lock.mutex, MY_MUTEX_INIT_FAST);
308}
309static void rc_rwlock_destroy(WT_RESOURCE *rc)
310{
311 DBUG_ASSERT(rc->lock.write_locked == 0);
312 DBUG_ASSERT(rc->lock.readers == 0);
313 pthread_cond_destroy(&rc->lock.cond);
314 pthread_mutex_destroy(&rc->lock.mutex);
315}
316static void rc_rdlock(WT_RESOURCE *rc)
317{
318 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
319 pthread_mutex_lock(&rc->lock.mutex);
320 while (rc->lock.write_locked)
321 pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
322 rc->lock.readers++;
323 pthread_mutex_unlock(&rc->lock.mutex);
324 DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
325}
326static void rc_wrlock(WT_RESOURCE *rc)
327{
328 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
329 pthread_mutex_lock(&rc->lock.mutex);
330 while (rc->lock.write_locked || rc->lock.readers)
331 pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
332 rc->lock.write_locked= 1;
333 pthread_mutex_unlock(&rc->lock.mutex);
334 DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
335}
336static void rc_unlock(WT_RESOURCE *rc)
337{
338 DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
339 pthread_mutex_lock(&rc->lock.mutex);
340 if (rc->lock.write_locked)
341 {
342 rc->lock.write_locked= 0;
343 pthread_cond_broadcast(&rc->lock.cond);
344 }
345 else if (--rc->lock.readers == 0)
346 pthread_cond_broadcast(&rc->lock.cond);
347 pthread_mutex_unlock(&rc->lock.mutex);
348}
349#else
350static void rc_rwlock_init(WT_RESOURCE *rc)
351{
352 my_rwlock_init(&rc->lock, 0);
353}
354static void rc_rwlock_destroy(WT_RESOURCE *rc)
355{
356 rwlock_destroy(&rc->lock);
357}
358static void rc_rdlock(WT_RESOURCE *rc)
359{
360 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
361 rw_rdlock(&rc->lock);
362 DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
363}
364static void rc_wrlock(WT_RESOURCE *rc)
365{
366 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
367 rw_wrlock(&rc->lock);
368 DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
369}
370static void rc_unlock(WT_RESOURCE *rc)
371{
372 DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
373 rw_unlock(&rc->lock);
374}
375#endif
376
377/*
378 All resources are stored in a lock-free hash. Different threads
379 may add new resources and perform deadlock detection concurrently.
380*/
381static LF_HASH reshash;
382
383/**
384 WT_RESOURCE constructor
385
386 It's called from lf_hash and takes a pointer to an LF_SLIST instance.
387 WT_RESOURCE is located at arg+sizeof(LF_SLIST)
388*/
389static void wt_resource_create(uchar *arg)
390{
391 WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
392 DBUG_ENTER("wt_resource_create");
393
394 bzero(rc, sizeof(*rc));
395 rc_rwlock_init(rc);
396 mysql_cond_init(key_WT_RESOURCE_cond, &rc->cond, 0);
397 my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5, MYF(0));
398 DBUG_VOID_RETURN;
399}
400
401/**
402 WT_RESOURCE destructor
403
404 It's called from lf_hash and takes a pointer to an LF_SLIST instance.
405 WT_RESOURCE is located at arg+sizeof(LF_SLIST)
406*/
407static void wt_resource_destroy(uchar *arg)
408{
409 WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
410 DBUG_ENTER("wt_resource_destroy");
411
412 DBUG_ASSERT(rc->owners.elements == 0);
413 rc_rwlock_destroy(rc);
414 mysql_cond_destroy(&rc->cond);
415 delete_dynamic(&rc->owners);
416 DBUG_VOID_RETURN;
417}
418
419/**
420 WT_RESOURCE initializer
421
422 It's called from lf_hash when an element is inserted.
423*/
424static void wt_resource_init(LF_HASH *hash __attribute__((unused)),
425 WT_RESOURCE *rc, WT_RESOURCE_ID *id)
426{
427 DBUG_ENTER("wt_resource_init");
428 rc->id= *id;
429 rc->waiter_count= 0;
430 rc->state= ACTIVE;
431#ifndef DBUG_OFF
432 rc->cond_mutex= 0;
433#endif
434 DBUG_VOID_RETURN;
435}
436
437static int wt_init_done;
438
439void wt_init()
440{
441 DBUG_ENTER("wt_init");
442 DBUG_ASSERT(reshash.alloc.constructor != wt_resource_create);
443
444 lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
445 sizeof_WT_RESOURCE_ID, 0, 0);
446 reshash.alloc.constructor= wt_resource_create;
447 reshash.alloc.destructor= wt_resource_destroy;
448 reshash.initializer= (lf_hash_initializer) wt_resource_init;
449
450 bzero(wt_wait_stats, sizeof(wt_wait_stats));
451 bzero(wt_cycle_stats, sizeof(wt_cycle_stats));
452 wt_success_stats= 0;
453 { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
454 int i;
455 double from= log(1); /* 1 us */
456 double to= log(60e6); /* 1 min */
457 for (i= 0; i < WT_WAIT_STATS; i++)
458 {
459 wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
460 DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
461 }
462 }
463 wt_init_done= 1;
464 DBUG_VOID_RETURN;
465}
466
467void wt_end()
468{
469 DBUG_ENTER("wt_end");
470 if (!wt_init_done)
471 DBUG_VOID_RETURN;
472
473 DBUG_ASSERT(reshash.count == 0);
474 lf_hash_destroy(&reshash);
475 reshash.alloc.constructor= NULL;
476 wt_init_done= 0;
477 DBUG_VOID_RETURN;
478}
479
480/**
481 Lazy WT_THD initialization
482
483 Cheap initialization of WT_THD. Only initialize fields that don't require
484 memory allocations - basically, it only does assignments. The rest of the
485 WT_THD structure will be initialized on demand, on the first use.
486 This allows one to initialize lazily all WT_THD structures, even if some
487 (or even most) of them will never be used for deadlock detection.
488
489 @param ds a pointer to deadlock search depth short value
490 @param ts a pointer to deadlock timeout short value (microseconds)
491 @param dl a pointer to deadlock search depth long value
492 @param tl a pointer to deadlock timeout long value (microseconds)
493
494 @note these are pointers to values, and WT_THD stores them as pointers.
495 It allows one later to change search depths and timeouts for existing
496 threads. It also means that the pointers must stay valid for the lifetime
497 of WT_THD.
498*/
499void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
500 const ulong *dl, const ulong *tl)
501{
502 DBUG_ENTER("wt_thd_lazy_init");
503 thd->waiting_for= 0;
504 thd->weight= 0;
505 thd->deadlock_search_depth_short= ds;
506 thd->timeout_short= ts;
507 thd->deadlock_search_depth_long= dl;
508 thd->timeout_long= tl;
509 /* dynamic array is also initialized lazily - without memory allocations */
510 my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5, MYF(0));
511#ifndef DBUG_OFF
512 thd->name= my_thread_name();
513#endif
514 DBUG_VOID_RETURN;
515}
516
517/**
518 Finalize WT_THD initialization
519
520 After lazy WT_THD initialization, parts of the structure are still
521 uninitialized. This function completes the initialization, allocating
522 memory, if necessary. It's called automatically on demand, when WT_THD
523 is about to be used.
524*/
525static int fix_thd_pins(WT_THD *thd)
526{
527 if (unlikely(thd->pins == 0))
528 {
529 thd->pins= lf_hash_get_pins(&reshash);
530#ifndef DBUG_OFF
531 thd->name= my_thread_name();
532#endif
533 }
534 return thd->pins == 0;
535}
536
537void wt_thd_destroy(WT_THD *thd)
538{
539 DBUG_ENTER("wt_thd_destroy");
540
541 DBUG_ASSERT(thd->my_resources.elements == 0);
542 DBUG_ASSERT(thd->waiting_for == 0);
543
544 if (thd->pins != 0)
545 lf_hash_put_pins(thd->pins);
546
547 delete_dynamic(&thd->my_resources);
548 DBUG_VOID_RETURN;
549}
550/**
551 Trivial resource id comparison function - bytewise memcmp.
552
553 It can be used in WT_RESOURCE_TYPE structures where bytewise
554 comparison of values is sufficient.
555*/
556my_bool wt_resource_id_memcmp(const void *a, const void *b)
557{
558 /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
559 compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
560 return MY_TEST(memcmp(a, b, sizeof_WT_RESOURCE_ID));
561}
562
563/**
564 arguments for the recursive deadlock_search function
565*/
566struct deadlock_arg {
567 WT_THD * const thd; /**< starting point of a search */
568 uint const max_depth; /**< search depth limit */
569 WT_THD *victim; /**< a thread to be killed to resolve a deadlock */
570 WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
571};
572
573/**
574 helper function to change the victim, according to the weight
575*/
576static void change_victim(WT_THD* found, struct deadlock_arg *arg)
577{
578 if (found->weight < arg->victim->weight)
579 {
580 if (arg->victim != arg->thd)
581 {
582 rc_unlock(arg->victim->waiting_for); /* release the previous victim */
583 DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
584 }
585 arg->victim= found;
586 arg->last_locked_rc= 0;
587 }
588}
589
590/**
591 recursive loop detection in a wait-for graph with a limited search depth
592*/
593static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
594 uint depth)
595{
596 WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
597 WT_THD *cursor;
598 uint i;
599 int ret= WT_OK;
600 DBUG_ENTER("deadlock_search");
601 DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
602 arg->thd->name, blocker->name, depth));
603
604 arg->last_locked_rc= 0;
605
606 if (depth > arg->max_depth)
607 {
608 DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
609 DBUG_RETURN(WT_DEPTH_EXCEEDED);
610 }
611
612retry:
613 /*
614 safe dereference as explained in lf_alloc-pin.c
615 (in short: protects against lf_alloc_free() in lf_hash_delete())
616 */
617 do
618 {
619 rc= *shared_ptr;
620 lf_pin(arg->thd->pins, 0, rc);
621 } while (rc != *shared_ptr && LF_BACKOFF());
622
623 if (rc == 0)
624 {
625 DBUG_PRINT("wt", ("exit: OK (early)"));
626 DBUG_RETURN(0);
627 }
628
629 rc_rdlock(rc);
630 if (rc->state != ACTIVE || *shared_ptr != rc)
631 {
632 /* blocker is not waiting on this resource anymore */
633 rc_unlock(rc);
634 lf_unpin(arg->thd->pins, 0);
635 goto retry;
636 }
637 /* as the state is locked, we can unpin now */
638 lf_unpin(arg->thd->pins, 0);
639
640 /*
641 Below is not a pure depth-first search. It's a depth-first with a
642 slightest hint of breadth-first. Depth-first is:
643
644 check(element, X):
645 foreach current in element->nodes[] do:
646 if current == X return error;
647 check(current, X);
648
649 while we do
650
651 check(element, X):
652 foreach current in element->nodes[] do:
653 if current == X return error;
654 foreach current in element->nodes[] do:
655 check(current, X);
656
657 preferring shorter deadlocks over longer ones.
658 */
659 for (i= 0; i < rc->owners.elements; i++)
660 {
661 cursor= *dynamic_element(&rc->owners, i, WT_THD**);
662 /*
663 We're only looking for (and detecting) cycles that include 'arg->thd'.
664 That is, only deadlocks that *we* have created. For example,
665 thd->A->B->thd
666 (thd waits for A, A waits for B, while B is waiting for thd).
667 While walking the graph we can encounter other cicles, e.g.
668 thd->A->B->C->A
669 This will not be detected. Instead we will walk it in circles until
670 the search depth limit is reached (the latter guarantees that an
671 infinite loop is impossible). We expect the thread that has created
672 the cycle (one of A, B, and C) to detect its deadlock.
673 */
674 if (cursor == arg->thd)
675 {
676 ret= WT_DEADLOCK;
677 increment_cycle_stats(depth, arg->max_depth ==
678 *arg->thd->deadlock_search_depth_long);
679 arg->victim= cursor;
680 goto end;
681 }
682 }
683 for (i= 0; i < rc->owners.elements; i++)
684 {
685 cursor= *dynamic_element(&rc->owners, i, WT_THD**);
686 switch (deadlock_search(arg, cursor, depth+1)) {
687 case WT_OK:
688 break;
689 case WT_DEPTH_EXCEEDED:
690 ret= WT_DEPTH_EXCEEDED;
691 break;
692 case WT_DEADLOCK:
693 ret= WT_DEADLOCK;
694 change_victim(cursor, arg); /* also sets arg->last_locked_rc to 0 */
695 i= rc->owners.elements; /* jump out of the loop */
696 break;
697 default:
698 DBUG_ASSERT(0);
699 }
700 if (arg->last_locked_rc)
701 rc_unlock(arg->last_locked_rc);
702 }
703end:
704 /*
705 Note that 'rc' is locked in this function, but it's never unlocked here.
706 Instead it's saved in arg->last_locked_rc and the *caller* is
707 expected to unlock it. It's done to support different killing
708 strategies. This is how it works:
709 Assuming a graph
710
711 thd->A->B->C->thd
712
713 deadlock_search() function starts from thd, locks it (in fact it locks not
714 a thd, but a resource it is waiting on, but below, for simplicity, I'll
715 talk about "locking a thd"). Then it goes down recursively, locks A, and so
716 on. Goes down recursively, locks B. Goes down recursively, locks C.
717 Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
718 Returns from the last deadlock_search() call. C stays locked!
719 Now it checks whether C is a more appropriate victim than 'thd'.
720 If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
721 Now it checks whether B is a more appropriate victim than arg->victim.
722 If yes - old arg->victim is unlocked and arg->victim=B,
723 otherwise B is unlocked. Return.
724 And so on.
725
726 In short, a resource is locked in a frame. But it's not unlocked in the
727 same frame, it's unlocked by the caller, and only after the caller checks
728 that it doesn't need to use current WT_THD as a victim. If it does - the
729 lock is kept and the old victim's resource is unlocked. When the recursion
730 is unrolled and we are back to deadlock() function, there are only two
731 locks left - on thd and on the victim.
732 */
733 arg->last_locked_rc= rc;
734 DBUG_PRINT("wt", ("exit: %s",
735 ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
736 ret ? "WT_DEADLOCK" : "OK"));
737 DBUG_RETURN(ret);
738}
739
740/**
741 Deadlock detection in a wait-for graph
742
743 A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
744 invokes deadlock_search(), increments statistics, notifies the victim.
745
746 @param thd thread that is going to wait. Deadlock is detected
747 if, while walking the graph, we reach a thread that
748 is waiting on thd
749 @param blocker starting point of a search. In wt_thd_cond_timedwait()
750 it's thd, in wt_thd_will_wait_for() it's a thread that
751 thd is going to wait for
752 @param depth starting search depth. In general it's the number of
753 edges in the wait-for graph between thd and the
754 blocker. Practically only two values are used (and
755 supported) - when thd == blocker it's 0, when thd
756 waits directly for blocker, it's 1
757 @param max_depth search depth limit
758*/
759static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
760 uint max_depth)
761{
762 struct deadlock_arg arg= {thd, max_depth, 0, 0};
763 int ret;
764 DBUG_ENTER("deadlock");
765 DBUG_ASSERT(depth < 2);
766 ret= deadlock_search(&arg, blocker, depth);
767 if (ret == WT_DEPTH_EXCEEDED)
768 {
769 increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
770 *thd->deadlock_search_depth_long);
771 ret= WT_OK;
772 }
773 /*
774 if we started with depth==1, blocker was never considered for a victim
775 in deadlock_search(). Do it here.
776 */
777 if (ret == WT_DEADLOCK && depth)
778 change_victim(blocker, &arg);
779 if (arg.last_locked_rc)
780 {
781 /*
782 Special return code if there's nobody to wait for.
783
784 depth == 0 means that we start the search from thd (thd == blocker).
785 ret == WT_OK means that no cycle was found and
786 arg.last_locked_rc == thd->waiting_for.
787 and arg.last_locked_rc->owners.elements == 0 means that
788 (applying the rule above) thd->waiting_for->owners.elements == 0,
789 and thd doesn't have anybody to wait for.
790 */
791 if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
792 {
793 DBUG_ASSERT(thd == blocker);
794 DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
795 ret= WT_FREE_TO_GO;
796 }
797 rc_unlock(arg.last_locked_rc);
798 }
799 /* notify the victim, if appropriate */
800 if (ret == WT_DEADLOCK && arg.victim != thd)
801 {
802 DBUG_PRINT("wt", ("killing %s", arg.victim->name));
803 arg.victim->killed= 1;
804 mysql_cond_broadcast(&arg.victim->waiting_for->cond);
805 rc_unlock(arg.victim->waiting_for);
806 ret= WT_OK;
807 }
808 DBUG_RETURN(ret);
809}
810
811
812/**
813 Delete an element from reshash if it has no waiters or owners
814
815 rc->lock must be locked by the caller and it's unlocked on return.
816*/
817static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
818{
819 uint keylen;
820 const void *key;
821 DBUG_ENTER("unlock_lock_and_free_resource");
822
823 DBUG_ASSERT(rc->state == ACTIVE);
824
825 if (rc->owners.elements || rc->waiter_count)
826 {
827 DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
828 rc->owners.elements, rc->waiter_count));
829 rc_unlock(rc);
830 DBUG_RETURN(0);
831 }
832
833 if (fix_thd_pins(thd))
834 {
835 rc_unlock(rc);
836 DBUG_RETURN(1);
837 }
838
839 /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
840 {
841 key= &rc->id;
842 keylen= sizeof_WT_RESOURCE_ID;
843 }
844
845 /*
846 To free the element correctly we need to:
847 1. take its lock (already done).
848 2. set the state to FREE
849 3. release the lock
850 4. remove from the hash
851 */
852 rc->state= FREE;
853 rc_unlock(rc);
854 DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
855}
856
857
858/**
859 register the fact that thd is not waiting anymore
860
861 decrease waiter_count, clear waiting_for, free the resource if appropriate.
862 thd->waiting_for must be locked!
863*/
864static int stop_waiting_locked(WT_THD *thd)
865{
866 int ret;
867 WT_RESOURCE *rc= thd->waiting_for;
868 DBUG_ENTER("stop_waiting_locked");
869
870 DBUG_ASSERT(rc->waiter_count);
871 DBUG_ASSERT(rc->state == ACTIVE);
872 rc->waiter_count--;
873 thd->waiting_for= 0;
874 ret= unlock_lock_and_free_resource(thd, rc);
875 DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
876}
877
878/**
879 register the fact that thd is not waiting anymore
880
881 locks thd->waiting_for and calls stop_waiting_locked().
882*/
883static int stop_waiting(WT_THD *thd)
884{
885 int ret;
886 WT_RESOURCE *rc= thd->waiting_for;
887 DBUG_ENTER("stop_waiting");
888
889 if (!rc)
890 DBUG_RETURN(WT_OK);
891 /*
892 nobody's trying to free the resource now,
893 as its waiter_count is guaranteed to be non-zero
894 */
895 rc_wrlock(rc);
896 ret= stop_waiting_locked(thd);
897 DBUG_RETURN(ret);
898}
899
900/**
901 notify the system that a thread needs to wait for another thread
902
903 called by a *waiter* to declare that it (thd) will wait for another
904 thread (blocker) on a specific resource (resid).
905 can be called many times, if many blockers own a blocking resource.
906 but must always be called with the same resource id - a thread cannot
907 wait for more than one resource at a time.
908
909 @return WT_OK or WT_DEADLOCK
910
911 As a new edge is added to the wait-for graph, a deadlock detection is
912 performed for this new edge.
913*/
914int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
915 const WT_RESOURCE_ID *resid)
916{
917 uint i;
918 WT_RESOURCE *rc;
919 DBUG_ENTER("wt_thd_will_wait_for");
920
921 DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
922 thd->name, blocker->name, (ulong)resid->value));
923
924 if (fix_thd_pins(thd))
925 DBUG_RETURN(WT_DEADLOCK);
926
927 if (thd->waiting_for == 0)
928 {
929 uint keylen;
930 const void *key;
931 /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
932 {
933 key= resid;
934 keylen= sizeof_WT_RESOURCE_ID;
935 }
936
937 DBUG_PRINT("wt", ("first blocker"));
938
939retry:
940 while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
941 {
942 DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
943
944 if (lf_hash_insert(&reshash, thd->pins, resid) == -1) /* if OOM */
945 DBUG_RETURN(WT_DEADLOCK);
946 /*
947 Two cases: either lf_hash_insert() failed - because another thread
948 has just inserted a resource with the same id - and we need to retry.
949 Or lf_hash_insert() succeeded, and then we need to repeat
950 lf_hash_search() to find a real address of the newly inserted element.
951 That is, we don't care what lf_hash_insert() has returned.
952 And we need to repeat the loop anyway.
953 */
954 }
955 if (rc == MY_ERRPTR)
956 DBUG_RETURN(WT_DEADLOCK);
957
958 DBUG_PRINT("wt", ("found in hash rc=%p", rc));
959
960 rc_wrlock(rc);
961 if (rc->state != ACTIVE)
962 {
963 DBUG_PRINT("wt", ("but it's not active, retrying"));
964 /* Somebody has freed the element while we weren't looking */
965 rc_unlock(rc);
966 lf_hash_search_unpin(thd->pins);
967 goto retry;
968 }
969
970 lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
971 thd->waiting_for= rc;
972 rc->waiter_count++;
973 thd->killed= 0;
974 }
975 else
976 {
977 DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
978 DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
979 DBUG_PRINT("wt", ("adding another blocker"));
980
981 /*
982 we can safely access the resource here, it's in the hash as it has
983 non-zero waiter_count
984 */
985 rc= thd->waiting_for;
986 rc_wrlock(rc);
987 DBUG_ASSERT(rc->waiter_count);
988 DBUG_ASSERT(rc->state == ACTIVE);
989
990 if (thd->killed)
991 {
992 stop_waiting_locked(thd);
993 DBUG_RETURN(WT_DEADLOCK);
994 }
995 }
996 /*
997 Another thread could be waiting on this resource for this very 'blocker'.
998 In this case we should not add it to the list for the second time.
999 */
1000 for (i= 0; i < rc->owners.elements; i++)
1001 if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
1002 break;
1003 if (i >= rc->owners.elements)
1004 {
1005 if (push_dynamic(&blocker->my_resources, (void*)&rc))
1006 {
1007 stop_waiting_locked(thd);
1008 DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
1009 }
1010 if (push_dynamic(&rc->owners, (void*)&blocker))
1011 {
1012 pop_dynamic(&blocker->my_resources);
1013 stop_waiting_locked(thd);
1014 DBUG_RETURN(WT_DEADLOCK);
1015 }
1016 }
1017 rc_unlock(rc);
1018
1019 if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
1020 {
1021 stop_waiting(thd);
1022 DBUG_RETURN(WT_DEADLOCK);
1023 }
1024 DBUG_RETURN(WT_OK);
1025}
1026
1027/**
1028 called by a *waiter* (thd) to start waiting
1029
1030 It's supposed to be a drop-in replacement for
1031 mysql_cond_timedwait(), and it takes mutex as an argument.
1032
1033 @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
1034*/
1035int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex)
1036{
1037 int ret= WT_TIMEOUT;
1038 struct timespec timeout;
1039 my_hrtime_t before, after, starttime;
1040 WT_RESOURCE *rc= thd->waiting_for;
1041 ulonglong end_wait_time;
1042 DBUG_ENTER("wt_thd_cond_timedwait");
1043 DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
1044
1045#ifndef DBUG_OFF
1046 if (rc->cond_mutex)
1047 DBUG_ASSERT(rc->cond_mutex == mutex);
1048 else
1049 rc->cond_mutex= mutex;
1050 mysql_mutex_assert_owner(mutex);
1051#endif
1052
1053 before= starttime= my_hrtime();
1054
1055 rc_wrlock(rc);
1056 if (rc->owners.elements == 0)
1057 ret= WT_OK;
1058 rc_unlock(rc);
1059
1060 end_wait_time= starttime.val *1000 + (*thd->timeout_short)*1000000ULL;
1061 set_timespec_time_nsec(timeout, end_wait_time);
1062 if (ret == WT_TIMEOUT && !thd->killed)
1063 ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1064 if (ret == WT_TIMEOUT && !thd->killed)
1065 {
1066 int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
1067 if (r == WT_FREE_TO_GO)
1068 ret= WT_OK;
1069 else if (r != WT_OK)
1070 ret= WT_DEADLOCK;
1071 else if (*thd->timeout_long > *thd->timeout_short)
1072 {
1073 end_wait_time= starttime.val *1000 + (*thd->timeout_long)*1000000ULL;
1074 set_timespec_time_nsec(timeout, end_wait_time);
1075 if (!thd->killed)
1076 ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1077 }
1078 }
1079 after= my_hrtime();
1080 if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
1081 ret= WT_DEADLOCK;
1082 increment_wait_stats(after.val-before.val, ret);
1083 if (ret == WT_OK)
1084 increment_success_stats();
1085 DBUG_RETURN(ret);
1086}
1087
1088/**
1089 called by a *blocker* when it releases a resource
1090
1091 it's conceptually similar to pthread_cond_broadcast, and must be done
1092 under the same mutex as wt_thd_cond_timedwait().
1093
1094 @param resid a resource to release. 0 to release all resources
1095*/
1096
1097void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
1098{
1099 uint i;
1100 DBUG_ENTER("wt_thd_release");
1101
1102 for (i= 0; i < thd->my_resources.elements; i++)
1103 {
1104 WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
1105 if (!resid || (resid->type->compare(&rc->id, resid) == 0))
1106 {
1107 uint j;
1108
1109 rc_wrlock(rc);
1110 /*
1111 nobody's trying to free the resource now,
1112 as its owners[] array is not empty (at least thd must be there)
1113 */
1114 DBUG_ASSERT(rc->state == ACTIVE);
1115 for (j= 0; j < rc->owners.elements; j++)
1116 if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
1117 break;
1118 DBUG_ASSERT(j < rc->owners.elements);
1119 delete_dynamic_element(&rc->owners, j);
1120 if (rc->owners.elements == 0)
1121 {
1122 mysql_cond_broadcast(&rc->cond);
1123#ifndef DBUG_OFF
1124 if (rc->cond_mutex)
1125 mysql_mutex_assert_owner(rc->cond_mutex);
1126#endif
1127 }
1128 unlock_lock_and_free_resource(thd, rc);
1129 if (resid)
1130 {
1131 delete_dynamic_element(&thd->my_resources, i);
1132 DBUG_VOID_RETURN;
1133 }
1134 }
1135 }
1136 if (!resid)
1137 reset_dynamic(&thd->my_resources);
1138 DBUG_VOID_RETURN;
1139}
1140
1141