1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #include <toku_race_tools.h> |
40 | |
41 | // TODO: source location info might have to be pulled up one caller |
42 | // to be useful |
43 | void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); } |
44 | |
45 | void treenode::mutex_unlock(void) { |
46 | toku_mutex_unlock(&m_mutex); |
47 | } |
48 | |
49 | void treenode::init(const comparator *cmp) { |
50 | m_txnid = TXNID_NONE; |
51 | m_is_root = false; |
52 | m_is_empty = true; |
53 | m_cmp = cmp; |
54 | // use an adaptive mutex at each node since we expect the time the |
55 | // lock is held to be relatively short compared to a context switch. |
56 | // indeed, this improves performance at high thread counts considerably. |
57 | memset(&m_mutex, 0, sizeof(toku_mutex_t)); |
58 | toku_pthread_mutexattr_t attr; |
59 | toku_mutexattr_init(&attr); |
60 | toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE); |
61 | toku_mutex_init(*treenode_mutex_key, &m_mutex, &attr); |
62 | toku_mutexattr_destroy(&attr); |
63 | m_left_child.set(nullptr); |
64 | m_right_child.set(nullptr); |
65 | } |
66 | |
67 | void treenode::create_root(const comparator *cmp) { |
68 | init(cmp); |
69 | m_is_root = true; |
70 | } |
71 | |
72 | void treenode::destroy_root(void) { |
73 | invariant(is_root()); |
74 | invariant(is_empty()); |
75 | toku_mutex_destroy(&m_mutex); |
76 | m_cmp = nullptr; |
77 | } |
78 | |
79 | void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) { |
80 | // allocates a new copy of the range for this node |
81 | m_range.create_copy(range); |
82 | m_txnid = txnid; |
83 | m_is_empty = false; |
84 | } |
85 | |
86 | bool treenode::is_root(void) { |
87 | return m_is_root; |
88 | } |
89 | |
90 | bool treenode::is_empty(void) { |
91 | return m_is_empty; |
92 | } |
93 | |
94 | bool treenode::range_overlaps(const keyrange &range) { |
95 | return m_range.overlaps(*m_cmp, range); |
96 | } |
97 | |
98 | treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) { |
99 | treenode *XCALLOC(node); |
100 | node->init(cmp); |
101 | node->set_range_and_txnid(range, txnid); |
102 | return node; |
103 | } |
104 | |
105 | void treenode::swap_in_place(treenode *node1, treenode *node2) { |
106 | keyrange tmp_range = node1->m_range; |
107 | TXNID tmp_txnid = node1->m_txnid; |
108 | node1->m_range = node2->m_range; |
109 | node1->m_txnid = node2->m_txnid; |
110 | node2->m_range = tmp_range; |
111 | node2->m_txnid = tmp_txnid; |
112 | } |
113 | |
114 | void treenode::free(treenode *node) { |
115 | // destroy the range, freeing any copied keys |
116 | node->m_range.destroy(); |
117 | |
118 | // the root is simply marked as empty. |
119 | if (node->is_root()) { |
120 | toku_mutex_assert_locked(&node->m_mutex); |
121 | node->m_is_empty = true; |
122 | } else { |
123 | toku_mutex_assert_unlocked(&node->m_mutex); |
124 | toku_mutex_destroy(&node->m_mutex); |
125 | toku_free(node); |
126 | } |
127 | } |
128 | |
129 | uint32_t treenode::get_depth_estimate(void) const { |
130 | const uint32_t left_est = m_left_child.depth_est; |
131 | const uint32_t right_est = m_right_child.depth_est; |
132 | return (left_est > right_est ? left_est : right_est) + 1; |
133 | } |
134 | |
135 | treenode *treenode::find_node_with_overlapping_child(const keyrange &range, |
136 | const keyrange::comparison *cmp_hint) { |
137 | |
138 | // determine which child to look at based on a comparison. if we were |
139 | // given a comparison hint, use that. otherwise, compare them now. |
140 | keyrange::comparison c = cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range); |
141 | |
142 | treenode *child; |
143 | if (c == keyrange::comparison::LESS_THAN) { |
144 | child = lock_and_rebalance_left(); |
145 | } else { |
146 | // The caller (locked_keyrange::acquire) handles the case where |
147 | // the root of the locked_keyrange is the node that overlaps. |
148 | // range is guaranteed not to overlap this node. |
149 | invariant(c == keyrange::comparison::GREATER_THAN); |
150 | child = lock_and_rebalance_right(); |
151 | } |
152 | |
153 | // if the search would lead us to an empty subtree (child == nullptr), |
154 | // or the child overlaps, then we know this node is the parent we want. |
155 | // otherwise we need to recur into that child. |
156 | if (child == nullptr) { |
157 | return this; |
158 | } else { |
159 | c = range.compare(*m_cmp, child->m_range); |
160 | if (c == keyrange::comparison::EQUALS || c == keyrange::comparison::OVERLAPS) { |
161 | child->mutex_unlock(); |
162 | return this; |
163 | } else { |
164 | // unlock this node before recurring into the locked child, |
165 | // passing in a comparison hint since we just comapred range |
166 | // to the child's range. |
167 | mutex_unlock(); |
168 | return child->find_node_with_overlapping_child(range, &c); |
169 | } |
170 | } |
171 | } |
172 | |
173 | template <class F> |
174 | void treenode::traverse_overlaps(const keyrange &range, F *function) { |
175 | keyrange::comparison c = range.compare(*m_cmp, m_range); |
176 | if (c == keyrange::comparison::EQUALS) { |
177 | // Doesn't matter if fn wants to keep going, there |
178 | // is nothing left, so return. |
179 | function->fn(m_range, m_txnid); |
180 | return; |
181 | } |
182 | |
183 | treenode *left = m_left_child.get_locked(); |
184 | if (left) { |
185 | if (c != keyrange::comparison::GREATER_THAN) { |
186 | // Target range is less than this node, or it overlaps this |
187 | // node. There may be something on the left. |
188 | left->traverse_overlaps(range, function); |
189 | } |
190 | left->mutex_unlock(); |
191 | } |
192 | |
193 | if (c == keyrange::comparison::OVERLAPS) { |
194 | bool keep_going = function->fn(m_range, m_txnid); |
195 | if (!keep_going) { |
196 | return; |
197 | } |
198 | } |
199 | |
200 | treenode *right = m_right_child.get_locked(); |
201 | if (right) { |
202 | if (c != keyrange::comparison::LESS_THAN) { |
203 | // Target range is greater than this node, or it overlaps this |
204 | // node. There may be something on the right. |
205 | right->traverse_overlaps(range, function); |
206 | } |
207 | right->mutex_unlock(); |
208 | } |
209 | } |
210 | |
211 | void treenode::insert(const keyrange &range, TXNID txnid) { |
212 | // choose a child to check. if that child is null, then insert the new node there. |
213 | // otherwise recur down that child's subtree |
214 | keyrange::comparison c = range.compare(*m_cmp, m_range); |
215 | if (c == keyrange::comparison::LESS_THAN) { |
216 | treenode *left_child = lock_and_rebalance_left(); |
217 | if (left_child == nullptr) { |
218 | left_child = treenode::alloc(m_cmp, range, txnid); |
219 | m_left_child.set(left_child); |
220 | } else { |
221 | left_child->insert(range, txnid); |
222 | left_child->mutex_unlock(); |
223 | } |
224 | } else { |
225 | invariant(c == keyrange::comparison::GREATER_THAN); |
226 | treenode *right_child = lock_and_rebalance_right(); |
227 | if (right_child == nullptr) { |
228 | right_child = treenode::alloc(m_cmp, range, txnid); |
229 | m_right_child.set(right_child); |
230 | } else { |
231 | right_child->insert(range, txnid); |
232 | right_child->mutex_unlock(); |
233 | } |
234 | } |
235 | } |
236 | |
237 | treenode *treenode::find_child_at_extreme(int direction, treenode **parent) { |
238 | treenode *child = direction > 0 ? |
239 | m_right_child.get_locked() : m_left_child.get_locked(); |
240 | |
241 | if (child) { |
242 | *parent = this; |
243 | treenode *child_extreme = child->find_child_at_extreme(direction, parent); |
244 | child->mutex_unlock(); |
245 | return child_extreme; |
246 | } else { |
247 | return this; |
248 | } |
249 | } |
250 | |
251 | treenode *treenode::find_leftmost_child(treenode **parent) { |
252 | return find_child_at_extreme(-1, parent); |
253 | } |
254 | |
255 | treenode *treenode::find_rightmost_child(treenode **parent) { |
256 | return find_child_at_extreme(1, parent); |
257 | } |
258 | |
259 | treenode *treenode::remove_root_of_subtree() { |
260 | // if this node has no children, just free it and return null |
261 | if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) { |
262 | // treenode::free requires that non-root nodes are unlocked |
263 | if (!is_root()) { |
264 | mutex_unlock(); |
265 | } |
266 | treenode::free(this); |
267 | return nullptr; |
268 | } |
269 | |
270 | // we have a child, so get either the in-order successor or |
271 | // predecessor of this node to be our replacement. |
272 | // replacement_parent is updated by the find functions as |
273 | // they recur down the tree, so initialize it to this. |
274 | treenode *child, *replacement; |
275 | treenode *replacement_parent = this; |
276 | if (m_left_child.ptr != nullptr) { |
277 | child = m_left_child.get_locked(); |
278 | replacement = child->find_rightmost_child(&replacement_parent); |
279 | invariant(replacement == child || replacement_parent != this); |
280 | |
281 | // detach the replacement from its parent |
282 | if (replacement_parent == this) { |
283 | m_left_child = replacement->m_left_child; |
284 | } else { |
285 | replacement_parent->m_right_child = replacement->m_left_child; |
286 | } |
287 | } else { |
288 | child = m_right_child.get_locked(); |
289 | replacement = child->find_leftmost_child(&replacement_parent); |
290 | invariant(replacement == child || replacement_parent != this); |
291 | |
292 | // detach the replacement from its parent |
293 | if (replacement_parent == this) { |
294 | m_right_child = replacement->m_right_child; |
295 | } else { |
296 | replacement_parent->m_left_child = replacement->m_right_child; |
297 | } |
298 | } |
299 | child->mutex_unlock(); |
300 | |
301 | // swap in place with the detached replacement, then destroy it |
302 | treenode::swap_in_place(replacement, this); |
303 | treenode::free(replacement); |
304 | |
305 | return this; |
306 | } |
307 | |
308 | void treenode::recursive_remove(void) { |
309 | treenode *left = m_left_child.ptr; |
310 | if (left) { |
311 | left->recursive_remove(); |
312 | } |
313 | m_left_child.set(nullptr); |
314 | |
315 | treenode *right = m_right_child.ptr; |
316 | if (right) { |
317 | right->recursive_remove(); |
318 | } |
319 | m_right_child.set(nullptr); |
320 | |
321 | // we do not take locks on the way down, so we know non-root nodes |
322 | // are unlocked here and the caller is required to pass a locked |
323 | // root, so this free is correct. |
324 | treenode::free(this); |
325 | } |
326 | |
327 | treenode *treenode::remove(const keyrange &range) { |
328 | treenode *child; |
329 | // if the range is equal to this node's range, then just remove |
330 | // the root of this subtree. otherwise search down the tree |
331 | // in either the left or right children. |
332 | keyrange::comparison c = range.compare(*m_cmp, m_range); |
333 | switch (c) { |
334 | case keyrange::comparison::EQUALS: |
335 | return remove_root_of_subtree(); |
336 | case keyrange::comparison::LESS_THAN: |
337 | child = m_left_child.get_locked(); |
338 | invariant_notnull(child); |
339 | child = child->remove(range); |
340 | |
341 | // unlock the child if there still is one. |
342 | // regardless, set the right child pointer |
343 | if (child) { |
344 | child->mutex_unlock(); |
345 | } |
346 | m_left_child.set(child); |
347 | break; |
348 | case keyrange::comparison::GREATER_THAN: |
349 | child = m_right_child.get_locked(); |
350 | invariant_notnull(child); |
351 | child = child->remove(range); |
352 | |
353 | // unlock the child if there still is one. |
354 | // regardless, set the right child pointer |
355 | if (child) { |
356 | child->mutex_unlock(); |
357 | } |
358 | m_right_child.set(child); |
359 | break; |
360 | case keyrange::comparison::OVERLAPS: |
361 | // shouldn't be overlapping, since the tree is |
362 | // non-overlapping and this range must exist |
363 | abort(); |
364 | } |
365 | |
366 | return this; |
367 | } |
368 | |
369 | bool treenode::left_imbalanced(int threshold) const { |
370 | uint32_t left_depth = m_left_child.depth_est; |
371 | uint32_t right_depth = m_right_child.depth_est; |
372 | return m_left_child.ptr != nullptr && left_depth > threshold + right_depth; |
373 | } |
374 | |
375 | bool treenode::right_imbalanced(int threshold) const { |
376 | uint32_t left_depth = m_left_child.depth_est; |
377 | uint32_t right_depth = m_right_child.depth_est; |
378 | return m_right_child.ptr != nullptr && right_depth > threshold + left_depth; |
379 | } |
380 | |
381 | // effect: rebalances the subtree rooted at this node |
382 | // using AVL style O(1) rotations. unlocks this |
383 | // node if it is not the new root of the subtree. |
384 | // requires: node is locked by this thread, children are not |
385 | // returns: locked root node of the rebalanced tree |
386 | treenode *treenode::maybe_rebalance(void) { |
387 | // if we end up not rotating at all, the new root is this |
388 | treenode *new_root = this; |
389 | treenode *child = nullptr; |
390 | |
391 | if (left_imbalanced(IMBALANCE_THRESHOLD)) { |
392 | child = m_left_child.get_locked(); |
393 | if (child->right_imbalanced(0)) { |
394 | treenode *grandchild = child->m_right_child.get_locked(); |
395 | |
396 | child->m_right_child = grandchild->m_left_child; |
397 | grandchild->m_left_child.set(child); |
398 | |
399 | m_left_child = grandchild->m_right_child; |
400 | grandchild->m_right_child.set(this); |
401 | |
402 | new_root = grandchild; |
403 | } else { |
404 | m_left_child = child->m_right_child; |
405 | child->m_right_child.set(this); |
406 | new_root = child; |
407 | } |
408 | } else if (right_imbalanced(IMBALANCE_THRESHOLD)) { |
409 | child = m_right_child.get_locked(); |
410 | if (child->left_imbalanced(0)) { |
411 | treenode *grandchild = child->m_left_child.get_locked(); |
412 | |
413 | child->m_left_child = grandchild->m_right_child; |
414 | grandchild->m_right_child.set(child); |
415 | |
416 | m_right_child = grandchild->m_left_child; |
417 | grandchild->m_left_child.set(this); |
418 | |
419 | new_root = grandchild; |
420 | } else { |
421 | m_right_child = child->m_left_child; |
422 | child->m_left_child.set(this); |
423 | new_root = child; |
424 | } |
425 | } |
426 | |
427 | // up to three nodes may be locked. |
428 | // - this |
429 | // - child |
430 | // - grandchild (but if it is locked, its the new root) |
431 | // |
432 | // one of them is the new root. we unlock everything except the new root. |
433 | if (child && child != new_root) { |
434 | TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex); |
435 | child->mutex_unlock(); |
436 | } |
437 | if (this != new_root) { |
438 | TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex); |
439 | mutex_unlock(); |
440 | } |
441 | TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex); |
442 | return new_root; |
443 | } |
444 | |
445 | |
446 | treenode *treenode::lock_and_rebalance_left(void) { |
447 | treenode *child = m_left_child.get_locked(); |
448 | if (child) { |
449 | treenode *new_root = child->maybe_rebalance(); |
450 | m_left_child.set(new_root); |
451 | child = new_root; |
452 | } |
453 | return child; |
454 | } |
455 | |
456 | treenode *treenode::lock_and_rebalance_right(void) { |
457 | treenode *child = m_right_child.get_locked(); |
458 | if (child) { |
459 | treenode *new_root = child->maybe_rebalance(); |
460 | m_right_child.set(new_root); |
461 | child = new_root; |
462 | } |
463 | return child; |
464 | } |
465 | |
466 | void treenode::child_ptr::set(treenode *node) { |
467 | ptr = node; |
468 | depth_est = ptr ? ptr->get_depth_estimate() : 0; |
469 | } |
470 | |
471 | treenode *treenode::child_ptr::get_locked(void) { |
472 | if (ptr) { |
473 | ptr->mutex_lock(); |
474 | depth_est = ptr->get_depth_estimate(); |
475 | } |
476 | return ptr; |
477 | } |
478 | |