1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#include <toku_race_tools.h>
40
41// TODO: source location info might have to be pulled up one caller
42// to be useful
43void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
44
45void treenode::mutex_unlock(void) {
46 toku_mutex_unlock(&m_mutex);
47}
48
49void treenode::init(const comparator *cmp) {
50 m_txnid = TXNID_NONE;
51 m_is_root = false;
52 m_is_empty = true;
53 m_cmp = cmp;
54 // use an adaptive mutex at each node since we expect the time the
55 // lock is held to be relatively short compared to a context switch.
56 // indeed, this improves performance at high thread counts considerably.
57 memset(&m_mutex, 0, sizeof(toku_mutex_t));
58 toku_pthread_mutexattr_t attr;
59 toku_mutexattr_init(&attr);
60 toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
61 toku_mutex_init(*treenode_mutex_key, &m_mutex, &attr);
62 toku_mutexattr_destroy(&attr);
63 m_left_child.set(nullptr);
64 m_right_child.set(nullptr);
65}
66
67void treenode::create_root(const comparator *cmp) {
68 init(cmp);
69 m_is_root = true;
70}
71
72void treenode::destroy_root(void) {
73 invariant(is_root());
74 invariant(is_empty());
75 toku_mutex_destroy(&m_mutex);
76 m_cmp = nullptr;
77}
78
79void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) {
80 // allocates a new copy of the range for this node
81 m_range.create_copy(range);
82 m_txnid = txnid;
83 m_is_empty = false;
84}
85
86bool treenode::is_root(void) {
87 return m_is_root;
88}
89
90bool treenode::is_empty(void) {
91 return m_is_empty;
92}
93
94bool treenode::range_overlaps(const keyrange &range) {
95 return m_range.overlaps(*m_cmp, range);
96}
97
98treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) {
99 treenode *XCALLOC(node);
100 node->init(cmp);
101 node->set_range_and_txnid(range, txnid);
102 return node;
103}
104
105void treenode::swap_in_place(treenode *node1, treenode *node2) {
106 keyrange tmp_range = node1->m_range;
107 TXNID tmp_txnid = node1->m_txnid;
108 node1->m_range = node2->m_range;
109 node1->m_txnid = node2->m_txnid;
110 node2->m_range = tmp_range;
111 node2->m_txnid = tmp_txnid;
112}
113
114void treenode::free(treenode *node) {
115 // destroy the range, freeing any copied keys
116 node->m_range.destroy();
117
118 // the root is simply marked as empty.
119 if (node->is_root()) {
120 toku_mutex_assert_locked(&node->m_mutex);
121 node->m_is_empty = true;
122 } else {
123 toku_mutex_assert_unlocked(&node->m_mutex);
124 toku_mutex_destroy(&node->m_mutex);
125 toku_free(node);
126 }
127}
128
129uint32_t treenode::get_depth_estimate(void) const {
130 const uint32_t left_est = m_left_child.depth_est;
131 const uint32_t right_est = m_right_child.depth_est;
132 return (left_est > right_est ? left_est : right_est) + 1;
133}
134
135treenode *treenode::find_node_with_overlapping_child(const keyrange &range,
136 const keyrange::comparison *cmp_hint) {
137
138 // determine which child to look at based on a comparison. if we were
139 // given a comparison hint, use that. otherwise, compare them now.
140 keyrange::comparison c = cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
141
142 treenode *child;
143 if (c == keyrange::comparison::LESS_THAN) {
144 child = lock_and_rebalance_left();
145 } else {
146 // The caller (locked_keyrange::acquire) handles the case where
147 // the root of the locked_keyrange is the node that overlaps.
148 // range is guaranteed not to overlap this node.
149 invariant(c == keyrange::comparison::GREATER_THAN);
150 child = lock_and_rebalance_right();
151 }
152
153 // if the search would lead us to an empty subtree (child == nullptr),
154 // or the child overlaps, then we know this node is the parent we want.
155 // otherwise we need to recur into that child.
156 if (child == nullptr) {
157 return this;
158 } else {
159 c = range.compare(*m_cmp, child->m_range);
160 if (c == keyrange::comparison::EQUALS || c == keyrange::comparison::OVERLAPS) {
161 child->mutex_unlock();
162 return this;
163 } else {
164 // unlock this node before recurring into the locked child,
165 // passing in a comparison hint since we just comapred range
166 // to the child's range.
167 mutex_unlock();
168 return child->find_node_with_overlapping_child(range, &c);
169 }
170 }
171}
172
173template <class F>
174void treenode::traverse_overlaps(const keyrange &range, F *function) {
175 keyrange::comparison c = range.compare(*m_cmp, m_range);
176 if (c == keyrange::comparison::EQUALS) {
177 // Doesn't matter if fn wants to keep going, there
178 // is nothing left, so return.
179 function->fn(m_range, m_txnid);
180 return;
181 }
182
183 treenode *left = m_left_child.get_locked();
184 if (left) {
185 if (c != keyrange::comparison::GREATER_THAN) {
186 // Target range is less than this node, or it overlaps this
187 // node. There may be something on the left.
188 left->traverse_overlaps(range, function);
189 }
190 left->mutex_unlock();
191 }
192
193 if (c == keyrange::comparison::OVERLAPS) {
194 bool keep_going = function->fn(m_range, m_txnid);
195 if (!keep_going) {
196 return;
197 }
198 }
199
200 treenode *right = m_right_child.get_locked();
201 if (right) {
202 if (c != keyrange::comparison::LESS_THAN) {
203 // Target range is greater than this node, or it overlaps this
204 // node. There may be something on the right.
205 right->traverse_overlaps(range, function);
206 }
207 right->mutex_unlock();
208 }
209}
210
211void treenode::insert(const keyrange &range, TXNID txnid) {
212 // choose a child to check. if that child is null, then insert the new node there.
213 // otherwise recur down that child's subtree
214 keyrange::comparison c = range.compare(*m_cmp, m_range);
215 if (c == keyrange::comparison::LESS_THAN) {
216 treenode *left_child = lock_and_rebalance_left();
217 if (left_child == nullptr) {
218 left_child = treenode::alloc(m_cmp, range, txnid);
219 m_left_child.set(left_child);
220 } else {
221 left_child->insert(range, txnid);
222 left_child->mutex_unlock();
223 }
224 } else {
225 invariant(c == keyrange::comparison::GREATER_THAN);
226 treenode *right_child = lock_and_rebalance_right();
227 if (right_child == nullptr) {
228 right_child = treenode::alloc(m_cmp, range, txnid);
229 m_right_child.set(right_child);
230 } else {
231 right_child->insert(range, txnid);
232 right_child->mutex_unlock();
233 }
234 }
235}
236
237treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
238 treenode *child = direction > 0 ?
239 m_right_child.get_locked() : m_left_child.get_locked();
240
241 if (child) {
242 *parent = this;
243 treenode *child_extreme = child->find_child_at_extreme(direction, parent);
244 child->mutex_unlock();
245 return child_extreme;
246 } else {
247 return this;
248 }
249}
250
251treenode *treenode::find_leftmost_child(treenode **parent) {
252 return find_child_at_extreme(-1, parent);
253}
254
255treenode *treenode::find_rightmost_child(treenode **parent) {
256 return find_child_at_extreme(1, parent);
257}
258
259treenode *treenode::remove_root_of_subtree() {
260 // if this node has no children, just free it and return null
261 if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
262 // treenode::free requires that non-root nodes are unlocked
263 if (!is_root()) {
264 mutex_unlock();
265 }
266 treenode::free(this);
267 return nullptr;
268 }
269
270 // we have a child, so get either the in-order successor or
271 // predecessor of this node to be our replacement.
272 // replacement_parent is updated by the find functions as
273 // they recur down the tree, so initialize it to this.
274 treenode *child, *replacement;
275 treenode *replacement_parent = this;
276 if (m_left_child.ptr != nullptr) {
277 child = m_left_child.get_locked();
278 replacement = child->find_rightmost_child(&replacement_parent);
279 invariant(replacement == child || replacement_parent != this);
280
281 // detach the replacement from its parent
282 if (replacement_parent == this) {
283 m_left_child = replacement->m_left_child;
284 } else {
285 replacement_parent->m_right_child = replacement->m_left_child;
286 }
287 } else {
288 child = m_right_child.get_locked();
289 replacement = child->find_leftmost_child(&replacement_parent);
290 invariant(replacement == child || replacement_parent != this);
291
292 // detach the replacement from its parent
293 if (replacement_parent == this) {
294 m_right_child = replacement->m_right_child;
295 } else {
296 replacement_parent->m_left_child = replacement->m_right_child;
297 }
298 }
299 child->mutex_unlock();
300
301 // swap in place with the detached replacement, then destroy it
302 treenode::swap_in_place(replacement, this);
303 treenode::free(replacement);
304
305 return this;
306}
307
308void treenode::recursive_remove(void) {
309 treenode *left = m_left_child.ptr;
310 if (left) {
311 left->recursive_remove();
312 }
313 m_left_child.set(nullptr);
314
315 treenode *right = m_right_child.ptr;
316 if (right) {
317 right->recursive_remove();
318 }
319 m_right_child.set(nullptr);
320
321 // we do not take locks on the way down, so we know non-root nodes
322 // are unlocked here and the caller is required to pass a locked
323 // root, so this free is correct.
324 treenode::free(this);
325}
326
327treenode *treenode::remove(const keyrange &range) {
328 treenode *child;
329 // if the range is equal to this node's range, then just remove
330 // the root of this subtree. otherwise search down the tree
331 // in either the left or right children.
332 keyrange::comparison c = range.compare(*m_cmp, m_range);
333 switch (c) {
334 case keyrange::comparison::EQUALS:
335 return remove_root_of_subtree();
336 case keyrange::comparison::LESS_THAN:
337 child = m_left_child.get_locked();
338 invariant_notnull(child);
339 child = child->remove(range);
340
341 // unlock the child if there still is one.
342 // regardless, set the right child pointer
343 if (child) {
344 child->mutex_unlock();
345 }
346 m_left_child.set(child);
347 break;
348 case keyrange::comparison::GREATER_THAN:
349 child = m_right_child.get_locked();
350 invariant_notnull(child);
351 child = child->remove(range);
352
353 // unlock the child if there still is one.
354 // regardless, set the right child pointer
355 if (child) {
356 child->mutex_unlock();
357 }
358 m_right_child.set(child);
359 break;
360 case keyrange::comparison::OVERLAPS:
361 // shouldn't be overlapping, since the tree is
362 // non-overlapping and this range must exist
363 abort();
364 }
365
366 return this;
367}
368
369bool treenode::left_imbalanced(int threshold) const {
370 uint32_t left_depth = m_left_child.depth_est;
371 uint32_t right_depth = m_right_child.depth_est;
372 return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
373}
374
375bool treenode::right_imbalanced(int threshold) const {
376 uint32_t left_depth = m_left_child.depth_est;
377 uint32_t right_depth = m_right_child.depth_est;
378 return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
379}
380
381// effect: rebalances the subtree rooted at this node
382// using AVL style O(1) rotations. unlocks this
383// node if it is not the new root of the subtree.
384// requires: node is locked by this thread, children are not
385// returns: locked root node of the rebalanced tree
386treenode *treenode::maybe_rebalance(void) {
387 // if we end up not rotating at all, the new root is this
388 treenode *new_root = this;
389 treenode *child = nullptr;
390
391 if (left_imbalanced(IMBALANCE_THRESHOLD)) {
392 child = m_left_child.get_locked();
393 if (child->right_imbalanced(0)) {
394 treenode *grandchild = child->m_right_child.get_locked();
395
396 child->m_right_child = grandchild->m_left_child;
397 grandchild->m_left_child.set(child);
398
399 m_left_child = grandchild->m_right_child;
400 grandchild->m_right_child.set(this);
401
402 new_root = grandchild;
403 } else {
404 m_left_child = child->m_right_child;
405 child->m_right_child.set(this);
406 new_root = child;
407 }
408 } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
409 child = m_right_child.get_locked();
410 if (child->left_imbalanced(0)) {
411 treenode *grandchild = child->m_left_child.get_locked();
412
413 child->m_left_child = grandchild->m_right_child;
414 grandchild->m_right_child.set(child);
415
416 m_right_child = grandchild->m_left_child;
417 grandchild->m_left_child.set(this);
418
419 new_root = grandchild;
420 } else {
421 m_right_child = child->m_left_child;
422 child->m_left_child.set(this);
423 new_root = child;
424 }
425 }
426
427 // up to three nodes may be locked.
428 // - this
429 // - child
430 // - grandchild (but if it is locked, its the new root)
431 //
432 // one of them is the new root. we unlock everything except the new root.
433 if (child && child != new_root) {
434 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
435 child->mutex_unlock();
436 }
437 if (this != new_root) {
438 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
439 mutex_unlock();
440 }
441 TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
442 return new_root;
443}
444
445
446treenode *treenode::lock_and_rebalance_left(void) {
447 treenode *child = m_left_child.get_locked();
448 if (child) {
449 treenode *new_root = child->maybe_rebalance();
450 m_left_child.set(new_root);
451 child = new_root;
452 }
453 return child;
454}
455
456treenode *treenode::lock_and_rebalance_right(void) {
457 treenode *child = m_right_child.get_locked();
458 if (child) {
459 treenode *new_root = child->maybe_rebalance();
460 m_right_child.set(new_root);
461 child = new_root;
462 }
463 return child;
464}
465
466void treenode::child_ptr::set(treenode *node) {
467 ptr = node;
468 depth_est = ptr ? ptr->get_depth_estimate() : 0;
469}
470
471treenode *treenode::child_ptr::get_locked(void) {
472 if (ptr) {
473 ptr->mutex_lock();
474 depth_est = ptr->get_depth_estimate();
475 }
476 return ptr;
477}
478