1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #pragma once |
40 | |
41 | #include <atomic> |
42 | |
43 | #include <db.h> |
44 | #include <toku_pthread.h> |
45 | #include <toku_time.h> |
46 | |
47 | #include <ft/comparator.h> |
48 | #include <ft/ft-ops.h> // just for DICTIONARY_ID.. |
49 | |
50 | #include <util/omt.h> |
51 | |
52 | #include "txnid_set.h" |
53 | #include "wfg.h" |
54 | #include "range_buffer.h" |
55 | |
56 | |
57 | namespace toku { |
58 | |
59 | class locktree; |
60 | class locktree_manager; |
61 | class lock_request; |
62 | class concurrent_tree; |
63 | |
64 | typedef int (*lt_create_cb)(locktree *lt, void *); |
65 | typedef void (*lt_destroy_cb)(locktree *lt); |
66 | typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *); |
67 | |
68 | struct lt_counters { |
69 | uint64_t wait_count, wait_time; |
70 | uint64_t long_wait_count, long_wait_time; |
71 | uint64_t timeout_count; |
72 | |
73 | void add(const lt_counters &rhs) { |
74 | wait_count += rhs.wait_count; |
75 | wait_time += rhs.wait_time; |
76 | long_wait_count += rhs.long_wait_count; |
77 | long_wait_time += rhs.long_wait_time; |
78 | timeout_count += rhs.timeout_count; |
79 | } |
80 | }; |
81 | |
82 | // Lock request state for some locktree |
83 | struct lt_lock_request_info { |
84 | omt<lock_request *> pending_lock_requests; |
85 | std::atomic_bool pending_is_empty; |
86 | toku_mutex_t mutex; |
87 | bool should_retry_lock_requests; |
88 | lt_counters counters; |
89 | std::atomic_ullong retry_want; |
90 | unsigned long long retry_done; |
91 | toku_mutex_t retry_mutex; |
92 | toku_cond_t retry_cv; |
93 | bool running_retry; |
94 | |
95 | void init(void); |
96 | void destroy(void); |
97 | }; |
98 | |
99 | // The locktree manager manages a set of locktrees, one for each open |
100 | // dictionary. Locktrees are retrieved from the manager. When they are no |
101 | // longer needed, they are be released by the user. |
102 | class locktree_manager { |
103 | public: |
104 | // param: create_cb, called just after a locktree is first created. |
105 | // destroy_cb, called just before a locktree is destroyed. |
106 | // escalate_cb, called after a locktree is escalated (with extra |
107 | // param) |
108 | void create(lt_create_cb create_cb, |
109 | lt_destroy_cb destroy_cb, |
110 | lt_escalate_cb escalate_cb, |
111 | void *); |
112 | |
113 | void destroy(void); |
114 | |
115 | size_t get_max_lock_memory(void); |
116 | |
117 | int set_max_lock_memory(size_t max_lock_memory); |
118 | |
119 | // effect: Get a locktree from the manager. If a locktree exists with the given |
120 | // dict_id, it is referenced and then returned. If one did not exist, it |
121 | // is created. It will use the comparator for comparing keys. The on_create |
122 | // callback (passed to locktree_manager::create()) will be called with the |
123 | // given extra parameter. |
124 | locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp, void *); |
125 | |
126 | void reference_lt(locktree *lt); |
127 | |
128 | // effect: Releases one reference on a locktree. If the reference count transitions |
129 | // to zero, the on_destroy callback is called before it gets destroyed. |
130 | void release_lt(locktree *lt); |
131 | |
132 | void get_status(LTM_STATUS status); |
133 | |
134 | // effect: calls the iterate function on each pending lock request |
135 | // note: holds the manager's mutex |
136 | typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id, |
137 | TXNID txnid, |
138 | const DBT *left_key, |
139 | const DBT *right_key, |
140 | TXNID blocking_txnid, |
141 | uint64_t start_time, |
142 | void *); |
143 | int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *); |
144 | |
145 | // effect: Determines if too many locks or too much memory is being used, |
146 | // Runs escalation on the manager if so. |
147 | // param: big_txn, if the current transaction is 'big' (has spilled rollback logs) |
148 | // returns: 0 if there enough resources to create a new lock, or TOKUDB_OUT_OF_LOCKS |
149 | // if there are not enough resources and lock escalation failed to free up |
150 | // enough resources for a new lock. |
151 | int check_current_lock_constraints(bool big_txn); |
152 | |
153 | bool over_big_threshold(void); |
154 | |
155 | void note_mem_used(uint64_t mem_used); |
156 | |
157 | void note_mem_released(uint64_t mem_freed); |
158 | |
159 | bool out_of_locks(void) const; |
160 | |
161 | // Escalate all locktrees |
162 | void escalate_all_locktrees(void); |
163 | |
164 | // Escalate a set of locktrees |
165 | void escalate_locktrees(locktree **locktrees, int num_locktrees); |
166 | |
167 | // effect: calls the private function run_escalation(), only ok to |
168 | // do for tests. |
169 | // rationale: to get better stress test coverage, we want a way to |
170 | // deterministicly trigger lock escalation. |
171 | void run_escalation_for_test(void); |
172 | void run_escalation(void); |
173 | |
174 | // Add time t to the escalator's wait time statistics |
175 | void add_escalator_wait_time(uint64_t t); |
176 | |
177 | void kill_waiter(void *); |
178 | |
179 | private: |
180 | static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024; |
181 | |
182 | // tracks the current number of locks and lock memory |
183 | uint64_t m_max_lock_memory; |
184 | uint64_t m_current_lock_memory; |
185 | |
186 | struct lt_counters m_lt_counters; |
187 | |
188 | // the create and destroy callbacks for the locktrees |
189 | lt_create_cb m_lt_create_callback; |
190 | lt_destroy_cb m_lt_destroy_callback; |
191 | lt_escalate_cb m_lt_escalate_callback; |
192 | void *; |
193 | |
194 | omt<locktree *> m_locktree_map; |
195 | |
196 | // the manager's mutex protects the locktree map |
197 | toku_mutex_t m_mutex; |
198 | |
199 | void mutex_lock(void); |
200 | |
201 | void mutex_unlock(void); |
202 | |
203 | // Manage the set of open locktrees |
204 | locktree *locktree_map_find(const DICTIONARY_ID &dict_id); |
205 | void locktree_map_put(locktree *lt); |
206 | void locktree_map_remove(locktree *lt); |
207 | |
208 | static int find_by_dict_id(locktree *const <, const DICTIONARY_ID &dict_id); |
209 | |
210 | void escalator_init(void); |
211 | void escalator_destroy(void); |
212 | |
213 | // statistics about lock escalation. |
214 | toku_mutex_t m_escalation_mutex; |
215 | uint64_t m_escalation_count; |
216 | tokutime_t m_escalation_time; |
217 | uint64_t m_escalation_latest_result; |
218 | uint64_t m_wait_escalation_count; |
219 | uint64_t m_wait_escalation_time; |
220 | uint64_t m_long_wait_escalation_count; |
221 | uint64_t m_long_wait_escalation_time; |
222 | |
223 | // the escalator coordinates escalation on a set of locktrees for a bunch of threads |
224 | class locktree_escalator { |
225 | public: |
226 | void create(void); |
227 | void destroy(void); |
228 | void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *), void *); |
229 | |
230 | private: |
231 | toku_mutex_t m_escalator_mutex; |
232 | toku_cond_t m_escalator_done; |
233 | bool m_escalator_running; |
234 | }; |
235 | |
236 | locktree_escalator m_escalator; |
237 | |
238 | friend class manager_unit_test; |
239 | }; |
240 | |
241 | // A locktree represents the set of row locks owned by all transactions |
242 | // over an open dictionary. Read and write ranges are represented as |
243 | // a left and right key which are compared with the given comparator |
244 | // |
245 | // Locktrees are not created and destroyed by the user. Instead, they are |
246 | // referenced and released using the locktree manager. |
247 | // |
248 | // A sample workflow looks like this: |
249 | // - Create a manager. |
250 | // - Get a locktree by dictionaroy id from the manager. |
251 | // - Perform read/write lock acquision on the locktree, add references to |
252 | // the locktree using the manager, release locks, release references, etc. |
253 | // - ... |
254 | // - Release the final reference to the locktree. It will be destroyed. |
255 | // - Destroy the manager. |
256 | class locktree { |
257 | public: |
258 | // effect: Creates a locktree |
259 | void create(locktree_manager *mgr, DICTIONARY_ID dict_id, const comparator &cmp); |
260 | |
261 | void destroy(void); |
262 | |
263 | // For thread-safe, external reference counting |
264 | void add_reference(void); |
265 | |
266 | // requires: the reference count is > 0 |
267 | // returns: the reference count, after decrementing it by one |
268 | uint32_t release_reference(void); |
269 | |
270 | // returns: the current reference count |
271 | uint32_t get_reference_count(void); |
272 | |
273 | // effect: Attempts to grant a read lock for the range of keys between [left_key, right_key]. |
274 | // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the |
275 | // given conflicts set with the txnids that hold conflicting locks in the range. |
276 | // If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS. |
277 | // note: Read locks cannot be shared between txnids, as one would expect. |
278 | // This is for simplicity since read locks are rare in MySQL. |
279 | int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn); |
280 | |
281 | // effect: Attempts to grant a write lock for the range of keys between [left_key, right_key]. |
282 | // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the |
283 | // given conflicts set with the txnids that hold conflicting locks in the range. |
284 | // If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS. |
285 | int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn); |
286 | |
287 | // effect: populate the conflicts set with the txnids that would preventing |
288 | // the given txnid from getting a lock on [left_key, right_key] |
289 | void get_conflicts(bool is_write_request, TXNID txnid, |
290 | const DBT *left_key, const DBT *right_key, txnid_set *conflicts); |
291 | |
292 | // effect: Release all of the lock ranges represented by the range buffer for a txnid. |
293 | void release_locks(TXNID txnid, const range_buffer *ranges); |
294 | |
295 | // effect: Runs escalation on this locktree |
296 | void escalate(lt_escalate_cb after_escalate_callback, void *); |
297 | |
298 | // returns: The userdata associated with this locktree, or null if it has not been set. |
299 | void *get_userdata(void) const; |
300 | |
301 | void set_userdata(void *userdata); |
302 | |
303 | locktree_manager *get_manager(void) const; |
304 | |
305 | void set_comparator(const comparator &cmp); |
306 | |
307 | int compare(const locktree *lt) const; |
308 | |
309 | DICTIONARY_ID get_dict_id() const; |
310 | |
311 | // Private info struct for storing pending lock request state. |
312 | // Only to be used by lock requests. We store it here as |
313 | // something less opaque than usual to strike a tradeoff between |
314 | // abstraction and code complexity. It is still fairly abstract |
315 | // since the lock_request object is opaque |
316 | struct lt_lock_request_info *get_lock_request_info(void); |
317 | |
318 | private: |
319 | locktree_manager *m_mgr; |
320 | DICTIONARY_ID m_dict_id; |
321 | uint32_t m_reference_count; |
322 | |
323 | // Since the memory referenced by this comparator is not owned by the |
324 | // locktree, the user must guarantee it will outlive the locktree. |
325 | // |
326 | // The ydb API accomplishes this by opening an ft_handle in the on_create |
327 | // callback, which will keep the underlying FT (and its descriptor) in memory |
328 | // for as long as the handle is open. The ft_handle is stored opaquely in the |
329 | // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra |
330 | comparator m_cmp; |
331 | |
332 | concurrent_tree *m_rangetree; |
333 | |
334 | void *m_userdata; |
335 | struct lt_lock_request_info m_lock_request_info; |
336 | |
337 | // The following fields and members prefixed with "sto_" are for |
338 | // the single txnid optimization, intended to speed up the case |
339 | // when only one transaction is using the locktree. If we know |
340 | // the locktree has only one transaction, then acquiring locks |
341 | // takes O(1) work and releasing all locks takes O(1) work. |
342 | // |
343 | // How do we know that the locktree only has a single txnid? |
344 | // What do we do if it does? |
345 | // |
346 | // When a txn with txnid T requests a lock: |
347 | // - If the tree is empty, the optimization is possible. Set the single |
348 | // txnid to T, and insert the lock range into the buffer. |
349 | // - If the tree is not empty, check if the single txnid is T. If so, |
350 | // append the lock range to the buffer. Otherwise, migrate all of |
351 | // the locks in the buffer into the rangetree on behalf of txnid T, |
352 | // and invalid the single txnid. |
353 | // |
354 | // When a txn with txnid T releases its locks: |
355 | // - If the single txnid is valid, it must be for T. Destroy the buffer. |
356 | // - If it's not valid, release locks the normal way in the rangetree. |
357 | // |
358 | // To carry out the optimization we need to record a single txnid |
359 | // and a range buffer for each locktree, each protected by the root |
360 | // lock of the locktree's rangetree. The root lock for a rangetree |
361 | // is grabbed by preparing a locked keyrange on the rangetree. |
362 | TXNID m_sto_txnid; |
363 | range_buffer m_sto_buffer; |
364 | |
365 | // The single txnid optimization speeds up the case when only one |
366 | // transaction is using the locktree. But it has the potential to |
367 | // hurt the case when more than one txnid exists. |
368 | // |
369 | // There are two things we need to do to make the optimization only |
370 | // optimize the case we care about, and not hurt the general case. |
371 | // |
372 | // Bound the worst-case latency for lock migration when the |
373 | // optimization stops working: |
374 | // - Idea: Stop the optimization and migrate immediate if we notice |
375 | // the single txnid has takes many locks in the range buffer. |
376 | // - Implementation: Enforce a max size on the single txnid range buffer. |
377 | // - Analysis: Choosing the perfect max value, M, is difficult to do |
378 | // without some feedback from the field. Intuition tells us that M should |
379 | // not be so small that the optimization is worthless, and it should not |
380 | // be so big that it's unreasonable to have to wait behind a thread doing |
381 | // the work of converting M buffer locks into rangetree locks. |
382 | // |
383 | // Prevent concurrent-transaction workloads from trying the optimization |
384 | // in vain: |
385 | // - Idea: Don't even bother trying the optimization if we think the |
386 | // system is in a concurrent-transaction state. |
387 | // - Implementation: Do something even simpler than detecting whether the |
388 | // system is in a concurent-transaction state. Just keep a "score" value |
389 | // and some threshold. If at any time the locktree is eligible for the |
390 | // optimization, only do it if the score is at this threshold. When you |
391 | // actually do the optimization but someone has to migrate locks in the buffer |
392 | // (expensive), then reset the score back to zero. Each time a txn |
393 | // releases locks, the score is incremented by 1. |
394 | // - Analysis: If you let the threshold be "C", then at most 1 / C txns will |
395 | // do the optimization in a concurrent-transaction system. Similarly, it |
396 | // takes at most C txns to start using the single txnid optimzation, which |
397 | // is good when the system transitions from multithreaded to single threaded. |
398 | // |
399 | // STO_BUFFER_MAX_SIZE: |
400 | // |
401 | // We choose the max value to be 1 million since most transactions are smaller |
402 | // than 1 million and we can create a rangetree of 1 million elements in |
403 | // less than a second. So we can be pretty confident that this threshold |
404 | // enables the optimization almost always, and prevents super pathological |
405 | // latency issues for the first lock taken by a second thread. |
406 | // |
407 | // STO_SCORE_THRESHOLD: |
408 | // |
409 | // A simple first guess at a good value for the score threshold is 100. |
410 | // By our analysis, we'd end up doing the optimization in vain for |
411 | // around 1% of all transactions, which seems reasonable. Further, |
412 | // if the system goes single threaded, it ought to be pretty quick |
413 | // for 100 transactions to go by, so we won't have to wait long before |
414 | // we start doing the single txind optimzation again. |
415 | static const int STO_BUFFER_MAX_SIZE = 50 * 1024; |
416 | static const int STO_SCORE_THRESHOLD = 100; |
417 | int m_sto_score; |
418 | |
419 | // statistics about time spent ending the STO early |
420 | uint64_t m_sto_end_early_count; |
421 | tokutime_t m_sto_end_early_time; |
422 | |
423 | // effect: begins the single txnid optimizaiton, setting m_sto_txnid |
424 | // to the given txnid. |
425 | // requires: m_sto_txnid is invalid |
426 | void sto_begin(TXNID txnid); |
427 | |
428 | // effect: append a range to the sto buffer |
429 | // requires: m_sto_txnid is valid |
430 | void sto_append(const DBT *left_key, const DBT *right_key); |
431 | |
432 | // effect: ends the single txnid optimization, releaseing any memory |
433 | // stored in the sto buffer, notifying the tracker, and |
434 | // invalidating m_sto_txnid. |
435 | // requires: m_sto_txnid is valid |
436 | void sto_end(void); |
437 | |
438 | // params: prepared_lkr is a void * to a prepared locked keyrange. see below. |
439 | // effect: ends the single txnid optimization early, migrating buffer locks |
440 | // into the rangetree, calling sto_end(), and then setting the |
441 | // sto_score back to zero. |
442 | // requires: m_sto_txnid is valid |
443 | void sto_end_early(void *prepared_lkr); |
444 | void sto_end_early_no_accounting(void *prepared_lkr); |
445 | |
446 | // params: prepared_lkr is a void * to a prepared locked keyrange. we can't use |
447 | // the real type because the compiler won't allow us to forward declare |
448 | // concurrent_tree::locked_keyrange without including concurrent_tree.h, |
449 | // which we cannot do here because it is a template implementation. |
450 | // requires: the prepared locked keyrange is for the locktree's rangetree |
451 | // requires: m_sto_txnid is valid |
452 | // effect: migrates each lock in the single txnid buffer into the locktree's |
453 | // rangetree, notifying the memory tracker as necessary. |
454 | void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr); |
455 | |
456 | // effect: If m_sto_txnid is valid, then release the txnid's locks |
457 | // by ending the optimization. |
458 | // requires: If m_sto_txnid is valid, it is equal to the given txnid |
459 | // returns: True if locks were released for this txnid |
460 | bool sto_try_release(TXNID txnid); |
461 | |
462 | // params: prepared_lkr is a void * to a prepared locked keyrange. see above. |
463 | // requires: the prepared locked keyrange is for the locktree's rangetree |
464 | // effect: If m_sto_txnid is valid and equal to the given txnid, then |
465 | // append a range onto the buffer. Otherwise, if m_sto_txnid is valid |
466 | // but not equal to this txnid, then migrate the buffer's locks |
467 | // into the rangetree and end the optimization, setting the score |
468 | // back to zero. |
469 | // returns: true if the lock was acquired for this txnid |
470 | bool sto_try_acquire(void *prepared_lkr, TXNID txnid, |
471 | const DBT *left_key, const DBT *right_key); |
472 | |
473 | // Effect: |
474 | // Provides a hook for a helgrind suppression. |
475 | // Returns: |
476 | // true if m_sto_txnid is not TXNID_NONE |
477 | bool sto_txnid_is_valid_unsafe(void) const; |
478 | |
479 | // Effect: |
480 | // Provides a hook for a helgrind suppression. |
481 | // Returns: |
482 | // m_sto_score |
483 | int sto_get_score_unsafe(void )const; |
484 | |
485 | void remove_overlapping_locks_for_txnid(TXNID txnid, |
486 | const DBT *left_key, const DBT *right_key); |
487 | |
488 | int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, |
489 | const DBT *left_key, const DBT *right_key, |
490 | txnid_set *conflicts); |
491 | |
492 | int acquire_lock(bool is_write_request, TXNID txnid, |
493 | const DBT *left_key, const DBT *right_key, |
494 | txnid_set *conflicts); |
495 | |
496 | int try_acquire_lock(bool is_write_request, TXNID txnid, |
497 | const DBT *left_key, const DBT *right_key, |
498 | txnid_set *conflicts, bool big_txn); |
499 | |
500 | |
501 | friend class locktree_unit_test; |
502 | friend class manager_unit_test; |
503 | friend class lock_request_unit_test; |
504 | |
505 | // engine status reaches into the locktree to read some stats |
506 | friend void locktree_manager::get_status(LTM_STATUS status); |
507 | }; |
508 | |
509 | } /* namespace toku */ |
510 | |