locktree.h source code [MariaDB/storage/tokudb/PerconaFT/locktree/locktree.h]

1	/ -- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -- /
2	// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3	#ident "$Id$"
4	/======*
5	This file is part of PerconaFT.
6
7
8	Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10	PerconaFT is free software: you can redistribute it and/or modify
11	it under the terms of the GNU General Public License, version 2,
12	as published by the Free Software Foundation.
13
14	PerconaFT is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	GNU General Public License for more details.
18
19	You should have received a copy of the GNU General Public License
20	along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22	----------------------------------------
23
24	PerconaFT is free software: you can redistribute it and/or modify
25	it under the terms of the GNU Affero General Public License, version 3,
26	as published by the Free Software Foundation.
27
28	PerconaFT is distributed in the hope that it will be useful,
29	but WITHOUT ANY WARRANTY; without even the implied warranty of
30	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31	GNU Affero General Public License for more details.
32
33	You should have received a copy of the GNU Affero General Public License
34	along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35	======= /*
36
37	#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39	#pragma once
40
41	#include <atomic>
42
43	#include <db.h>
44	#include <toku_pthread.h>
45	#include <toku_time.h>
46
47	#include <ft/comparator.h>
48	#include <ft/ft-ops.h> // just for DICTIONARY_ID..
49
50	#include <util/omt.h>
51
52	#include "txnid_set.h"
53	#include "wfg.h"
54	#include "range_buffer.h"
55
56
57	namespace toku {
58
59	class locktree;
60	class locktree_manager;
61	class lock_request;
62	class concurrent_tree;
63
64	typedef int (lt_create_cb)(locktree lt, void *extra);
65	typedef void (lt_destroy_cb)(locktree lt);
66	typedef void (lt_escalate_cb)(TXNID txnid, const* locktree lt, const* range_buffer &buffer, void *extra);
67
68	struct lt_counters {
69	uint64_t wait_count, wait_time;
70	uint64_t long_wait_count, long_wait_time;
71	uint64_t timeout_count;
72
73	void add(const lt_counters &rhs) {
74	wait_count += rhs.wait_count;
75	wait_time += rhs.wait_time;
76	long_wait_count += rhs.long_wait_count;
77	long_wait_time += rhs.long_wait_time;
78	timeout_count += rhs.timeout_count;
79	}
80	};
81
82	// Lock request state for some locktree
83	struct lt_lock_request_info {
84	omt<lock_request *> pending_lock_requests;
85	std::atomic_bool pending_is_empty;
86	toku_mutex_t mutex;
87	bool should_retry_lock_requests;
88	lt_counters counters;
89	std::atomic_ullong retry_want;
90	unsigned long long retry_done;
91	toku_mutex_t retry_mutex;
92	toku_cond_t retry_cv;
93	bool running_retry;
94
95	void init(void);
96	void destroy(void);
97	};
98
99	// The locktree manager manages a set of locktrees, one for each open
100	// dictionary. Locktrees are retrieved from the manager. When they are no
101	// longer needed, they are be released by the user.
102	class locktree_manager {
103	public:
104	// param: create_cb, called just after a locktree is first created.
105	// destroy_cb, called just before a locktree is destroyed.
106	// escalate_cb, called after a locktree is escalated (with extra
107	// param)
108	void create(lt_create_cb create_cb,
109	lt_destroy_cb destroy_cb,
110	lt_escalate_cb escalate_cb,
111	void *extra);
112
113	void destroy(void);
114
115	size_t get_max_lock_memory(void);
116
117	int set_max_lock_memory(size_t max_lock_memory);
118
119	// effect: Get a locktree from the manager. If a locktree exists with the given
120	// dict_id, it is referenced and then returned. If one did not exist, it
121	// is created. It will use the comparator for comparing keys. The on_create
122	// callback (passed to locktree_manager::create()) will be called with the
123	// given extra parameter.
124	locktree get_lt(DICTIONARY_ID dict_id, const* comparator &cmp, void *on_create_extra);
125
126	void reference_lt(locktree *lt);
127
128	// effect: Releases one reference on a locktree. If the reference count transitions
129	// to zero, the on_destroy callback is called before it gets destroyed.
130	void release_lt(locktree *lt);
131
132	void get_status(LTM_STATUS status);
133
134	// effect: calls the iterate function on each pending lock request
135	// note: holds the manager's mutex
136	typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
137	TXNID txnid,
138	const DBT *left_key,
139	const DBT *right_key,
140	TXNID blocking_txnid,
141	uint64_t start_time,
142	void *extra);
143	int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra);
144
145	// effect: Determines if too many locks or too much memory is being used,
146	// Runs escalation on the manager if so.
147	// param: big_txn, if the current transaction is 'big' (has spilled rollback logs)
148	// returns: 0 if there enough resources to create a new lock, or TOKUDB_OUT_OF_LOCKS
149	// if there are not enough resources and lock escalation failed to free up
150	// enough resources for a new lock.
151	int check_current_lock_constraints(bool big_txn);
152
153	bool over_big_threshold(void);
154
155	void note_mem_used(uint64_t mem_used);
156
157	void note_mem_released(uint64_t mem_freed);
158
159	bool out_of_locks(void) const;
160
161	// Escalate all locktrees
162	void escalate_all_locktrees(void);
163
164	// Escalate a set of locktrees
165	void escalate_locktrees(locktree *locktrees, int* num_locktrees);
166
167	// effect: calls the private function run_escalation(), only ok to
168	// do for tests.
169	// rationale: to get better stress test coverage, we want a way to
170	// deterministicly trigger lock escalation.
171	void run_escalation_for_test(void);
172	void run_escalation(void);
173
174	// Add time t to the escalator's wait time statistics
175	void add_escalator_wait_time(uint64_t t);
176
177	void kill_waiter(void *extra);
178
179	private:
180	static const uint64_t DEFAULT_MAX_LOCK_MEMORY = `64L` * `1024` * `1024`;
181
182	// tracks the current number of locks and lock memory
183	uint64_t m_max_lock_memory;
184	uint64_t m_current_lock_memory;
185
186	struct lt_counters m_lt_counters;
187
188	// the create and destroy callbacks for the locktrees
189	lt_create_cb m_lt_create_callback;
190	lt_destroy_cb m_lt_destroy_callback;
191	lt_escalate_cb m_lt_escalate_callback;
192	void *m_lt_escalate_callback_extra;
193
194	omt<locktree *> m_locktree_map;
195
196	// the manager's mutex protects the locktree map
197	toku_mutex_t m_mutex;
198
199	void mutex_lock(void);
200
201	void mutex_unlock(void);
202
203	// Manage the set of open locktrees
204	locktree locktree_map_find(const* DICTIONARY_ID &dict_id);
205	void locktree_map_put(locktree *lt);
206	void locktree_map_remove(locktree *lt);
207
208	static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
209
210	void escalator_init(void);
211	void escalator_destroy(void);
212
213	// statistics about lock escalation.
214	toku_mutex_t m_escalation_mutex;
215	uint64_t m_escalation_count;
216	tokutime_t m_escalation_time;
217	uint64_t m_escalation_latest_result;
218	uint64_t m_wait_escalation_count;
219	uint64_t m_wait_escalation_time;
220	uint64_t m_long_wait_escalation_count;
221	uint64_t m_long_wait_escalation_time;
222
223	// the escalator coordinates escalation on a set of locktrees for a bunch of threads
224	class locktree_escalator {
225	public:
226	void create(void);
227	void destroy(void);
228	void run(locktree_manager mgr, void* (escalate_locktrees_fun)(void* extra), void* *extra);
229
230	private:
231	toku_mutex_t m_escalator_mutex;
232	toku_cond_t m_escalator_done;
233	bool m_escalator_running;
234	};
235
236	locktree_escalator m_escalator;
237
238	friend class manager_unit_test;
239	};
240
241	// A locktree represents the set of row locks owned by all transactions
242	// over an open dictionary. Read and write ranges are represented as
243	// a left and right key which are compared with the given comparator
244	//
245	// Locktrees are not created and destroyed by the user. Instead, they are
246	// referenced and released using the locktree manager.
247	//
248	// A sample workflow looks like this:
249	// - Create a manager.
250	// - Get a locktree by dictionaroy id from the manager.
251	// - Perform read/write lock acquision on the locktree, add references to
252	// the locktree using the manager, release locks, release references, etc.
253	// - ...
254	// - Release the final reference to the locktree. It will be destroyed.
255	// - Destroy the manager.
256	class locktree {
257	public:
258	// effect: Creates a locktree
259	void create(locktree_manager mgr, DICTIONARY_ID dict_id, const* comparator &cmp);
260
261	void destroy(void);
262
263	// For thread-safe, external reference counting
264	void add_reference(void);
265
266	// requires: the reference count is > 0
267	// returns: the reference count, after decrementing it by one
268	uint32_t release_reference(void);
269
270	// returns: the current reference count
271	uint32_t get_reference_count(void);
272
273	// effect: Attempts to grant a read lock for the range of keys between [left_key, right_key].
274	// returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
275	// given conflicts set with the txnids that hold conflicting locks in the range.
276	// If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
277	// note: Read locks cannot be shared between txnids, as one would expect.
278	// This is for simplicity since read locks are rare in MySQL.
279	int acquire_read_lock(TXNID txnid, const DBT left_key, const* DBT right_key, txnid_set conflicts, bool big_txn);
280
281	// effect: Attempts to grant a write lock for the range of keys between [left_key, right_key].
282	// returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
283	// given conflicts set with the txnids that hold conflicting locks in the range.
284	// If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
285	int acquire_write_lock(TXNID txnid, const DBT left_key, const* DBT right_key, txnid_set conflicts, bool big_txn);
286
287	// effect: populate the conflicts set with the txnids that would preventing
288	// the given txnid from getting a lock on [left_key, right_key]
289	void get_conflicts(bool is_write_request, TXNID txnid,
290	const DBT left_key, const* DBT right_key, txnid_set conflicts);
291
292	// effect: Release all of the lock ranges represented by the range buffer for a txnid.
293	void release_locks(TXNID txnid, const range_buffer *ranges);
294
295	// effect: Runs escalation on this locktree
296	void escalate(lt_escalate_cb after_escalate_callback, void *extra);
297
298	// returns: The userdata associated with this locktree, or null if it has not been set.
299	void get_userdata(void) const*;
300
301	void set_userdata(void *userdata);
302
303	locktree_manager get_manager(void) const*;
304
305	void set_comparator(const comparator &cmp);
306
307	int compare(const locktree lt) const*;
308
309	DICTIONARY_ID get_dict_id() const;
310
311	// Private info struct for storing pending lock request state.
312	// Only to be used by lock requests. We store it here as
313	// something less opaque than usual to strike a tradeoff between
314	// abstraction and code complexity. It is still fairly abstract
315	// since the lock_request object is opaque
316	struct lt_lock_request_info get_lock_request_info(void*);
317
318	private:
319	locktree_manager *m_mgr;
320	DICTIONARY_ID m_dict_id;
321	uint32_t m_reference_count;
322
323	// Since the memory referenced by this comparator is not owned by the
324	// locktree, the user must guarantee it will outlive the locktree.
325	//
326	// The ydb API accomplishes this by opening an ft_handle in the on_create
327	// callback, which will keep the underlying FT (and its descriptor) in memory
328	// for as long as the handle is open. The ft_handle is stored opaquely in the
329	// userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
330	comparator m_cmp;
331
332	concurrent_tree *m_rangetree;
333
334	void *m_userdata;
335	struct lt_lock_request_info m_lock_request_info;
336
337	// The following fields and members prefixed with "sto_" are for
338	// the single txnid optimization, intended to speed up the case
339	// when only one transaction is using the locktree. If we know
340	// the locktree has only one transaction, then acquiring locks
341	// takes O(1) work and releasing all locks takes O(1) work.
342	//
343	// How do we know that the locktree only has a single txnid?
344	// What do we do if it does?
345	//
346	// When a txn with txnid T requests a lock:
347	// - If the tree is empty, the optimization is possible. Set the single
348	// txnid to T, and insert the lock range into the buffer.
349	// - If the tree is not empty, check if the single txnid is T. If so,
350	// append the lock range to the buffer. Otherwise, migrate all of
351	// the locks in the buffer into the rangetree on behalf of txnid T,
352	// and invalid the single txnid.
353	//
354	// When a txn with txnid T releases its locks:
355	// - If the single txnid is valid, it must be for T. Destroy the buffer.
356	// - If it's not valid, release locks the normal way in the rangetree.
357	//
358	// To carry out the optimization we need to record a single txnid
359	// and a range buffer for each locktree, each protected by the root
360	// lock of the locktree's rangetree. The root lock for a rangetree
361	// is grabbed by preparing a locked keyrange on the rangetree.
362	TXNID m_sto_txnid;
363	range_buffer m_sto_buffer;
364
365	// The single txnid optimization speeds up the case when only one
366	// transaction is using the locktree. But it has the potential to
367	// hurt the case when more than one txnid exists.
368	//
369	// There are two things we need to do to make the optimization only
370	// optimize the case we care about, and not hurt the general case.
371	//
372	// Bound the worst-case latency for lock migration when the
373	// optimization stops working:
374	// - Idea: Stop the optimization and migrate immediate if we notice
375	// the single txnid has takes many locks in the range buffer.
376	// - Implementation: Enforce a max size on the single txnid range buffer.
377	// - Analysis: Choosing the perfect max value, M, is difficult to do
378	// without some feedback from the field. Intuition tells us that M should
379	// not be so small that the optimization is worthless, and it should not
380	// be so big that it's unreasonable to have to wait behind a thread doing
381	// the work of converting M buffer locks into rangetree locks.
382	//
383	// Prevent concurrent-transaction workloads from trying the optimization
384	// in vain:
385	// - Idea: Don't even bother trying the optimization if we think the
386	// system is in a concurrent-transaction state.
387	// - Implementation: Do something even simpler than detecting whether the
388	// system is in a concurent-transaction state. Just keep a "score" value
389	// and some threshold. If at any time the locktree is eligible for the
390	// optimization, only do it if the score is at this threshold. When you
391	// actually do the optimization but someone has to migrate locks in the buffer
392	// (expensive), then reset the score back to zero. Each time a txn
393	// releases locks, the score is incremented by 1.
394	// - Analysis: If you let the threshold be "C", then at most 1 / C txns will
395	// do the optimization in a concurrent-transaction system. Similarly, it
396	// takes at most C txns to start using the single txnid optimzation, which
397	// is good when the system transitions from multithreaded to single threaded.
398	//
399	// STO_BUFFER_MAX_SIZE:
400	//
401	// We choose the max value to be 1 million since most transactions are smaller
402	// than 1 million and we can create a rangetree of 1 million elements in
403	// less than a second. So we can be pretty confident that this threshold
404	// enables the optimization almost always, and prevents super pathological
405	// latency issues for the first lock taken by a second thread.
406	//
407	// STO_SCORE_THRESHOLD:
408	//
409	// A simple first guess at a good value for the score threshold is 100.
410	// By our analysis, we'd end up doing the optimization in vain for
411	// around 1% of all transactions, which seems reasonable. Further,
412	// if the system goes single threaded, it ought to be pretty quick
413	// for 100 transactions to go by, so we won't have to wait long before
414	// we start doing the single txind optimzation again.
415	static const int STO_BUFFER_MAX_SIZE = `50` * `1024`;
416	static const int STO_SCORE_THRESHOLD = `100`;
417	int m_sto_score;
418
419	// statistics about time spent ending the STO early
420	uint64_t m_sto_end_early_count;
421	tokutime_t m_sto_end_early_time;
422
423	// effect: begins the single txnid optimizaiton, setting m_sto_txnid
424	// to the given txnid.
425	// requires: m_sto_txnid is invalid
426	void sto_begin(TXNID txnid);
427
428	// effect: append a range to the sto buffer
429	// requires: m_sto_txnid is valid
430	void sto_append(const DBT left_key, const* DBT *right_key);
431
432	// effect: ends the single txnid optimization, releaseing any memory
433	// stored in the sto buffer, notifying the tracker, and
434	// invalidating m_sto_txnid.
435	// requires: m_sto_txnid is valid
436	void sto_end(void);
437
438	// params: prepared_lkr is a void to a prepared locked keyrange. see below.*
439	// effect: ends the single txnid optimization early, migrating buffer locks
440	// into the rangetree, calling sto_end(), and then setting the
441	// sto_score back to zero.
442	// requires: m_sto_txnid is valid
443	void sto_end_early(void *prepared_lkr);
444	void sto_end_early_no_accounting(void *prepared_lkr);
445
446	// params: prepared_lkr is a void to a prepared locked keyrange. we can't use*
447	// the real type because the compiler won't allow us to forward declare
448	// concurrent_tree::locked_keyrange without including concurrent_tree.h,
449	// which we cannot do here because it is a template implementation.
450	// requires: the prepared locked keyrange is for the locktree's rangetree
451	// requires: m_sto_txnid is valid
452	// effect: migrates each lock in the single txnid buffer into the locktree's
453	// rangetree, notifying the memory tracker as necessary.
454	void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
455
456	// effect: If m_sto_txnid is valid, then release the txnid's locks
457	// by ending the optimization.
458	// requires: If m_sto_txnid is valid, it is equal to the given txnid
459	// returns: True if locks were released for this txnid
460	bool sto_try_release(TXNID txnid);
461
462	// params: prepared_lkr is a void to a prepared locked keyrange. see above.*
463	// requires: the prepared locked keyrange is for the locktree's rangetree
464	// effect: If m_sto_txnid is valid and equal to the given txnid, then
465	// append a range onto the buffer. Otherwise, if m_sto_txnid is valid
466	// but not equal to this txnid, then migrate the buffer's locks
467	// into the rangetree and end the optimization, setting the score
468	// back to zero.
469	// returns: true if the lock was acquired for this txnid
470	bool sto_try_acquire(void *prepared_lkr, TXNID txnid,
471	const DBT left_key, const* DBT *right_key);
472
473	// Effect:
474	// Provides a hook for a helgrind suppression.
475	// Returns:
476	// true if m_sto_txnid is not TXNID_NONE
477	bool sto_txnid_is_valid_unsafe(void) const;
478
479	// Effect:
480	// Provides a hook for a helgrind suppression.
481	// Returns:
482	// m_sto_score
483	int sto_get_score_unsafe(void )const;
484
485	void remove_overlapping_locks_for_txnid(TXNID txnid,
486	const DBT left_key, const* DBT *right_key);
487
488	int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
489	const DBT left_key, const* DBT *right_key,
490	txnid_set *conflicts);
491
492	int acquire_lock(bool is_write_request, TXNID txnid,
493	const DBT left_key, const* DBT *right_key,
494	txnid_set *conflicts);
495
496	int try_acquire_lock(bool is_write_request, TXNID txnid,
497	const DBT left_key, const* DBT *right_key,
498	txnid_set conflicts, bool* big_txn);
499
500
501	friend class locktree_unit_test;
502	friend class manager_unit_test;
503	friend class lock_request_unit_test;
504
505	// engine status reaches into the locktree to read some stats
506	friend void locktree_manager::get_status(LTM_STATUS status);
507	};
508
509	} / namespace toku /
510

Browse the source code of MariaDB/storage/tokudb/PerconaFT/locktree/locktree.h