1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#pragma once
40
41#include "portability/toku_config.h"
42#include "portability/toku_list.h"
43#include "portability/toku_race_tools.h"
44
45#include "ft/cachetable/cachetable.h"
46#include "ft/comparator.h"
47#include "ft/ft.h"
48#include "ft/ft-ops.h"
49#include "ft/node.h"
50#include "ft/serialize/block_table.h"
51#include "ft/txn/rollback.h"
52#include "ft/ft-status.h"
53
54// Symbol TOKUDB_REVISION is not defined by fractal-tree makefiles, so
55// BUILD_ID of 1000 indicates development build of main, not a release build.
56#if defined(TOKUDB_REVISION)
57#define BUILD_ID TOKUDB_REVISION
58#else
59#error
60#endif
61
62struct ft_search;
63
64enum { FT_DEFAULT_FANOUT = 16 };
65enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 };
66enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
67
68// We optimize for a sequential insert pattern if 100 consecutive injections
69// happen into the rightmost leaf node due to promotion.
70enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 };
71
72uint32_t compute_child_fullhash (CACHEFILE cf, FTNODE node, int childnum);
73
74enum ft_type {
75 FT_CURRENT = 1,
76 FT_CHECKPOINT_INPROGRESS
77};
78
79// The ft_header is not managed by the cachetable. Instead, it hangs off the cachefile as userdata.
80struct ft_header {
81 enum ft_type type;
82
83 int dirty;
84
85 // Free-running counter incremented once per checkpoint (toggling LSB).
86 // LSB indicates which header location is used on disk so this
87 // counter is effectively a boolean which alternates with each checkpoint.
88 uint64_t checkpoint_count;
89 // LSN of creation of "checkpoint-begin" record in log.
90 LSN checkpoint_lsn;
91
92 // see serialize/ft_layout_version.h. maybe don't need this if we assume
93 // it's always the current version after deserializing
94 const int layout_version;
95 // different (<) from layout_version if upgraded from a previous
96 // version (useful for debugging)
97 const int layout_version_original;
98 // build_id (svn rev number) of software that wrote this node to
99 // disk. (read from disk, overwritten when written to disk, I
100 // think).
101 const uint32_t build_id;
102 // build_id of software that created this tree
103 const uint32_t build_id_original;
104
105 // time this tree was created
106 const uint64_t time_of_creation;
107 // and the root transaction id that created it
108 TXNID root_xid_that_created;
109 // last time this header was serialized to disk (read from disk,
110 // overwritten when written to disk)
111 uint64_t time_of_last_modification;
112 // last time that this tree was verified
113 uint64_t time_of_last_verification;
114
115 // this field is essentially a const
116 BLOCKNUM root_blocknum;
117
118 const unsigned int flags;
119
120 //protected by toku_ft_lock
121 unsigned int nodesize;
122 unsigned int basementnodesize;
123 enum toku_compression_method compression_method;
124 unsigned int fanout;
125
126 // Current Minimum MSN to be used when upgrading pre-MSN FT's.
127 // This is decremented from our currnt MIN_MSN so as not to clash
128 // with any existing 'normal' MSN's.
129 MSN highest_unused_msn_for_upgrade;
130 // Largest MSN ever injected into the tree. Used to set the MSN for
131 // messages as they get injected.
132 MSN max_msn_in_ft;
133
134 // last time that a hot optimize operation was begun
135 uint64_t time_of_last_optimize_begin;
136 // last time that a hot optimize operation was successfully completed
137 uint64_t time_of_last_optimize_end;
138 // the number of hot optimize operations currently in progress on this tree
139 uint32_t count_of_optimize_in_progress;
140 // the number of hot optimize operations in progress on this tree at the time of the last crash (this field is in-memory only)
141 uint32_t count_of_optimize_in_progress_read_from_disk;
142 // all messages before this msn have been applied to leaf nodes
143 MSN msn_at_start_of_last_completed_optimize;
144
145 STAT64INFO_S on_disk_stats;
146
147 // This represents the balance of inserts - deletes and should be
148 // closer to a logical representation of the number of records in an index
149 uint64_t on_disk_logical_rows;
150};
151typedef struct ft_header *FT_HEADER;
152
153// ft_header is always the current version.
154struct ft {
155 FT_HEADER h;
156 FT_HEADER checkpoint_header;
157
158 // These are (mostly) read-only.
159
160 CACHEFILE cf;
161 // unique id for dictionary
162 DICTIONARY_ID dict_id;
163
164 // protected by locktree
165 DESCRIPTOR_S descriptor;
166
167 // protected by locktree and user.
168 // User makes sure this is only changed when no activity on tree
169 DESCRIPTOR_S cmp_descriptor;
170 // contains a pointer to cmp_descriptor (above) - their lifetimes are bound
171 toku::comparator cmp;
172
173 // the update function always utilizes the cmp_descriptor, not the regular one
174 ft_update_func update_fun;
175
176 // These are not read-only:
177
178 // protected by blocktable lock
179 block_table blocktable;
180
181 // protected by atomic builtins
182 STAT64INFO_S in_memory_stats;
183 uint64_t in_memory_logical_rows;
184
185 // transient, not serialized to disk. updated when we do write to
186 // disk. tells us whether we can do partial eviction (we can't if
187 // the on-disk layout version is from before basement nodes)
188 int layout_version_read_from_disk;
189
190 // Logically the reference count is zero if live_ft_handles is empty, txns is 0, and pinned_by_checkpoint is false.
191
192 // ft_ref_lock protects modifying live_ft_handles, txns, and pinned_by_checkpoint.
193 toku_mutex_t ft_ref_lock;
194 struct toku_list live_ft_handles;
195 // Number of transactions that are using this FT. you should only be able
196 // to modify this if you have a valid handle in live_ft_handles
197 uint32_t num_txns;
198 // A checkpoint is running. If true, then keep this header around for checkpoint, like a transaction
199 bool pinned_by_checkpoint;
200
201 // is this ft a blackhole? if so, all messages are dropped.
202 bool blackhole;
203
204 // The blocknum of the rightmost leaf node in the tree. Stays constant through splits
205 // and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values())
206 //
207 // This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back.
208 // We initialize it when promotion inserts into a non-root leaf node on the right extreme.
209 // We use the blocktable lock to protect the initialize transition, though it's not really
210 // necessary since all threads should be setting it to the same value. We maintain that invariant
211 // on first initialization, see ft_set_or_verify_rightmost_blocknum()
212 BLOCKNUM rightmost_blocknum;
213
214 // sequential access pattern heuristic
215 // - when promotion pushes a message directly into the rightmost leaf, the score goes up.
216 // - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf
217 // - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0
218 uint32_t seqinsert_score;
219};
220
221// Allocate a DB struct off the stack and only set its comparison
222// descriptor. We don't bother setting any other fields because
223// the comparison function doesn't need it, and we would like to
224// reduce the CPU work done per comparison.
225#define FAKE_DB(db, desc) struct __toku_db db; do { db.cmp_descriptor = const_cast<DESCRIPTOR>(desc); } while (0)
226
227struct ft_options {
228 unsigned int nodesize;
229 unsigned int basementnodesize;
230 enum toku_compression_method compression_method;
231 unsigned int fanout;
232 unsigned int flags;
233 uint8_t memcmp_magic;
234 ft_compare_func compare_fun;
235 ft_update_func update_fun;
236};
237
238struct ft_handle {
239 // The fractal tree.
240 FT ft;
241
242 on_redirect_callback redirect_callback;
243 void *redirect_callback_extra;
244 struct toku_list live_ft_handle_link;
245 bool did_set_flags;
246
247 struct ft_options options;
248};
249
250PAIR_ATTR make_ftnode_pair_attr(FTNODE node);
251PAIR_ATTR make_invalid_pair_attr(void);
252
253//
254// Field in ftnode_fetch_extra that tells the
255// partial fetch callback what piece of the node
256// is needed by the ydb
257//
258enum ftnode_fetch_type {
259 ftnode_fetch_none = 1, // no partitions needed.
260 ftnode_fetch_subset, // some subset of partitions needed
261 ftnode_fetch_prefetch, // this is part of a prefetch call
262 ftnode_fetch_all, // every partition is needed
263 ftnode_fetch_keymatch, // one child is needed if it holds both keys
264};
265
266// Info passed to cachetable fetch callbacks to say which parts of a node
267// should be fetched (perhaps a subset, perhaps the whole thing, depending
268// on operation)
269class ftnode_fetch_extra {
270public:
271 // Used when the whole node must be in memory, such as for flushes.
272 void create_for_full_read(FT ft);
273
274 // A subset of children are necessary. Used by point queries.
275 void create_for_subset_read(FT ft, ft_search *search, const DBT *left, const DBT *right,
276 bool left_is_neg_infty, bool right_is_pos_infty,
277 bool disable_prefetching, bool read_all_partitions);
278
279 // No partitions are necessary - only pivots and/or subtree estimates.
280 // Currently used for stat64.
281 void create_for_min_read(FT ft);
282
283 // Used to prefetch partitions that fall within the bounds given by the cursor.
284 void create_for_prefetch(FT ft, struct ft_cursor *cursor);
285
286 // Only a portion of the node (within a keyrange) is required.
287 // Used by keysrange when the left and right key are in the same basement node.
288 void create_for_keymatch(FT ft, const DBT *left, const DBT *right,
289 bool disable_prefetching, bool read_all_partitions);
290
291 void destroy(void);
292
293 // return: true if a specific childnum is required to be in memory
294 bool wants_child_available(int childnum) const;
295
296 // return: the childnum of the leftmost child that is required to be in memory
297 int leftmost_child_wanted(FTNODE node) const;
298
299 // return: the childnum of the rightmost child that is required to be in memory
300 int rightmost_child_wanted(FTNODE node) const;
301
302 // needed for reading a node off disk
303 FT ft;
304
305 enum ftnode_fetch_type type;
306
307 // used in the case where type == ftnode_fetch_subset
308 // parameters needed to find out which child needs to be decompressed (so it can be read)
309 ft_search *search;
310 DBT range_lock_left_key, range_lock_right_key;
311 bool left_is_neg_infty, right_is_pos_infty;
312
313 // states if we should try to aggressively fetch basement nodes
314 // that are not specifically needed for current query,
315 // but may be needed for other cursor operations user is doing
316 // For example, if we have not disabled prefetching,
317 // and the user is doing a dictionary wide scan, then
318 // even though a query may only want one basement node,
319 // we fetch all basement nodes in a leaf node.
320 bool disable_prefetching;
321
322 // this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
323 // thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
324 int child_to_read;
325
326 // when we read internal nodes, we want to read all the data off disk in one I/O
327 // then we'll treat it as normal and only decompress the needed partitions etc.
328 bool read_all_partitions;
329
330 // Accounting: How many bytes were read, and how much time did we spend doing I/O?
331 uint64_t bytes_read;
332 tokutime_t io_time;
333 tokutime_t decompress_time;
334 tokutime_t deserialize_time;
335
336private:
337 void _create_internal(FT ft_);
338};
339
340// Only exported for tests.
341// Cachetable callbacks for ftnodes.
342void toku_ftnode_clone_callback(void* value_data, void** cloned_value_data, long* clone_size, PAIR_ATTR* new_attr, bool for_checkpoint, void* write_extraargs);
343void toku_ftnode_checkpoint_complete_callback(void *value_data);
344void toku_ftnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM blocknum, void *ftnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, bool write_me, bool keep_me, bool for_checkpoint, bool is_clone);
345int toku_ftnode_fetch_callback (CACHEFILE cachefile, PAIR p, int fd, BLOCKNUM blocknum, uint32_t fullhash, void **ftnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
346void toku_ftnode_pe_est_callback(void* ftnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
347int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *extraargs,
348 void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra);
349bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs);
350int toku_ftnode_pf_callback(void* ftnode_pv, void* UU(disk_data), void* read_extraargs, int fd, PAIR_ATTR* sizep);
351int toku_ftnode_cleaner_callback( void *ftnode_pv, BLOCKNUM blocknum, uint32_t fullhash, void *extraargs);
352
353CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(FT ft);
354
355// This is only exported for tests.
356// append a child node to a parent node
357void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey);
358
359// This is only exported for tests.
360// append a message to a nonleaf node child buffer
361void toku_ft_append_to_child_buffer(const toku::comparator &cmp, FTNODE node, int childnum, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, const DBT *key, const DBT *val);
362
363STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
364
365//#define SLOW
366#ifdef SLOW
367#define VERIFY_NODE(t,n) (toku_verify_or_set_counts(n), toku_verify_estimates(t,n))
368#else
369#define VERIFY_NODE(t,n) ((void)0)
370#endif
371
372void toku_verify_or_set_counts(FTNODE);
373
374// TODO: consider moving this to ft/pivotkeys.cc
375class pivot_bounds {
376public:
377 pivot_bounds(const DBT &lbe_dbt, const DBT &ubi_dbt);
378
379 pivot_bounds next_bounds(FTNODE node, int childnum) const;
380
381 const DBT *lbe() const;
382 const DBT *ubi() const;
383
384 static pivot_bounds infinite_bounds();
385
386private:
387 DBT _prepivotkey(FTNODE node, int childnum, const DBT &lbe_dbt) const;
388 DBT _postpivotkey(FTNODE node, int childnum, const DBT &ubi_dbt) const;
389
390 // if toku_dbt_is_empty() is true for either bound, then it represents
391 // negative or positive infinity (which are exclusive in practice)
392 const DBT _lower_bound_exclusive;
393 const DBT _upper_bound_inclusive;
394};
395
396// allocate a block number
397// allocate and initialize a ftnode
398// put the ftnode into the cache table
399void toku_create_new_ftnode(FT_HANDLE ft_handle, FTNODE *result, int height, int n_children);
400
401/* Stuff for testing */
402// toku_testsetup_initialize() must be called before any other test_setup_xxx() functions are called.
403void toku_testsetup_initialize(void);
404int toku_testsetup_leaf(FT_HANDLE ft_h, BLOCKNUM *blocknum, int n_children, char **keys, int *keylens);
405int toku_testsetup_nonleaf (FT_HANDLE ft_h, int height, BLOCKNUM *blocknum, int n_children, BLOCKNUM *children, char **keys, int *keylens);
406int toku_testsetup_root(FT_HANDLE ft_h, BLOCKNUM);
407int toku_testsetup_get_sersize(FT_HANDLE ft_h, BLOCKNUM); // Return the size on disk.
408int toku_testsetup_insert_to_leaf (FT_HANDLE ft_h, BLOCKNUM, const char *key, int keylen, const char *val, int vallen);
409int toku_testsetup_insert_to_nonleaf (FT_HANDLE ft_h, BLOCKNUM, enum ft_msg_type, const char *key, int keylen, const char *val, int vallen);
410void toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t);
411
412void toku_ft_root_put_msg(FT ft, const ft_msg &msg, txn_gc_info *gc_info);
413
414// TODO: Rename
415void toku_get_node_for_verify(BLOCKNUM blocknum, FT_HANDLE ft_h, FTNODE* nodep);
416
417int
418toku_verify_ftnode (FT_HANDLE ft_h,
419 MSN rootmsn, MSN parentmsn_with_messages, bool messages_exist_above,
420 FTNODE node, int height,
421 const DBT *lesser_pivot, // Everything in the subtree should be > lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
422 const DBT *greatereq_pivot, // Everything in the subtree should be <= lesser_pivot. (lesser_pivot==NULL if there is no lesser pivot.)
423 int (*progress_callback)(void *extra, float progress), void *progress_extra,
424 int recurse, int verbose, int keep_going_on_failure)
425 __attribute__ ((warn_unused_result));
426
427int toku_db_badformat(void) __attribute__((__warn_unused_result__));
428
429typedef enum {
430 FT_UPGRADE_FOOTPRINT = 0,
431 FT_UPGRADE_STATUS_NUM_ROWS
432} ft_upgrade_status_entry;
433
434typedef struct {
435 bool initialized;
436 TOKU_ENGINE_STATUS_ROW_S status[FT_UPGRADE_STATUS_NUM_ROWS];
437} FT_UPGRADE_STATUS_S, *FT_UPGRADE_STATUS;
438
439void toku_ft_upgrade_get_status(FT_UPGRADE_STATUS);
440
441void toku_le_get_status(LE_STATUS);
442
443void toku_ft_status_update_pivot_fetch_reason(ftnode_fetch_extra *bfe);
444void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
445void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
446void toku_ft_status_update_deserialize_times(FTNODE node, tokutime_t deserialize_time, tokutime_t decompress_time);
447void toku_ft_status_note_msn_discard(void);
448void toku_ft_status_note_update(bool broadcast);
449void toku_ft_status_note_msg_bytes_out(size_t buffsize);
450void toku_ft_status_note_ftnode(int height, bool created); // created = false means destroyed
451
452void toku_ft_get_status(FT_STATUS);
453
454void toku_flusher_thread_set_callback(void (*callback_f)(int, void*), void* extra);
455
456// For upgrade
457int toku_upgrade_subtree_estimates_to_stat64info(int fd, FT ft) __attribute__((nonnull));
458int toku_upgrade_msn_from_root_to_header(int fd, FT ft) __attribute__((nonnull));
459
460// A callback function is invoked with the key, and the data.
461// The pointers (to the bytevecs) must not be modified. The data must be copied out before the callback function returns.
462// Note: In the thread-safe version, the ftnode remains locked while the callback function runs. So return soon, and don't call the ft code from the callback function.
463// If the callback function returns a nonzero value (an error code), then that error code is returned from the get function itself.
464// The cursor object will have been updated (so that if result==0 the current value is the value being passed)
465// (If r!=0 then the cursor won't have been updated.)
466// If r!=0, it's up to the callback function to return that value of r.
467// A 'key' pointer of NULL means that element is not found (effectively infinity or
468// -infinity depending on direction)
469// When lock_only is false, the callback does optional lock tree locking and then processes the key and val.
470// When lock_only is true, the callback only does optional lock tree locking.
471typedef int (*FT_GET_CALLBACK_FUNCTION)(uint32_t keylen, const void *key, uint32_t vallen, const void *val, void *extra, bool lock_only);
472
473typedef bool (*FT_CHECK_INTERRUPT_CALLBACK)(void *extra, uint64_t deleted_rows);
474
475struct ft_cursor;
476int toku_ft_search(FT_HANDLE ft_handle, ft_search *search, FT_GET_CALLBACK_FUNCTION getf, void *getf_v, struct ft_cursor *ftcursor, bool can_bulk_fetch);
477