1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | /*====== |
4 | This file is part of PerconaFT. |
5 | |
6 | |
7 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
8 | |
9 | PerconaFT is free software: you can redistribute it and/or modify |
10 | it under the terms of the GNU General Public License, version 2, |
11 | as published by the Free Software Foundation. |
12 | |
13 | PerconaFT is distributed in the hope that it will be useful, |
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | GNU General Public License for more details. |
17 | |
18 | You should have received a copy of the GNU General Public License |
19 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
20 | |
21 | ---------------------------------------- |
22 | |
23 | PerconaFT is free software: you can redistribute it and/or modify |
24 | it under the terms of the GNU Affero General Public License, version 3, |
25 | as published by the Free Software Foundation. |
26 | |
27 | PerconaFT is distributed in the hope that it will be useful, |
28 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
29 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
30 | GNU Affero General Public License for more details. |
31 | |
32 | You should have received a copy of the GNU Affero General Public License |
33 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
34 | ======= */ |
35 | |
36 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
37 | |
38 | #pragma once |
39 | |
40 | #include "ft/bndata.h" |
41 | #include "ft/comparator.h" |
42 | #include "ft/ft.h" |
43 | #include "ft/msg_buffer.h" |
44 | |
45 | /* Pivot keys. |
46 | * Child 0's keys are <= pivotkeys[0]. |
47 | * Child 1's keys are <= pivotkeys[1]. |
48 | * Child 1's keys are > pivotkeys[0]. |
49 | * etc |
50 | */ |
51 | class ftnode_pivot_keys { |
52 | public: |
53 | // effect: create an empty set of pivot keys |
54 | void create_empty(); |
55 | |
56 | // effect: create pivot keys by copying the given DBT array |
57 | void create_from_dbts(const DBT *keys, int n); |
58 | |
59 | // effect: create pivot keys as a clone of an existing set of pivotkeys |
60 | void create_from_pivot_keys(const ftnode_pivot_keys &pivotkeys); |
61 | |
62 | void destroy(); |
63 | |
64 | // effect: deserialize pivot keys previously serialized by serialize_to_wbuf() |
65 | void deserialize_from_rbuf(struct rbuf *rb, int n); |
66 | |
67 | // returns: unowned DBT representing the i'th pivot key |
68 | DBT get_pivot(int i) const; |
69 | |
70 | // effect: fills a DBT with the i'th pivot key |
71 | // returns: the given dbt |
72 | DBT *fill_pivot(int i, DBT *dbt) const; |
73 | |
74 | // effect: insert a pivot into the i'th position, shifting others to the right |
75 | void insert_at(const DBT *key, int i); |
76 | |
77 | // effect: append pivotkeys to the end of our own pivot keys |
78 | void append(const ftnode_pivot_keys &pivotkeys); |
79 | |
80 | // effect: replace the pivot at the i'th position |
81 | void replace_at(const DBT *key, int i); |
82 | |
83 | // effect: removes the i'th pivot key, shifting others to the left |
84 | void delete_at(int i); |
85 | |
86 | // effect: split the pivot keys, removing all pivots at position greater |
87 | // than or equal to `i' and storing them in *other |
88 | // requires: *other is empty (size == 0) |
89 | void split_at(int i, ftnode_pivot_keys *other); |
90 | |
91 | // effect: serialize pivot keys to a wbuf |
92 | // requires: wbuf has at least ftnode_pivot_keys::total_size() bytes available |
93 | void serialize_to_wbuf(struct wbuf *wb) const; |
94 | |
95 | int num_pivots() const; |
96 | |
97 | // return: the total size of this data structure |
98 | size_t total_size() const; |
99 | |
100 | // return: the sum of the keys sizes of each pivot (for serialization) |
101 | size_t serialized_size() const; |
102 | |
103 | private: |
104 | inline size_t _align4(size_t x) const { |
105 | return roundup_to_multiple(4, x); |
106 | } |
107 | |
108 | // effect: create pivot keys, in fixed key format, by copying the given key array |
109 | void _create_from_fixed_keys(const char *fixedkeys, size_t fixed_keylen, int n); |
110 | |
111 | char *_fixed_key(int i) const { |
112 | return &_fixed_keys[i * _fixed_keylen_aligned]; |
113 | } |
114 | |
115 | bool _fixed_format() const { |
116 | return _fixed_keys != nullptr; |
117 | } |
118 | |
119 | void sanity_check() const; |
120 | |
121 | void _insert_at_dbt(const DBT *key, int i); |
122 | void _append_dbt(const ftnode_pivot_keys &pivotkeys); |
123 | void _replace_at_dbt(const DBT *key, int i); |
124 | void _delete_at_dbt(int i); |
125 | void _split_at_dbt(int i, ftnode_pivot_keys *other); |
126 | |
127 | void _insert_at_fixed(const DBT *key, int i); |
128 | void _append_fixed(const ftnode_pivot_keys &pivotkeys); |
129 | void _replace_at_fixed(const DBT *key, int i); |
130 | void _delete_at_fixed(int i); |
131 | void _split_at_fixed(int i, ftnode_pivot_keys *other); |
132 | |
133 | // adds/destroys keys at a certain index (in dbt format), |
134 | // maintaining _total_size, but not _num_pivots |
135 | void _add_key_dbt(const DBT *key, int i); |
136 | void _destroy_key_dbt(int i); |
137 | |
138 | // conversions to and from packed key array format |
139 | void _convert_to_dbt_format(); |
140 | void _convert_to_fixed_format(); |
141 | |
142 | // If every key is _fixed_keylen long, then _fixed_key is a |
143 | // packed array of keys.. |
144 | char *_fixed_keys; |
145 | // The actual length of the fixed key |
146 | size_t _fixed_keylen; |
147 | // The aligned length that we use for fixed key storage |
148 | size_t _fixed_keylen_aligned; |
149 | |
150 | // ..otherwise _fixed_keys is null and we store an array of dbts, |
151 | // each representing a key. this is simpler but less cache-efficient. |
152 | DBT *_dbt_keys; |
153 | |
154 | int _num_pivots; |
155 | size_t _total_size; |
156 | }; |
157 | |
158 | // TODO: class me up |
159 | struct ftnode { |
160 | // max_msn_applied that will be written to disk |
161 | MSN max_msn_applied_to_node_on_disk; |
162 | unsigned int flags; |
163 | // Which block number is this node? |
164 | BLOCKNUM blocknum; |
165 | // What version of the data structure? |
166 | int layout_version; |
167 | // different (<) from layout_version if upgraded from a previous version |
168 | // (useful for debugging) |
169 | int layout_version_original; |
170 | // transient, not serialized to disk, (useful for debugging) |
171 | int layout_version_read_from_disk; |
172 | // build_id (svn rev number) of software that wrote this node to disk |
173 | uint32_t build_id; |
174 | // height is always >= 0. 0 for leaf, >0 for nonleaf. |
175 | int height; |
176 | int dirty; |
177 | uint32_t fullhash; |
178 | |
179 | // for internal nodes, if n_children==fanout+1 then the tree needs to be |
180 | // rebalanced. for leaf nodes, represents number of basement nodes |
181 | int n_children; |
182 | ftnode_pivot_keys pivotkeys; |
183 | |
184 | // What's the oldest referenced xid that this node knows about? The real |
185 | // oldest referenced xid might be younger, but this is our best estimate. |
186 | // We use it as a heuristic to transition provisional mvcc entries from |
187 | // provisional to committed (from implicity committed to really committed). |
188 | // |
189 | // A better heuristic would be the oldest live txnid, but we use this since |
190 | // it still works well most of the time, and its readily available on the |
191 | // inject code path. |
192 | TXNID oldest_referenced_xid_known; |
193 | |
194 | // array of size n_children, consisting of ftnode partitions |
195 | // each one is associated with a child for internal nodes, the ith |
196 | // partition corresponds to the ith message buffer for leaf nodes, the ith |
197 | // partition corresponds to the ith basement node |
198 | struct ftnode_partition *bp; |
199 | struct ctpair *ct_pair; |
200 | }; |
201 | typedef struct ftnode *FTNODE; |
202 | |
203 | // data of an available partition of a leaf ftnode |
204 | struct ftnode_leaf_basement_node { |
205 | bn_data data_buffer; |
206 | unsigned int seqinsert; // number of sequential inserts to this leaf |
207 | MSN max_msn_applied; // max message sequence number applied |
208 | bool stale_ancestor_messages_applied; |
209 | // current count of rows added or removed as a result of message application |
210 | // to this basement node, gets reset when node is undirtied. |
211 | // Used to back out tree scoped LRC id node is evicted but not persisted |
212 | int64_t logical_rows_delta; |
213 | STAT64INFO_S stat64_delta; // change in stat64 counters since basement was last written to disk |
214 | }; |
215 | typedef struct ftnode_leaf_basement_node *BASEMENTNODE; |
216 | |
217 | enum pt_state { // declare this to be packed so that when used below it will only take 1 byte. |
218 | PT_INVALID = 0, |
219 | PT_ON_DISK = 1, |
220 | PT_COMPRESSED = 2, |
221 | PT_AVAIL = 3}; |
222 | |
223 | enum ftnode_child_tag { |
224 | BCT_INVALID = 0, |
225 | BCT_NULL, |
226 | BCT_SUBBLOCK, |
227 | BCT_LEAF, |
228 | BCT_NONLEAF |
229 | }; |
230 | |
231 | typedef toku::omt<int32_t> off_omt_t; |
232 | typedef toku::omt<int32_t, int32_t, true> marked_off_omt_t; |
233 | |
234 | // data of an available partition of a nonleaf ftnode |
235 | struct ftnode_nonleaf_childinfo { |
236 | message_buffer msg_buffer; |
237 | off_omt_t broadcast_list; |
238 | marked_off_omt_t fresh_message_tree; |
239 | off_omt_t stale_message_tree; |
240 | uint64_t flow[2]; // current and last checkpoint |
241 | }; |
242 | typedef struct ftnode_nonleaf_childinfo *NONLEAF_CHILDINFO; |
243 | |
244 | typedef struct ftnode_child_pointer { |
245 | union { |
246 | struct sub_block *subblock; |
247 | struct ftnode_nonleaf_childinfo *nonleaf; |
248 | struct ftnode_leaf_basement_node *leaf; |
249 | } u; |
250 | enum ftnode_child_tag tag; |
251 | } FTNODE_CHILD_POINTER; |
252 | |
253 | struct ftnode_disk_data { |
254 | // |
255 | // stores the offset to the beginning of the partition on disk from the ftnode, and the length, needed to read a partition off of disk |
256 | // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless |
257 | // The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition |
258 | // The SIZE is the size of the compressed partition. |
259 | // Rationale: We cannot store the size from the beginning of the node since we don't know how big the header will be. |
260 | // However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align. |
261 | uint32_t start; |
262 | uint32_t size; |
263 | }; |
264 | typedef struct ftnode_disk_data *FTNODE_DISK_DATA; |
265 | |
266 | // TODO: Turn these into functions instead of macros |
267 | #define BP_START(node_dd,i) ((node_dd)[i].start) |
268 | #define BP_SIZE(node_dd,i) ((node_dd)[i].size) |
269 | |
270 | // a ftnode partition, associated with a child of a node |
271 | struct ftnode_partition { |
272 | // the following three variables are used for nonleaf nodes |
273 | // for leaf nodes, they are meaningless |
274 | BLOCKNUM blocknum; // blocknum of child |
275 | |
276 | // How many bytes worth of work was performed by messages in each buffer. |
277 | uint64_t workdone; |
278 | |
279 | // |
280 | // pointer to the partition. Depending on the state, they may be different things |
281 | // if state == PT_INVALID, then the node was just initialized and ptr == NULL |
282 | // if state == PT_ON_DISK, then ptr == NULL |
283 | // if state == PT_COMPRESSED, then ptr points to a struct sub_block* |
284 | // if state == PT_AVAIL, then ptr is: |
285 | // a struct ftnode_nonleaf_childinfo for internal nodes, |
286 | // a struct ftnode_leaf_basement_node for leaf nodes |
287 | // |
288 | struct ftnode_child_pointer ptr; |
289 | // |
290 | // at any time, the partitions may be in one of the following three states (stored in pt_state): |
291 | // PT_INVALID - means that the partition was just initialized |
292 | // PT_ON_DISK - means that the partition is not in memory and needs to be read from disk. To use, must read off disk and decompress |
293 | // PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress |
294 | // PT_AVAIL - means the partition is decompressed and in memory |
295 | // |
296 | enum pt_state state; // make this an enum to make debugging easier. |
297 | |
298 | // clock count used to for pe_callback to determine if a node should be evicted or not |
299 | // for now, saturating the count at 1 |
300 | uint8_t clock_count; |
301 | }; |
302 | |
303 | // |
304 | // TODO: Fix all these names |
305 | // Organize declarations |
306 | // Fix widespread parameter ordering inconsistencies |
307 | // |
308 | BASEMENTNODE toku_create_empty_bn(void); |
309 | BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer. |
310 | NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo); |
311 | BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn); |
312 | NONLEAF_CHILDINFO toku_create_empty_nl(void); |
313 | void destroy_basement_node (BASEMENTNODE bn); |
314 | void destroy_nonleaf_childinfo (NONLEAF_CHILDINFO nl); |
315 | void toku_destroy_ftnode_internals(FTNODE node); |
316 | void toku_ftnode_free (FTNODE *node); |
317 | bool toku_ftnode_fully_in_memory(FTNODE node); |
318 | void toku_ftnode_assert_fully_in_memory(FTNODE node); |
319 | void toku_evict_bn_from_memory(FTNODE node, int childnum, FT ft); |
320 | BASEMENTNODE toku_detach_bn(FTNODE node, int childnum); |
321 | void toku_ftnode_update_disk_stats(FTNODE ftnode, FT ft, bool for_checkpoint); |
322 | void toku_ftnode_clone_partitions(FTNODE node, FTNODE cloned_node); |
323 | |
324 | void toku_initialize_empty_ftnode(FTNODE node, BLOCKNUM blocknum, int height, int num_children, |
325 | int layout_version, unsigned int flags); |
326 | |
327 | int toku_ftnode_which_child(FTNODE node, const DBT *k, const toku::comparator &cmp); |
328 | void toku_ftnode_save_ct_pair(CACHEKEY key, void *value_data, PAIR p); |
329 | |
330 | // |
331 | // TODO: put the heaviside functions into their respective 'struct .*extra;' namespaces |
332 | // |
333 | struct { |
334 | const toku::comparator &; |
335 | message_buffer *; |
336 | const DBT *; |
337 | MSN ; |
338 | (const toku::comparator &c, message_buffer *mb, const DBT *k, MSN m) : |
339 | cmp(c), msg_buffer(mb), key(k), msn(m) { |
340 | } |
341 | }; |
342 | int (const int32_t &v, const struct toku_msg_buffer_key_msn_heaviside_extra &); |
343 | |
344 | struct { |
345 | const toku::comparator &; |
346 | message_buffer *; |
347 | (const toku::comparator &c, message_buffer *mb) : |
348 | cmp(c), msg_buffer(mb) { |
349 | } |
350 | }; |
351 | int (const struct toku_msg_buffer_key_msn_cmp_extra &, const int &a, const int &b); |
352 | |
353 | struct { |
354 | const toku::comparator &; |
355 | DBT const *const ; |
356 | (const toku::comparator &c, const DBT *k) : |
357 | cmp(c), key(k) { |
358 | } |
359 | }; |
360 | int (DBT const &kdbt, const struct toku_msg_leafval_heaviside_extra &be); |
361 | |
362 | unsigned int toku_bnc_nbytesinbuf(NONLEAF_CHILDINFO bnc); |
363 | int toku_bnc_n_entries(NONLEAF_CHILDINFO bnc); |
364 | long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc); |
365 | long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc); |
366 | void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, uint32_t keylen, const void *data, uint32_t datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, const toku::comparator &cmp); |
367 | void toku_bnc_empty(NONLEAF_CHILDINFO bnc); |
368 | void toku_bnc_flush_to_child(FT ft, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID parent_oldest_referenced_xid_known); |
369 | bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull)); |
370 | |
371 | bool toku_ftnode_nonleaf_is_gorged(FTNODE node, uint32_t nodesize); |
372 | uint32_t toku_ftnode_leaf_num_entries(FTNODE node); |
373 | void toku_ftnode_leaf_rebalance(FTNODE node, unsigned int basementnodesize); |
374 | |
375 | void toku_ftnode_leaf_run_gc(FT ft, FTNODE node); |
376 | |
377 | enum reactivity { |
378 | RE_STABLE, |
379 | RE_FUSIBLE, |
380 | RE_FISSIBLE |
381 | }; |
382 | |
383 | enum reactivity toku_ftnode_get_reactivity(FT ft, FTNODE node); |
384 | enum reactivity toku_ftnode_get_nonleaf_reactivity(FTNODE node, unsigned int fanout); |
385 | enum reactivity toku_ftnode_get_leaf_reactivity(FTNODE node, uint32_t nodesize); |
386 | |
387 | inline const char* toku_ftnode_get_cachefile_fname_in_env(FTNODE node) { |
388 | if (node->ct_pair) { |
389 | CACHEFILE cf = toku_pair_get_cachefile(node->ct_pair); |
390 | if (cf) { |
391 | return toku_cachefile_fname_in_env(cf); |
392 | } |
393 | } |
394 | return nullptr; |
395 | } |
396 | |
397 | /** |
398 | * Finds the next child for HOT to flush to, given that everything up to |
399 | * and including k has been flattened. |
400 | * |
401 | * If k falls between pivots in node, then we return the childnum where k |
402 | * lies. |
403 | * |
404 | * If k is equal to some pivot, then we return the next (to the right) |
405 | * childnum. |
406 | */ |
407 | int toku_ftnode_hot_next_child( |
408 | FTNODE node, |
409 | const DBT* k, |
410 | const toku::comparator &cmp); |
411 | |
412 | void toku_ftnode_put_msg( |
413 | const toku::comparator& cmp, |
414 | ft_update_func update_fun, |
415 | FTNODE node, |
416 | int target_childnum, |
417 | const ft_msg& msg, |
418 | bool is_fresh, |
419 | txn_gc_info* gc_info, |
420 | size_t flow_deltas[], |
421 | STAT64INFO stats_to_update, |
422 | int64_t* logical_rows_delta); |
423 | |
424 | void toku_ft_bn_apply_msg_once( |
425 | BASEMENTNODE bn, |
426 | const ft_msg& msg, |
427 | uint32_t idx, |
428 | uint32_t le_keylen, |
429 | LEAFENTRY le, |
430 | txn_gc_info* gc_info, |
431 | uint64_t* workdonep, |
432 | STAT64INFO stats_to_update, |
433 | int64_t* logical_rows_delta); |
434 | |
435 | void toku_ft_bn_apply_msg( |
436 | const toku::comparator& cmp, |
437 | ft_update_func update_fun, |
438 | BASEMENTNODE bn, |
439 | const ft_msg& msg, |
440 | txn_gc_info* gc_info, |
441 | uint64_t* workdone, |
442 | STAT64INFO stats_to_update, |
443 | int64_t* logical_rows_delta); |
444 | |
445 | void toku_ft_leaf_apply_msg( |
446 | const toku::comparator& cmp, |
447 | ft_update_func update_fun, |
448 | FTNODE node, |
449 | int target_childnum, |
450 | const ft_msg& msg, |
451 | txn_gc_info* gc_info, |
452 | uint64_t* workdone, |
453 | STAT64INFO stats_to_update, |
454 | int64_t* logical_rows_delta); |
455 | |
456 | // |
457 | // Message management for orthopush |
458 | // |
459 | |
460 | struct ancestors { |
461 | // This is the root node if next is NULL (since the root has no ancestors) |
462 | FTNODE node; |
463 | // Which buffer holds messages destined to the node whose ancestors this list represents. |
464 | int childnum; |
465 | struct ancestors *next; |
466 | }; |
467 | typedef struct ancestors *ANCESTORS; |
468 | |
469 | void toku_ft_bnc_move_messages_to_stale(FT ft, NONLEAF_CHILDINFO bnc); |
470 | |
471 | void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node); |
472 | |
473 | // TODO: Should ft_handle just be FT? |
474 | class pivot_bounds; |
475 | void toku_apply_ancestors_messages_to_node(FT_HANDLE t, FTNODE node, ANCESTORS ancestors, |
476 | const pivot_bounds &bounds, |
477 | bool *msgs_applied, int child_to_read); |
478 | |
479 | bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, |
480 | const pivot_bounds &bounds, |
481 | MSN *const max_msn_in_path, int child_to_read); |
482 | |
483 | void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read); |
484 | |
485 | struct ft_search; |
486 | int toku_ft_search_which_child(const toku::comparator &cmp, FTNODE node, ft_search *search); |
487 | |
488 | // |
489 | // internal node inline functions |
490 | // TODO: Turn the macros into real functions |
491 | // |
492 | |
493 | static inline void set_BNULL(FTNODE node, int i) { |
494 | paranoid_invariant(i >= 0); |
495 | paranoid_invariant(i < node->n_children); |
496 | node->bp[i].ptr.tag = BCT_NULL; |
497 | } |
498 | |
499 | static inline bool is_BNULL (FTNODE node, int i) { |
500 | paranoid_invariant(i >= 0); |
501 | paranoid_invariant(i < node->n_children); |
502 | return node->bp[i].ptr.tag == BCT_NULL; |
503 | } |
504 | |
505 | static inline NONLEAF_CHILDINFO BNC(FTNODE node, int i) { |
506 | paranoid_invariant(i >= 0); |
507 | paranoid_invariant(i < node->n_children); |
508 | FTNODE_CHILD_POINTER p = node->bp[i].ptr; |
509 | paranoid_invariant(p.tag==BCT_NONLEAF); |
510 | return p.u.nonleaf; |
511 | } |
512 | |
513 | static inline void set_BNC(FTNODE node, int i, NONLEAF_CHILDINFO nl) { |
514 | paranoid_invariant(i >= 0); |
515 | paranoid_invariant(i < node->n_children); |
516 | FTNODE_CHILD_POINTER *p = &node->bp[i].ptr; |
517 | p->tag = BCT_NONLEAF; |
518 | p->u.nonleaf = nl; |
519 | } |
520 | |
521 | static inline BASEMENTNODE BLB(FTNODE node, int i) { |
522 | paranoid_invariant(i >= 0); |
523 | // The optimizer really doesn't like it when we compare |
524 | // i to n_children as signed integers. So we assert that |
525 | // n_children is in fact positive before doing a comparison |
526 | // on the values forcibly cast to unsigned ints. |
527 | paranoid_invariant(node->n_children > 0); |
528 | paranoid_invariant((unsigned) i < (unsigned) node->n_children); |
529 | FTNODE_CHILD_POINTER p = node->bp[i].ptr; |
530 | paranoid_invariant(p.tag==BCT_LEAF); |
531 | return p.u.leaf; |
532 | } |
533 | |
534 | static inline void set_BLB(FTNODE node, int i, BASEMENTNODE bn) { |
535 | paranoid_invariant(i >= 0); |
536 | paranoid_invariant(i < node->n_children); |
537 | FTNODE_CHILD_POINTER *p = &node->bp[i].ptr; |
538 | p->tag = BCT_LEAF; |
539 | p->u.leaf = bn; |
540 | } |
541 | |
542 | static inline struct sub_block *BSB(FTNODE node, int i) { |
543 | paranoid_invariant(i >= 0); |
544 | paranoid_invariant(i < node->n_children); |
545 | FTNODE_CHILD_POINTER p = node->bp[i].ptr; |
546 | paranoid_invariant(p.tag==BCT_SUBBLOCK); |
547 | return p.u.subblock; |
548 | } |
549 | |
550 | static inline void set_BSB(FTNODE node, int i, struct sub_block *sb) { |
551 | paranoid_invariant(i >= 0); |
552 | paranoid_invariant(i < node->n_children); |
553 | FTNODE_CHILD_POINTER *p = &node->bp[i].ptr; |
554 | p->tag = BCT_SUBBLOCK; |
555 | p->u.subblock = sb; |
556 | } |
557 | |
558 | // ftnode partition macros |
559 | // BP stands for ftnode_partition |
560 | #define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum) |
561 | #define BP_STATE(node,i) ((node)->bp[i].state) |
562 | #define BP_WORKDONE(node, i)((node)->bp[i].workdone) |
563 | |
564 | // |
565 | // macros for managing a node's clock |
566 | // Should be managed by ft-ops.c, NOT by serialize/deserialize |
567 | // |
568 | |
569 | // |
570 | // BP_TOUCH_CLOCK uses a compare and swap because multiple threads |
571 | // that have a read lock on an internal node may try to touch the clock |
572 | // simultaneously |
573 | // |
574 | #define BP_TOUCH_CLOCK(node, i) ((node)->bp[i].clock_count = 1) |
575 | #define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0) |
576 | #define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0) |
577 | // not crazy about having these two here, one is for the case where we create new |
578 | // nodes, such as in splits and creating new roots, and the other is for when |
579 | // we are deserializing a node and not all bp's are touched |
580 | #define BP_INIT_TOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 1) |
581 | #define BP_INIT_UNTOUCHED_CLOCK(node, i) ((node)->bp[i].clock_count = 0) |
582 | |
583 | // ftnode leaf basementnode macros, |
584 | #define BLB_MAX_MSN_APPLIED(node,i) (BLB(node,i)->max_msn_applied) |
585 | #define BLB_MAX_DSN_APPLIED(node,i) (BLB(node,i)->max_dsn_applied) |
586 | #define BLB_DATA(node,i) (&(BLB(node,i)->data_buffer)) |
587 | #define BLB_NBYTESINDATA(node,i) (BLB_DATA(node,i)->get_disk_size()) |
588 | #define BLB_SEQINSERT(node,i) (BLB(node,i)->seqinsert) |
589 | #define BLB_LRD(node, i) (BLB(node,i)->logical_rows_delta) |
590 | |