1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #include <my_global.h> |
40 | #include "ft/serialize/block_table.h" |
41 | #include "ft/ft-cachetable-wrappers.h" |
42 | #include "ft/ft-flusher.h" |
43 | #include "ft/ft-internal.h" |
44 | #include "ft/ft.h" |
45 | #include "ft/node.h" |
46 | |
47 | #include <util/context.h> |
48 | |
49 | static void |
50 | ftnode_get_key_and_fullhash( |
51 | BLOCKNUM* cachekey, |
52 | uint32_t* fullhash, |
53 | void* ) |
54 | { |
55 | FT ft = (FT) extra; |
56 | BLOCKNUM blocknum; |
57 | ft->blocktable.allocate_blocknum(&blocknum, ft); |
58 | *cachekey = blocknum; |
59 | *fullhash = toku_cachetable_hash(ft->cf, blocknum); |
60 | } |
61 | |
62 | void |
63 | cachetable_put_empty_node_with_dep_nodes( |
64 | FT ft, |
65 | uint32_t num_dependent_nodes, |
66 | FTNODE* dependent_nodes, |
67 | BLOCKNUM* blocknum, //output |
68 | uint32_t* fullhash, //output |
69 | FTNODE* result) |
70 | { |
71 | FTNODE XCALLOC(new_node); |
72 | PAIR dependent_pairs[num_dependent_nodes]; |
73 | enum cachetable_dirty dependent_dirty_bits[num_dependent_nodes]; |
74 | for (uint32_t i = 0; i < num_dependent_nodes; i++) { |
75 | dependent_pairs[i] = dependent_nodes[i]->ct_pair; |
76 | dependent_dirty_bits[i] = (enum cachetable_dirty) dependent_nodes[i]->dirty; |
77 | } |
78 | |
79 | toku_cachetable_put_with_dep_pairs( |
80 | ft->cf, |
81 | ftnode_get_key_and_fullhash, |
82 | new_node, |
83 | make_pair_attr(sizeof(FTNODE)), |
84 | get_write_callbacks_for_node(ft), |
85 | ft, |
86 | num_dependent_nodes, |
87 | dependent_pairs, |
88 | dependent_dirty_bits, |
89 | blocknum, |
90 | fullhash, |
91 | toku_ftnode_save_ct_pair); |
92 | *result = new_node; |
93 | } |
94 | |
95 | void |
96 | create_new_ftnode_with_dep_nodes( |
97 | FT ft, |
98 | FTNODE *result, |
99 | int height, |
100 | int n_children, |
101 | uint32_t num_dependent_nodes, |
102 | FTNODE* dependent_nodes) |
103 | { |
104 | uint32_t fullhash = 0; |
105 | BLOCKNUM blocknum; |
106 | |
107 | cachetable_put_empty_node_with_dep_nodes( |
108 | ft, |
109 | num_dependent_nodes, |
110 | dependent_nodes, |
111 | &blocknum, |
112 | &fullhash, |
113 | result); |
114 | |
115 | assert(ft->h->basementnodesize > 0); |
116 | if (height == 0) { |
117 | assert(n_children > 0); |
118 | } |
119 | |
120 | toku_initialize_empty_ftnode( |
121 | *result, |
122 | blocknum, |
123 | height, |
124 | n_children, |
125 | ft->h->layout_version, |
126 | ft->h->flags); |
127 | |
128 | (*result)->fullhash = fullhash; |
129 | } |
130 | |
131 | void |
132 | toku_create_new_ftnode ( |
133 | FT_HANDLE t, |
134 | FTNODE *result, |
135 | int height, |
136 | int n_children) |
137 | { |
138 | return create_new_ftnode_with_dep_nodes( |
139 | t->ft, |
140 | result, |
141 | height, |
142 | n_children, |
143 | 0, |
144 | NULL); |
145 | } |
146 | |
147 | // |
148 | // On success, this function assumes that the caller is trying to pin the node |
149 | // with a PL_READ lock. If message application is needed, |
150 | // then a PL_WRITE_CHEAP lock is grabbed |
151 | // |
152 | int |
153 | toku_pin_ftnode_for_query( |
154 | FT_HANDLE ft_handle, |
155 | BLOCKNUM blocknum, |
156 | uint32_t fullhash, |
157 | UNLOCKERS unlockers, |
158 | ANCESTORS ancestors, |
159 | const pivot_bounds &bounds, |
160 | ftnode_fetch_extra *bfe, |
161 | bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this |
162 | FTNODE *node_p, |
163 | bool* msgs_applied) |
164 | { |
165 | void *node_v; |
166 | *msgs_applied = false; |
167 | FTNODE node = nullptr; |
168 | MSN max_msn_in_path = ZERO_MSN; |
169 | bool needs_ancestors_messages = false; |
170 | // this function assumes that if you want ancestor messages applied, |
171 | // you are doing a read for a query. This is so we can make some optimizations |
172 | // below. |
173 | if (apply_ancestor_messages) { |
174 | paranoid_invariant(bfe->type == ftnode_fetch_subset); |
175 | } |
176 | |
177 | int r = toku_cachetable_get_and_pin_nonblocking( |
178 | ft_handle->ft->cf, |
179 | blocknum, |
180 | fullhash, |
181 | &node_v, |
182 | NULL, |
183 | get_write_callbacks_for_node(ft_handle->ft), |
184 | toku_ftnode_fetch_callback, |
185 | toku_ftnode_pf_req_callback, |
186 | toku_ftnode_pf_callback, |
187 | PL_READ, |
188 | bfe, //read_extraargs |
189 | unlockers); |
190 | if (r != 0) { |
191 | assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP. |
192 | goto exit; |
193 | } |
194 | node = static_cast<FTNODE>(node_v); |
195 | if (apply_ancestor_messages && node->height == 0) { |
196 | needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages( |
197 | ft_handle->ft, |
198 | node, |
199 | ancestors, |
200 | bounds, |
201 | &max_msn_in_path, |
202 | bfe->child_to_read |
203 | ); |
204 | if (needs_ancestors_messages) { |
205 | toku::context apply_messages_ctx(CTX_MESSAGE_APPLICATION); |
206 | |
207 | toku_unpin_ftnode_read_only(ft_handle->ft, node); |
208 | int rr = toku_cachetable_get_and_pin_nonblocking( |
209 | ft_handle->ft->cf, |
210 | blocknum, |
211 | fullhash, |
212 | &node_v, |
213 | NULL, |
214 | get_write_callbacks_for_node(ft_handle->ft), |
215 | toku_ftnode_fetch_callback, |
216 | toku_ftnode_pf_req_callback, |
217 | toku_ftnode_pf_callback, |
218 | PL_WRITE_CHEAP, |
219 | bfe, //read_extraargs |
220 | unlockers); |
221 | if (rr != 0) { |
222 | assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP. |
223 | r = TOKUDB_TRY_AGAIN; |
224 | goto exit; |
225 | } |
226 | node = static_cast<FTNODE>(node_v); |
227 | toku_apply_ancestors_messages_to_node( |
228 | ft_handle, |
229 | node, |
230 | ancestors, |
231 | bounds, |
232 | msgs_applied, |
233 | bfe->child_to_read |
234 | ); |
235 | } else { |
236 | // At this point, we aren't going to run |
237 | // toku_apply_ancestors_messages_to_node but that doesn't |
238 | // mean max_msn_applied shouldn't be updated if possible |
239 | // (this saves the CPU work involved in |
240 | // toku_ft_leaf_needs_ancestors_messages). |
241 | // |
242 | // We still have a read lock, so we have not resolved |
243 | // checkpointing. If the node is pending and dirty, we |
244 | // can't modify anything, including max_msn, until we |
245 | // resolve checkpointing. If we do, the node might get |
246 | // written out that way as part of a checkpoint with a |
247 | // root that was already written out with a smaller |
248 | // max_msn. During recovery, we would then inject a |
249 | // message based on the root's max_msn, and that message |
250 | // would get filtered by the leaf because it had too high |
251 | // a max_msn value. (see #5407) |
252 | // |
253 | // So for simplicity we only update the max_msn if the |
254 | // node is clean. That way, in order for the node to get |
255 | // written out, it would have to be dirtied. That |
256 | // requires a write lock, and a write lock requires you to |
257 | // resolve checkpointing. |
258 | if (!node->dirty) { |
259 | toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read); |
260 | } |
261 | } |
262 | } |
263 | *node_p = node; |
264 | exit: |
265 | return r; |
266 | } |
267 | |
268 | void |
269 | ( |
270 | FT ft, |
271 | BLOCKNUM blocknum, |
272 | uint32_t fullhash, |
273 | ftnode_fetch_extra *bfe, |
274 | pair_lock_type lock_type, |
275 | uint32_t num_dependent_nodes, |
276 | FTNODE *dependent_nodes, |
277 | FTNODE *node_p, |
278 | bool move_messages) |
279 | { |
280 | void *node_v; |
281 | PAIR dependent_pairs[num_dependent_nodes]; |
282 | enum cachetable_dirty dependent_dirty_bits[num_dependent_nodes]; |
283 | for (uint32_t i = 0; i < num_dependent_nodes; i++) { |
284 | dependent_pairs[i] = dependent_nodes[i]->ct_pair; |
285 | dependent_dirty_bits[i] = (enum cachetable_dirty) dependent_nodes[i]->dirty; |
286 | } |
287 | |
288 | int r = toku_cachetable_get_and_pin_with_dep_pairs( |
289 | ft->cf, |
290 | blocknum, |
291 | fullhash, |
292 | &node_v, |
293 | NULL, |
294 | get_write_callbacks_for_node(ft), |
295 | toku_ftnode_fetch_callback, |
296 | toku_ftnode_pf_req_callback, |
297 | toku_ftnode_pf_callback, |
298 | lock_type, |
299 | bfe, |
300 | num_dependent_nodes, |
301 | dependent_pairs, |
302 | dependent_dirty_bits |
303 | ); |
304 | invariant_zero(r); |
305 | FTNODE node = (FTNODE) node_v; |
306 | if (lock_type != PL_READ && node->height > 0 && move_messages) { |
307 | toku_move_ftnode_messages_to_stale(ft, node); |
308 | } |
309 | *node_p = node; |
310 | } |
311 | |
312 | void (FT ft, |
313 | BLOCKNUM blocknum, |
314 | uint32_t fullhash, |
315 | ftnode_fetch_extra *bfe, |
316 | pair_lock_type lock_type, |
317 | FTNODE *node_p, |
318 | bool move_messages) { |
319 | toku_pin_ftnode_with_dep_nodes(ft, blocknum, fullhash, bfe, lock_type, 0, nullptr, node_p, move_messages); |
320 | } |
321 | |
322 | int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep) { |
323 | void *node_v; |
324 | int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, lock_type, &node_v); |
325 | if (r != 0) { |
326 | goto cleanup; |
327 | } |
328 | CAST_FROM_VOIDP(*nodep, node_v); |
329 | if ((*nodep)->height > 0 && lock_type != PL_READ) { |
330 | toku_move_ftnode_messages_to_stale(ft, *nodep); |
331 | } |
332 | cleanup: |
333 | return r; |
334 | } |
335 | |
336 | void toku_unpin_ftnode(FT ft, FTNODE node) { |
337 | int r = toku_cachetable_unpin(ft->cf, |
338 | node->ct_pair, |
339 | static_cast<enum cachetable_dirty>(node->dirty), |
340 | make_ftnode_pair_attr(node)); |
341 | invariant_zero(r); |
342 | } |
343 | |
344 | void |
345 | toku_unpin_ftnode_read_only(FT ft, FTNODE node) |
346 | { |
347 | int r = toku_cachetable_unpin( |
348 | ft->cf, |
349 | node->ct_pair, |
350 | (enum cachetable_dirty) node->dirty, |
351 | make_invalid_pair_attr() |
352 | ); |
353 | assert(r==0); |
354 | } |
355 | |
356 | void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b) |
357 | // Effect: Swap the blocknum, fullhash, and PAIR for for a and b |
358 | // Requires: Both nodes are pinned |
359 | { |
360 | BLOCKNUM tmp_blocknum = a->blocknum; |
361 | uint32_t tmp_fullhash = a->fullhash; |
362 | PAIR tmp_pair = a->ct_pair; |
363 | |
364 | a->blocknum = b->blocknum; |
365 | a->fullhash = b->fullhash; |
366 | a->ct_pair = b->ct_pair; |
367 | |
368 | b->blocknum = tmp_blocknum; |
369 | b->fullhash = tmp_fullhash; |
370 | b->ct_pair = tmp_pair; |
371 | |
372 | // A and B swapped pair pointers, but we still have to swap |
373 | // the actual pair values (ie: the FTNODEs they represent) |
374 | // in the cachetable. |
375 | toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair); |
376 | } |
377 | |