1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#include <my_global.h>
40#include "ft/serialize/block_table.h"
41#include "ft/ft-cachetable-wrappers.h"
42#include "ft/ft-flusher.h"
43#include "ft/ft-internal.h"
44#include "ft/ft.h"
45#include "ft/node.h"
46
47#include <util/context.h>
48
49static void
50ftnode_get_key_and_fullhash(
51 BLOCKNUM* cachekey,
52 uint32_t* fullhash,
53 void* extra)
54{
55 FT ft = (FT) extra;
56 BLOCKNUM blocknum;
57 ft->blocktable.allocate_blocknum(&blocknum, ft);
58 *cachekey = blocknum;
59 *fullhash = toku_cachetable_hash(ft->cf, blocknum);
60}
61
62void
63cachetable_put_empty_node_with_dep_nodes(
64 FT ft,
65 uint32_t num_dependent_nodes,
66 FTNODE* dependent_nodes,
67 BLOCKNUM* blocknum, //output
68 uint32_t* fullhash, //output
69 FTNODE* result)
70{
71 FTNODE XCALLOC(new_node);
72 PAIR dependent_pairs[num_dependent_nodes];
73 enum cachetable_dirty dependent_dirty_bits[num_dependent_nodes];
74 for (uint32_t i = 0; i < num_dependent_nodes; i++) {
75 dependent_pairs[i] = dependent_nodes[i]->ct_pair;
76 dependent_dirty_bits[i] = (enum cachetable_dirty) dependent_nodes[i]->dirty;
77 }
78
79 toku_cachetable_put_with_dep_pairs(
80 ft->cf,
81 ftnode_get_key_and_fullhash,
82 new_node,
83 make_pair_attr(sizeof(FTNODE)),
84 get_write_callbacks_for_node(ft),
85 ft,
86 num_dependent_nodes,
87 dependent_pairs,
88 dependent_dirty_bits,
89 blocknum,
90 fullhash,
91 toku_ftnode_save_ct_pair);
92 *result = new_node;
93}
94
95void
96create_new_ftnode_with_dep_nodes(
97 FT ft,
98 FTNODE *result,
99 int height,
100 int n_children,
101 uint32_t num_dependent_nodes,
102 FTNODE* dependent_nodes)
103{
104 uint32_t fullhash = 0;
105 BLOCKNUM blocknum;
106
107 cachetable_put_empty_node_with_dep_nodes(
108 ft,
109 num_dependent_nodes,
110 dependent_nodes,
111 &blocknum,
112 &fullhash,
113 result);
114
115 assert(ft->h->basementnodesize > 0);
116 if (height == 0) {
117 assert(n_children > 0);
118 }
119
120 toku_initialize_empty_ftnode(
121 *result,
122 blocknum,
123 height,
124 n_children,
125 ft->h->layout_version,
126 ft->h->flags);
127
128 (*result)->fullhash = fullhash;
129}
130
131void
132toku_create_new_ftnode (
133 FT_HANDLE t,
134 FTNODE *result,
135 int height,
136 int n_children)
137{
138 return create_new_ftnode_with_dep_nodes(
139 t->ft,
140 result,
141 height,
142 n_children,
143 0,
144 NULL);
145}
146
147//
148// On success, this function assumes that the caller is trying to pin the node
149// with a PL_READ lock. If message application is needed,
150// then a PL_WRITE_CHEAP lock is grabbed
151//
152int
153toku_pin_ftnode_for_query(
154 FT_HANDLE ft_handle,
155 BLOCKNUM blocknum,
156 uint32_t fullhash,
157 UNLOCKERS unlockers,
158 ANCESTORS ancestors,
159 const pivot_bounds &bounds,
160 ftnode_fetch_extra *bfe,
161 bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
162 FTNODE *node_p,
163 bool* msgs_applied)
164{
165 void *node_v;
166 *msgs_applied = false;
167 FTNODE node = nullptr;
168 MSN max_msn_in_path = ZERO_MSN;
169 bool needs_ancestors_messages = false;
170 // this function assumes that if you want ancestor messages applied,
171 // you are doing a read for a query. This is so we can make some optimizations
172 // below.
173 if (apply_ancestor_messages) {
174 paranoid_invariant(bfe->type == ftnode_fetch_subset);
175 }
176
177 int r = toku_cachetable_get_and_pin_nonblocking(
178 ft_handle->ft->cf,
179 blocknum,
180 fullhash,
181 &node_v,
182 NULL,
183 get_write_callbacks_for_node(ft_handle->ft),
184 toku_ftnode_fetch_callback,
185 toku_ftnode_pf_req_callback,
186 toku_ftnode_pf_callback,
187 PL_READ,
188 bfe, //read_extraargs
189 unlockers);
190 if (r != 0) {
191 assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
192 goto exit;
193 }
194 node = static_cast<FTNODE>(node_v);
195 if (apply_ancestor_messages && node->height == 0) {
196 needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
197 ft_handle->ft,
198 node,
199 ancestors,
200 bounds,
201 &max_msn_in_path,
202 bfe->child_to_read
203 );
204 if (needs_ancestors_messages) {
205 toku::context apply_messages_ctx(CTX_MESSAGE_APPLICATION);
206
207 toku_unpin_ftnode_read_only(ft_handle->ft, node);
208 int rr = toku_cachetable_get_and_pin_nonblocking(
209 ft_handle->ft->cf,
210 blocknum,
211 fullhash,
212 &node_v,
213 NULL,
214 get_write_callbacks_for_node(ft_handle->ft),
215 toku_ftnode_fetch_callback,
216 toku_ftnode_pf_req_callback,
217 toku_ftnode_pf_callback,
218 PL_WRITE_CHEAP,
219 bfe, //read_extraargs
220 unlockers);
221 if (rr != 0) {
222 assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
223 r = TOKUDB_TRY_AGAIN;
224 goto exit;
225 }
226 node = static_cast<FTNODE>(node_v);
227 toku_apply_ancestors_messages_to_node(
228 ft_handle,
229 node,
230 ancestors,
231 bounds,
232 msgs_applied,
233 bfe->child_to_read
234 );
235 } else {
236 // At this point, we aren't going to run
237 // toku_apply_ancestors_messages_to_node but that doesn't
238 // mean max_msn_applied shouldn't be updated if possible
239 // (this saves the CPU work involved in
240 // toku_ft_leaf_needs_ancestors_messages).
241 //
242 // We still have a read lock, so we have not resolved
243 // checkpointing. If the node is pending and dirty, we
244 // can't modify anything, including max_msn, until we
245 // resolve checkpointing. If we do, the node might get
246 // written out that way as part of a checkpoint with a
247 // root that was already written out with a smaller
248 // max_msn. During recovery, we would then inject a
249 // message based on the root's max_msn, and that message
250 // would get filtered by the leaf because it had too high
251 // a max_msn value. (see #5407)
252 //
253 // So for simplicity we only update the max_msn if the
254 // node is clean. That way, in order for the node to get
255 // written out, it would have to be dirtied. That
256 // requires a write lock, and a write lock requires you to
257 // resolve checkpointing.
258 if (!node->dirty) {
259 toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
260 }
261 }
262 }
263 *node_p = node;
264exit:
265 return r;
266}
267
268void
269toku_pin_ftnode_with_dep_nodes(
270 FT ft,
271 BLOCKNUM blocknum,
272 uint32_t fullhash,
273 ftnode_fetch_extra *bfe,
274 pair_lock_type lock_type,
275 uint32_t num_dependent_nodes,
276 FTNODE *dependent_nodes,
277 FTNODE *node_p,
278 bool move_messages)
279{
280 void *node_v;
281 PAIR dependent_pairs[num_dependent_nodes];
282 enum cachetable_dirty dependent_dirty_bits[num_dependent_nodes];
283 for (uint32_t i = 0; i < num_dependent_nodes; i++) {
284 dependent_pairs[i] = dependent_nodes[i]->ct_pair;
285 dependent_dirty_bits[i] = (enum cachetable_dirty) dependent_nodes[i]->dirty;
286 }
287
288 int r = toku_cachetable_get_and_pin_with_dep_pairs(
289 ft->cf,
290 blocknum,
291 fullhash,
292 &node_v,
293 NULL,
294 get_write_callbacks_for_node(ft),
295 toku_ftnode_fetch_callback,
296 toku_ftnode_pf_req_callback,
297 toku_ftnode_pf_callback,
298 lock_type,
299 bfe,
300 num_dependent_nodes,
301 dependent_pairs,
302 dependent_dirty_bits
303 );
304 invariant_zero(r);
305 FTNODE node = (FTNODE) node_v;
306 if (lock_type != PL_READ && node->height > 0 && move_messages) {
307 toku_move_ftnode_messages_to_stale(ft, node);
308 }
309 *node_p = node;
310}
311
312void toku_pin_ftnode(FT ft,
313 BLOCKNUM blocknum,
314 uint32_t fullhash,
315 ftnode_fetch_extra *bfe,
316 pair_lock_type lock_type,
317 FTNODE *node_p,
318 bool move_messages) {
319 toku_pin_ftnode_with_dep_nodes(ft, blocknum, fullhash, bfe, lock_type, 0, nullptr, node_p, move_messages);
320}
321
322int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pair_lock_type lock_type, FTNODE *nodep) {
323 void *node_v;
324 int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, lock_type, &node_v);
325 if (r != 0) {
326 goto cleanup;
327 }
328 CAST_FROM_VOIDP(*nodep, node_v);
329 if ((*nodep)->height > 0 && lock_type != PL_READ) {
330 toku_move_ftnode_messages_to_stale(ft, *nodep);
331 }
332cleanup:
333 return r;
334}
335
336void toku_unpin_ftnode(FT ft, FTNODE node) {
337 int r = toku_cachetable_unpin(ft->cf,
338 node->ct_pair,
339 static_cast<enum cachetable_dirty>(node->dirty),
340 make_ftnode_pair_attr(node));
341 invariant_zero(r);
342}
343
344void
345toku_unpin_ftnode_read_only(FT ft, FTNODE node)
346{
347 int r = toku_cachetable_unpin(
348 ft->cf,
349 node->ct_pair,
350 (enum cachetable_dirty) node->dirty,
351 make_invalid_pair_attr()
352 );
353 assert(r==0);
354}
355
356void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b)
357// Effect: Swap the blocknum, fullhash, and PAIR for for a and b
358// Requires: Both nodes are pinned
359{
360 BLOCKNUM tmp_blocknum = a->blocknum;
361 uint32_t tmp_fullhash = a->fullhash;
362 PAIR tmp_pair = a->ct_pair;
363
364 a->blocknum = b->blocknum;
365 a->fullhash = b->fullhash;
366 a->ct_pair = b->ct_pair;
367
368 b->blocknum = tmp_blocknum;
369 b->fullhash = tmp_fullhash;
370 b->ct_pair = tmp_pair;
371
372 // A and B swapped pair pointers, but we still have to swap
373 // the actual pair values (ie: the FTNODEs they represent)
374 // in the cachetable.
375 toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair);
376}
377