1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | // Purpose of this file is to handle all modifications and queries to the database |
40 | // at the level of leafentry. |
41 | // |
42 | // ule = Unpacked Leaf Entry |
43 | // |
44 | // This design unpacks the leafentry into a convenient format, performs all work |
45 | // on the unpacked form, then repacks the leafentry into its compact format. |
46 | // |
47 | // See design documentation for nested transactions at |
48 | // TokuWiki/Imp/TransactionsOverview. |
49 | |
50 | #include <my_global.h> |
51 | #include "portability/toku_portability.h" |
52 | |
53 | #include "ft/ft-internal.h" |
54 | #include "ft/leafentry.h" |
55 | #include "ft/logger/logger.h" |
56 | #include "ft/msg.h" |
57 | #include "ft/txn/txn.h" |
58 | #include "ft/txn/txn_manager.h" |
59 | #include "ft/ule.h" |
60 | #include "ft/ule-internal.h" |
61 | #include "ft/txn/xids.h" |
62 | #include "util/bytestring.h" |
63 | #include "util/omt.h" |
64 | #include "util/partitioned_counter.h" |
65 | #include "util/scoped_malloc.h" |
66 | #include "util/status.h" |
67 | |
68 | #define ULE_DEBUG 0 |
69 | |
70 | static uint32_t ule_get_innermost_numbytes(ULE ule, uint32_t keylen); |
71 | |
72 | void toku_le_get_status(LE_STATUS statp) { |
73 | le_status.init(); |
74 | *statp = le_status; |
75 | } |
76 | |
77 | //////////////////////////////////////////////////////////////////////////////// |
78 | // Accessor functions used by outside world (e.g. indexer) |
79 | // |
80 | |
81 | ULEHANDLE toku_ule_create(LEAFENTRY le) { |
82 | ULE XMALLOC(ule_p); |
83 | le_unpack(ule_p, le); |
84 | return (ULEHANDLE) ule_p; |
85 | } |
86 | |
87 | void toku_ule_free(ULEHANDLE ule_p) { |
88 | ule_cleanup((ULE) ule_p); |
89 | toku_free(ule_p); |
90 | } |
91 | |
92 | //////////////////////////////////////////////////////////////////////////////// |
93 | // |
94 | // Question: Can any software outside this file modify or read a leafentry? |
95 | // If so, is it worthwhile to put it all here? |
96 | // |
97 | // There are two entries, one each for modification and query: |
98 | // toku_le_apply_msg() performs all inserts/deletes/aborts |
99 | // |
100 | // |
101 | // |
102 | // |
103 | |
104 | //This is what we use to initialize Xuxrs[0] in a new unpacked leafentry. |
105 | const UXR_S committed_delete = { |
106 | .type = XR_DELETE, |
107 | .vallen = 0, |
108 | .valp = NULL, |
109 | .xid = 0 |
110 | }; // static allocation of uxr with type set to committed delete and xid = 0 |
111 | |
112 | #define INSERT_LENGTH(len) ((1U << 31) | len) |
113 | #define DELETE_LENGTH(len) (0) |
114 | #define GET_LENGTH(len) (len & ((1U << 31)-1)) |
115 | #define IS_INSERT(len) (len & (1U << 31)) |
116 | #define IS_VALID_LEN(len) (len < (1U<<31)) |
117 | |
118 | // Local functions: |
119 | |
120 | static inline void msg_init_empty_ule(ULE ule); |
121 | static int64_t msg_modify_ule(ULE ule, const ft_msg &msg); |
122 | static inline void ule_init_empty_ule(ULE ule); |
123 | static void ule_do_implicit_promotions(ULE ule, XIDS xids); |
124 | static void ule_try_promote_provisional_outermost( |
125 | ULE ule, |
126 | TXNID oldest_possible_live_xid); |
127 | static void ule_promote_provisional_innermost_to_index(ULE ule, uint32_t index); |
128 | static void ule_promote_provisional_innermost_to_committed(ULE ule); |
129 | static inline int64_t ule_apply_insert_no_overwrite( |
130 | ULE ule, |
131 | XIDS xids, |
132 | uint32_t vallen, |
133 | void* valp); |
134 | static inline int64_t ule_apply_insert( |
135 | ULE ule, |
136 | XIDS xids, |
137 | uint32_t vallen, |
138 | void* valp); |
139 | static inline int64_t ule_apply_delete(ULE ule, XIDS xids); |
140 | static inline void ule_prepare_for_new_uxr(ULE ule, XIDS xids); |
141 | static inline int64_t ule_apply_abort(ULE ule, XIDS xids); |
142 | static void ule_apply_broadcast_commit_all (ULE ule); |
143 | static void ule_apply_commit(ULE ule, XIDS xids); |
144 | static inline void ule_push_insert_uxr( |
145 | ULE ule, |
146 | bool is_committed, |
147 | TXNID xid, |
148 | uint32_t vallen, |
149 | void* valp); |
150 | static inline void ule_push_delete_uxr(ULE ule, bool is_committed, TXNID xid); |
151 | static inline void ule_push_placeholder_uxr(ULE ule, TXNID xid); |
152 | static inline UXR ule_get_innermost_uxr(ULE ule); |
153 | static inline UXR ule_get_first_empty_uxr(ULE ule); |
154 | static inline void ule_remove_innermost_uxr(ULE ule); |
155 | static inline TXNID ule_get_innermost_xid(ULE ule); |
156 | static inline TXNID ule_get_xid(ULE ule, uint32_t index); |
157 | static void ule_remove_innermost_placeholders(ULE ule); |
158 | static void ule_add_placeholders(ULE ule, XIDS xids); |
159 | static void ule_optimize(ULE ule, XIDS xids); |
160 | static inline bool uxr_type_is_insert(uint8_t type); |
161 | static inline bool uxr_type_is_delete(uint8_t type); |
162 | static inline bool uxr_type_is_placeholder(uint8_t type); |
163 | static inline size_t uxr_pack_txnid(UXR uxr, uint8_t *p); |
164 | static inline size_t uxr_pack_type_and_length(UXR uxr, uint8_t *p); |
165 | static inline size_t uxr_pack_length_and_bit(UXR uxr, uint8_t *p); |
166 | static inline size_t uxr_pack_data(UXR uxr, uint8_t *p); |
167 | static inline size_t uxr_unpack_txnid(UXR uxr, uint8_t *p); |
168 | static inline size_t uxr_unpack_type_and_length(UXR uxr, uint8_t *p); |
169 | static inline size_t uxr_unpack_length_and_bit(UXR uxr, uint8_t *p); |
170 | static inline size_t uxr_unpack_data(UXR uxr, uint8_t *p); |
171 | |
172 | #if 0 |
173 | static void ule_print(ULE ule, const char* note) { |
174 | fprintf(stderr, "%s : ULE[0x%p]\n" , note, ule); |
175 | fprintf(stderr, " num_puxrs[%u]\n" , ule->num_puxrs); |
176 | fprintf(stderr, " num_cuxrs[%u]\n" , ule->num_cuxrs); |
177 | fprintf(stderr, " innermost[%u]\n" , ule->num_cuxrs + ule->num_puxrs - 1); |
178 | fprintf(stderr, " first_empty[%u]\n" , ule->num_cuxrs + ule->num_puxrs); |
179 | |
180 | uint32_t num_uxrs = ule->num_cuxrs + ule->num_puxrs - 1; |
181 | for (uint32_t uxr_num = 0; uxr_num <= num_uxrs; uxr_num++) { |
182 | UXR uxr = &(ule->uxrs[uxr_num]); |
183 | fprintf(stderr, " uxr[%u]\n" , uxr_num); |
184 | switch (uxr->type) { |
185 | case 0: fprintf(stderr, " type[NONE]\n" ); break; |
186 | case 1: fprintf(stderr, " type[INSERT]\n" ); break; |
187 | case 2: fprintf(stderr, " type[DELETE]\n" ); break; |
188 | case 3: fprintf(stderr, " type[PLACEHOLDER]\n" ); break; |
189 | default: fprintf(stderr, " type[WHAT??]\n" ); break; |
190 | } |
191 | fprintf(stderr, " xid[%lu]\n" , uxr->xid); |
192 | } |
193 | } |
194 | #endif |
195 | |
196 | static void get_space_for_le( |
197 | bn_data* data_buffer, |
198 | uint32_t idx, |
199 | void* keyp, |
200 | uint32_t keylen, |
201 | uint32_t old_keylen, |
202 | uint32_t old_le_size, |
203 | size_t size, |
204 | LEAFENTRY* new_le_space, |
205 | void** const maybe_free) { |
206 | |
207 | if (data_buffer == nullptr) { |
208 | CAST_FROM_VOIDP(*new_le_space, toku_xmalloc(size)); |
209 | } else if (old_le_size > 0) { |
210 | // this means we are overwriting something |
211 | data_buffer->get_space_for_overwrite( |
212 | idx, |
213 | keyp, |
214 | keylen, |
215 | old_keylen, |
216 | old_le_size, |
217 | size, |
218 | new_le_space, |
219 | maybe_free); |
220 | } else { |
221 | // this means we are inserting something new |
222 | data_buffer->get_space_for_insert( |
223 | idx, |
224 | keyp, |
225 | keylen, |
226 | size, |
227 | new_le_space, |
228 | maybe_free); |
229 | } |
230 | } |
231 | |
232 | |
233 | ///////////////////////////////////////////////////////////////////// |
234 | // Garbage collection related functions |
235 | // |
236 | |
237 | static TXNID get_next_older_txnid(TXNID xc, const xid_omt_t &omt) { |
238 | int r; |
239 | TXNID xid; |
240 | r = omt.find<TXNID, toku_find_xid_by_xid>(xc, -1, &xid, nullptr); |
241 | if (r==0) { |
242 | invariant(xid < xc); //sanity check |
243 | } else { |
244 | invariant(r==DB_NOTFOUND); |
245 | xid = TXNID_NONE; |
246 | } |
247 | return xid; |
248 | } |
249 | |
250 | // |
251 | // This function returns true if live transaction TL1 is allowed to read a |
252 | // value committed by transaction xc, false otherwise. |
253 | // |
254 | static bool xid_reads_committed_xid( |
255 | TXNID tl1, |
256 | TXNID xc, |
257 | const xid_omt_t& snapshot_txnids, |
258 | const rx_omt_t& referenced_xids) { |
259 | |
260 | bool rval; |
261 | if (tl1 < xc) { |
262 | rval = false; //cannot read a newer txn |
263 | } else { |
264 | TXNID x = |
265 | toku_get_youngest_live_list_txnid_for( |
266 | xc, |
267 | snapshot_txnids, |
268 | referenced_xids); |
269 | |
270 | if (x == TXNID_NONE) { |
271 | //Not in ANY live list, tl1 can read it. |
272 | rval = true; |
273 | } else { |
274 | //Newer than the 'newest one that has it in live list' |
275 | rval = tl1 > x; |
276 | } |
277 | // we know tl1 > xc |
278 | // we know x > xc |
279 | // if tl1 == x, then we do not read, because tl1 is in xc's live list |
280 | // if x is older than tl1, that means that xc < x < tl1 |
281 | // and if xc is in x's live list, it CANNOT be in tl1's live list |
282 | } |
283 | return rval; |
284 | } |
285 | |
286 | // |
287 | // This function does some simple garbage collection given a TXNID known |
288 | // to be the oldest referenced xid, that is, the oldest xid in any live list. |
289 | // We find the youngest entry in the stack with an xid less |
290 | // than oldest_referenced_xid. All elements below this entry are garbage, |
291 | // so we get rid of them. |
292 | // |
293 | static void ule_simple_garbage_collection(ULE ule, txn_gc_info *gc_info) { |
294 | if (ule->num_cuxrs == 1) { |
295 | return; |
296 | } |
297 | |
298 | uint32_t curr_index = 0; |
299 | if (gc_info->mvcc_needed) { |
300 | // starting at the top of the committed stack, find the first |
301 | // uxr with a txnid that is less than oldest_referenced_xid |
302 | for (uint32_t i = 0; i < ule->num_cuxrs; i++) { |
303 | curr_index = ule->num_cuxrs - i - 1; |
304 | if (ule->uxrs[curr_index].xid < |
305 | gc_info->oldest_referenced_xid_for_simple_gc) { |
306 | break; |
307 | } |
308 | } |
309 | } else { |
310 | // if mvcc is not needed, we can need the top committed |
311 | // value and nothing else |
312 | curr_index = ule->num_cuxrs - 1; |
313 | } |
314 | |
315 | // curr_index is now set to the youngest uxr older than |
316 | // oldest_referenced_xid so if it's not the bottom of the stack.. |
317 | if (curr_index != 0) { |
318 | // ..then we need to get rid of the entries below curr_index |
319 | uint32_t num_entries = ule->num_cuxrs + ule->num_puxrs - curr_index; |
320 | memmove( |
321 | &ule->uxrs[0], |
322 | &ule->uxrs[curr_index], |
323 | num_entries * sizeof(ule->uxrs[0])); |
324 | ule->uxrs[0].xid = TXNID_NONE; // New 'bottom of stack' loses its TXNID |
325 | ule->num_cuxrs -= curr_index; |
326 | } |
327 | } |
328 | |
329 | // TODO: Clean this up |
330 | extern bool garbage_collection_debug; |
331 | |
332 | static void ule_garbage_collect( |
333 | ULE ule, |
334 | const xid_omt_t& snapshot_xids, |
335 | const rx_omt_t& referenced_xids, |
336 | const xid_omt_t& live_root_txns) { |
337 | |
338 | if (ule->num_cuxrs == 1) { |
339 | return; |
340 | } |
341 | |
342 | toku::scoped_calloc necessary_buf(ule->num_cuxrs * sizeof(bool)); |
343 | bool *necessary = reinterpret_cast<bool *>(necessary_buf.get()); |
344 | |
345 | uint32_t curr_committed_entry; |
346 | curr_committed_entry = ule->num_cuxrs - 1; |
347 | while (true) { |
348 | // mark the curr_committed_entry as necessary |
349 | necessary[curr_committed_entry] = true; |
350 | if (curr_committed_entry == 0) break; //nothing left |
351 | |
352 | // find the youngest live transaction that reads something |
353 | // below curr_committed_entry, if it exists |
354 | TXNID tl1; |
355 | TXNID xc = ule->uxrs[curr_committed_entry].xid; |
356 | |
357 | // |
358 | // If we find that the committed transaction is in the live list, |
359 | // then xc is really in the process of being committed. It has not |
360 | // been fully committed. As a result, our assumption that transactions |
361 | // newer than what is currently in these OMTs will read the top of the |
362 | // stack is not necessarily accurate. Transactions may read what is |
363 | // just below xc. |
364 | // As a result, we must mark what is just below xc as necessary and |
365 | // move on. This issue was found while testing flusher threads, and was |
366 | // fixed for #3979 |
367 | // |
368 | bool is_xc_live = toku_is_txn_in_live_root_txn_list(live_root_txns, xc); |
369 | if (is_xc_live) { |
370 | curr_committed_entry--; |
371 | continue; |
372 | } |
373 | |
374 | tl1 = |
375 | toku_get_youngest_live_list_txnid_for( |
376 | xc, |
377 | snapshot_xids, |
378 | referenced_xids); |
379 | |
380 | // if tl1 == xc, that means xc should be live and show up in |
381 | // live_root_txns, which we check above. |
382 | invariant(tl1 != xc); |
383 | |
384 | if (tl1 == TXNID_NONE) { |
385 | // set tl1 to youngest live transaction older than |
386 | // ule->uxrs[curr_committed_entry]->xid |
387 | tl1 = get_next_older_txnid(xc, snapshot_xids); |
388 | if (tl1 == TXNID_NONE) { |
389 | // remainder is garbage, we're done |
390 | break; |
391 | } |
392 | } |
393 | if (garbage_collection_debug) { |
394 | int r = |
395 | snapshot_xids.find_zero<TXNID, toku_find_xid_by_xid>( |
396 | tl1, |
397 | nullptr, |
398 | nullptr); |
399 | // make sure that the txn you are claiming is live is actually live |
400 | invariant_zero(r); |
401 | } |
402 | // |
403 | // tl1 should now be set |
404 | // |
405 | curr_committed_entry--; |
406 | while (curr_committed_entry > 0) { |
407 | xc = ule->uxrs[curr_committed_entry].xid; |
408 | if (xid_reads_committed_xid( |
409 | tl1, |
410 | xc, |
411 | snapshot_xids, |
412 | referenced_xids)) { |
413 | break; |
414 | } |
415 | curr_committed_entry--; |
416 | } |
417 | } |
418 | uint32_t first_free = 0; |
419 | for (uint32_t i = 0; i < ule->num_cuxrs; i++) { |
420 | // Shift values to 'delete' garbage values. |
421 | if (necessary[i]) { |
422 | ule->uxrs[first_free] = ule->uxrs[i]; |
423 | first_free++; |
424 | } |
425 | } |
426 | uint32_t saved = first_free; |
427 | invariant(saved <= ule->num_cuxrs); |
428 | invariant(saved >= 1); |
429 | ule->uxrs[0].xid = TXNID_NONE; //New 'bottom of stack' loses its TXNID |
430 | if (first_free != ule->num_cuxrs) { |
431 | // Shift provisional values |
432 | memmove( |
433 | &ule->uxrs[first_free], |
434 | &ule->uxrs[ule->num_cuxrs], |
435 | ule->num_puxrs * sizeof(ule->uxrs[0])); |
436 | } |
437 | ule->num_cuxrs = saved; |
438 | } |
439 | |
440 | static size_t ule_packed_memsize(ULE ule) { |
441 | // Returns: The size 'ule' would be when packed into a leafentry, or 0 if the |
442 | // topmost committed value is a delete. |
443 | if (ule->num_cuxrs == 1 && ule->num_puxrs == 0) { |
444 | UXR uxr = ule_get_innermost_uxr(ule); |
445 | if (uxr_is_delete(uxr)) { |
446 | return 0; |
447 | } |
448 | } |
449 | return le_memsize_from_ule(ule); |
450 | } |
451 | |
452 | // Heuristics to control when we decide to initialize |
453 | // txn manager state (possibly expensive) and run gc. |
454 | enum { |
455 | ULE_MIN_STACK_SIZE_TO_FORCE_GC = 5, |
456 | ULE_MIN_MEMSIZE_TO_FORCE_GC = 1024 * 1024 |
457 | }; |
458 | |
459 | //////////////////////////////////////////////////////////////////////////////// |
460 | // This is the big enchilada. (Bring Tums.) Note that this level of |
461 | // abstraction has no knowledge of the inner structure of either leafentry or |
462 | // msg. It makes calls into the next lower layer (msg_xxx) which handles |
463 | // messages. |
464 | // |
465 | // NOTE: This is the only function (at least in this body of code) that modifies |
466 | // a leafentry. |
467 | // NOTE: It is the responsibility of the caller to make sure that the key is set |
468 | // in the FT_MSG, as it will be used to store the data in the data_buffer |
469 | // |
470 | // Returns -1, 0, or 1 that identifies the change in logical row count needed |
471 | // based on the results of the message application. For example, if a delete |
472 | // finds no logical leafentry or if an insert finds a duplicate and is |
473 | // converted to an update. |
474 | // |
475 | // old_leafentry - NULL if there was no stored data. |
476 | // data_buffer - bn_data storing leafentry, if NULL, means there is no bn_data |
477 | // idx - index in data_buffer where leafentry is stored |
478 | // (and should be replaced) |
479 | // old_keylen - length of the any key in data_buffer |
480 | // new_leafentry_p - If the leafentry is destroyed it sets *new_leafentry_p |
481 | // to NULL. Otherwise the new_leafentry_p points at the new |
482 | // leaf entry. |
483 | // numbytes_delta_p - change in total size of key and val, not including any |
484 | // overhead |
485 | int64_t toku_le_apply_msg( |
486 | const ft_msg& msg, |
487 | LEAFENTRY old_leafentry, |
488 | bn_data* data_buffer, |
489 | uint32_t idx, |
490 | uint32_t old_keylen, |
491 | txn_gc_info* gc_info, |
492 | LEAFENTRY* new_leafentry_p, |
493 | int64_t* numbytes_delta_p) { |
494 | |
495 | invariant_notnull(gc_info); |
496 | paranoid_invariant_notnull(new_leafentry_p); |
497 | ULE_S ule; |
498 | int64_t oldnumbytes = 0; |
499 | int64_t newnumbytes = 0; |
500 | uint64_t oldmemsize = 0; |
501 | uint32_t keylen = msg.kdbt()->size; |
502 | int32_t rowcountdelta = 0; |
503 | |
504 | if (old_leafentry == NULL) { |
505 | msg_init_empty_ule(&ule); |
506 | } else { |
507 | oldmemsize = leafentry_memsize(old_leafentry); |
508 | le_unpack(&ule, old_leafentry); // otherwise unpack leafentry |
509 | oldnumbytes = ule_get_innermost_numbytes(&ule, keylen); |
510 | } |
511 | |
512 | // modify unpacked leafentry |
513 | rowcountdelta = msg_modify_ule(&ule, msg); |
514 | |
515 | // - we may be able to immediately promote the newly-apllied outermost |
516 | // provisonal uxr |
517 | // - either way, run simple gc first, and then full gc if there are still |
518 | // some committed uxrs. |
519 | ule_try_promote_provisional_outermost( |
520 | &ule, |
521 | gc_info->oldest_referenced_xid_for_implicit_promotion); |
522 | ule_simple_garbage_collection(&ule, gc_info); |
523 | txn_manager_state *txn_state_for_gc = gc_info->txn_state_for_gc; |
524 | size_t size_before_gc = 0; |
525 | // there is garbage to clean, and our caller gave us state.. |
526 | // ..and either the state is pre-initialized, or the committed stack is |
527 | // large enough |
528 | // ..or the ule's raw memsize is sufficiently large |
529 | // ..then it's worth running gc, possibly initializing the txn manager |
530 | // state, if it isn't already |
531 | if (ule.num_cuxrs > 1 && txn_state_for_gc != nullptr && |
532 | (txn_state_for_gc->initialized || |
533 | ule.num_cuxrs >= ULE_MIN_STACK_SIZE_TO_FORCE_GC || |
534 | (size_before_gc = ule_packed_memsize(&ule)) >= |
535 | ULE_MIN_MEMSIZE_TO_FORCE_GC)) { |
536 | if (!txn_state_for_gc->initialized) { |
537 | txn_state_for_gc->init(); |
538 | } |
539 | // it's already been calculated above |
540 | size_before_gc = |
541 | size_before_gc != 0 ? size_before_gc : ule_packed_memsize(&ule); |
542 | ule_garbage_collect( |
543 | &ule, |
544 | txn_state_for_gc->snapshot_xids, |
545 | txn_state_for_gc->referenced_xids, |
546 | txn_state_for_gc->live_root_txns); |
547 | size_t size_after_gc = ule_packed_memsize(&ule); |
548 | |
549 | LE_STATUS_INC(LE_APPLY_GC_BYTES_IN, size_before_gc); |
550 | LE_STATUS_INC(LE_APPLY_GC_BYTES_OUT, size_after_gc); |
551 | } |
552 | |
553 | void* maybe_free = nullptr; |
554 | // create packed leafentry |
555 | // contract of this function is caller has keyp and keylen set, always |
556 | int r = |
557 | le_pack( |
558 | &ule, |
559 | data_buffer, |
560 | idx, |
561 | msg.kdbt()->data, |
562 | keylen, |
563 | old_keylen, |
564 | oldmemsize, |
565 | new_leafentry_p, |
566 | &maybe_free); |
567 | invariant_zero(r); |
568 | if (*new_leafentry_p) { |
569 | newnumbytes = ule_get_innermost_numbytes(&ule, keylen); |
570 | } |
571 | *numbytes_delta_p = newnumbytes - oldnumbytes; |
572 | |
573 | ule_cleanup(&ule); |
574 | if (maybe_free != nullptr) { |
575 | toku_free(maybe_free); |
576 | } |
577 | return rowcountdelta; |
578 | } |
579 | |
580 | bool toku_le_worth_running_garbage_collection( |
581 | LEAFENTRY le, |
582 | txn_gc_info* gc_info) { |
583 | // Effect: Quickly determines if it's worth trying to run garbage collection |
584 | // on a leafentry |
585 | // Return: True if it makes sense to try garbage collection, false otherwise. |
586 | // Rationale: Garbage collection is likely to clean up under two circumstances: |
587 | // 1.) There are multiple committed entries. Some may never be read |
588 | // by new txns. |
589 | // 2.) There is only one committed entry, but the outermost |
590 | // provisional entry is older than the oldest known referenced |
591 | // xid, so it must have committed. Therefor we can promote it to |
592 | // committed and get rid of the old committed entry. |
593 | if (le->type != LE_MVCC) { |
594 | return false; |
595 | } |
596 | if (le->u.mvcc.num_cxrs > 1) { |
597 | return true; |
598 | } else { |
599 | paranoid_invariant(le->u.mvcc.num_cxrs == 1); |
600 | } |
601 | return le->u.mvcc.num_pxrs > 0 && |
602 | le_outermost_uncommitted_xid(le) < |
603 | gc_info->oldest_referenced_xid_for_implicit_promotion; |
604 | } |
605 | |
606 | // Garbage collect one leaf entry, using the given OMT's. |
607 | // Parameters: |
608 | // -- old_leaf_entry : the leaf we intend to clean up through garbage |
609 | // collecting. |
610 | // -- new_leaf_entry (OUTPUT) : a pointer to the leaf entry after |
611 | // garbage collection. |
612 | // -- new_leaf_entry_memory_size : after this call, our leaf entry |
613 | // should be empty or smaller. This number represents that and is |
614 | // used in a previous call to truncate the existing size. |
615 | // -- omt : the memory where our leaf entry resides. |
616 | // -- mp : our memory pool. |
617 | // -- maybe_free (OUTPUT) : in a previous call, we may be able to free |
618 | // the memory completely, if we removed the leaf entry. |
619 | // -- snapshot_xids : we use these in memory transaction ids to |
620 | // determine what to garbage collect. |
621 | // -- referenced_xids : list of in memory active transactions. |
622 | // NOTE: it is not a good idea to garbage collect a leaf |
623 | // entry with only one committed value. |
624 | void toku_le_garbage_collect( |
625 | LEAFENTRY old_leaf_entry, |
626 | bn_data* data_buffer, |
627 | uint32_t idx, |
628 | void* keyp, |
629 | uint32_t keylen, |
630 | txn_gc_info* gc_info, |
631 | LEAFENTRY* new_leaf_entry, |
632 | int64_t* numbytes_delta_p) { |
633 | |
634 | // We shouldn't want to run gc without having provided a snapshot of the |
635 | // txn system. |
636 | invariant_notnull(gc_info); |
637 | invariant_notnull(gc_info->txn_state_for_gc); |
638 | paranoid_invariant_notnull(new_leaf_entry); |
639 | ULE_S ule; |
640 | int64_t oldnumbytes = 0; |
641 | int64_t newnumbytes = 0; |
642 | |
643 | le_unpack(&ule, old_leaf_entry); |
644 | |
645 | oldnumbytes = ule_get_innermost_numbytes(&ule, keylen); |
646 | uint32_t old_mem_size = leafentry_memsize(old_leaf_entry); |
647 | |
648 | // Before running garbage collection, try to promote the outermost |
649 | // provisional entries to committed if its xid is older than the oldest |
650 | // possible live xid. |
651 | // |
652 | // The oldest known refeferenced xid is a lower bound on the oldest possible |
653 | // live xid, so we use that. It's usually close enough to get rid of most |
654 | // garbage in leafentries. |
655 | ule_try_promote_provisional_outermost( |
656 | &ule, |
657 | gc_info->oldest_referenced_xid_for_implicit_promotion); |
658 | // No need to run simple gc here if we're going straight for full gc. |
659 | if (ule.num_cuxrs > 1) { |
660 | size_t size_before_gc = ule_packed_memsize(&ule); |
661 | ule_garbage_collect( |
662 | &ule, |
663 | gc_info->txn_state_for_gc->snapshot_xids, |
664 | gc_info->txn_state_for_gc->referenced_xids, |
665 | gc_info->txn_state_for_gc->live_root_txns); |
666 | size_t size_after_gc = ule_packed_memsize(&ule); |
667 | |
668 | LE_STATUS_INC(LE_APPLY_GC_BYTES_IN, size_before_gc); |
669 | LE_STATUS_INC(LE_APPLY_GC_BYTES_OUT, size_after_gc); |
670 | } |
671 | |
672 | void *maybe_free = nullptr; |
673 | // old_keylen, same because the key isn't going to change for gc |
674 | int r = |
675 | le_pack( |
676 | &ule, |
677 | data_buffer, |
678 | idx, |
679 | keyp, |
680 | keylen, |
681 | keylen, |
682 | old_mem_size, |
683 | new_leaf_entry, |
684 | &maybe_free); |
685 | invariant_zero(r); |
686 | if (*new_leaf_entry) { |
687 | newnumbytes = ule_get_innermost_numbytes(&ule, keylen); |
688 | } |
689 | *numbytes_delta_p = newnumbytes - oldnumbytes; |
690 | |
691 | ule_cleanup(&ule); |
692 | if (maybe_free != nullptr) { |
693 | toku_free(maybe_free); |
694 | } |
695 | } |
696 | |
697 | //////////////////////////////////////////////////////////////////////////////// |
698 | // This layer of abstraction (msg_xxx) |
699 | // knows the accessors of msg, but not of leafentry or unpacked leaf entry. |
700 | // It makes calls into the lower layer (le_xxx) which handles leafentries. |
701 | |
702 | // Purpose is to init the ule with given key and no transaction records |
703 | // |
704 | static inline void msg_init_empty_ule(ULE ule) { |
705 | ule_init_empty_ule(ule); |
706 | } |
707 | |
708 | // Purpose is to modify the unpacked leafentry in our private workspace. |
709 | // |
710 | // Returns -1, 0, or 1 that identifies the change in logical row count needed |
711 | // based on the results of the message application. For example, if a delete |
712 | // finds no logical leafentry or if an insert finds a duplicate and is |
713 | // converted to an update. |
714 | static int64_t msg_modify_ule(ULE ule, const ft_msg &msg) { |
715 | int64_t retval = 0; |
716 | XIDS xids = msg.xids(); |
717 | invariant(toku_xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS); |
718 | enum ft_msg_type type = msg.type(); |
719 | if (FT_LIKELY(type != FT_OPTIMIZE && type != FT_OPTIMIZE_FOR_UPGRADE)) { |
720 | ule_do_implicit_promotions(ule, xids); |
721 | } |
722 | switch (type) { |
723 | case FT_INSERT_NO_OVERWRITE: |
724 | retval = |
725 | ule_apply_insert_no_overwrite( |
726 | ule, |
727 | xids, |
728 | msg.vdbt()->size, |
729 | msg.vdbt()->data); |
730 | break; |
731 | case FT_INSERT: |
732 | retval = |
733 | ule_apply_insert( |
734 | ule, |
735 | xids, |
736 | msg.vdbt()->size, |
737 | msg.vdbt()->data); |
738 | break; |
739 | case FT_DELETE_ANY: |
740 | retval = ule_apply_delete(ule, xids); |
741 | break; |
742 | case FT_ABORT_ANY: |
743 | case FT_ABORT_BROADCAST_TXN: |
744 | retval = ule_apply_abort(ule, xids); |
745 | break; |
746 | case FT_COMMIT_BROADCAST_ALL: |
747 | ule_apply_broadcast_commit_all(ule); |
748 | break; |
749 | case FT_COMMIT_ANY: |
750 | case FT_COMMIT_BROADCAST_TXN: |
751 | ule_apply_commit(ule, xids); |
752 | break; |
753 | case FT_OPTIMIZE: |
754 | case FT_OPTIMIZE_FOR_UPGRADE: |
755 | ule_optimize(ule, xids); |
756 | break; |
757 | case FT_UPDATE: |
758 | case FT_UPDATE_BROADCAST_ALL: |
759 | // These messages don't get this far. Instead they get translated (in |
760 | // setval_fun in do_update) into FT_INSERT messages. |
761 | assert(false); |
762 | break; |
763 | default: |
764 | // illegal ft msg type |
765 | assert(false); |
766 | break; |
767 | } |
768 | return retval; |
769 | } |
770 | |
771 | void test_msg_modify_ule(ULE ule, const ft_msg &msg) { |
772 | msg_modify_ule(ule,msg); |
773 | } |
774 | |
775 | static void ule_optimize(ULE ule, XIDS xids) { |
776 | if (ule->num_puxrs) { |
777 | // outermost uncommitted |
778 | TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid; |
779 | TXNID oldest_living_xid = TXNID_NONE; |
780 | uint32_t num_xids = toku_xids_get_num_xids(xids); |
781 | if (num_xids > 0) { |
782 | invariant(num_xids==1); |
783 | oldest_living_xid = toku_xids_get_xid(xids, 0); |
784 | } |
785 | if (oldest_living_xid == TXNID_NONE || |
786 | uncommitted < oldest_living_xid) { |
787 | ule_promote_provisional_innermost_to_committed(ule); |
788 | } |
789 | } |
790 | } |
791 | |
792 | //////////////////////////////////////////////////////////////////////////////// |
793 | // This layer of abstraction (le_xxx) understands the structure of the leafentry |
794 | // and of the unpacked leafentry. It is the only layer that understands the |
795 | // structure of leafentry. It has no knowledge of any other data structures. |
796 | // |
797 | |
798 | // |
799 | // required for every le_unpack that is done |
800 | // |
801 | void ule_cleanup(ULE ule) { |
802 | invariant(ule->uxrs); |
803 | if (ule->uxrs != ule->uxrs_static) { |
804 | toku_free(ule->uxrs); |
805 | ule->uxrs = NULL; |
806 | } |
807 | } |
808 | |
809 | // populate an unpacked leafentry using pointers into the given leafentry. |
810 | // thus, the memory referenced by 'le' must live as long as the ULE. |
811 | void le_unpack(ULE ule, LEAFENTRY le) { |
812 | uint8_t type = le->type; |
813 | uint8_t *p; |
814 | uint32_t i; |
815 | switch (type) { |
816 | case LE_CLEAN: { |
817 | ule->uxrs = ule->uxrs_static; //Static version is always enough. |
818 | ule->num_cuxrs = 1; |
819 | ule->num_puxrs = 0; |
820 | UXR uxr = ule->uxrs; |
821 | uxr->type = XR_INSERT; |
822 | uxr->vallen = toku_dtoh32(le->u.clean.vallen); |
823 | uxr->valp = le->u.clean.val; |
824 | uxr->xid = TXNID_NONE; |
825 | //Set p to immediately after leafentry |
826 | p = le->u.clean.val + uxr->vallen; |
827 | break; |
828 | } |
829 | case LE_MVCC: |
830 | ule->num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
831 | invariant(ule->num_cuxrs); |
832 | ule->num_puxrs = le->u.mvcc.num_pxrs; |
833 | //Dynamic memory |
834 | if (ule->num_cuxrs < MAX_TRANSACTION_RECORDS) { |
835 | ule->uxrs = ule->uxrs_static; |
836 | } else { |
837 | XMALLOC_N( |
838 | ule->num_cuxrs + 1 + MAX_TRANSACTION_RECORDS, |
839 | ule->uxrs); |
840 | } |
841 | p = le->u.mvcc.xrs; |
842 | |
843 | //unpack interesting TXNIDs inner to outer. |
844 | if (ule->num_puxrs!=0) { |
845 | UXR outermost = ule->uxrs + ule->num_cuxrs; |
846 | p += uxr_unpack_txnid(outermost, p); |
847 | } |
848 | //unpack other TXNIDS (not for ule->uxrs[0]) |
849 | ule->uxrs[0].xid = TXNID_NONE; //0 for super-root is implicit |
850 | for (i = 0; i < ule->num_cuxrs - 1; i++) { |
851 | p += uxr_unpack_txnid(ule->uxrs + ule->num_cuxrs - 1 - i, p); |
852 | } |
853 | |
854 | //unpack interesting lengths inner to outer. |
855 | if (ule->num_puxrs!=0) { |
856 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
857 | p += uxr_unpack_length_and_bit(innermost, p); |
858 | } |
859 | for (i = 0; i < ule->num_cuxrs; i++) { |
860 | p += |
861 | uxr_unpack_length_and_bit( |
862 | ule->uxrs + ule->num_cuxrs - 1 - i, |
863 | p); |
864 | } |
865 | |
866 | //unpack interesting values inner to outer |
867 | if (ule->num_puxrs!=0) { |
868 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
869 | p += uxr_unpack_data(innermost, p); |
870 | } |
871 | for (i = 0; i < ule->num_cuxrs; i++) { |
872 | p += uxr_unpack_data(ule->uxrs + ule->num_cuxrs - 1 - i, p); |
873 | } |
874 | |
875 | //unpack provisional xrs outer to inner |
876 | if (ule->num_puxrs > 1) { |
877 | { |
878 | //unpack length, bit, data for outermost uncommitted |
879 | UXR outermost = ule->uxrs + ule->num_cuxrs; |
880 | p += uxr_unpack_type_and_length(outermost, p); |
881 | p += uxr_unpack_data(outermost, p); |
882 | } |
883 | //unpack txnid, length, bit, data for non-outermost, non-innermost |
884 | for (i = ule->num_cuxrs + 1; i < ule->num_cuxrs + ule->num_puxrs - 1; i++) { |
885 | UXR uxr = ule->uxrs + i; |
886 | p += uxr_unpack_txnid(uxr, p); |
887 | p += uxr_unpack_type_and_length(uxr, p); |
888 | p += uxr_unpack_data(uxr, p); |
889 | } |
890 | { |
891 | //Just unpack txnid for innermost |
892 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
893 | p += uxr_unpack_txnid(innermost, p); |
894 | } |
895 | } |
896 | break; |
897 | default: |
898 | invariant(false); |
899 | } |
900 | |
901 | #if ULE_DEBUG |
902 | size_t memsize = le_memsize_from_ule(ule); |
903 | assert(p == ((uint8_t*)le) + memsize); |
904 | #endif |
905 | } |
906 | |
907 | static inline size_t uxr_pack_txnid(UXR uxr, uint8_t *p) { |
908 | *(TXNID*)p = toku_htod64(uxr->xid); |
909 | return sizeof(TXNID); |
910 | } |
911 | |
912 | static inline size_t uxr_pack_type_and_length(UXR uxr, uint8_t *p) { |
913 | size_t rval = 1; |
914 | *p = uxr->type; |
915 | if (uxr_is_insert(uxr)) { |
916 | *(uint32_t*)(p+1) = toku_htod32(uxr->vallen); |
917 | rval += sizeof(uint32_t); |
918 | } |
919 | return rval; |
920 | } |
921 | |
922 | static inline size_t uxr_pack_length_and_bit(UXR uxr, uint8_t *p) { |
923 | uint32_t length_and_bit; |
924 | if (uxr_is_insert(uxr)) { |
925 | length_and_bit = INSERT_LENGTH(uxr->vallen); |
926 | } else { |
927 | length_and_bit = DELETE_LENGTH(uxr->vallen); |
928 | } |
929 | *(uint32_t*)p = toku_htod32(length_and_bit); |
930 | return sizeof(uint32_t); |
931 | } |
932 | |
933 | static inline size_t uxr_pack_data(UXR uxr, uint8_t *p) { |
934 | if (uxr_is_insert(uxr)) { |
935 | memcpy(p, uxr->valp, uxr->vallen); |
936 | return uxr->vallen; |
937 | } |
938 | return 0; |
939 | } |
940 | |
941 | static inline size_t uxr_unpack_txnid(UXR uxr, uint8_t *p) { |
942 | uxr->xid = toku_dtoh64(*(TXNID*)p); |
943 | return sizeof(TXNID); |
944 | } |
945 | |
946 | static inline size_t uxr_unpack_type_and_length(UXR uxr, uint8_t *p) { |
947 | size_t rval = 1; |
948 | uxr->type = *p; |
949 | if (uxr_is_insert(uxr)) { |
950 | uxr->vallen = toku_dtoh32(*(uint32_t*)(p+1)); |
951 | rval += sizeof(uint32_t); |
952 | } |
953 | return rval; |
954 | } |
955 | |
956 | static inline size_t uxr_unpack_length_and_bit(UXR uxr, uint8_t *p) { |
957 | uint32_t length_and_bit = toku_dtoh32(*(uint32_t*)p); |
958 | if (IS_INSERT(length_and_bit)) { |
959 | uxr->type = XR_INSERT; |
960 | uxr->vallen = GET_LENGTH(length_and_bit); |
961 | } else { |
962 | uxr->type = XR_DELETE; |
963 | uxr->vallen = 0; |
964 | } |
965 | return sizeof(uint32_t); |
966 | } |
967 | |
968 | static inline size_t uxr_unpack_data(UXR uxr, uint8_t *p) { |
969 | if (uxr_is_insert(uxr)) { |
970 | uxr->valp = p; |
971 | return uxr->vallen; |
972 | } |
973 | return 0; |
974 | } |
975 | |
976 | // executed too often to be worth making threadsafe |
977 | static inline void update_le_status(ULE ule, size_t memsize) { |
978 | if (ule->num_cuxrs > LE_STATUS_VAL(LE_MAX_COMMITTED_XR)) |
979 | LE_STATUS_VAL(LE_MAX_COMMITTED_XR) = ule->num_cuxrs; |
980 | if (ule->num_puxrs > LE_STATUS_VAL(LE_MAX_PROVISIONAL_XR)) |
981 | LE_STATUS_VAL(LE_MAX_PROVISIONAL_XR) = ule->num_puxrs; |
982 | if (ule->num_cuxrs > MAX_TRANSACTION_RECORDS) |
983 | LE_STATUS_VAL(LE_EXPANDED)++; |
984 | if (memsize > LE_STATUS_VAL(LE_MAX_MEMSIZE)) |
985 | LE_STATUS_VAL(LE_MAX_MEMSIZE) = memsize; |
986 | } |
987 | |
988 | // Purpose is to return a newly allocated leaf entry in packed format, or |
989 | // return null if leaf entry should be destroyed (if no transaction records |
990 | // are for inserts). |
991 | // Transaction records in packed le are stored inner to outer (first xr is |
992 | // innermost), with some information extracted out of the transaction records |
993 | // into the header. |
994 | // Transaction records in ule are stored outer to inner (uxr[0] is outermost). |
995 | // Takes 'ule' and creates 'new_leafentry_p |
996 | int le_pack( |
997 | ULE ule, |
998 | bn_data* data_buffer, |
999 | uint32_t idx, |
1000 | void* keyp, |
1001 | uint32_t keylen, |
1002 | uint32_t old_keylen, |
1003 | uint32_t old_le_size, |
1004 | LEAFENTRY* const new_leafentry_p, |
1005 | void** const maybe_free) { |
1006 | |
1007 | invariant(ule->num_cuxrs > 0); |
1008 | invariant(ule->uxrs[0].xid == TXNID_NONE); |
1009 | int rval; |
1010 | size_t memsize = 0; |
1011 | { |
1012 | // The unpacked leafentry may contain no inserts anywhere on its stack. |
1013 | // If so, then there IS no leafentry to pack, we should return NULL |
1014 | // So, first we check the stack to see if there is any insert. If not, |
1015 | // Then we can return NULL and exit the function, otherwise, we goto |
1016 | // found_insert, and proceed with packing the leafentry |
1017 | uint32_t i; |
1018 | for (i = 0; i < ule->num_cuxrs + ule->num_puxrs; i++) { |
1019 | if (uxr_is_insert(&ule->uxrs[i])) { |
1020 | goto found_insert; |
1021 | } |
1022 | } |
1023 | if (data_buffer && old_le_size > 0) { |
1024 | // must pass old_keylen and old_le_size, since that's what is |
1025 | // actually stored in data_buffer |
1026 | data_buffer->delete_leafentry(idx, old_keylen, old_le_size); |
1027 | } |
1028 | *new_leafentry_p = NULL; |
1029 | rval = 0; |
1030 | goto cleanup; |
1031 | } |
1032 | found_insert: |
1033 | memsize = le_memsize_from_ule(ule); |
1034 | LEAFENTRY new_leafentry; |
1035 | get_space_for_le( |
1036 | data_buffer, |
1037 | idx, |
1038 | keyp, |
1039 | keylen, |
1040 | old_keylen, |
1041 | old_le_size, |
1042 | memsize, |
1043 | &new_leafentry, |
1044 | maybe_free); |
1045 | |
1046 | //p always points to first unused byte after leafentry we are packing |
1047 | uint8_t *p; |
1048 | invariant(ule->num_cuxrs>0); |
1049 | //Type specific data |
1050 | if (ule->num_cuxrs == 1 && ule->num_puxrs == 0) { |
1051 | //Pack a 'clean leafentry' (no uncommitted transactions, only one |
1052 | //committed value) |
1053 | new_leafentry->type = LE_CLEAN; |
1054 | |
1055 | uint32_t vallen = ule->uxrs[0].vallen; |
1056 | //Store vallen |
1057 | new_leafentry->u.clean.vallen = toku_htod32(vallen); |
1058 | |
1059 | //Store actual val |
1060 | memcpy(new_leafentry->u.clean.val, ule->uxrs[0].valp, vallen); |
1061 | |
1062 | //Set p to after leafentry |
1063 | p = new_leafentry->u.clean.val + vallen; |
1064 | } else { |
1065 | uint32_t i; |
1066 | //Pack an 'mvcc leafentry' |
1067 | new_leafentry->type = LE_MVCC; |
1068 | |
1069 | new_leafentry->u.mvcc.num_cxrs = toku_htod32(ule->num_cuxrs); |
1070 | // invariant makes cast that follows ok, although not sure if |
1071 | // check should be "< MAX_TRANSACTION_RECORDS" or |
1072 | // "< MAX_TRANSACTION_RECORDS - 1" |
1073 | invariant(ule->num_puxrs < MAX_TRANSACTION_RECORDS); |
1074 | new_leafentry->u.mvcc.num_pxrs = (uint8_t)ule->num_puxrs; |
1075 | |
1076 | p = new_leafentry->u.mvcc.xrs; |
1077 | |
1078 | //pack interesting TXNIDs inner to outer. |
1079 | if (ule->num_puxrs!=0) { |
1080 | UXR outermost = ule->uxrs + ule->num_cuxrs; |
1081 | p += uxr_pack_txnid(outermost, p); |
1082 | } |
1083 | //pack other TXNIDS (not for ule->uxrs[0]) |
1084 | for (i = 0; i < ule->num_cuxrs - 1; i++) { |
1085 | p += uxr_pack_txnid(ule->uxrs + ule->num_cuxrs - 1 - i, p); |
1086 | } |
1087 | |
1088 | //pack interesting lengths inner to outer. |
1089 | if (ule->num_puxrs!=0) { |
1090 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
1091 | p += uxr_pack_length_and_bit(innermost, p); |
1092 | } |
1093 | for (i = 0; i < ule->num_cuxrs; i++) { |
1094 | p += uxr_pack_length_and_bit(ule->uxrs + ule->num_cuxrs - 1 - i, p); |
1095 | } |
1096 | |
1097 | //pack interesting values inner to outer |
1098 | if (ule->num_puxrs!=0) { |
1099 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
1100 | p += uxr_pack_data(innermost, p); |
1101 | } |
1102 | for (i = 0; i < ule->num_cuxrs; i++) { |
1103 | p += uxr_pack_data(ule->uxrs + ule->num_cuxrs - 1 - i, p); |
1104 | } |
1105 | |
1106 | //pack provisional xrs outer to inner |
1107 | if (ule->num_puxrs > 1) { |
1108 | { |
1109 | //pack length, bit, data for outermost uncommitted |
1110 | UXR outermost = ule->uxrs + ule->num_cuxrs; |
1111 | p += uxr_pack_type_and_length(outermost, p); |
1112 | p += uxr_pack_data(outermost, p); |
1113 | } |
1114 | //pack txnid, length, bit, data for non-outermost, non-innermost |
1115 | for (i = ule->num_cuxrs + 1; |
1116 | i < ule->num_cuxrs + ule->num_puxrs - 1; |
1117 | i++) { |
1118 | UXR uxr = ule->uxrs + i; |
1119 | p += uxr_pack_txnid(uxr, p); |
1120 | p += uxr_pack_type_and_length(uxr, p); |
1121 | p += uxr_pack_data(uxr, p); |
1122 | } |
1123 | { |
1124 | //Just pack txnid for innermost |
1125 | UXR innermost = ule->uxrs + ule->num_cuxrs + ule->num_puxrs - 1; |
1126 | p += uxr_pack_txnid(innermost, p); |
1127 | } |
1128 | } |
1129 | } |
1130 | |
1131 | //p points to first unused byte after packed leafentry |
1132 | |
1133 | size_t bytes_written; |
1134 | bytes_written = (size_t)p - (size_t)new_leafentry; |
1135 | invariant(bytes_written == memsize); |
1136 | |
1137 | #if ULE_DEBUG |
1138 | if (omt) { //Disable recursive debugging. |
1139 | size_t memsize_verify = leafentry_memsize(new_leafentry); |
1140 | invariant(memsize_verify == memsize); |
1141 | |
1142 | ULE_S ule_tmp; |
1143 | le_unpack(&ule_tmp, new_leafentry); |
1144 | |
1145 | memsize_verify = le_memsize_from_ule(&ule_tmp); |
1146 | invariant(memsize_verify == memsize); |
1147 | //Debugging code inside le_unpack will repack and verify it is the same. |
1148 | |
1149 | LEAFENTRY le_copy; |
1150 | |
1151 | int r_tmp = le_pack(&ule_tmp, &memsize_verify, &memsize_verify, |
1152 | &le_copy); |
1153 | invariant(r_tmp==0); |
1154 | invariant(memsize_verify == memsize); |
1155 | |
1156 | invariant(memcmp(new_leafentry, le_copy, memsize)==0); |
1157 | toku_free(le_copy); |
1158 | |
1159 | ule_cleanup(&ule_tmp); |
1160 | } |
1161 | #endif |
1162 | |
1163 | *new_leafentry_p = (LEAFENTRY)new_leafentry; |
1164 | rval = 0; |
1165 | cleanup: |
1166 | update_le_status(ule, memsize); |
1167 | return rval; |
1168 | } |
1169 | |
1170 | //////////////////////////////////////////////////////////////////////////////// |
1171 | // Following functions provide convenient access to a packed leafentry. |
1172 | |
1173 | //Requires: |
1174 | // Leafentry that ule represents should not be destroyed (is not just all |
1175 | // deletes) |
1176 | size_t le_memsize_from_ule (ULE ule) { |
1177 | invariant(ule->num_cuxrs); |
1178 | size_t rval; |
1179 | if (ule->num_cuxrs == 1 && ule->num_puxrs == 0) { |
1180 | UXR committed = ule->uxrs; |
1181 | invariant(uxr_is_insert(committed)); |
1182 | rval = 1 //type |
1183 | +4 //vallen |
1184 | +committed->vallen; //actual val |
1185 | } else { |
1186 | rval = 1 //type |
1187 | +4 //num_cuxrs |
1188 | +1 //num_puxrs |
1189 | +4*(ule->num_cuxrs) //types+lengths for committed |
1190 | +8*(ule->num_cuxrs + ule->num_puxrs - 1); //txnids (excluding |
1191 | //superroot) |
1192 | uint32_t i; |
1193 | //Count data from committed uxrs and innermost puxr |
1194 | for (i = 0; i < ule->num_cuxrs; i++) { |
1195 | UXR uxr = &ule->uxrs[i]; |
1196 | if (uxr_is_insert(uxr)) { |
1197 | rval += uxr->vallen; //actual val |
1198 | } |
1199 | } |
1200 | if (ule->num_puxrs) { |
1201 | UXR uxr = ule_get_innermost_uxr(ule); |
1202 | if (uxr_is_insert(uxr)) { |
1203 | rval += uxr->vallen; //actual val |
1204 | } |
1205 | rval += 4; //type+length for innermost puxr |
1206 | rval += 1*(ule->num_puxrs - 1); //type for remaining puxrs. |
1207 | //Count data and lengths from other puxrs |
1208 | for (i = 0; i < ule->num_puxrs-1; i++) { |
1209 | uxr = &ule->uxrs[i+ule->num_cuxrs]; |
1210 | if (uxr_is_insert(uxr)) { |
1211 | rval += 4 + uxr->vallen; //length plus actual val |
1212 | } |
1213 | } |
1214 | } |
1215 | } |
1216 | return rval; |
1217 | } |
1218 | |
1219 | // TODO: rename |
1220 | size_t leafentry_rest_memsize( |
1221 | uint32_t num_puxrs, |
1222 | uint32_t num_cuxrs, |
1223 | uint8_t* start) { |
1224 | |
1225 | UXR_S uxr; |
1226 | size_t lengths = 0; |
1227 | uint8_t* p = start; |
1228 | |
1229 | //Skip TXNIDs |
1230 | if (num_puxrs!=0) { |
1231 | p += sizeof(TXNID); |
1232 | } |
1233 | p += (num_cuxrs-1)*sizeof(TXNID); |
1234 | |
1235 | //Retrieve interesting lengths inner to outer. |
1236 | if (num_puxrs!=0) { |
1237 | p += uxr_unpack_length_and_bit(&uxr, p); |
1238 | if (uxr_is_insert(&uxr)) { |
1239 | lengths += uxr.vallen; |
1240 | } |
1241 | } |
1242 | uint32_t i; |
1243 | for (i = 0; i < num_cuxrs; i++) { |
1244 | p += uxr_unpack_length_and_bit(&uxr, p); |
1245 | if (uxr_is_insert(&uxr)) { |
1246 | lengths += uxr.vallen; |
1247 | } |
1248 | } |
1249 | //Skip all interesting 'data' |
1250 | p += lengths; |
1251 | |
1252 | //unpack provisional xrs outer to inner |
1253 | if (num_puxrs > 1) { |
1254 | { |
1255 | p += uxr_unpack_type_and_length(&uxr, p); |
1256 | p += uxr_unpack_data(&uxr, p); |
1257 | } |
1258 | //unpack txnid, length, bit, data for non-outermost, non-innermost |
1259 | for (i = 0; i < num_puxrs - 2; i++) { |
1260 | p += uxr_unpack_txnid(&uxr, p); |
1261 | p += uxr_unpack_type_and_length(&uxr, p); |
1262 | p += uxr_unpack_data(&uxr, p); |
1263 | } |
1264 | { |
1265 | //Just unpack txnid for innermost |
1266 | p += uxr_unpack_txnid(&uxr, p); |
1267 | } |
1268 | } |
1269 | size_t rval = (size_t)p - (size_t)start; |
1270 | return rval; |
1271 | } |
1272 | |
1273 | size_t leafentry_memsize (LEAFENTRY le) { |
1274 | size_t rval = 0; |
1275 | |
1276 | uint8_t type = le->type; |
1277 | |
1278 | uint8_t *p = NULL; |
1279 | switch (type) { |
1280 | case LE_CLEAN: { |
1281 | uint32_t vallen = toku_dtoh32(le->u.clean.vallen); |
1282 | rval = LE_CLEAN_MEMSIZE(vallen); |
1283 | break; |
1284 | } |
1285 | case LE_MVCC: { |
1286 | p = le->u.mvcc.xrs; |
1287 | uint32_t num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
1288 | invariant(num_cuxrs); |
1289 | uint32_t num_puxrs = le->u.mvcc.num_pxrs; |
1290 | p += leafentry_rest_memsize(num_puxrs, num_cuxrs, p); |
1291 | rval = (size_t)p - (size_t)le; |
1292 | break; |
1293 | } |
1294 | default: |
1295 | invariant(false); |
1296 | } |
1297 | #if ULE_DEBUG |
1298 | ULE_S ule; |
1299 | le_unpack(&ule, le); |
1300 | size_t slow_rval = le_memsize_from_ule(&ule); |
1301 | if (slow_rval!=rval) { |
1302 | int r = print_klpair(stderr, le, NULL, 0); |
1303 | fprintf(stderr, "\nSlow: [%" PRIu64 "] Fast: [%" PRIu64 "]\n" , slow_rval, rval); |
1304 | invariant(r==0); |
1305 | } |
1306 | assert(slow_rval == rval); |
1307 | ule_cleanup(&ule); |
1308 | #endif |
1309 | return rval; |
1310 | } |
1311 | |
1312 | size_t leafentry_disksize (LEAFENTRY le) { |
1313 | return leafentry_memsize(le); |
1314 | } |
1315 | |
1316 | bool le_is_clean(LEAFENTRY le) { |
1317 | uint8_t type = le->type; |
1318 | uint32_t rval; |
1319 | switch (type) { |
1320 | case LE_CLEAN: |
1321 | rval = true; |
1322 | break; |
1323 | case LE_MVCC:; |
1324 | rval = false; |
1325 | break; |
1326 | default: |
1327 | invariant(false); |
1328 | } |
1329 | return rval; |
1330 | } |
1331 | |
1332 | int le_latest_is_del(LEAFENTRY le) { |
1333 | int rval; |
1334 | uint8_t type = le->type; |
1335 | uint8_t *p; |
1336 | switch (type) { |
1337 | case LE_CLEAN: { |
1338 | rval = 0; |
1339 | break; |
1340 | } |
1341 | case LE_MVCC: { |
1342 | UXR_S uxr; |
1343 | uint32_t num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
1344 | invariant(num_cuxrs); |
1345 | uint32_t num_puxrs = le->u.mvcc.num_pxrs; |
1346 | |
1347 | //Position p. |
1348 | p = le->u.mvcc.xrs; |
1349 | |
1350 | //Skip TXNIDs |
1351 | if (num_puxrs!=0) { |
1352 | p += sizeof(TXNID); |
1353 | } |
1354 | p += (num_cuxrs-1)*sizeof(TXNID); |
1355 | |
1356 | p += uxr_unpack_length_and_bit(&uxr, p); |
1357 | rval = uxr_is_delete(&uxr); |
1358 | break; |
1359 | } |
1360 | default: |
1361 | invariant(false); |
1362 | } |
1363 | #if ULE_DEBUG |
1364 | ULE_S ule; |
1365 | le_unpack(&ule, le); |
1366 | UXR uxr = ule_get_innermost_uxr(&ule); |
1367 | int slow_rval = uxr_is_delete(uxr); |
1368 | assert((rval==0) == (slow_rval==0)); |
1369 | ule_cleanup(&ule); |
1370 | #endif |
1371 | return rval; |
1372 | } |
1373 | |
1374 | |
1375 | // |
1376 | // returns true if the outermost provisional transaction id on the leafentry's |
1377 | // stack matches the outermost transaction id in xids |
1378 | // It is used to determine if a broadcast commit/abort message (look in ft-ops.c) |
1379 | // should be applied to this leafentry |
1380 | // If the outermost transactions match, then the broadcast commit/abort should |
1381 | // be applied |
1382 | // |
1383 | bool le_has_xids(LEAFENTRY le, XIDS xids) { |
1384 | //Read num_uxrs |
1385 | uint32_t num_xids = toku_xids_get_num_xids(xids); |
1386 | invariant(num_xids > 0); //Disallow checking for having TXNID_NONE |
1387 | TXNID xid = toku_xids_get_xid(xids, 0); |
1388 | invariant(xid!=TXNID_NONE); |
1389 | |
1390 | bool rval = (le_outermost_uncommitted_xid(le) == xid); |
1391 | return rval; |
1392 | } |
1393 | |
1394 | void* le_latest_val_and_len (LEAFENTRY le, uint32_t *len) { |
1395 | uint8_t type = le->type; |
1396 | void *valp; |
1397 | |
1398 | uint8_t *p; |
1399 | switch (type) { |
1400 | case LE_CLEAN: |
1401 | *len = toku_dtoh32(le->u.clean.vallen); |
1402 | valp = le->u.clean.val; |
1403 | break; |
1404 | case LE_MVCC:; |
1405 | UXR_S uxr; |
1406 | uint32_t num_cuxrs; |
1407 | num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
1408 | invariant(num_cuxrs); |
1409 | uint32_t num_puxrs; |
1410 | num_puxrs = le->u.mvcc.num_pxrs; |
1411 | |
1412 | //Position p. |
1413 | p = le->u.mvcc.xrs; |
1414 | |
1415 | //Skip TXNIDs |
1416 | if (num_puxrs!=0) { |
1417 | p += sizeof(TXNID); |
1418 | } |
1419 | p += (num_cuxrs-1)*sizeof(TXNID); |
1420 | |
1421 | p += uxr_unpack_length_and_bit(&uxr, p); |
1422 | if (uxr_is_insert(&uxr)) { |
1423 | *len = uxr.vallen; |
1424 | valp = p + (num_cuxrs - 1 + (num_puxrs!=0))*sizeof(uint32_t); |
1425 | } else { |
1426 | *len = 0; |
1427 | valp = NULL; |
1428 | } |
1429 | break; |
1430 | default: |
1431 | invariant(false); |
1432 | } |
1433 | #if ULE_DEBUG |
1434 | ULE_S ule; |
1435 | le_unpack(&ule, le); |
1436 | UXR uxr = ule_get_innermost_uxr(&ule); |
1437 | void *slow_valp; |
1438 | uint32_t slow_len; |
1439 | if (uxr_is_insert(uxr)) { |
1440 | slow_valp = uxr->valp; |
1441 | slow_len = uxr->vallen; |
1442 | } else { |
1443 | slow_valp = NULL; |
1444 | slow_len = 0; |
1445 | } |
1446 | assert(slow_valp == le_latest_val(le)); |
1447 | assert(slow_len == le_latest_vallen(le)); |
1448 | assert(valp==slow_valp); |
1449 | assert(*len==slow_len); |
1450 | ule_cleanup(&ule); |
1451 | #endif |
1452 | return valp; |
1453 | } |
1454 | |
1455 | //DEBUG ONLY can be slow |
1456 | void* le_latest_val (LEAFENTRY le) { |
1457 | ULE_S ule; |
1458 | le_unpack(&ule, le); |
1459 | UXR uxr = ule_get_innermost_uxr(&ule); |
1460 | void *slow_rval; |
1461 | if (uxr_is_insert(uxr)) |
1462 | slow_rval = uxr->valp; |
1463 | else |
1464 | slow_rval = NULL; |
1465 | ule_cleanup(&ule); |
1466 | return slow_rval; |
1467 | } |
1468 | |
1469 | //needed to be fast for statistics. |
1470 | uint32_t le_latest_vallen (LEAFENTRY le) { |
1471 | uint32_t rval; |
1472 | uint8_t type = le->type; |
1473 | uint8_t *p; |
1474 | switch (type) { |
1475 | case LE_CLEAN: |
1476 | rval = toku_dtoh32(le->u.clean.vallen); |
1477 | break; |
1478 | case LE_MVCC:; |
1479 | UXR_S uxr; |
1480 | uint32_t num_cuxrs; |
1481 | num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
1482 | invariant(num_cuxrs); |
1483 | uint32_t num_puxrs; |
1484 | num_puxrs = le->u.mvcc.num_pxrs; |
1485 | |
1486 | //Position p. |
1487 | p = le->u.mvcc.xrs; |
1488 | |
1489 | //Skip TXNIDs |
1490 | if (num_puxrs!=0) { |
1491 | p += sizeof(TXNID); |
1492 | } |
1493 | p += (num_cuxrs-1)*sizeof(TXNID); |
1494 | |
1495 | uxr_unpack_length_and_bit(&uxr, p); |
1496 | if (uxr_is_insert(&uxr)) { |
1497 | rval = uxr.vallen; |
1498 | } else { |
1499 | rval = 0; |
1500 | } |
1501 | break; |
1502 | default: |
1503 | invariant(false); |
1504 | } |
1505 | #if ULE_DEBUG |
1506 | ULE_S ule; |
1507 | le_unpack(&ule, le); |
1508 | UXR uxr = ule_get_innermost_uxr(&ule); |
1509 | uint32_t slow_rval; |
1510 | if (uxr_is_insert(uxr)) |
1511 | slow_rval = uxr->vallen; |
1512 | else |
1513 | slow_rval = 0; |
1514 | ule_cleanup(&ule); |
1515 | invariant(slow_rval == rval); |
1516 | #endif |
1517 | return rval; |
1518 | } |
1519 | |
1520 | uint64_t le_outermost_uncommitted_xid (LEAFENTRY le) { |
1521 | uint64_t rval = TXNID_NONE; |
1522 | uint8_t type = le->type; |
1523 | |
1524 | uint8_t *p; |
1525 | switch (type) { |
1526 | case LE_CLEAN: |
1527 | break; |
1528 | case LE_MVCC:; |
1529 | UXR_S uxr; |
1530 | uint32_t num_puxrs = le->u.mvcc.num_pxrs; |
1531 | |
1532 | if (num_puxrs) { |
1533 | p = le->u.mvcc.xrs; |
1534 | uxr_unpack_txnid(&uxr, p); |
1535 | rval = uxr.xid; |
1536 | } |
1537 | break; |
1538 | } |
1539 | #if ULE_DEBUG |
1540 | ULE_S ule; |
1541 | le_unpack(&ule, le); |
1542 | TXNID slow_rval = 0; |
1543 | if (ule.num_puxrs > 0) |
1544 | slow_rval = ule.uxrs[ule.num_cuxrs].xid; |
1545 | assert(rval==slow_rval); |
1546 | ule_cleanup(&ule); |
1547 | #endif |
1548 | return rval; |
1549 | } |
1550 | |
1551 | |
1552 | //Optimization not required. This is a debug only function. |
1553 | //Print a leafentry out in human-readable format |
1554 | int print_klpair (FILE *outf, const void* keyp, uint32_t keylen, LEAFENTRY le) { |
1555 | ULE_S ule; |
1556 | le_unpack(&ule, le); |
1557 | uint32_t i; |
1558 | invariant(ule.num_cuxrs > 0); |
1559 | UXR uxr; |
1560 | if (!le) { printf("NULL" ); return 0; } |
1561 | if (keyp) { |
1562 | fprintf(outf, "{key=" ); |
1563 | toku_print_BYTESTRING(outf, keylen, (char *) keyp); |
1564 | } |
1565 | for (i = 0; i < ule.num_cuxrs+ule.num_puxrs; i++) { |
1566 | // fprintf(outf, "\n%*s", i+1, " "); //Nested indenting |
1567 | uxr = &ule.uxrs[i]; |
1568 | char prov = i < ule.num_cuxrs ? 'c' : 'p'; |
1569 | fprintf(outf, " " ); |
1570 | if (uxr_is_placeholder(uxr)) |
1571 | fprintf(outf, "P: xid=%016" PRIx64, uxr->xid); |
1572 | else if (uxr_is_delete(uxr)) |
1573 | fprintf(outf, "%cD: xid=%016" PRIx64, prov, uxr->xid); |
1574 | else { |
1575 | assert(uxr_is_insert(uxr)); |
1576 | fprintf(outf, "%cI: xid=%016" PRIx64 " val=" , prov, uxr->xid); |
1577 | toku_print_BYTESTRING(outf, uxr->vallen, (char *) uxr->valp); |
1578 | } |
1579 | } |
1580 | fprintf(outf, "}" ); |
1581 | ule_cleanup(&ule); |
1582 | return 0; |
1583 | } |
1584 | |
1585 | //////////////////////////////////////////////////////////////////////////////// |
1586 | // This layer of abstraction (ule_xxx) knows the structure of the unpacked |
1587 | // leafentry and no other structure. |
1588 | // |
1589 | |
1590 | // ule constructor |
1591 | // Note that transaction 0 is explicit in the ule |
1592 | static inline void ule_init_empty_ule(ULE ule) { |
1593 | ule->num_cuxrs = 1; |
1594 | ule->num_puxrs = 0; |
1595 | ule->uxrs = ule->uxrs_static; |
1596 | ule->uxrs[0] = committed_delete; |
1597 | } |
1598 | |
1599 | static inline int32_t min_i32(int32_t a, int32_t b) { |
1600 | int32_t rval = a < b ? a : b; |
1601 | return rval; |
1602 | } |
1603 | |
1604 | /////////////////// |
1605 | // Implicit promotion logic: |
1606 | // |
1607 | // If the leafentry has already been promoted, there is nothing to do. |
1608 | // We have two transaction stacks (one from message, one from leaf entry). |
1609 | // We want to implicitly promote transactions newer than (but not including) |
1610 | // the innermost common ancestor (ICA) of the two stacks of transaction ids. We |
1611 | // know that this is the right thing to do because each transaction with an id |
1612 | // greater (later) than the ICA must have been either committed or aborted. |
1613 | // If it was aborted then we would have seen an abort message and removed the |
1614 | // xid from the stack of transaction records. So any transaction still on the |
1615 | // leaf entry stack must have been successfully promoted. |
1616 | // |
1617 | // After finding the ICA, promote transaction later than the ICA by copying |
1618 | // value and type from innermost transaction record of leafentry to transaction |
1619 | // record of ICA, keeping the transaction id of the ICA. |
1620 | // Outermost xid is zero for both ule and xids<> |
1621 | // |
1622 | static void ule_do_implicit_promotions(ULE ule, XIDS xids) { |
1623 | //Optimization for (most) common case. |
1624 | //No commits necessary if everything is already committed. |
1625 | if (ule->num_puxrs > 0) { |
1626 | int num_xids = toku_xids_get_num_xids(xids); |
1627 | invariant(num_xids>0); |
1628 | uint32_t max_index = ule->num_cuxrs + min_i32(ule->num_puxrs, num_xids) - 1; |
1629 | uint32_t ica_index = max_index; |
1630 | uint32_t index; |
1631 | for (index = ule->num_cuxrs; index <= max_index; index++) { |
1632 | TXNID current_msg_xid = toku_xids_get_xid(xids, index - ule->num_cuxrs); |
1633 | TXNID current_ule_xid = ule_get_xid(ule, index); |
1634 | if (current_msg_xid != current_ule_xid) { |
1635 | //ica is innermost transaction with matching xids. |
1636 | ica_index = index - 1; |
1637 | break; |
1638 | } |
1639 | } |
1640 | |
1641 | if (ica_index < ule->num_cuxrs) { |
1642 | invariant(ica_index == ule->num_cuxrs - 1); |
1643 | ule_promote_provisional_innermost_to_committed(ule); |
1644 | } else if (ica_index < ule->num_cuxrs + ule->num_puxrs - 1) { |
1645 | //If ica is the innermost uxr in the leafentry, no commits are |
1646 | //necessary. |
1647 | ule_promote_provisional_innermost_to_index(ule, ica_index); |
1648 | } |
1649 | |
1650 | } |
1651 | } |
1652 | |
1653 | static void ule_promote_provisional_innermost_to_committed(ULE ule) { |
1654 | //Must be something to promote. |
1655 | invariant(ule->num_puxrs); |
1656 | //Take value (or delete flag) from innermost. |
1657 | //Take TXNID from outermost uncommitted txn |
1658 | //"Delete" provisional stack |
1659 | //add one UXR that is committed using saved TXNID,val/delete flag |
1660 | |
1661 | UXR old_innermost_uxr = ule_get_innermost_uxr(ule); |
1662 | assert(!uxr_is_placeholder(old_innermost_uxr)); |
1663 | |
1664 | UXR old_outermost_uncommitted_uxr = &ule->uxrs[ule->num_cuxrs]; |
1665 | |
1666 | ule->num_puxrs = 0; //Discard all provisional uxrs. |
1667 | if (uxr_is_delete(old_innermost_uxr)) { |
1668 | ule_push_delete_uxr(ule, true, old_outermost_uncommitted_uxr->xid); |
1669 | } else { |
1670 | ule_push_insert_uxr(ule, true, |
1671 | old_outermost_uncommitted_uxr->xid, |
1672 | old_innermost_uxr->vallen, |
1673 | old_innermost_uxr->valp); |
1674 | } |
1675 | } |
1676 | |
1677 | static void ule_try_promote_provisional_outermost( |
1678 | ULE ule, |
1679 | TXNID oldest_possible_live_xid) { |
1680 | // Effect: If there is a provisional record whose outermost xid is older than |
1681 | // the oldest known referenced_xid, promote it to committed. |
1682 | if (ule->num_puxrs > 0 && |
1683 | ule_get_xid(ule, ule->num_cuxrs) < oldest_possible_live_xid) { |
1684 | ule_promote_provisional_innermost_to_committed(ule); |
1685 | } |
1686 | } |
1687 | |
1688 | // Purpose is to promote the value (and type) of the innermost transaction |
1689 | // record to the uxr at the specified index (keeping the txnid of the uxr at |
1690 | // specified index.) |
1691 | static void ule_promote_provisional_innermost_to_index( |
1692 | ULE ule, |
1693 | uint32_t index) { |
1694 | //Must not promote to committed portion of stack. |
1695 | invariant(index >= ule->num_cuxrs); |
1696 | //Must actually be promoting. |
1697 | invariant(index < ule->num_cuxrs + ule->num_puxrs - 1); |
1698 | UXR old_innermost_uxr = ule_get_innermost_uxr(ule); |
1699 | assert(!uxr_is_placeholder(old_innermost_uxr)); |
1700 | TXNID new_innermost_xid = ule->uxrs[index].xid; |
1701 | //Discard old uxr at index (and everything inner) |
1702 | ule->num_puxrs = index - ule->num_cuxrs; |
1703 | if (uxr_is_delete(old_innermost_uxr)) { |
1704 | ule_push_delete_uxr(ule, false, new_innermost_xid); |
1705 | } else { |
1706 | ule_push_insert_uxr( |
1707 | ule, |
1708 | false, |
1709 | new_innermost_xid, |
1710 | old_innermost_uxr->vallen, |
1711 | old_innermost_uxr->valp); |
1712 | } |
1713 | } |
1714 | |
1715 | /////////////////// |
1716 | // All ule_apply_xxx operations are done after implicit promotions, |
1717 | // so the innermost transaction record in the leafentry is the ICA. |
1718 | // |
1719 | |
1720 | |
1721 | // Purpose is to apply an insert message to this leafentry: |
1722 | static inline int64_t ule_apply_insert_no_overwrite( |
1723 | ULE ule, |
1724 | XIDS xids, |
1725 | uint32_t vallen, |
1726 | void* valp) { |
1727 | |
1728 | invariant(IS_VALID_LEN(vallen)); |
1729 | int64_t retval = 0; |
1730 | UXR old_innermost_uxr = ule_get_innermost_uxr(ule); |
1731 | // If something exists, don't overwrite |
1732 | if (uxr_is_insert(old_innermost_uxr)) { |
1733 | retval = -1; |
1734 | return retval; |
1735 | } |
1736 | ule_prepare_for_new_uxr(ule, xids); |
1737 | // xid of transaction doing this insert |
1738 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1739 | ule_push_insert_uxr(ule, this_xid == TXNID_NONE, this_xid, vallen, valp); |
1740 | return retval; |
1741 | } |
1742 | |
1743 | // Purpose is to apply an insert message to this leafentry: |
1744 | static inline int64_t ule_apply_insert( |
1745 | ULE ule, |
1746 | XIDS xids, |
1747 | uint32_t vallen, |
1748 | void* valp) { |
1749 | |
1750 | invariant(IS_VALID_LEN(vallen)); |
1751 | int64_t retval = 0; |
1752 | UXR old_innermost_uxr = ule_get_innermost_uxr(ule); |
1753 | // If something exists, overwrite |
1754 | if (uxr_is_insert(old_innermost_uxr)) { |
1755 | retval = -1; |
1756 | } |
1757 | ule_prepare_for_new_uxr(ule, xids); |
1758 | // xid of transaction doing this insert |
1759 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1760 | ule_push_insert_uxr(ule, this_xid == TXNID_NONE, this_xid, vallen, valp); |
1761 | return retval; |
1762 | } |
1763 | |
1764 | // Purpose is to apply a delete message to this leafentry: |
1765 | static inline int64_t ule_apply_delete(ULE ule, XIDS xids) { |
1766 | int64_t retval = 0; |
1767 | UXR old_innermost_uxr = ule_get_innermost_uxr(ule); |
1768 | if (FT_UNLIKELY(uxr_is_delete(old_innermost_uxr))) { |
1769 | retval = 1; |
1770 | } |
1771 | ule_prepare_for_new_uxr(ule, xids); |
1772 | // xid of transaction doing this delete |
1773 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1774 | ule_push_delete_uxr(ule, this_xid == TXNID_NONE, this_xid); |
1775 | return retval; |
1776 | } |
1777 | |
1778 | // First, discard anything done earlier by this transaction. |
1779 | // Then, add placeholders if necessary. This transaction may be nested within |
1780 | // outer transactions that are newer than then newest (innermost) transaction in |
1781 | // the leafentry. If so, record those outer transactions in the leafentry |
1782 | // with placeholders. |
1783 | static inline void ule_prepare_for_new_uxr(ULE ule, XIDS xids) { |
1784 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1785 | //This is for LOADER_USE_PUTS or transactionless environment |
1786 | //where messages use XIDS of 0 |
1787 | if (this_xid == TXNID_NONE && ule_get_innermost_xid(ule) == TXNID_NONE) { |
1788 | ule_remove_innermost_uxr(ule); |
1789 | } else if (ule->num_puxrs > 0 && ule_get_innermost_xid(ule) == this_xid) { |
1790 | // case where we are transactional and xids stack matches ule stack |
1791 | ule_remove_innermost_uxr(ule); |
1792 | } else { |
1793 | // case where we are transactional and xids stack does not match ule |
1794 | // stack |
1795 | ule_add_placeholders(ule, xids); |
1796 | } |
1797 | } |
1798 | |
1799 | // Purpose is to apply an abort message to this leafentry. |
1800 | // If the aborted transaction (the transaction whose xid is the innermost xid |
1801 | // in the id stack passed in the message), has not modified this leafentry, |
1802 | // then there is nothing to be done. |
1803 | // If this transaction did modify the leafentry, then undo whatever it did (by |
1804 | // removing the transaction record (uxr) and any placeholders underneath. |
1805 | // Remember, the innermost uxr can only be an insert or a delete, not a |
1806 | // placeholder. |
1807 | static inline int64_t ule_apply_abort(ULE ule, XIDS xids) { |
1808 | int64_t retval = 0; |
1809 | // xid of transaction doing this abort |
1810 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1811 | invariant(this_xid!=TXNID_NONE); |
1812 | UXR innermost = ule_get_innermost_uxr(ule); |
1813 | // need to check for provisional entries in ule, otherwise |
1814 | // there is nothing to abort, not checking this may result |
1815 | // in a bug where the most recently committed has same xid |
1816 | // as the XID's innermost |
1817 | if (ule->num_puxrs > 0 && innermost->xid == this_xid) { |
1818 | // if this is a rollback of a delete of a new ule, return 0 |
1819 | // (i.e. double delete) |
1820 | if (uxr_is_delete(innermost)) { |
1821 | if (ule->num_puxrs == 1 && ule->num_cuxrs == 1 && |
1822 | uxr_is_delete(&(ule->uxrs[0]))) { |
1823 | retval = 0; |
1824 | } else { |
1825 | retval = 1; |
1826 | } |
1827 | } else if (uxr_is_insert(innermost)) { |
1828 | if (ule->num_puxrs == 1 && ule->num_cuxrs == 1 && |
1829 | uxr_is_insert(&(ule->uxrs[0]))) { |
1830 | retval = 0; |
1831 | } else { |
1832 | retval = -1; |
1833 | } |
1834 | } |
1835 | // if this is a rollback of a insert of an exising ule, return 0 |
1836 | // (i.e. double insert) |
1837 | invariant(ule->num_puxrs>0); |
1838 | ule_remove_innermost_uxr(ule); |
1839 | ule_remove_innermost_placeholders(ule); |
1840 | } |
1841 | invariant(ule->num_cuxrs > 0); |
1842 | return retval; |
1843 | } |
1844 | |
1845 | static void ule_apply_broadcast_commit_all (ULE ule) { |
1846 | ule->uxrs[0] = ule->uxrs[ule->num_puxrs + ule->num_cuxrs - 1]; |
1847 | ule->uxrs[0].xid = TXNID_NONE; |
1848 | ule->num_puxrs = 0; |
1849 | ule->num_cuxrs = 1; |
1850 | } |
1851 | |
1852 | // Purpose is to apply a commit message to this leafentry. |
1853 | // If the committed transaction (the transaction whose xid is the innermost xid |
1854 | // in the id stack passed in the message), has not modified this leafentry, |
1855 | // then there is nothing to be done. |
1856 | // Also, if there are no uncommitted transaction records there is nothing to do. |
1857 | // If this transaction did modify the leafentry, then promote whatever it did. |
1858 | // Remember, the innermost uxr can only be an insert or a delete, not a |
1859 | // placeholder. |
1860 | void ule_apply_commit(ULE ule, XIDS xids) { |
1861 | // xid of transaction committing |
1862 | TXNID this_xid = toku_xids_get_innermost_xid(xids); |
1863 | invariant(this_xid!=TXNID_NONE); |
1864 | // need to check for provisional entries in ule, otherwise |
1865 | // there is nothing to abort, not checking this may result |
1866 | // in a bug where the most recently committed has same xid |
1867 | // as the XID's innermost |
1868 | if (ule->num_puxrs > 0 && ule_get_innermost_xid(ule) == this_xid) { |
1869 | // 3 cases: |
1870 | //1- it's already a committed value (do nothing) (num_puxrs==0) |
1871 | //2- it's provisional but root level (make a new committed value |
1872 | // (num_puxrs==1) |
1873 | //3- it's provisional and not root (promote); (num_puxrs>1) |
1874 | if (ule->num_puxrs == 1) { //new committed value |
1875 | ule_promote_provisional_innermost_to_committed(ule); |
1876 | } else if (ule->num_puxrs > 1) { |
1877 | //ule->uxrs[ule->num_cuxrs+ule->num_puxrs-1] is the innermost |
1878 | // (this transaction) |
1879 | //ule->uxrs[ule->num_cuxrs+ule->num_puxrs-2] is the 2nd innermost |
1880 | //We want to promote the innermost uxr one level out. |
1881 | ule_promote_provisional_innermost_to_index( |
1882 | ule, |
1883 | ule->num_cuxrs+ule->num_puxrs-2); |
1884 | } |
1885 | } |
1886 | } |
1887 | |
1888 | /////////////////// |
1889 | // Helper functions called from the functions above: |
1890 | // |
1891 | |
1892 | // Purpose is to record an insert for this transaction (and set type correctly). |
1893 | static inline void ule_push_insert_uxr( |
1894 | ULE ule, |
1895 | bool is_committed, TXNID xid, |
1896 | uint32_t vallen, |
1897 | void* valp) { |
1898 | |
1899 | UXR uxr = ule_get_first_empty_uxr(ule); |
1900 | if (is_committed) { |
1901 | invariant(ule->num_puxrs==0); |
1902 | ule->num_cuxrs++; |
1903 | } else { |
1904 | ule->num_puxrs++; |
1905 | } |
1906 | uxr->xid = xid; |
1907 | uxr->vallen = vallen; |
1908 | uxr->valp = valp; |
1909 | uxr->type = XR_INSERT; |
1910 | } |
1911 | |
1912 | // Purpose is to record a delete for this transaction. If this transaction |
1913 | // is the root transaction, then truly delete the leafentry by marking the |
1914 | // ule as empty. |
1915 | static inline void ule_push_delete_uxr(ULE ule, bool is_committed, TXNID xid) { |
1916 | UXR uxr = ule_get_first_empty_uxr(ule); |
1917 | if (is_committed) { |
1918 | invariant(ule->num_puxrs==0); |
1919 | ule->num_cuxrs++; |
1920 | } else { |
1921 | ule->num_puxrs++; |
1922 | } |
1923 | uxr->xid = xid; |
1924 | uxr->type = XR_DELETE; |
1925 | } |
1926 | |
1927 | // Purpose is to push a placeholder on the top of the leafentry's transaction |
1928 | // stack. |
1929 | static inline void ule_push_placeholder_uxr(ULE ule, TXNID xid) { |
1930 | invariant(ule->num_cuxrs>0); |
1931 | UXR uxr = ule_get_first_empty_uxr(ule); |
1932 | uxr->xid = xid; |
1933 | uxr->type = XR_PLACEHOLDER; |
1934 | ule->num_puxrs++; |
1935 | } |
1936 | |
1937 | // Return innermost transaction record. |
1938 | static inline UXR ule_get_innermost_uxr(ULE ule) { |
1939 | invariant(ule->num_cuxrs > 0); |
1940 | UXR rval = &(ule->uxrs[ule->num_cuxrs + ule->num_puxrs - 1]); |
1941 | return rval; |
1942 | } |
1943 | |
1944 | // Return first empty transaction record |
1945 | static inline UXR ule_get_first_empty_uxr(ULE ule) { |
1946 | invariant(ule->num_puxrs < MAX_TRANSACTION_RECORDS-1); |
1947 | UXR rval = &(ule->uxrs[ule->num_cuxrs+ule->num_puxrs]); |
1948 | return rval; |
1949 | } |
1950 | |
1951 | // Remove the innermost transaction (pop the leafentry's stack), undoing |
1952 | // whatever the innermost transaction did. |
1953 | static inline void ule_remove_innermost_uxr(ULE ule) { |
1954 | //It is possible to remove the committed delete at first insert. |
1955 | invariant(ule->num_cuxrs > 0); |
1956 | if (ule->num_puxrs) { |
1957 | ule->num_puxrs--; |
1958 | } else { |
1959 | //This is for LOADER_USE_PUTS or transactionless environment |
1960 | //where messages use XIDS of 0 |
1961 | invariant(ule->num_cuxrs == 1); |
1962 | invariant(ule_get_innermost_xid(ule)==TXNID_NONE); |
1963 | ule->num_cuxrs--; |
1964 | } |
1965 | } |
1966 | |
1967 | static inline TXNID ule_get_innermost_xid(ULE ule) { |
1968 | TXNID rval = ule_get_xid(ule, ule->num_cuxrs + ule->num_puxrs - 1); |
1969 | return rval; |
1970 | } |
1971 | |
1972 | static inline TXNID ule_get_xid(ULE ule, uint32_t index) { |
1973 | invariant(index < ule->num_cuxrs + ule->num_puxrs); |
1974 | TXNID rval = ule->uxrs[index].xid; |
1975 | return rval; |
1976 | } |
1977 | |
1978 | // Purpose is to remove any placeholders from the top of the leaf stack (the |
1979 | // innermost recorded transactions), if necessary. This function is idempotent. |
1980 | // It makes no logical sense for a placeholder to be the innermost recorded |
1981 | // transaction record, so placeholders at the top of the stack are not legal. |
1982 | static void ule_remove_innermost_placeholders(ULE ule) { |
1983 | UXR uxr = ule_get_innermost_uxr(ule); |
1984 | while (uxr_is_placeholder(uxr)) { |
1985 | invariant(ule->num_puxrs>0); |
1986 | ule_remove_innermost_uxr(ule); |
1987 | uxr = ule_get_innermost_uxr(ule); |
1988 | } |
1989 | } |
1990 | |
1991 | // Purpose is to add placeholders to the top of the leaf stack (the innermost |
1992 | // recorded transactions), if necessary. This function is idempotent. |
1993 | // Note, after placeholders are added, an insert or delete will be added. This |
1994 | // function temporarily leaves the transaction stack in an illegal state (having |
1995 | // placeholders on top). |
1996 | static void ule_add_placeholders(ULE ule, XIDS xids) { |
1997 | //Placeholders can be placed on top of the committed uxr. |
1998 | invariant(ule->num_cuxrs > 0); |
1999 | |
2000 | uint32_t num_xids = toku_xids_get_num_xids(xids); |
2001 | // we assume that implicit promotion has happened |
2002 | // when we get this call, so the number of xids MUST |
2003 | // be greater than the number of provisional entries |
2004 | invariant(num_xids >= ule->num_puxrs); |
2005 | // make sure that the xids stack matches up to a certain amount |
2006 | // this first for loop is just debug code |
2007 | for (uint32_t i = 0; i < ule->num_puxrs; i++) { |
2008 | TXNID current_msg_xid = toku_xids_get_xid(xids, i); |
2009 | TXNID current_ule_xid = ule_get_xid(ule, i + ule->num_cuxrs); |
2010 | invariant(current_msg_xid == current_ule_xid); |
2011 | } |
2012 | for (uint32_t i = ule->num_puxrs; i < num_xids-1; i++) { |
2013 | TXNID current_msg_xid = toku_xids_get_xid(xids, i); |
2014 | ule_push_placeholder_uxr(ule, current_msg_xid); |
2015 | } |
2016 | } |
2017 | |
2018 | uint64_t ule_num_uxrs(ULE ule) { |
2019 | return ule->num_cuxrs + ule->num_puxrs; |
2020 | } |
2021 | |
2022 | UXR ule_get_uxr(ULE ule, uint64_t ith) { |
2023 | invariant(ith < ule_num_uxrs(ule)); |
2024 | return &ule->uxrs[ith]; |
2025 | } |
2026 | |
2027 | uint32_t ule_get_num_committed(ULE ule) { |
2028 | return ule->num_cuxrs; |
2029 | } |
2030 | |
2031 | uint32_t ule_get_num_provisional(ULE ule) { |
2032 | return ule->num_puxrs; |
2033 | } |
2034 | |
2035 | int ule_is_committed(ULE ule, uint64_t ith) { |
2036 | invariant(ith < ule_num_uxrs(ule)); |
2037 | return ith < ule->num_cuxrs; |
2038 | } |
2039 | |
2040 | int ule_is_provisional(ULE ule, uint64_t ith) { |
2041 | invariant(ith < ule_num_uxrs(ule)); |
2042 | return ith >= ule->num_cuxrs; |
2043 | } |
2044 | |
2045 | // return size of data for innermost uxr, the size of val |
2046 | uint32_t ule_get_innermost_numbytes(ULE ule, uint32_t keylen) { |
2047 | uint32_t rval; |
2048 | UXR uxr = ule_get_innermost_uxr(ule); |
2049 | if (uxr_is_delete(uxr)) { |
2050 | rval = 0; |
2051 | } else { |
2052 | rval = uxr_get_vallen(uxr) + keylen; |
2053 | } |
2054 | return rval; |
2055 | } |
2056 | |
2057 | |
2058 | ///////////////////////////////////////////////////////////////////////////////// |
2059 | // This layer of abstraction (uxr_xxx) understands uxr and nothing else. |
2060 | // |
2061 | |
2062 | static inline bool uxr_type_is_insert(uint8_t type) { |
2063 | bool rval = (bool)(type == XR_INSERT); |
2064 | return rval; |
2065 | } |
2066 | |
2067 | bool uxr_is_insert(UXR uxr) { |
2068 | return uxr_type_is_insert(uxr->type); |
2069 | } |
2070 | |
2071 | static inline bool uxr_type_is_delete(uint8_t type) { |
2072 | bool rval = (bool)(type == XR_DELETE); |
2073 | return rval; |
2074 | } |
2075 | |
2076 | bool uxr_is_delete(UXR uxr) { |
2077 | return uxr_type_is_delete(uxr->type); |
2078 | } |
2079 | |
2080 | static inline bool uxr_type_is_placeholder(uint8_t type) { |
2081 | bool rval = (bool)(type == XR_PLACEHOLDER); |
2082 | return rval; |
2083 | } |
2084 | |
2085 | bool uxr_is_placeholder(UXR uxr) { |
2086 | return uxr_type_is_placeholder(uxr->type); |
2087 | } |
2088 | |
2089 | void* uxr_get_val(UXR uxr) { |
2090 | return uxr->valp; |
2091 | } |
2092 | |
2093 | uint32_t uxr_get_vallen(UXR uxr) { |
2094 | return uxr->vallen; |
2095 | } |
2096 | |
2097 | |
2098 | TXNID uxr_get_txnid(UXR uxr) { |
2099 | return uxr->xid; |
2100 | } |
2101 | |
2102 | static int le_iterate_get_accepted_index( |
2103 | TXNID* xids, |
2104 | uint32_t* index, |
2105 | uint32_t num_xids, |
2106 | LE_ITERATE_CALLBACK f, |
2107 | TOKUTXN context, |
2108 | bool top_is_provisional) { |
2109 | |
2110 | uint32_t i; |
2111 | int r = 0; |
2112 | // if this for loop does not return anything, we return num_xids-1, which |
2113 | // should map to T_0 |
2114 | for (i = 0; i < num_xids - 1; i++) { |
2115 | TXNID xid = toku_dtoh64(xids[i]); |
2116 | r = f(xid, context, (i == 0 && top_is_provisional)); |
2117 | if (r==TOKUDB_ACCEPT) { |
2118 | r = 0; |
2119 | break; //or goto something |
2120 | } else if (r!=0) { |
2121 | break; |
2122 | } |
2123 | } |
2124 | *index = i; |
2125 | return r; |
2126 | } |
2127 | |
2128 | #if ULE_DEBUG |
2129 | static void ule_verify_xids(ULE ule, uint32_t interesting, TXNID *xids) { |
2130 | int has_p = (ule->num_puxrs != 0); |
2131 | invariant(ule->num_cuxrs + has_p == interesting); |
2132 | uint32_t i; |
2133 | for (i = 0; i < interesting - 1; i++) { |
2134 | TXNID xid = toku_dtoh64(xids[i]); |
2135 | invariant(ule->uxrs[ule->num_cuxrs - 1 + has_p - i].xid == xid); |
2136 | } |
2137 | } |
2138 | #endif |
2139 | |
2140 | // |
2141 | // Iterates over "possible" TXNIDs in a leafentry's stack, until one is |
2142 | // accepted by 'f'. If the value associated with the accepted TXNID is not an |
2143 | // insert, then set *is_emptyp to true, otherwise false |
2144 | // The "possible" TXNIDs are: |
2145 | // If provisionals exist, then the first possible TXNID is the outermost |
2146 | // provisional. |
2147 | // The next possible TXNIDs are the committed TXNIDs, from most recently |
2148 | // committed to T_0. |
2149 | // If provisionals exist, and the outermost provisional is accepted by 'f', |
2150 | // the associated value checked is the innermost provisional's value. |
2151 | // Parameters: |
2152 | // le - leafentry to iterate over |
2153 | // f - callback function that checks if a TXNID in le is accepted, and its |
2154 | // associated value should be examined. |
2155 | // is_delp - output parameter that returns answer |
2156 | // context - parameter for f |
2157 | // |
2158 | static int le_iterate_is_del( |
2159 | LEAFENTRY le, |
2160 | LE_ITERATE_CALLBACK f, |
2161 | bool* is_delp, |
2162 | TOKUTXN context) { |
2163 | |
2164 | #if ULE_DEBUG |
2165 | ULE_S ule; |
2166 | le_unpack(&ule, le); |
2167 | #endif |
2168 | |
2169 | uint8_t type = le->type; |
2170 | int r; |
2171 | bool is_del = false; |
2172 | switch (type) { |
2173 | case LE_CLEAN: { |
2174 | r = 0; |
2175 | #if ULE_DEBUG |
2176 | invariant(ule.num_cuxrs == 1); |
2177 | invariant(ule.num_puxrs == 0); |
2178 | invariant(uxr_is_insert(ule.uxrs)); |
2179 | #endif |
2180 | break; |
2181 | } |
2182 | case LE_MVCC:; |
2183 | uint32_t num_cuxrs; |
2184 | num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
2185 | uint32_t num_puxrs; |
2186 | num_puxrs = le->u.mvcc.num_pxrs; |
2187 | uint8_t *p; |
2188 | p = le->u.mvcc.xrs; |
2189 | |
2190 | uint32_t index; |
2191 | uint32_t num_interesting; |
2192 | num_interesting = num_cuxrs + (num_puxrs != 0); |
2193 | TXNID *xids; |
2194 | xids = (TXNID*)p; |
2195 | #if ULE_DEBUG |
2196 | ule_verify_xids(&ule, num_interesting, xids); |
2197 | #endif |
2198 | r = |
2199 | le_iterate_get_accepted_index( |
2200 | xids, |
2201 | &index, |
2202 | num_interesting, |
2203 | f, |
2204 | context, |
2205 | (num_puxrs != 0)); |
2206 | if (r != 0) { |
2207 | goto cleanup; |
2208 | } |
2209 | invariant(index < num_interesting); |
2210 | |
2211 | //Skip TXNIDs |
2212 | p += (num_interesting - 1)*sizeof(TXNID); |
2213 | |
2214 | uint32_t *length_and_bits; |
2215 | length_and_bits = (uint32_t*)p; |
2216 | uint32_t my_length_and_bit; |
2217 | my_length_and_bit = toku_dtoh32(length_and_bits[index]); |
2218 | is_del = !IS_INSERT(my_length_and_bit); |
2219 | #if ULE_DEBUG |
2220 | { |
2221 | uint32_t has_p = (ule.num_puxrs != 0); |
2222 | uint32_t ule_index = (index==0) ? |
2223 | ule.num_cuxrs + ule.num_puxrs - 1 : |
2224 | ule.num_cuxrs - 1 + has_p - index; |
2225 | UXR uxr = ule.uxrs + ule_index; |
2226 | invariant(uxr_is_delete(uxr) == is_del); |
2227 | } |
2228 | #endif |
2229 | break; |
2230 | default: |
2231 | invariant(false); |
2232 | } |
2233 | cleanup: |
2234 | #if ULE_DEBUG |
2235 | ule_cleanup(&ule); |
2236 | #endif |
2237 | if (!r) *is_delp = is_del; |
2238 | return r; |
2239 | } |
2240 | |
2241 | static int le_iterate_read_committed_callback( |
2242 | TXNID txnid, |
2243 | TOKUTXN txn, |
2244 | bool is_provisional UU()) { |
2245 | |
2246 | if (is_provisional) { |
2247 | return toku_txn_reads_txnid(txnid, txn, is_provisional); |
2248 | } |
2249 | return TOKUDB_ACCEPT; |
2250 | } |
2251 | |
2252 | // |
2253 | // Returns true if the value that is to be read is empty. |
2254 | // |
2255 | int le_val_is_del(LEAFENTRY le, enum cursor_read_type read_type, TOKUTXN txn) { |
2256 | int rval; |
2257 | if (read_type == C_READ_SNAPSHOT || read_type == C_READ_COMMITTED) { |
2258 | LE_ITERATE_CALLBACK f = (read_type == C_READ_SNAPSHOT) ? |
2259 | toku_txn_reads_txnid : |
2260 | le_iterate_read_committed_callback; |
2261 | bool is_del = false; |
2262 | le_iterate_is_del( |
2263 | le, |
2264 | f, |
2265 | &is_del, |
2266 | txn |
2267 | ); |
2268 | rval = is_del; |
2269 | } else if (read_type == C_READ_ANY) { |
2270 | rval = le_latest_is_del(le); |
2271 | } else { |
2272 | invariant(false); |
2273 | } |
2274 | return rval; |
2275 | } |
2276 | |
2277 | // |
2278 | // Iterates over "possible" TXNIDs in a leafentry's stack, until one is accepted |
2279 | // by 'f'. Set valpp and vallenp to value and length associated with accepted |
2280 | // TXNID |
2281 | // The "possible" TXNIDs are: |
2282 | // If provisionals exist, then the first possible TXNID is the outermost |
2283 | // provisional. |
2284 | // The next possible TXNIDs are the committed TXNIDs, from most recently |
2285 | // committed to T_0. |
2286 | // If provisionals exist, and the outermost provisional is accepted by 'f', |
2287 | // the associated length value is the innermost provisional's length and value. |
2288 | // Parameters: |
2289 | // le - leafentry to iterate over |
2290 | // f - callback function that checks if a TXNID in le is accepted, and its |
2291 | // associated value should be examined. |
2292 | // valpp - output parameter that returns pointer to value |
2293 | // vallenp - output parameter that returns length of value |
2294 | // context - parameter for f |
2295 | // |
2296 | int le_iterate_val( |
2297 | LEAFENTRY le, |
2298 | LE_ITERATE_CALLBACK f, |
2299 | void** valpp, |
2300 | uint32_t* vallenp, |
2301 | TOKUTXN context) { |
2302 | |
2303 | #if ULE_DEBUG |
2304 | ULE_S ule; |
2305 | le_unpack(&ule, le); |
2306 | #endif |
2307 | |
2308 | uint8_t type = le->type; |
2309 | int r; |
2310 | uint32_t vallen = 0; |
2311 | void *valp = NULL; |
2312 | switch (type) { |
2313 | case LE_CLEAN: { |
2314 | vallen = toku_dtoh32(le->u.clean.vallen); |
2315 | valp = le->u.clean.val; |
2316 | r = 0; |
2317 | #if ULE_DEBUG |
2318 | invariant(ule.num_cuxrs == 1); |
2319 | invariant(ule.num_puxrs == 0); |
2320 | invariant(uxr_is_insert(ule.uxrs)); |
2321 | invariant(ule.uxrs[0].vallen == vallen); |
2322 | invariant(ule.uxrs[0].valp == valp); |
2323 | #endif |
2324 | break; |
2325 | } |
2326 | case LE_MVCC:; |
2327 | uint32_t num_cuxrs; |
2328 | num_cuxrs = toku_dtoh32(le->u.mvcc.num_cxrs); |
2329 | uint32_t num_puxrs; |
2330 | num_puxrs = le->u.mvcc.num_pxrs; |
2331 | uint8_t *p; |
2332 | p = le->u.mvcc.xrs; |
2333 | |
2334 | uint32_t index; |
2335 | uint32_t num_interesting; |
2336 | num_interesting = num_cuxrs + (num_puxrs != 0); |
2337 | TXNID *xids; |
2338 | xids = (TXNID*)p; |
2339 | #if ULE_DEBUG |
2340 | ule_verify_xids(&ule, num_interesting, xids); |
2341 | #endif |
2342 | r = |
2343 | le_iterate_get_accepted_index( |
2344 | xids, |
2345 | &index, |
2346 | num_interesting, |
2347 | f, |
2348 | context, |
2349 | (num_puxrs != 0)); |
2350 | if (r != 0) { |
2351 | goto cleanup; |
2352 | } |
2353 | invariant(index < num_interesting); |
2354 | |
2355 | //Skip TXNIDs |
2356 | p += (num_interesting - 1)*sizeof(TXNID); |
2357 | |
2358 | UXR_S temp; |
2359 | size_t offset; |
2360 | offset = 0; |
2361 | |
2362 | uint32_t *length_and_bits; |
2363 | length_and_bits = (uint32_t*)p; |
2364 | uint32_t i; |
2365 | //evaluate the offset |
2366 | for (i=0; i < index; i++){ |
2367 | uxr_unpack_length_and_bit(&temp, (uint8_t*)&length_and_bits[i]); |
2368 | offset += temp.vallen; |
2369 | } |
2370 | uxr_unpack_length_and_bit(&temp, (uint8_t*)&length_and_bits[index]); |
2371 | if (uxr_is_delete(&temp)) { |
2372 | goto verify_is_empty; |
2373 | } |
2374 | vallen = temp.vallen; |
2375 | |
2376 | // move p past the length and bits, now points to beginning of data |
2377 | p += num_interesting*sizeof(uint32_t); |
2378 | // move p to point to the data we care about |
2379 | p += offset; |
2380 | valp = p; |
2381 | |
2382 | #if ULE_DEBUG |
2383 | { |
2384 | uint32_t has_p = (ule.num_puxrs != 0); |
2385 | uint32_t ule_index = (index==0) ? |
2386 | ule.num_cuxrs + ule.num_puxrs - 1 : |
2387 | ule.num_cuxrs - 1 + has_p - index; |
2388 | UXR uxr = ule.uxrs + ule_index; |
2389 | invariant(uxr_is_insert(uxr)); |
2390 | invariant(uxr->vallen == vallen); |
2391 | invariant(uxr->valp == valp); |
2392 | } |
2393 | #endif |
2394 | if (0) { |
2395 | verify_is_empty:; |
2396 | #if ULE_DEBUG |
2397 | uint32_t has_p = (ule.num_puxrs != 0); |
2398 | UXR uxr = ule.uxrs + ule.num_cuxrs - 1 + has_p - index; |
2399 | invariant(uxr_is_delete(uxr)); |
2400 | #endif |
2401 | } |
2402 | break; |
2403 | default: |
2404 | invariant(false); |
2405 | } |
2406 | cleanup: |
2407 | #if ULE_DEBUG |
2408 | ule_cleanup(&ule); |
2409 | #endif |
2410 | if (!r) { |
2411 | *valpp = valp; |
2412 | *vallenp = vallen; |
2413 | } |
2414 | return r; |
2415 | } |
2416 | |
2417 | void ( |
2418 | LEAFENTRY le, |
2419 | // should we return the entire leafentry as the val? |
2420 | bool is_leaf_mode, |
2421 | enum cursor_read_type read_type, |
2422 | TOKUTXN ttxn, |
2423 | uint32_t* vallen, |
2424 | void** val) { |
2425 | |
2426 | if (is_leaf_mode) { |
2427 | *val = le; |
2428 | *vallen = leafentry_memsize(le); |
2429 | } else if (read_type == C_READ_SNAPSHOT || read_type == C_READ_COMMITTED) { |
2430 | LE_ITERATE_CALLBACK f = (read_type == C_READ_SNAPSHOT) ? |
2431 | toku_txn_reads_txnid : |
2432 | le_iterate_read_committed_callback; |
2433 | int r = le_iterate_val(le, f, val, vallen, ttxn); |
2434 | lazy_assert_zero(r); |
2435 | } else if (read_type == C_READ_ANY){ |
2436 | *val = le_latest_val_and_len(le, vallen); |
2437 | } else { |
2438 | assert(false); |
2439 | } |
2440 | } |
2441 | |
2442 | // This is an on-disk format. static_asserts verify everything is packed and aligned correctly. |
2443 | struct __attribute__ ((__packed__)) leafentry_13 { |
2444 | struct leafentry_committed_13 { |
2445 | uint8_t key_val[0]; //Actual key, then actual val |
2446 | }; |
2447 | static_assert(0 == sizeof(leafentry_committed_13), "wrong size" ); |
2448 | static_assert(0 == __builtin_offsetof(leafentry_committed_13, key_val), "wrong offset" ); |
2449 | struct __attribute__ ((__packed__)) leafentry_provisional_13 { |
2450 | uint8_t innermost_type; |
2451 | TXNID xid_outermost_uncommitted; |
2452 | uint8_t key_val_xrs[0]; //Actual key, |
2453 | //then actual innermost inserted val, |
2454 | //then transaction records. |
2455 | }; |
2456 | static_assert(9 == sizeof(leafentry_provisional_13), "wrong size" ); |
2457 | static_assert(9 == __builtin_offsetof(leafentry_provisional_13, key_val_xrs), "wrong offset" ); |
2458 | |
2459 | uint8_t num_xrs; |
2460 | uint32_t keylen; |
2461 | uint32_t innermost_inserted_vallen; |
2462 | union __attribute__ ((__packed__)) { |
2463 | struct leafentry_committed_13 comm; |
2464 | struct leafentry_provisional_13 prov; |
2465 | } u; |
2466 | }; |
2467 | static_assert(18 == sizeof(leafentry_13), "wrong size" ); |
2468 | static_assert(9 == __builtin_offsetof(leafentry_13, u), "wrong offset" ); |
2469 | |
2470 | //Requires: |
2471 | // Leafentry that ule represents should not be destroyed (is not just all |
2472 | // deletes) |
2473 | static size_t le_memsize_from_ule_13 (ULE ule, LEAFENTRY_13 le) { |
2474 | uint32_t num_uxrs = ule->num_cuxrs + ule->num_puxrs; |
2475 | assert(num_uxrs); |
2476 | size_t rval; |
2477 | if (num_uxrs == 1) { |
2478 | assert(uxr_is_insert(&ule->uxrs[0])); |
2479 | rval = 1 //num_uxrs |
2480 | +4 //keylen |
2481 | +4 //vallen |
2482 | +le->keylen //actual key |
2483 | +ule->uxrs[0].vallen; //actual val |
2484 | } else { |
2485 | rval = 1 //num_uxrs |
2486 | +4 //keylen |
2487 | +le->keylen //actual key |
2488 | +1*num_uxrs //types |
2489 | +8*(num_uxrs-1); //txnids |
2490 | uint8_t i; |
2491 | for (i = 0; i < num_uxrs; i++) { |
2492 | UXR uxr = &ule->uxrs[i]; |
2493 | if (uxr_is_insert(uxr)) { |
2494 | rval += 4; //vallen |
2495 | rval += uxr->vallen; //actual val |
2496 | } |
2497 | } |
2498 | } |
2499 | return rval; |
2500 | } |
2501 | |
2502 | // This function is mostly copied from 4.1.1 (which is version 12, same as 13 |
2503 | // except that only 13 is upgradable). |
2504 | // Note, number of transaction records in version 13 has been replaced by |
2505 | // separate counters in version 14 (MVCC), one counter for committed transaction |
2506 | // records and one counter for provisional transaction records. When upgrading |
2507 | // a version 13 le to version 14, the number of committed transaction records is |
2508 | // always set to one (1) and the number of provisional transaction records is |
2509 | // set to the original number of transaction records minus one. The bottom |
2510 | // transaction record is assumed to be a committed value. (If there is no |
2511 | // committed value then the bottom transaction record of version 13 is a |
2512 | // committed delete.) |
2513 | // This is the only change from the 4.1.1 code. The rest of the leafentry is |
2514 | // read as is. |
2515 | static void le_unpack_13(ULE ule, LEAFENTRY_13 le) { |
2516 | //Read num_uxrs |
2517 | uint8_t num_xrs = le->num_xrs; |
2518 | assert(num_xrs > 0); |
2519 | ule->uxrs = ule->uxrs_static; //Static version is always enough. |
2520 | ule->num_cuxrs = 1; |
2521 | ule->num_puxrs = num_xrs - 1; |
2522 | |
2523 | //Read the keylen |
2524 | uint32_t keylen = toku_dtoh32(le->keylen); |
2525 | |
2526 | //Read the vallen of innermost insert |
2527 | uint32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen); |
2528 | |
2529 | uint8_t *p; |
2530 | if (num_xrs == 1) { |
2531 | //Unpack a 'committed leafentry' (No uncommitted transactions exist) |
2532 | //Must be or the leafentry would not exist |
2533 | ule->uxrs[0].type = XR_INSERT; |
2534 | ule->uxrs[0].vallen = vallen_of_innermost_insert; |
2535 | ule->uxrs[0].valp = &le->u.comm.key_val[keylen]; |
2536 | ule->uxrs[0].xid = 0; //Required. |
2537 | |
2538 | //Set p to immediately after leafentry |
2539 | p = &le->u.comm.key_val[keylen + vallen_of_innermost_insert]; |
2540 | } else { |
2541 | //Unpack a 'provisional leafentry' (Uncommitted transactions exist) |
2542 | |
2543 | //Read in type. |
2544 | uint8_t innermost_type = le->u.prov.innermost_type; |
2545 | assert(!uxr_type_is_placeholder(innermost_type)); |
2546 | |
2547 | //Read in xid |
2548 | TXNID xid_outermost_uncommitted = toku_dtoh64(le->u.prov.xid_outermost_uncommitted); |
2549 | |
2550 | //Read pointer to innermost inserted val (immediately after key) |
2551 | uint8_t *valp_of_innermost_insert = &le->u.prov.key_val_xrs[keylen]; |
2552 | |
2553 | //Point p to immediately after 'header' |
2554 | p = &le->u.prov.key_val_xrs[keylen + vallen_of_innermost_insert]; |
2555 | |
2556 | bool found_innermost_insert = false; |
2557 | int i; //Index in ULE.uxrs[] |
2558 | //Loop inner to outer |
2559 | for (i = num_xrs - 1; i >= 0; i--) { |
2560 | UXR uxr = &ule->uxrs[i]; |
2561 | |
2562 | //Innermost's type is in header. |
2563 | if (i < num_xrs - 1) { |
2564 | //Not innermost, so load the type. |
2565 | uxr->type = *p; |
2566 | p += 1; |
2567 | } else { |
2568 | //Innermost, load the type previously read from header |
2569 | uxr->type = innermost_type; |
2570 | } |
2571 | |
2572 | //Committed txn id is implicit (0). (i==0) |
2573 | //Outermost uncommitted txnid is stored in header. (i==1) |
2574 | if (i > 1) { |
2575 | //Not committed nor outermost uncommitted, so load the xid. |
2576 | uxr->xid = toku_dtoh64(*(TXNID*)p); |
2577 | p += 8; |
2578 | } else if (i == 1) { |
2579 | //Outermost uncommitted, load the xid previously read from |
2580 | //header |
2581 | uxr->xid = xid_outermost_uncommitted; |
2582 | } else { |
2583 | // i == 0, committed entry |
2584 | uxr->xid = 0; |
2585 | } |
2586 | |
2587 | if (uxr_is_insert(uxr)) { |
2588 | if (found_innermost_insert) { |
2589 | //Not the innermost insert. Load vallen/valp |
2590 | uxr->vallen = toku_dtoh32(*(uint32_t*)p); |
2591 | p += 4; |
2592 | |
2593 | uxr->valp = p; |
2594 | p += uxr->vallen; |
2595 | } else { |
2596 | //Innermost insert, load the vallen/valp previously read |
2597 | //from header |
2598 | uxr->vallen = vallen_of_innermost_insert; |
2599 | uxr->valp = valp_of_innermost_insert; |
2600 | found_innermost_insert = true; |
2601 | } |
2602 | } |
2603 | } |
2604 | assert(found_innermost_insert); |
2605 | } |
2606 | #if ULE_DEBUG |
2607 | size_t memsize = le_memsize_from_ule_13(ule); |
2608 | assert(p == ((uint8_t*)le) + memsize); |
2609 | #endif |
2610 | } |
2611 | |
2612 | size_t leafentry_disksize_13(LEAFENTRY_13 le) { |
2613 | ULE_S ule; |
2614 | le_unpack_13(&ule, le); |
2615 | size_t memsize = le_memsize_from_ule_13(&ule, le); |
2616 | ule_cleanup(&ule); |
2617 | return memsize; |
2618 | } |
2619 | |
2620 | int toku_le_upgrade_13_14( |
2621 | LEAFENTRY_13 old_leafentry, |
2622 | void** keyp, |
2623 | uint32_t* keylen, |
2624 | size_t* new_leafentry_memorysize, |
2625 | LEAFENTRY* new_leafentry_p) { |
2626 | |
2627 | ULE_S ule; |
2628 | int rval; |
2629 | invariant(old_leafentry); |
2630 | le_unpack_13(&ule, old_leafentry); |
2631 | // get the key |
2632 | *keylen = old_leafentry->keylen; |
2633 | if (old_leafentry->num_xrs == 1) { |
2634 | *keyp = old_leafentry->u.comm.key_val; |
2635 | } else { |
2636 | *keyp = old_leafentry->u.prov.key_val_xrs; |
2637 | } |
2638 | // We used to pass NULL for omt and mempool, so that we would use |
2639 | // malloc instead of a mempool. However after supporting upgrade, |
2640 | // we need to use mempools and the OMT. |
2641 | rval = |
2642 | le_pack( |
2643 | &ule, // create packed leafentry |
2644 | nullptr, |
2645 | 0, //only matters if we are passing in a bn_data |
2646 | nullptr, //only matters if we are passing in a bn_data |
2647 | 0, //only matters if we are passing in a bn_data |
2648 | 0, //only matters if we are passing in a bn_data |
2649 | 0, //only matters if we are passing in a bn_data |
2650 | new_leafentry_p, |
2651 | nullptr); //only matters if we are passing in a bn_data |
2652 | ule_cleanup(&ule); |
2653 | *new_leafentry_memorysize = leafentry_memsize(*new_leafentry_p); |
2654 | return rval; |
2655 | } |
2656 | |
2657 | #include <toku_race_tools.h> |
2658 | void __attribute__((__constructor__)) toku_ule_helgrind_ignore(void); |
2659 | void |
2660 | toku_ule_helgrind_ignore(void) { |
2661 | TOKU_VALGRIND_HG_DISABLE_CHECKING(&le_status, sizeof le_status); |
2662 | } |
2663 | |