| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #pragma once |
| 40 | |
| 41 | #include <toku_portability.h> |
| 42 | |
| 43 | #include <util/mempool.h> |
| 44 | #include <util/omt.h> |
| 45 | |
| 46 | #include "ft/txn/txn_manager.h" |
| 47 | #include "ft/serialize/rbuf.h" |
| 48 | #include "ft/msg.h" |
| 49 | |
| 50 | /* |
| 51 | Memory format of packed leaf entry |
| 52 | CONSTANTS: |
| 53 | num_uxrs |
| 54 | keylen |
| 55 | Run-time-constants |
| 56 | voffset of val/vallen??? (for le_any_val) This must be small if it is interpreted as voffset = realoffset_of_val - keylen |
| 57 | GOOD performance optimization. |
| 58 | ALSO good for simplicity (no having to scan packed version) |
| 59 | key[] |
| 60 | variable length |
| 61 | |
| 62 | |
| 63 | Memory format of packed dup leaf entry |
| 64 | CONSTANTS: |
| 65 | num_uxrs |
| 66 | keylen |
| 67 | vallen |
| 68 | Run-time-constants |
| 69 | key[] |
| 70 | val[] |
| 71 | */ |
| 72 | |
| 73 | enum cursor_read_type { |
| 74 | C_READ_ANY = 0, |
| 75 | C_READ_SNAPSHOT = 1, |
| 76 | C_READ_COMMITTED = 2 |
| 77 | }; |
| 78 | |
| 79 | // |
| 80 | // enum of possible values for LEAFENTRY->type field |
| 81 | // LE_CLEAN means that there is a single committed value in a format that saves disk space |
| 82 | // LE_MVCC means that there may be multiple committed values or there are provisional values |
| 83 | // |
| 84 | enum { LE_CLEAN = 0, LE_MVCC = 1 }; |
| 85 | |
| 86 | // This is an on-disk format. static_asserts verify everything is packed and aligned correctly. |
| 87 | struct leafentry { |
| 88 | struct leafentry_clean { |
| 89 | uint32_t vallen; |
| 90 | uint8_t val[0]; //actual val |
| 91 | }; // For the case where LEAFENTRY->type is LE_CLEAN |
| 92 | static_assert(4 == sizeof(leafentry::leafentry_clean), "leafentry_clean size is wrong" ); |
| 93 | static_assert(4 == __builtin_offsetof(leafentry::leafentry_clean, val), "val is in the wrong place" ); |
| 94 | struct __attribute__ ((__packed__)) leafentry_mvcc { |
| 95 | uint32_t num_cxrs; // number of committed transaction records |
| 96 | uint8_t num_pxrs; // number of provisional transaction records |
| 97 | uint8_t xrs[0]; //then TXNIDs of XRs relevant for reads: |
| 98 | // if provisional XRs exist, store OUTERMOST TXNID |
| 99 | // store committed TXNIDs, from most recently committed to least recently committed (newest first) |
| 100 | //then lengths of XRs relevant for reads (length is at most 1<<31, MSB is 1 for insert, 0 for delete): |
| 101 | // if provisional XRs exist (num_pxrs>0), store length and insert/delete flag associated with INNERMOST TXNID |
| 102 | // store length and insert/delete flag associated with each committed TXNID, in same order as above (newest first) |
| 103 | //then data of XRs relevant for reads |
| 104 | // if provisional XRs exist (num_pxrs>0), store data associated with INNERMOST provisional TXNID |
| 105 | // store data associated with committed TXNIDs (all committed data, newest committed values first) |
| 106 | //if provisional XRs still exist (that is, num_puxrs > 1, so INNERMOST provisional TXNID != OUTERMOST provisional TXNID): |
| 107 | // for OUTERMOST provisional XR: |
| 108 | // 1 byte: store type (insert/delete/placeholder) |
| 109 | // 4 bytes: length (if type is INSERT, no length stored if placeholder or delete) |
| 110 | // data |
| 111 | // for rest of provisional stack (if num_pxrs > 2), from second-outermost to second-innermost (outermost is stored above, innermost is stored separately): |
| 112 | // 8 bytes: TXNID |
| 113 | // 1 byte: store type (insert/delete/placeholder) |
| 114 | // 4 bytes: length (if type is INSERT) |
| 115 | // data |
| 116 | // for INNERMOST provisional XR: |
| 117 | // 8 bytes: TXNID |
| 118 | // (innermost data and length with insert/delete flag are stored above, cannot be a placeholder) |
| 119 | }; // For the case where LEAFENTRY->type is LE_MVCC |
| 120 | static_assert(5 == sizeof(leafentry::leafentry_mvcc), "leafentry_mvcc size is wrong" ); |
| 121 | static_assert(5 == __builtin_offsetof(leafentry::leafentry_mvcc, xrs), "xrs is in the wrong place" ); |
| 122 | |
| 123 | uint8_t type; // type is LE_CLEAN or LE_MVCC |
| 124 | //uint32_t keylen; |
| 125 | union __attribute__ ((__packed__)) { |
| 126 | struct leafentry_clean clean; |
| 127 | struct leafentry_mvcc mvcc; |
| 128 | } u; |
| 129 | }; |
| 130 | static_assert(6 == sizeof(leafentry), "leafentry size is wrong" ); |
| 131 | static_assert(1 == __builtin_offsetof(leafentry, u), "union is in the wrong place" ); |
| 132 | |
| 133 | #define LE_CLEAN_MEMSIZE(_vallen) \ |
| 134 | (sizeof(((LEAFENTRY)NULL)->type) /* type */ \ |
| 135 | +sizeof(((LEAFENTRY)NULL)->u.clean.vallen) /* vallen */ \ |
| 136 | +(_vallen)) /* actual val */ |
| 137 | |
| 138 | #define \ |
| 139 | (sizeof(((LEAFENTRY)NULL)->type) /* type */ \ |
| 140 | +sizeof(((LEAFENTRY)NULL)->u.mvcc.num_cxrs) /* committed */ \ |
| 141 | +sizeof(((LEAFENTRY)NULL)->u.mvcc.num_pxrs) /* provisional */ \ |
| 142 | +sizeof(TXNID) /* transaction */ \ |
| 143 | +sizeof(uint32_t) /* length+bit */ \ |
| 144 | +sizeof(uint32_t)) /* length+bit */ |
| 145 | |
| 146 | #define LE_MVCC_COMMITTED_MEMSIZE(_vallen) \ |
| 147 | (LE_MVCC_COMMITTED_HEADER_MEMSIZE \ |
| 148 | +(_vallen)) /* actual val */ |
| 149 | |
| 150 | |
| 151 | typedef struct leafentry *LEAFENTRY; |
| 152 | typedef struct leafentry_13 *LEAFENTRY_13; |
| 153 | |
| 154 | // |
| 155 | // TODO: consistency among names is very poor. |
| 156 | // |
| 157 | |
| 158 | // TODO: rename this helper function for deserialization |
| 159 | size_t leafentry_rest_memsize(uint32_t num_puxrs, uint32_t num_cuxrs, uint8_t* start); |
| 160 | size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory. |
| 161 | size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk. |
| 162 | void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le); |
| 163 | int print_klpair (FILE *outf, const void* key, uint32_t keylen, LEAFENTRY v); // Print a leafentry out in human-readable form. |
| 164 | |
| 165 | int le_latest_is_del(LEAFENTRY le); // Return true if it is a provisional delete. |
| 166 | int le_val_is_del(LEAFENTRY le, enum cursor_read_type read_type, TOKUTXN txn); // Returns true if the value that is to be read is empty |
| 167 | bool le_is_clean(LEAFENTRY le); //Return how many xids exist (0 does not count) |
| 168 | bool le_has_xids(LEAFENTRY le, XIDS xids); // Return true transaction represented by xids is still provisional in this leafentry (le's xid stack is a superset or equal to xids) |
| 169 | void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes) |
| 170 | uint32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen. Returns 0 for provisional deletes. |
| 171 | void* le_latest_val_and_len (LEAFENTRY le, uint32_t *len); |
| 172 | |
| 173 | uint64_t le_outermost_uncommitted_xid (LEAFENTRY le); |
| 174 | |
| 175 | //Callback contract: |
| 176 | // Function checks to see if id is accepted by context. |
| 177 | // Returns: |
| 178 | // 0: context ignores this entry, id. |
| 179 | // TOKUDB_ACCEPT: context accepts id |
| 180 | // r|r!=0&&r!=TOKUDB_ACCEPT: Quit early, return r, because something unexpected went wrong (error case) |
| 181 | typedef int(*LE_ITERATE_CALLBACK)(TXNID id, TOKUTXN context, bool is_provisional); |
| 182 | |
| 183 | int le_iterate_val( |
| 184 | LEAFENTRY le, |
| 185 | LE_ITERATE_CALLBACK f, |
| 186 | void** valpp, |
| 187 | uint32_t* vallenp, |
| 188 | TOKUTXN context); |
| 189 | |
| 190 | void ( |
| 191 | LEAFENTRY le, |
| 192 | // should we return the entire leafentry as the val? |
| 193 | bool is_leaf_mode, |
| 194 | enum cursor_read_type read_type, |
| 195 | TOKUTXN ttxn, |
| 196 | uint32_t* vallen, |
| 197 | void** val); |
| 198 | |
| 199 | size_t leafentry_disksize_13(LEAFENTRY_13 le); |
| 200 | |
| 201 | int toku_le_upgrade_13_14( |
| 202 | // NULL if there was no stored data. |
| 203 | LEAFENTRY_13 old_leafentry, |
| 204 | void** keyp, |
| 205 | uint32_t* keylen, |
| 206 | size_t* new_leafentry_memorysize, |
| 207 | LEAFENTRY *new_leafentry_p); |
| 208 | |
| 209 | class bn_data; |
| 210 | |
| 211 | int64_t toku_le_apply_msg( |
| 212 | const ft_msg &msg, |
| 213 | // NULL if there was no stored data. |
| 214 | LEAFENTRY old_leafentry, |
| 215 | // bn_data storing leafentry, if NULL, means there is no bn_data |
| 216 | bn_data* data_buffer, |
| 217 | // index in data_buffer where leafentry is stored (and should be replaced |
| 218 | uint32_t idx, |
| 219 | uint32_t old_keylen, |
| 220 | txn_gc_info* gc_info, |
| 221 | LEAFENTRY *new_leafentry_p, |
| 222 | int64_t* numbytes_delta_p); |
| 223 | |
| 224 | bool toku_le_worth_running_garbage_collection( |
| 225 | LEAFENTRY le, |
| 226 | txn_gc_info* gc_info); |
| 227 | |
| 228 | void toku_le_garbage_collect( |
| 229 | LEAFENTRY old_leaf_entry, |
| 230 | bn_data* data_buffer, |
| 231 | uint32_t idx, |
| 232 | void* keyp, |
| 233 | uint32_t keylen, |
| 234 | txn_gc_info* gc_info, |
| 235 | LEAFENTRY* new_leaf_entry, |
| 236 | int64_t* numbytes_delta_p); |
| 237 | |