1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #pragma once |
40 | |
41 | #include <toku_portability.h> |
42 | |
43 | #include <util/mempool.h> |
44 | #include <util/omt.h> |
45 | |
46 | #include "ft/txn/txn_manager.h" |
47 | #include "ft/serialize/rbuf.h" |
48 | #include "ft/msg.h" |
49 | |
50 | /* |
51 | Memory format of packed leaf entry |
52 | CONSTANTS: |
53 | num_uxrs |
54 | keylen |
55 | Run-time-constants |
56 | voffset of val/vallen??? (for le_any_val) This must be small if it is interpreted as voffset = realoffset_of_val - keylen |
57 | GOOD performance optimization. |
58 | ALSO good for simplicity (no having to scan packed version) |
59 | key[] |
60 | variable length |
61 | |
62 | |
63 | Memory format of packed dup leaf entry |
64 | CONSTANTS: |
65 | num_uxrs |
66 | keylen |
67 | vallen |
68 | Run-time-constants |
69 | key[] |
70 | val[] |
71 | */ |
72 | |
73 | enum cursor_read_type { |
74 | C_READ_ANY = 0, |
75 | C_READ_SNAPSHOT = 1, |
76 | C_READ_COMMITTED = 2 |
77 | }; |
78 | |
79 | // |
80 | // enum of possible values for LEAFENTRY->type field |
81 | // LE_CLEAN means that there is a single committed value in a format that saves disk space |
82 | // LE_MVCC means that there may be multiple committed values or there are provisional values |
83 | // |
84 | enum { LE_CLEAN = 0, LE_MVCC = 1 }; |
85 | |
86 | // This is an on-disk format. static_asserts verify everything is packed and aligned correctly. |
87 | struct leafentry { |
88 | struct leafentry_clean { |
89 | uint32_t vallen; |
90 | uint8_t val[0]; //actual val |
91 | }; // For the case where LEAFENTRY->type is LE_CLEAN |
92 | static_assert(4 == sizeof(leafentry::leafentry_clean), "leafentry_clean size is wrong" ); |
93 | static_assert(4 == __builtin_offsetof(leafentry::leafentry_clean, val), "val is in the wrong place" ); |
94 | struct __attribute__ ((__packed__)) leafentry_mvcc { |
95 | uint32_t num_cxrs; // number of committed transaction records |
96 | uint8_t num_pxrs; // number of provisional transaction records |
97 | uint8_t xrs[0]; //then TXNIDs of XRs relevant for reads: |
98 | // if provisional XRs exist, store OUTERMOST TXNID |
99 | // store committed TXNIDs, from most recently committed to least recently committed (newest first) |
100 | //then lengths of XRs relevant for reads (length is at most 1<<31, MSB is 1 for insert, 0 for delete): |
101 | // if provisional XRs exist (num_pxrs>0), store length and insert/delete flag associated with INNERMOST TXNID |
102 | // store length and insert/delete flag associated with each committed TXNID, in same order as above (newest first) |
103 | //then data of XRs relevant for reads |
104 | // if provisional XRs exist (num_pxrs>0), store data associated with INNERMOST provisional TXNID |
105 | // store data associated with committed TXNIDs (all committed data, newest committed values first) |
106 | //if provisional XRs still exist (that is, num_puxrs > 1, so INNERMOST provisional TXNID != OUTERMOST provisional TXNID): |
107 | // for OUTERMOST provisional XR: |
108 | // 1 byte: store type (insert/delete/placeholder) |
109 | // 4 bytes: length (if type is INSERT, no length stored if placeholder or delete) |
110 | // data |
111 | // for rest of provisional stack (if num_pxrs > 2), from second-outermost to second-innermost (outermost is stored above, innermost is stored separately): |
112 | // 8 bytes: TXNID |
113 | // 1 byte: store type (insert/delete/placeholder) |
114 | // 4 bytes: length (if type is INSERT) |
115 | // data |
116 | // for INNERMOST provisional XR: |
117 | // 8 bytes: TXNID |
118 | // (innermost data and length with insert/delete flag are stored above, cannot be a placeholder) |
119 | }; // For the case where LEAFENTRY->type is LE_MVCC |
120 | static_assert(5 == sizeof(leafentry::leafentry_mvcc), "leafentry_mvcc size is wrong" ); |
121 | static_assert(5 == __builtin_offsetof(leafentry::leafentry_mvcc, xrs), "xrs is in the wrong place" ); |
122 | |
123 | uint8_t type; // type is LE_CLEAN or LE_MVCC |
124 | //uint32_t keylen; |
125 | union __attribute__ ((__packed__)) { |
126 | struct leafentry_clean clean; |
127 | struct leafentry_mvcc mvcc; |
128 | } u; |
129 | }; |
130 | static_assert(6 == sizeof(leafentry), "leafentry size is wrong" ); |
131 | static_assert(1 == __builtin_offsetof(leafentry, u), "union is in the wrong place" ); |
132 | |
133 | #define LE_CLEAN_MEMSIZE(_vallen) \ |
134 | (sizeof(((LEAFENTRY)NULL)->type) /* type */ \ |
135 | +sizeof(((LEAFENTRY)NULL)->u.clean.vallen) /* vallen */ \ |
136 | +(_vallen)) /* actual val */ |
137 | |
138 | #define \ |
139 | (sizeof(((LEAFENTRY)NULL)->type) /* type */ \ |
140 | +sizeof(((LEAFENTRY)NULL)->u.mvcc.num_cxrs) /* committed */ \ |
141 | +sizeof(((LEAFENTRY)NULL)->u.mvcc.num_pxrs) /* provisional */ \ |
142 | +sizeof(TXNID) /* transaction */ \ |
143 | +sizeof(uint32_t) /* length+bit */ \ |
144 | +sizeof(uint32_t)) /* length+bit */ |
145 | |
146 | #define LE_MVCC_COMMITTED_MEMSIZE(_vallen) \ |
147 | (LE_MVCC_COMMITTED_HEADER_MEMSIZE \ |
148 | +(_vallen)) /* actual val */ |
149 | |
150 | |
151 | typedef struct leafentry *LEAFENTRY; |
152 | typedef struct leafentry_13 *LEAFENTRY_13; |
153 | |
154 | // |
155 | // TODO: consistency among names is very poor. |
156 | // |
157 | |
158 | // TODO: rename this helper function for deserialization |
159 | size_t leafentry_rest_memsize(uint32_t num_puxrs, uint32_t num_cuxrs, uint8_t* start); |
160 | size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory. |
161 | size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk. |
162 | void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le); |
163 | int print_klpair (FILE *outf, const void* key, uint32_t keylen, LEAFENTRY v); // Print a leafentry out in human-readable form. |
164 | |
165 | int le_latest_is_del(LEAFENTRY le); // Return true if it is a provisional delete. |
166 | int le_val_is_del(LEAFENTRY le, enum cursor_read_type read_type, TOKUTXN txn); // Returns true if the value that is to be read is empty |
167 | bool le_is_clean(LEAFENTRY le); //Return how many xids exist (0 does not count) |
168 | bool le_has_xids(LEAFENTRY le, XIDS xids); // Return true transaction represented by xids is still provisional in this leafentry (le's xid stack is a superset or equal to xids) |
169 | void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes) |
170 | uint32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen. Returns 0 for provisional deletes. |
171 | void* le_latest_val_and_len (LEAFENTRY le, uint32_t *len); |
172 | |
173 | uint64_t le_outermost_uncommitted_xid (LEAFENTRY le); |
174 | |
175 | //Callback contract: |
176 | // Function checks to see if id is accepted by context. |
177 | // Returns: |
178 | // 0: context ignores this entry, id. |
179 | // TOKUDB_ACCEPT: context accepts id |
180 | // r|r!=0&&r!=TOKUDB_ACCEPT: Quit early, return r, because something unexpected went wrong (error case) |
181 | typedef int(*LE_ITERATE_CALLBACK)(TXNID id, TOKUTXN context, bool is_provisional); |
182 | |
183 | int le_iterate_val( |
184 | LEAFENTRY le, |
185 | LE_ITERATE_CALLBACK f, |
186 | void** valpp, |
187 | uint32_t* vallenp, |
188 | TOKUTXN context); |
189 | |
190 | void ( |
191 | LEAFENTRY le, |
192 | // should we return the entire leafentry as the val? |
193 | bool is_leaf_mode, |
194 | enum cursor_read_type read_type, |
195 | TOKUTXN ttxn, |
196 | uint32_t* vallen, |
197 | void** val); |
198 | |
199 | size_t leafentry_disksize_13(LEAFENTRY_13 le); |
200 | |
201 | int toku_le_upgrade_13_14( |
202 | // NULL if there was no stored data. |
203 | LEAFENTRY_13 old_leafentry, |
204 | void** keyp, |
205 | uint32_t* keylen, |
206 | size_t* new_leafentry_memorysize, |
207 | LEAFENTRY *new_leafentry_p); |
208 | |
209 | class bn_data; |
210 | |
211 | int64_t toku_le_apply_msg( |
212 | const ft_msg &msg, |
213 | // NULL if there was no stored data. |
214 | LEAFENTRY old_leafentry, |
215 | // bn_data storing leafentry, if NULL, means there is no bn_data |
216 | bn_data* data_buffer, |
217 | // index in data_buffer where leafentry is stored (and should be replaced |
218 | uint32_t idx, |
219 | uint32_t old_keylen, |
220 | txn_gc_info* gc_info, |
221 | LEAFENTRY *new_leafentry_p, |
222 | int64_t* numbytes_delta_p); |
223 | |
224 | bool toku_le_worth_running_garbage_collection( |
225 | LEAFENTRY le, |
226 | txn_gc_info* gc_info); |
227 | |
228 | void toku_le_garbage_collect( |
229 | LEAFENTRY old_leaf_entry, |
230 | bn_data* data_buffer, |
231 | uint32_t idx, |
232 | void* keyp, |
233 | uint32_t keylen, |
234 | txn_gc_info* gc_info, |
235 | LEAFENTRY* new_leaf_entry, |
236 | int64_t* numbytes_delta_p); |
237 | |