1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #include "ft/ft.h" |
40 | #include "ft/ft-internal.h" |
41 | #include "ft/msg.h" |
42 | #include "ft/serialize/block_allocator.h" |
43 | #include "ft/serialize/block_table.h" |
44 | #include "ft/serialize/compress.h" |
45 | #include "ft/serialize/ft-serialize.h" |
46 | |
47 | // not version-sensitive because we only serialize a descriptor using the current layout_version |
48 | uint32_t |
49 | toku_serialize_descriptor_size(DESCRIPTOR desc) { |
50 | //Checksum NOT included in this. Checksum only exists in header's version. |
51 | uint32_t size = 4; // four bytes for size of descriptor |
52 | size += desc->dbt.size; |
53 | return size; |
54 | } |
55 | |
56 | static uint32_t |
57 | deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) { |
58 | //Checksum NOT included in this. Checksum only exists in header's version. |
59 | uint32_t size = 4; // four bytes for size of descriptor |
60 | if (layout_version == FT_LAYOUT_VERSION_13) |
61 | size += 4; // for version 13, include four bytes of "version" |
62 | size += desc->dbt.size; |
63 | return size; |
64 | } |
65 | |
66 | void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) { |
67 | wbuf_bytes(wb, desc->dbt.data, desc->dbt.size); |
68 | } |
69 | |
70 | //Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed) |
71 | //descriptor. |
72 | //Descriptors are NOT written during the header checkpoint process. |
73 | void |
74 | toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) { |
75 | // make the checksum |
76 | int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum |
77 | int64_t size_aligned = roundup_to_multiple(512, size); |
78 | struct wbuf w; |
79 | char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf); |
80 | for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0; |
81 | wbuf_init(&w, aligned_buf, size); |
82 | toku_serialize_descriptor_contents_to_wbuf(&w, desc); |
83 | { |
84 | //Add checksum |
85 | uint32_t checksum = toku_x1764_finish(&w.checksum); |
86 | wbuf_int(&w, checksum); |
87 | } |
88 | lazy_assert(w.ndone==w.size); |
89 | { |
90 | //Actual Write translation table |
91 | toku_os_full_pwrite(fd, w.buf, size_aligned, offset); |
92 | } |
93 | toku_free(w.buf); |
94 | } |
95 | |
96 | static void |
97 | deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) { |
98 | if (layout_version <= FT_LAYOUT_VERSION_13) { |
99 | // in older versions of tokuft, the descriptor had a 4 byte |
100 | // version, which we skip over |
101 | (void) rbuf_int(rb); |
102 | } |
103 | |
104 | uint32_t size; |
105 | const void *data; |
106 | rbuf_bytes(rb, &data, &size); |
107 | toku_memdup_dbt(&desc->dbt, data, size); |
108 | } |
109 | |
110 | static int |
111 | deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) { |
112 | int r = 0; |
113 | DISKOFF offset; |
114 | DISKOFF size; |
115 | unsigned char *dbuf = nullptr; |
116 | bt->get_descriptor_offset_size(&offset, &size); |
117 | memset(desc, 0, sizeof(*desc)); |
118 | if (size > 0) { |
119 | lazy_assert(size>=4); //4 for checksum |
120 | { |
121 | ssize_t size_to_malloc = roundup_to_multiple(512, size); |
122 | XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf); |
123 | { |
124 | |
125 | ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset); |
126 | lazy_assert(sz_read==size_to_malloc); |
127 | } |
128 | { |
129 | // check the checksum |
130 | uint32_t x1764 = toku_x1764_memory(dbuf, size-4); |
131 | //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); |
132 | uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4)); |
133 | if (x1764 != stored_x1764) { |
134 | fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n" , x1764, stored_x1764); |
135 | r = TOKUDB_BAD_CHECKSUM; |
136 | toku_free(dbuf); |
137 | goto exit; |
138 | } |
139 | } |
140 | |
141 | struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 }; |
142 | deserialize_descriptor_from_rbuf(&rb, desc, layout_version); |
143 | lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size); |
144 | toku_free(dbuf); |
145 | } |
146 | } |
147 | exit: |
148 | return r; |
149 | } |
150 | |
151 | int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) |
152 | // Effect: Deserialize the ft header. |
153 | // We deserialize ft_header only once and then share everything with all the FTs. |
154 | { |
155 | int r; |
156 | FT ft = NULL; |
157 | paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION); |
158 | paranoid_invariant(version <= FT_LAYOUT_VERSION); |
159 | // We already know: |
160 | // we have an rbuf representing the header. |
161 | // The checksum has been validated |
162 | |
163 | //Verification of initial elements. |
164 | //Check magic number |
165 | const void *magic; |
166 | rbuf_literal_bytes(rb, &magic, 8); |
167 | lazy_assert(memcmp(magic,"tokudata" ,8)==0); |
168 | |
169 | XCALLOC(ft); |
170 | ft->checkpoint_header = NULL; |
171 | toku_list_init(&ft->live_ft_handles); |
172 | |
173 | //version MUST be in network order on disk regardless of disk order |
174 | ft->layout_version_read_from_disk = rbuf_network_int(rb); |
175 | invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION); |
176 | invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION); |
177 | |
178 | //build_id MUST be in network order on disk regardless of disk order |
179 | uint32_t build_id; |
180 | build_id = rbuf_network_int(rb); |
181 | |
182 | //Size MUST be in network order regardless of disk order. |
183 | uint32_t size; |
184 | size = rbuf_network_int(rb); |
185 | lazy_assert(size == rb->size); |
186 | |
187 | const void *tmp_byte_order_check; |
188 | lazy_assert((sizeof tmp_byte_order_check) >= 8); |
189 | rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order |
190 | int64_t byte_order_stored; |
191 | byte_order_stored = *(int64_t*)tmp_byte_order_check; |
192 | lazy_assert(byte_order_stored == toku_byte_order_host); |
193 | |
194 | uint64_t checkpoint_count; |
195 | checkpoint_count = rbuf_ulonglong(rb); |
196 | LSN checkpoint_lsn; |
197 | checkpoint_lsn = rbuf_LSN(rb); |
198 | unsigned nodesize; |
199 | nodesize = rbuf_int(rb); |
200 | DISKOFF translation_address_on_disk; |
201 | translation_address_on_disk = rbuf_DISKOFF(rb); |
202 | DISKOFF translation_size_on_disk; |
203 | translation_size_on_disk = rbuf_DISKOFF(rb); |
204 | lazy_assert(translation_address_on_disk > 0); |
205 | lazy_assert(translation_size_on_disk > 0); |
206 | |
207 | // initialize the tree lock |
208 | toku_ft_init_reflock(ft); |
209 | |
210 | //Load translation table |
211 | { |
212 | size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk); |
213 | unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf); |
214 | { |
215 | // This cast is messed up in 32-bits if the block translation |
216 | // table is ever more than 4GB. But in that case, the |
217 | // translation table itself won't fit in main memory. |
218 | ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read, |
219 | translation_address_on_disk); |
220 | invariant(readsz >= translation_size_on_disk); |
221 | invariant(readsz <= (ssize_t)size_to_read); |
222 | } |
223 | // Create table and read in data. |
224 | r = ft->blocktable.create_from_buffer(fd, |
225 | translation_address_on_disk, |
226 | translation_size_on_disk, |
227 | tbuf); |
228 | toku_free(tbuf); |
229 | if (r != 0) { |
230 | goto exit; |
231 | } |
232 | } |
233 | |
234 | BLOCKNUM root_blocknum; |
235 | root_blocknum = rbuf_blocknum(rb); |
236 | unsigned flags; |
237 | flags = rbuf_int(rb); |
238 | if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) { |
239 | // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag |
240 | flags &= ~TOKU_DB_VALCMP_BUILTIN_13; |
241 | } |
242 | int layout_version_original; |
243 | layout_version_original = rbuf_int(rb); |
244 | uint32_t build_id_original; |
245 | build_id_original = rbuf_int(rb); |
246 | uint64_t time_of_creation; |
247 | time_of_creation = rbuf_ulonglong(rb); |
248 | uint64_t time_of_last_modification; |
249 | time_of_last_modification = rbuf_ulonglong(rb); |
250 | |
251 | if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) { |
252 | // 17 was the last version with these fields, we no longer store |
253 | // them, so read and discard them |
254 | (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13 |
255 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { |
256 | (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14 |
257 | } |
258 | } |
259 | |
260 | // fake creation during the last checkpoint |
261 | TXNID root_xid_that_created; |
262 | root_xid_that_created = checkpoint_lsn.lsn; |
263 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) { |
264 | rbuf_TXNID(rb, &root_xid_that_created); |
265 | } |
266 | |
267 | // TODO(leif): get this to default to what's specified, not the |
268 | // hard-coded default |
269 | unsigned basementnodesize; |
270 | basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE; |
271 | uint64_t time_of_last_verification; |
272 | time_of_last_verification = 0; |
273 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { |
274 | basementnodesize = rbuf_int(rb); |
275 | time_of_last_verification = rbuf_ulonglong(rb); |
276 | } |
277 | |
278 | STAT64INFO_S on_disk_stats; |
279 | on_disk_stats = ZEROSTATS; |
280 | uint64_t time_of_last_optimize_begin; |
281 | time_of_last_optimize_begin = 0; |
282 | uint64_t time_of_last_optimize_end; |
283 | time_of_last_optimize_end = 0; |
284 | uint32_t count_of_optimize_in_progress; |
285 | count_of_optimize_in_progress = 0; |
286 | MSN msn_at_start_of_last_completed_optimize; |
287 | msn_at_start_of_last_completed_optimize = ZERO_MSN; |
288 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) { |
289 | on_disk_stats.numrows = rbuf_ulonglong(rb); |
290 | on_disk_stats.numbytes = rbuf_ulonglong(rb); |
291 | ft->in_memory_stats = on_disk_stats; |
292 | time_of_last_optimize_begin = rbuf_ulonglong(rb); |
293 | time_of_last_optimize_end = rbuf_ulonglong(rb); |
294 | count_of_optimize_in_progress = rbuf_int(rb); |
295 | msn_at_start_of_last_completed_optimize = rbuf_MSN(rb); |
296 | } |
297 | |
298 | enum toku_compression_method compression_method; |
299 | MSN highest_unused_msn_for_upgrade; |
300 | highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1); |
301 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) { |
302 | unsigned char method = rbuf_char(rb); |
303 | compression_method = (enum toku_compression_method) method; |
304 | highest_unused_msn_for_upgrade = rbuf_MSN(rb); |
305 | } else { |
306 | // we hard coded zlib until 5.2, then quicklz in 5.2 |
307 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { |
308 | compression_method = TOKU_ZLIB_METHOD; |
309 | } else { |
310 | compression_method = TOKU_QUICKLZ_METHOD; |
311 | } |
312 | } |
313 | |
314 | MSN max_msn_in_ft; |
315 | max_msn_in_ft = ZERO_MSN; // We'll upgrade it from the root node later if necessary |
316 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) { |
317 | max_msn_in_ft = rbuf_MSN(rb); |
318 | } |
319 | |
320 | unsigned fanout; |
321 | fanout = FT_DEFAULT_FANOUT; |
322 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) { |
323 | fanout = rbuf_int(rb); |
324 | } |
325 | |
326 | uint64_t on_disk_logical_rows; |
327 | on_disk_logical_rows = (uint64_t)-1; |
328 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_29) { |
329 | on_disk_logical_rows = rbuf_ulonglong(rb); |
330 | } |
331 | ft->in_memory_logical_rows = on_disk_logical_rows; |
332 | |
333 | (void) rbuf_int(rb); //Read in checksum and ignore (already verified). |
334 | if (rb->ndone != rb->size) { |
335 | fprintf(stderr, "Header size did not match contents.\n" ); |
336 | r = EINVAL; |
337 | goto exit; |
338 | } |
339 | |
340 | { |
341 | struct ft_header h = { |
342 | .type = FT_CURRENT, |
343 | .dirty = 0, |
344 | .checkpoint_count = checkpoint_count, |
345 | .checkpoint_lsn = checkpoint_lsn, |
346 | .layout_version = FT_LAYOUT_VERSION, |
347 | .layout_version_original = layout_version_original, |
348 | .build_id = build_id, |
349 | .build_id_original = build_id_original, |
350 | .time_of_creation = time_of_creation, |
351 | .root_xid_that_created = root_xid_that_created, |
352 | .time_of_last_modification = time_of_last_modification, |
353 | .time_of_last_verification = time_of_last_verification, |
354 | .root_blocknum = root_blocknum, |
355 | .flags = flags, |
356 | .nodesize = nodesize, |
357 | .basementnodesize = basementnodesize, |
358 | .compression_method = compression_method, |
359 | .fanout = fanout, |
360 | .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade, |
361 | .max_msn_in_ft = max_msn_in_ft, |
362 | .time_of_last_optimize_begin = time_of_last_optimize_begin, |
363 | .time_of_last_optimize_end = time_of_last_optimize_end, |
364 | .count_of_optimize_in_progress = count_of_optimize_in_progress, |
365 | .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress, |
366 | .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize, |
367 | .on_disk_stats = on_disk_stats, |
368 | .on_disk_logical_rows = on_disk_logical_rows |
369 | }; |
370 | XMEMDUP(ft->h, &h); |
371 | } |
372 | |
373 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { |
374 | // This needs ft->h to be non-null, so we have to do it after we |
375 | // read everything else. |
376 | r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft); |
377 | if (r != 0) { |
378 | goto exit; |
379 | } |
380 | } |
381 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) { |
382 | r = toku_upgrade_msn_from_root_to_header(fd, ft); |
383 | if (r != 0) { |
384 | goto exit; |
385 | } |
386 | } |
387 | |
388 | invariant((uint32_t) ft->layout_version_read_from_disk == version); |
389 | r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version); |
390 | if (r != 0) { |
391 | goto exit; |
392 | } |
393 | |
394 | // initialize for svn #4541 |
395 | toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt); |
396 | |
397 | // Version 13 descriptors had an extra 4 bytes that we don't read |
398 | // anymore. Since the header is going to think it's the current |
399 | // version if it gets written out, we need to write the descriptor in |
400 | // the new format (without those bytes) before that happens. |
401 | if (version <= FT_LAYOUT_VERSION_13) { |
402 | toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd); |
403 | } |
404 | r = 0; |
405 | exit: |
406 | if (r != 0 && ft != NULL) { |
407 | toku_free(ft); |
408 | ft = NULL; |
409 | } |
410 | *ftp = ft; |
411 | return r; |
412 | } |
413 | |
414 | static size_t serialize_ft_min_size(uint32_t version) { |
415 | size_t size = 0; |
416 | |
417 | switch (version) { |
418 | case FT_LAYOUT_VERSION_29: |
419 | size += sizeof(uint64_t); // logrows in ft |
420 | // fallthrough |
421 | case FT_LAYOUT_VERSION_28: |
422 | size += sizeof(uint32_t); // fanout in ft |
423 | // fallthrough |
424 | case FT_LAYOUT_VERSION_27: |
425 | case FT_LAYOUT_VERSION_26: |
426 | case FT_LAYOUT_VERSION_25: |
427 | case FT_LAYOUT_VERSION_24: |
428 | case FT_LAYOUT_VERSION_23: |
429 | case FT_LAYOUT_VERSION_22: |
430 | case FT_LAYOUT_VERSION_21: |
431 | size += sizeof(MSN); // max_msn_in_ft |
432 | // fallthrough |
433 | case FT_LAYOUT_VERSION_20: |
434 | case FT_LAYOUT_VERSION_19: |
435 | size += 1; // compression method |
436 | size += sizeof(MSN); // highest_unused_msn_for_upgrade |
437 | // fallthrough |
438 | case FT_LAYOUT_VERSION_18: |
439 | size += sizeof(uint64_t); // time_of_last_optimize_begin |
440 | size += sizeof(uint64_t); // time_of_last_optimize_end |
441 | size += sizeof(uint32_t); // count_of_optimize_in_progress |
442 | size += sizeof(MSN); // msn_at_start_of_last_completed_optimize |
443 | size -= 8; // removed num_blocks_to_upgrade_14 |
444 | size -= 8; // removed num_blocks_to_upgrade_13 |
445 | // fallthrough |
446 | case FT_LAYOUT_VERSION_17: |
447 | size += 16; |
448 | invariant(sizeof(STAT64INFO_S) == 16); |
449 | // fallthrough |
450 | case FT_LAYOUT_VERSION_16: |
451 | case FT_LAYOUT_VERSION_15: |
452 | size += 4; // basement node size |
453 | size += 8; // num_blocks_to_upgrade_14 (previously |
454 | // num_blocks_to_upgrade, now one int each for upgrade |
455 | // from 13, 14 |
456 | size += 8; // time of last verification |
457 | // fallthrough |
458 | case FT_LAYOUT_VERSION_14: |
459 | size += 8; // TXNID that created |
460 | // fallthrough |
461 | case FT_LAYOUT_VERSION_13: |
462 | size += (4 // build_id |
463 | + |
464 | 4 // build_id_original |
465 | + |
466 | 8 // time_of_creation |
467 | + |
468 | 8 // time_of_last_modification |
469 | ); |
470 | // fallthrough |
471 | case FT_LAYOUT_VERSION_12: |
472 | size += (+8 // "tokudata" |
473 | + |
474 | 4 // version |
475 | + |
476 | 4 // original_version |
477 | + |
478 | 4 // size |
479 | + |
480 | 8 // byte order verification |
481 | + |
482 | 8 // checkpoint_count |
483 | + |
484 | 8 // checkpoint_lsn |
485 | + |
486 | 4 // tree's nodesize |
487 | + |
488 | 8 // translation_size_on_disk |
489 | + |
490 | 8 // translation_address_on_disk |
491 | + |
492 | 4 // checksum |
493 | + |
494 | 8 // Number of blocks in old version. |
495 | + |
496 | 8 // diskoff |
497 | + |
498 | 4 // flags |
499 | ); |
500 | break; |
501 | default: |
502 | abort(); |
503 | } |
504 | |
505 | lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
506 | return size; |
507 | } |
508 | |
509 | int deserialize_ft_from_fd_into_rbuf(int fd, |
510 | toku_off_t , |
511 | struct rbuf *rb, |
512 | uint64_t *checkpoint_count, |
513 | LSN *checkpoint_lsn, |
514 | uint32_t *version_p) |
515 | // Effect: Read and parse the header of a fractalal tree |
516 | // |
517 | // Simply reading the raw bytes of the header into an rbuf is insensitive |
518 | // to disk format version. If that ever changes, then modify this. |
519 | // |
520 | // TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the |
521 | // file AND the header is useless |
522 | { |
523 | int r = 0; |
524 | const int64_t prefix_size = 8 + // magic ("tokudata") |
525 | 4 + // version |
526 | 4 + // build_id |
527 | 4; // size |
528 | const int64_t read_size = roundup_to_multiple(512, prefix_size); |
529 | unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix); |
530 | rb->buf = NULL; |
531 | int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header); |
532 | if (n != read_size) { |
533 | if (n == 0) { |
534 | r = TOKUDB_DICTIONARY_NO_HEADER; |
535 | } else if (n < 0) { |
536 | r = get_error_errno(); |
537 | } else { |
538 | r = EINVAL; |
539 | } |
540 | toku_free(prefix); |
541 | goto exit; |
542 | } |
543 | |
544 | rbuf_init(rb, prefix, prefix_size); |
545 | |
546 | // Check magic number |
547 | const void *magic; |
548 | rbuf_literal_bytes(rb, &magic, 8); |
549 | if (memcmp(magic, "tokudata" , 8) != 0) { |
550 | if ((*(uint64_t *)magic) == 0) { |
551 | r = TOKUDB_DICTIONARY_NO_HEADER; |
552 | } else { |
553 | r = EINVAL; // Not a tokudb file! Do not use. |
554 | } |
555 | goto exit; |
556 | } |
557 | |
558 | // Version MUST be in network order regardless of disk order. |
559 | uint32_t version; |
560 | version = rbuf_network_int(rb); |
561 | *version_p = version; |
562 | if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) { |
563 | r = TOKUDB_DICTIONARY_TOO_OLD; // Cannot use |
564 | goto exit; |
565 | } else if (version > FT_LAYOUT_VERSION) { |
566 | r = TOKUDB_DICTIONARY_TOO_NEW; // Cannot use |
567 | goto exit; |
568 | } |
569 | |
570 | // build_id MUST be in network order regardless of disk order. |
571 | uint32_t build_id __attribute__((__unused__)); |
572 | build_id = rbuf_network_int(rb); |
573 | int64_t ; |
574 | min_header_size = serialize_ft_min_size(version); |
575 | |
576 | // Size MUST be in network order regardless of disk order. |
577 | uint32_t size; |
578 | size = rbuf_network_int(rb); |
579 | // If too big, it is corrupt. We would probably notice during checksum |
580 | // but may have to do a multi-gigabyte malloc+read to find out. |
581 | // If its too small reading rbuf would crash, so verify. |
582 | if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE || |
583 | size < min_header_size) { |
584 | r = TOKUDB_DICTIONARY_NO_HEADER; |
585 | goto exit; |
586 | } |
587 | |
588 | lazy_assert(rb->ndone == prefix_size); |
589 | rb->size = size; |
590 | { |
591 | toku_free(rb->buf); |
592 | uint32_t size_to_read = roundup_to_multiple(512, size); |
593 | XMALLOC_N_ALIGNED(512, size_to_read, rb->buf); |
594 | |
595 | invariant(offset_of_header % 512 == 0); |
596 | n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header); |
597 | if (n != size_to_read) { |
598 | if (n < 0) { |
599 | r = get_error_errno(); |
600 | } else { |
601 | r = EINVAL; // Header might be useless (wrong size) or could be |
602 | // a disk read error. |
603 | } |
604 | goto exit; |
605 | } |
606 | } |
607 | // It's version 14 or later. Magic looks OK. |
608 | // We have an rbuf that represents the header. |
609 | // Size is within acceptable bounds. |
610 | |
611 | // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function |
612 | // changed) |
613 | uint32_t calculated_x1764; |
614 | calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4); |
615 | uint32_t stored_x1764; |
616 | stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4)); |
617 | if (calculated_x1764 != stored_x1764) { |
618 | r = TOKUDB_BAD_CHECKSUM; // Header useless |
619 | fprintf(stderr, |
620 | "Header checksum failure: calc=0x%08x read=0x%08x\n" , |
621 | calculated_x1764, |
622 | stored_x1764); |
623 | goto exit; |
624 | } |
625 | |
626 | // Verify byte order |
627 | const void *tmp_byte_order_check; |
628 | lazy_assert((sizeof toku_byte_order_host) == 8); |
629 | rbuf_literal_bytes( |
630 | rb, &tmp_byte_order_check, 8); // Must not translate byte order |
631 | int64_t byte_order_stored; |
632 | byte_order_stored = *(int64_t *)tmp_byte_order_check; |
633 | if (byte_order_stored != toku_byte_order_host) { |
634 | r = TOKUDB_DICTIONARY_NO_HEADER; // Cannot use dictionary |
635 | goto exit; |
636 | } |
637 | |
638 | // Load checkpoint count |
639 | *checkpoint_count = rbuf_ulonglong(rb); |
640 | *checkpoint_lsn = rbuf_LSN(rb); |
641 | // Restart at beginning during regular deserialization |
642 | rb->ndone = 0; |
643 | |
644 | exit: |
645 | if (r != 0 && rb->buf != NULL) { |
646 | toku_free(rb->buf); |
647 | rb->buf = NULL; |
648 | } |
649 | return r; |
650 | } |
651 | |
652 | // Read ft from file into struct. Read both headers and use one. |
653 | // We want the latest acceptable header whose checkpoint_lsn is no later |
654 | // than max_acceptable_lsn. |
655 | #define dump_state_of_toku_deserialize_ft_from() \ |
656 | fprintf(stderr, \ |
657 | "%s:%d toku_deserialize_ft_from: " \ |
658 | "filename[%s] " \ |
659 | "r[%d] max_acceptable_lsn[%lu]" \ |
660 | "r0[%d] checkpoint_lsn_0[%lu] checkpoint_count_0[%lu] " \ |
661 | "r1[%d] checkpoint_lsn_1[%lu] checkpoint_count_1[%lu]\n", \ |
662 | __FILE__, \ |
663 | __LINE__, \ |
664 | fn, \ |
665 | r, \ |
666 | max_acceptable_lsn.lsn, \ |
667 | r0, \ |
668 | checkpoint_lsn_0.lsn, \ |
669 | checkpoint_count_0, \ |
670 | r1, \ |
671 | checkpoint_lsn_1.lsn, \ |
672 | checkpoint_count_1); |
673 | |
674 | int toku_deserialize_ft_from(int fd, |
675 | const char *fn, |
676 | LSN max_acceptable_lsn, |
677 | FT *ft) { |
678 | struct rbuf rb_0; |
679 | struct rbuf rb_1; |
680 | uint64_t checkpoint_count_0 = 0; |
681 | uint64_t checkpoint_count_1 = 0; |
682 | LSN checkpoint_lsn_0; |
683 | LSN checkpoint_lsn_1; |
684 | uint32_t version_0 = 0, version_1 = 0, version = 0; |
685 | bool h0_acceptable = false; |
686 | bool h1_acceptable = false; |
687 | struct rbuf *rb = NULL; |
688 | int r0, r1, r = 0; |
689 | |
690 | toku_off_t = 0; |
691 | r0 = deserialize_ft_from_fd_into_rbuf(fd, |
692 | header_0_off, |
693 | &rb_0, |
694 | &checkpoint_count_0, |
695 | &checkpoint_lsn_0, |
696 | &version_0); |
697 | if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) { |
698 | h0_acceptable = true; |
699 | } |
700 | |
701 | toku_off_t = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; |
702 | r1 = deserialize_ft_from_fd_into_rbuf(fd, |
703 | header_1_off, |
704 | &rb_1, |
705 | &checkpoint_count_1, |
706 | &checkpoint_lsn_1, |
707 | &version_1); |
708 | if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) { |
709 | h1_acceptable = true; |
710 | } |
711 | |
712 | // if either header is too new, the dictionary is unreadable |
713 | if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW || |
714 | !(h0_acceptable || h1_acceptable)) { |
715 | // We were unable to read either header or at least one is too |
716 | // new. Certain errors are higher priority than others. Order of |
717 | // these if/else if is important. |
718 | if (r0 == TOKUDB_DICTIONARY_TOO_NEW || |
719 | r1 == TOKUDB_DICTIONARY_TOO_NEW) { |
720 | r = TOKUDB_DICTIONARY_TOO_NEW; |
721 | } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || |
722 | r1 == TOKUDB_DICTIONARY_TOO_OLD) { |
723 | r = TOKUDB_DICTIONARY_TOO_OLD; |
724 | } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) { |
725 | fprintf(stderr, "Both header checksums failed.\n" ); |
726 | r = TOKUDB_BAD_CHECKSUM; |
727 | } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || |
728 | r1 == TOKUDB_DICTIONARY_NO_HEADER) { |
729 | r = TOKUDB_DICTIONARY_NO_HEADER; |
730 | } else { |
731 | r = r0 ? r0 : r1; // Arbitrarily report the error from the |
732 | // first header, unless it's readable |
733 | } |
734 | |
735 | if (r != TOKUDB_DICTIONARY_NO_HEADER) { |
736 | dump_state_of_toku_deserialize_ft_from(); |
737 | } |
738 | |
739 | // it should not be possible for both headers to be later than the |
740 | // max_acceptable_lsn |
741 | invariant( |
742 | !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) && |
743 | (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn))); |
744 | invariant(r != 0); |
745 | goto exit; |
746 | } |
747 | |
748 | if (h0_acceptable && h1_acceptable) { |
749 | if (checkpoint_count_0 > checkpoint_count_1) { |
750 | if (!(checkpoint_count_0 == checkpoint_count_1 + 1) || |
751 | !(version_0 >= version_1)) { |
752 | dump_state_of_toku_deserialize_ft_from(); |
753 | } |
754 | invariant(checkpoint_count_0 == checkpoint_count_1 + 1); |
755 | invariant(version_0 >= version_1); |
756 | rb = &rb_0; |
757 | version = version_0; |
758 | } else { |
759 | if (!(checkpoint_count_1 == checkpoint_count_0 + 1) || |
760 | !(version_1 >= version_0)) { |
761 | dump_state_of_toku_deserialize_ft_from(); |
762 | } |
763 | invariant(checkpoint_count_1 == checkpoint_count_0 + 1); |
764 | invariant(version_1 >= version_0); |
765 | rb = &rb_1; |
766 | version = version_1; |
767 | } |
768 | } else if (h0_acceptable) { |
769 | if (r1 == TOKUDB_BAD_CHECKSUM) { |
770 | // print something reassuring |
771 | fprintf( |
772 | stderr, |
773 | "Header 2 checksum failed, but header 1 ok. Proceeding.\n" ); |
774 | dump_state_of_toku_deserialize_ft_from(); |
775 | } |
776 | rb = &rb_0; |
777 | version = version_0; |
778 | } else if (h1_acceptable) { |
779 | if (r0 == TOKUDB_BAD_CHECKSUM) { |
780 | // print something reassuring |
781 | fprintf( |
782 | stderr, |
783 | "Header 1 checksum failed, but header 2 ok. Proceeding.\n" ); |
784 | dump_state_of_toku_deserialize_ft_from(); |
785 | } |
786 | rb = &rb_1; |
787 | version = version_1; |
788 | } |
789 | |
790 | if (!rb) { |
791 | dump_state_of_toku_deserialize_ft_from(); |
792 | } |
793 | paranoid_invariant(rb); |
794 | r = deserialize_ft_versioned(fd, rb, ft, version); |
795 | |
796 | exit: |
797 | if (rb_0.buf) { |
798 | toku_free(rb_0.buf); |
799 | } |
800 | if (rb_1.buf) { |
801 | toku_free(rb_1.buf); |
802 | } |
803 | return r; |
804 | } |
805 | |
806 | size_t (FT_HEADER h) { |
807 | size_t size = serialize_ft_min_size(h->layout_version); |
808 | // There is no dynamic data. |
809 | lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
810 | return size; |
811 | } |
812 | |
813 | void ( |
814 | struct wbuf *wbuf, |
815 | FT_HEADER h, |
816 | DISKOFF translation_location_on_disk, |
817 | DISKOFF translation_size_on_disk |
818 | ) |
819 | { |
820 | wbuf_literal_bytes(wbuf, "tokudata" , 8); |
821 | wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order |
822 | wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order |
823 | wbuf_network_int (wbuf, wbuf->size); //MUST be in network order regardless of disk order |
824 | wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order |
825 | wbuf_ulonglong(wbuf, h->checkpoint_count); |
826 | wbuf_LSN (wbuf, h->checkpoint_lsn); |
827 | wbuf_int (wbuf, h->nodesize); |
828 | |
829 | wbuf_DISKOFF(wbuf, translation_location_on_disk); |
830 | wbuf_DISKOFF(wbuf, translation_size_on_disk); |
831 | wbuf_BLOCKNUM(wbuf, h->root_blocknum); |
832 | wbuf_int(wbuf, h->flags); |
833 | wbuf_int(wbuf, h->layout_version_original); |
834 | wbuf_int(wbuf, h->build_id_original); |
835 | wbuf_ulonglong(wbuf, h->time_of_creation); |
836 | wbuf_ulonglong(wbuf, h->time_of_last_modification); |
837 | wbuf_TXNID(wbuf, h->root_xid_that_created); |
838 | wbuf_int(wbuf, h->basementnodesize); |
839 | wbuf_ulonglong(wbuf, h->time_of_last_verification); |
840 | wbuf_ulonglong(wbuf, h->on_disk_stats.numrows); |
841 | wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes); |
842 | wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin); |
843 | wbuf_ulonglong(wbuf, h->time_of_last_optimize_end); |
844 | wbuf_int(wbuf, h->count_of_optimize_in_progress); |
845 | wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize); |
846 | wbuf_char(wbuf, (unsigned char) h->compression_method); |
847 | wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade); |
848 | wbuf_MSN(wbuf, h->max_msn_in_ft); |
849 | wbuf_int(wbuf, h->fanout); |
850 | wbuf_ulonglong(wbuf, h->on_disk_logical_rows); |
851 | uint32_t checksum = toku_x1764_finish(&wbuf->checksum); |
852 | wbuf_int(wbuf, checksum); |
853 | lazy_assert(wbuf->ndone == wbuf->size); |
854 | } |
855 | |
856 | void (int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) { |
857 | lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS); |
858 | struct wbuf w_translation; |
859 | int64_t size_translation; |
860 | int64_t address_translation; |
861 | |
862 | // Must serialize translation first, to get address,size for header. |
863 | bt->serialize_translation_to_wbuf( |
864 | fd, &w_translation, &address_translation, &size_translation); |
865 | invariant(size_translation == w_translation.ndone); |
866 | |
867 | // the number of bytes available in the buffer is 0 mod 512, and those last |
868 | // bytes are all initialized. |
869 | invariant(w_translation.size % 512 == 0); |
870 | |
871 | struct wbuf w_main; |
872 | size_t size_main = toku_serialize_ft_size(h); |
873 | size_t size_main_aligned = roundup_to_multiple(512, size_main); |
874 | invariant(size_main_aligned < |
875 | BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
876 | char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf); |
877 | for (size_t i = size_main; i < size_main_aligned; i++) |
878 | mainbuf[i] = 0; // initialize the end of the buffer with zeros |
879 | wbuf_init(&w_main, mainbuf, size_main); |
880 | toku_serialize_ft_to_wbuf( |
881 | &w_main, h, address_translation, size_translation); |
882 | lazy_assert(w_main.ndone == size_main); |
883 | |
884 | // Actually write translation table |
885 | // This write is guaranteed to read good data at the end of the buffer, |
886 | // since the |
887 | // w_translation.buf is padded with zeros to a 512-byte boundary. |
888 | toku_os_full_pwrite(fd, |
889 | w_translation.buf, |
890 | roundup_to_multiple(512, size_translation), |
891 | address_translation); |
892 | |
893 | // Everything but the header MUST be on disk before header starts. |
894 | // Otherwise we will think the header is good and some blocks might not |
895 | // yet be on disk. |
896 | // If the header has a cachefile we need to do cachefile fsync (to |
897 | // prevent crash if we redirected to dev null) |
898 | // If there is no cachefile we still need to do an fsync. |
899 | if (cf) { |
900 | toku_cachefile_fsync(cf); |
901 | } else { |
902 | toku_file_fsync(fd); |
903 | } |
904 | |
905 | // Alternate writing header to two locations: |
906 | // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE |
907 | toku_off_t main_offset; |
908 | main_offset = (h->checkpoint_count & 0x1) |
909 | ? 0 |
910 | : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; |
911 | toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset); |
912 | toku_free(w_main.buf); |
913 | toku_free(w_translation.buf); |
914 | } |
915 | |