1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#include "ft/ft.h"
40#include "ft/ft-internal.h"
41#include "ft/msg.h"
42#include "ft/serialize/block_allocator.h"
43#include "ft/serialize/block_table.h"
44#include "ft/serialize/compress.h"
45#include "ft/serialize/ft-serialize.h"
46
47// not version-sensitive because we only serialize a descriptor using the current layout_version
48uint32_t
49toku_serialize_descriptor_size(DESCRIPTOR desc) {
50 //Checksum NOT included in this. Checksum only exists in header's version.
51 uint32_t size = 4; // four bytes for size of descriptor
52 size += desc->dbt.size;
53 return size;
54}
55
56static uint32_t
57deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) {
58 //Checksum NOT included in this. Checksum only exists in header's version.
59 uint32_t size = 4; // four bytes for size of descriptor
60 if (layout_version == FT_LAYOUT_VERSION_13)
61 size += 4; // for version 13, include four bytes of "version"
62 size += desc->dbt.size;
63 return size;
64}
65
66void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) {
67 wbuf_bytes(wb, desc->dbt.data, desc->dbt.size);
68}
69
70//Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed)
71//descriptor.
72//Descriptors are NOT written during the header checkpoint process.
73void
74toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) {
75 // make the checksum
76 int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
77 int64_t size_aligned = roundup_to_multiple(512, size);
78 struct wbuf w;
79 char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
80 for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
81 wbuf_init(&w, aligned_buf, size);
82 toku_serialize_descriptor_contents_to_wbuf(&w, desc);
83 {
84 //Add checksum
85 uint32_t checksum = toku_x1764_finish(&w.checksum);
86 wbuf_int(&w, checksum);
87 }
88 lazy_assert(w.ndone==w.size);
89 {
90 //Actual Write translation table
91 toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
92 }
93 toku_free(w.buf);
94}
95
96static void
97deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) {
98 if (layout_version <= FT_LAYOUT_VERSION_13) {
99 // in older versions of tokuft, the descriptor had a 4 byte
100 // version, which we skip over
101 (void) rbuf_int(rb);
102 }
103
104 uint32_t size;
105 const void *data;
106 rbuf_bytes(rb, &data, &size);
107 toku_memdup_dbt(&desc->dbt, data, size);
108}
109
110static int
111deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) {
112 int r = 0;
113 DISKOFF offset;
114 DISKOFF size;
115 unsigned char *dbuf = nullptr;
116 bt->get_descriptor_offset_size(&offset, &size);
117 memset(desc, 0, sizeof(*desc));
118 if (size > 0) {
119 lazy_assert(size>=4); //4 for checksum
120 {
121 ssize_t size_to_malloc = roundup_to_multiple(512, size);
122 XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
123 {
124
125 ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
126 lazy_assert(sz_read==size_to_malloc);
127 }
128 {
129 // check the checksum
130 uint32_t x1764 = toku_x1764_memory(dbuf, size-4);
131 //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
132 uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4));
133 if (x1764 != stored_x1764) {
134 fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
135 r = TOKUDB_BAD_CHECKSUM;
136 toku_free(dbuf);
137 goto exit;
138 }
139 }
140
141 struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 };
142 deserialize_descriptor_from_rbuf(&rb, desc, layout_version);
143 lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size);
144 toku_free(dbuf);
145 }
146 }
147exit:
148 return r;
149}
150
151int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
152// Effect: Deserialize the ft header.
153// We deserialize ft_header only once and then share everything with all the FTs.
154{
155 int r;
156 FT ft = NULL;
157 paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
158 paranoid_invariant(version <= FT_LAYOUT_VERSION);
159 // We already know:
160 // we have an rbuf representing the header.
161 // The checksum has been validated
162
163 //Verification of initial elements.
164 //Check magic number
165 const void *magic;
166 rbuf_literal_bytes(rb, &magic, 8);
167 lazy_assert(memcmp(magic,"tokudata",8)==0);
168
169 XCALLOC(ft);
170 ft->checkpoint_header = NULL;
171 toku_list_init(&ft->live_ft_handles);
172
173 //version MUST be in network order on disk regardless of disk order
174 ft->layout_version_read_from_disk = rbuf_network_int(rb);
175 invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
176 invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION);
177
178 //build_id MUST be in network order on disk regardless of disk order
179 uint32_t build_id;
180 build_id = rbuf_network_int(rb);
181
182 //Size MUST be in network order regardless of disk order.
183 uint32_t size;
184 size = rbuf_network_int(rb);
185 lazy_assert(size == rb->size);
186
187 const void *tmp_byte_order_check;
188 lazy_assert((sizeof tmp_byte_order_check) >= 8);
189 rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
190 int64_t byte_order_stored;
191 byte_order_stored = *(int64_t*)tmp_byte_order_check;
192 lazy_assert(byte_order_stored == toku_byte_order_host);
193
194 uint64_t checkpoint_count;
195 checkpoint_count = rbuf_ulonglong(rb);
196 LSN checkpoint_lsn;
197 checkpoint_lsn = rbuf_LSN(rb);
198 unsigned nodesize;
199 nodesize = rbuf_int(rb);
200 DISKOFF translation_address_on_disk;
201 translation_address_on_disk = rbuf_DISKOFF(rb);
202 DISKOFF translation_size_on_disk;
203 translation_size_on_disk = rbuf_DISKOFF(rb);
204 lazy_assert(translation_address_on_disk > 0);
205 lazy_assert(translation_size_on_disk > 0);
206
207 // initialize the tree lock
208 toku_ft_init_reflock(ft);
209
210 //Load translation table
211 {
212 size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
213 unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
214 {
215 // This cast is messed up in 32-bits if the block translation
216 // table is ever more than 4GB. But in that case, the
217 // translation table itself won't fit in main memory.
218 ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
219 translation_address_on_disk);
220 invariant(readsz >= translation_size_on_disk);
221 invariant(readsz <= (ssize_t)size_to_read);
222 }
223 // Create table and read in data.
224 r = ft->blocktable.create_from_buffer(fd,
225 translation_address_on_disk,
226 translation_size_on_disk,
227 tbuf);
228 toku_free(tbuf);
229 if (r != 0) {
230 goto exit;
231 }
232 }
233
234 BLOCKNUM root_blocknum;
235 root_blocknum = rbuf_blocknum(rb);
236 unsigned flags;
237 flags = rbuf_int(rb);
238 if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) {
239 // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
240 flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
241 }
242 int layout_version_original;
243 layout_version_original = rbuf_int(rb);
244 uint32_t build_id_original;
245 build_id_original = rbuf_int(rb);
246 uint64_t time_of_creation;
247 time_of_creation = rbuf_ulonglong(rb);
248 uint64_t time_of_last_modification;
249 time_of_last_modification = rbuf_ulonglong(rb);
250
251 if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) {
252 // 17 was the last version with these fields, we no longer store
253 // them, so read and discard them
254 (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13
255 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
256 (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14
257 }
258 }
259
260 // fake creation during the last checkpoint
261 TXNID root_xid_that_created;
262 root_xid_that_created = checkpoint_lsn.lsn;
263 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) {
264 rbuf_TXNID(rb, &root_xid_that_created);
265 }
266
267 // TODO(leif): get this to default to what's specified, not the
268 // hard-coded default
269 unsigned basementnodesize;
270 basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE;
271 uint64_t time_of_last_verification;
272 time_of_last_verification = 0;
273 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
274 basementnodesize = rbuf_int(rb);
275 time_of_last_verification = rbuf_ulonglong(rb);
276 }
277
278 STAT64INFO_S on_disk_stats;
279 on_disk_stats = ZEROSTATS;
280 uint64_t time_of_last_optimize_begin;
281 time_of_last_optimize_begin = 0;
282 uint64_t time_of_last_optimize_end;
283 time_of_last_optimize_end = 0;
284 uint32_t count_of_optimize_in_progress;
285 count_of_optimize_in_progress = 0;
286 MSN msn_at_start_of_last_completed_optimize;
287 msn_at_start_of_last_completed_optimize = ZERO_MSN;
288 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) {
289 on_disk_stats.numrows = rbuf_ulonglong(rb);
290 on_disk_stats.numbytes = rbuf_ulonglong(rb);
291 ft->in_memory_stats = on_disk_stats;
292 time_of_last_optimize_begin = rbuf_ulonglong(rb);
293 time_of_last_optimize_end = rbuf_ulonglong(rb);
294 count_of_optimize_in_progress = rbuf_int(rb);
295 msn_at_start_of_last_completed_optimize = rbuf_MSN(rb);
296 }
297
298 enum toku_compression_method compression_method;
299 MSN highest_unused_msn_for_upgrade;
300 highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1);
301 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) {
302 unsigned char method = rbuf_char(rb);
303 compression_method = (enum toku_compression_method) method;
304 highest_unused_msn_for_upgrade = rbuf_MSN(rb);
305 } else {
306 // we hard coded zlib until 5.2, then quicklz in 5.2
307 if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
308 compression_method = TOKU_ZLIB_METHOD;
309 } else {
310 compression_method = TOKU_QUICKLZ_METHOD;
311 }
312 }
313
314 MSN max_msn_in_ft;
315 max_msn_in_ft = ZERO_MSN; // We'll upgrade it from the root node later if necessary
316 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) {
317 max_msn_in_ft = rbuf_MSN(rb);
318 }
319
320 unsigned fanout;
321 fanout = FT_DEFAULT_FANOUT;
322 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) {
323 fanout = rbuf_int(rb);
324 }
325
326 uint64_t on_disk_logical_rows;
327 on_disk_logical_rows = (uint64_t)-1;
328 if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_29) {
329 on_disk_logical_rows = rbuf_ulonglong(rb);
330 }
331 ft->in_memory_logical_rows = on_disk_logical_rows;
332
333 (void) rbuf_int(rb); //Read in checksum and ignore (already verified).
334 if (rb->ndone != rb->size) {
335 fprintf(stderr, "Header size did not match contents.\n");
336 r = EINVAL;
337 goto exit;
338 }
339
340 {
341 struct ft_header h = {
342 .type = FT_CURRENT,
343 .dirty = 0,
344 .checkpoint_count = checkpoint_count,
345 .checkpoint_lsn = checkpoint_lsn,
346 .layout_version = FT_LAYOUT_VERSION,
347 .layout_version_original = layout_version_original,
348 .build_id = build_id,
349 .build_id_original = build_id_original,
350 .time_of_creation = time_of_creation,
351 .root_xid_that_created = root_xid_that_created,
352 .time_of_last_modification = time_of_last_modification,
353 .time_of_last_verification = time_of_last_verification,
354 .root_blocknum = root_blocknum,
355 .flags = flags,
356 .nodesize = nodesize,
357 .basementnodesize = basementnodesize,
358 .compression_method = compression_method,
359 .fanout = fanout,
360 .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade,
361 .max_msn_in_ft = max_msn_in_ft,
362 .time_of_last_optimize_begin = time_of_last_optimize_begin,
363 .time_of_last_optimize_end = time_of_last_optimize_end,
364 .count_of_optimize_in_progress = count_of_optimize_in_progress,
365 .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress,
366 .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize,
367 .on_disk_stats = on_disk_stats,
368 .on_disk_logical_rows = on_disk_logical_rows
369 };
370 XMEMDUP(ft->h, &h);
371 }
372
373 if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
374 // This needs ft->h to be non-null, so we have to do it after we
375 // read everything else.
376 r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft);
377 if (r != 0) {
378 goto exit;
379 }
380 }
381 if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) {
382 r = toku_upgrade_msn_from_root_to_header(fd, ft);
383 if (r != 0) {
384 goto exit;
385 }
386 }
387
388 invariant((uint32_t) ft->layout_version_read_from_disk == version);
389 r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version);
390 if (r != 0) {
391 goto exit;
392 }
393
394 // initialize for svn #4541
395 toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt);
396
397 // Version 13 descriptors had an extra 4 bytes that we don't read
398 // anymore. Since the header is going to think it's the current
399 // version if it gets written out, we need to write the descriptor in
400 // the new format (without those bytes) before that happens.
401 if (version <= FT_LAYOUT_VERSION_13) {
402 toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd);
403 }
404 r = 0;
405exit:
406 if (r != 0 && ft != NULL) {
407 toku_free(ft);
408 ft = NULL;
409 }
410 *ftp = ft;
411 return r;
412}
413
414static size_t serialize_ft_min_size(uint32_t version) {
415 size_t size = 0;
416
417 switch (version) {
418 case FT_LAYOUT_VERSION_29:
419 size += sizeof(uint64_t); // logrows in ft
420 // fallthrough
421 case FT_LAYOUT_VERSION_28:
422 size += sizeof(uint32_t); // fanout in ft
423 // fallthrough
424 case FT_LAYOUT_VERSION_27:
425 case FT_LAYOUT_VERSION_26:
426 case FT_LAYOUT_VERSION_25:
427 case FT_LAYOUT_VERSION_24:
428 case FT_LAYOUT_VERSION_23:
429 case FT_LAYOUT_VERSION_22:
430 case FT_LAYOUT_VERSION_21:
431 size += sizeof(MSN); // max_msn_in_ft
432 // fallthrough
433 case FT_LAYOUT_VERSION_20:
434 case FT_LAYOUT_VERSION_19:
435 size += 1; // compression method
436 size += sizeof(MSN); // highest_unused_msn_for_upgrade
437 // fallthrough
438 case FT_LAYOUT_VERSION_18:
439 size += sizeof(uint64_t); // time_of_last_optimize_begin
440 size += sizeof(uint64_t); // time_of_last_optimize_end
441 size += sizeof(uint32_t); // count_of_optimize_in_progress
442 size += sizeof(MSN); // msn_at_start_of_last_completed_optimize
443 size -= 8; // removed num_blocks_to_upgrade_14
444 size -= 8; // removed num_blocks_to_upgrade_13
445 // fallthrough
446 case FT_LAYOUT_VERSION_17:
447 size += 16;
448 invariant(sizeof(STAT64INFO_S) == 16);
449 // fallthrough
450 case FT_LAYOUT_VERSION_16:
451 case FT_LAYOUT_VERSION_15:
452 size += 4; // basement node size
453 size += 8; // num_blocks_to_upgrade_14 (previously
454 // num_blocks_to_upgrade, now one int each for upgrade
455 // from 13, 14
456 size += 8; // time of last verification
457 // fallthrough
458 case FT_LAYOUT_VERSION_14:
459 size += 8; // TXNID that created
460 // fallthrough
461 case FT_LAYOUT_VERSION_13:
462 size += (4 // build_id
463 +
464 4 // build_id_original
465 +
466 8 // time_of_creation
467 +
468 8 // time_of_last_modification
469 );
470 // fallthrough
471 case FT_LAYOUT_VERSION_12:
472 size += (+8 // "tokudata"
473 +
474 4 // version
475 +
476 4 // original_version
477 +
478 4 // size
479 +
480 8 // byte order verification
481 +
482 8 // checkpoint_count
483 +
484 8 // checkpoint_lsn
485 +
486 4 // tree's nodesize
487 +
488 8 // translation_size_on_disk
489 +
490 8 // translation_address_on_disk
491 +
492 4 // checksum
493 +
494 8 // Number of blocks in old version.
495 +
496 8 // diskoff
497 +
498 4 // flags
499 );
500 break;
501 default:
502 abort();
503 }
504
505 lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
506 return size;
507}
508
509int deserialize_ft_from_fd_into_rbuf(int fd,
510 toku_off_t offset_of_header,
511 struct rbuf *rb,
512 uint64_t *checkpoint_count,
513 LSN *checkpoint_lsn,
514 uint32_t *version_p)
515// Effect: Read and parse the header of a fractalal tree
516//
517// Simply reading the raw bytes of the header into an rbuf is insensitive
518// to disk format version. If that ever changes, then modify this.
519//
520// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
521// file AND the header is useless
522{
523 int r = 0;
524 const int64_t prefix_size = 8 + // magic ("tokudata")
525 4 + // version
526 4 + // build_id
527 4; // size
528 const int64_t read_size = roundup_to_multiple(512, prefix_size);
529 unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
530 rb->buf = NULL;
531 int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
532 if (n != read_size) {
533 if (n == 0) {
534 r = TOKUDB_DICTIONARY_NO_HEADER;
535 } else if (n < 0) {
536 r = get_error_errno();
537 } else {
538 r = EINVAL;
539 }
540 toku_free(prefix);
541 goto exit;
542 }
543
544 rbuf_init(rb, prefix, prefix_size);
545
546 // Check magic number
547 const void *magic;
548 rbuf_literal_bytes(rb, &magic, 8);
549 if (memcmp(magic, "tokudata", 8) != 0) {
550 if ((*(uint64_t *)magic) == 0) {
551 r = TOKUDB_DICTIONARY_NO_HEADER;
552 } else {
553 r = EINVAL; // Not a tokudb file! Do not use.
554 }
555 goto exit;
556 }
557
558 // Version MUST be in network order regardless of disk order.
559 uint32_t version;
560 version = rbuf_network_int(rb);
561 *version_p = version;
562 if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
563 r = TOKUDB_DICTIONARY_TOO_OLD; // Cannot use
564 goto exit;
565 } else if (version > FT_LAYOUT_VERSION) {
566 r = TOKUDB_DICTIONARY_TOO_NEW; // Cannot use
567 goto exit;
568 }
569
570 // build_id MUST be in network order regardless of disk order.
571 uint32_t build_id __attribute__((__unused__));
572 build_id = rbuf_network_int(rb);
573 int64_t min_header_size;
574 min_header_size = serialize_ft_min_size(version);
575
576 // Size MUST be in network order regardless of disk order.
577 uint32_t size;
578 size = rbuf_network_int(rb);
579 // If too big, it is corrupt. We would probably notice during checksum
580 // but may have to do a multi-gigabyte malloc+read to find out.
581 // If its too small reading rbuf would crash, so verify.
582 if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE ||
583 size < min_header_size) {
584 r = TOKUDB_DICTIONARY_NO_HEADER;
585 goto exit;
586 }
587
588 lazy_assert(rb->ndone == prefix_size);
589 rb->size = size;
590 {
591 toku_free(rb->buf);
592 uint32_t size_to_read = roundup_to_multiple(512, size);
593 XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
594
595 invariant(offset_of_header % 512 == 0);
596 n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
597 if (n != size_to_read) {
598 if (n < 0) {
599 r = get_error_errno();
600 } else {
601 r = EINVAL; // Header might be useless (wrong size) or could be
602 // a disk read error.
603 }
604 goto exit;
605 }
606 }
607 // It's version 14 or later. Magic looks OK.
608 // We have an rbuf that represents the header.
609 // Size is within acceptable bounds.
610
611 // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function
612 // changed)
613 uint32_t calculated_x1764;
614 calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4);
615 uint32_t stored_x1764;
616 stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4));
617 if (calculated_x1764 != stored_x1764) {
618 r = TOKUDB_BAD_CHECKSUM; // Header useless
619 fprintf(stderr,
620 "Header checksum failure: calc=0x%08x read=0x%08x\n",
621 calculated_x1764,
622 stored_x1764);
623 goto exit;
624 }
625
626 // Verify byte order
627 const void *tmp_byte_order_check;
628 lazy_assert((sizeof toku_byte_order_host) == 8);
629 rbuf_literal_bytes(
630 rb, &tmp_byte_order_check, 8); // Must not translate byte order
631 int64_t byte_order_stored;
632 byte_order_stored = *(int64_t *)tmp_byte_order_check;
633 if (byte_order_stored != toku_byte_order_host) {
634 r = TOKUDB_DICTIONARY_NO_HEADER; // Cannot use dictionary
635 goto exit;
636 }
637
638 // Load checkpoint count
639 *checkpoint_count = rbuf_ulonglong(rb);
640 *checkpoint_lsn = rbuf_LSN(rb);
641 // Restart at beginning during regular deserialization
642 rb->ndone = 0;
643
644exit:
645 if (r != 0 && rb->buf != NULL) {
646 toku_free(rb->buf);
647 rb->buf = NULL;
648 }
649 return r;
650}
651
652// Read ft from file into struct. Read both headers and use one.
653// We want the latest acceptable header whose checkpoint_lsn is no later
654// than max_acceptable_lsn.
655#define dump_state_of_toku_deserialize_ft_from() \
656 fprintf(stderr, \
657 "%s:%d toku_deserialize_ft_from: " \
658 "filename[%s] " \
659 "r[%d] max_acceptable_lsn[%lu]" \
660 "r0[%d] checkpoint_lsn_0[%lu] checkpoint_count_0[%lu] " \
661 "r1[%d] checkpoint_lsn_1[%lu] checkpoint_count_1[%lu]\n", \
662 __FILE__, \
663 __LINE__, \
664 fn, \
665 r, \
666 max_acceptable_lsn.lsn, \
667 r0, \
668 checkpoint_lsn_0.lsn, \
669 checkpoint_count_0, \
670 r1, \
671 checkpoint_lsn_1.lsn, \
672 checkpoint_count_1);
673
674int toku_deserialize_ft_from(int fd,
675 const char *fn,
676 LSN max_acceptable_lsn,
677 FT *ft) {
678 struct rbuf rb_0;
679 struct rbuf rb_1;
680 uint64_t checkpoint_count_0 = 0;
681 uint64_t checkpoint_count_1 = 0;
682 LSN checkpoint_lsn_0;
683 LSN checkpoint_lsn_1;
684 uint32_t version_0 = 0, version_1 = 0, version = 0;
685 bool h0_acceptable = false;
686 bool h1_acceptable = false;
687 struct rbuf *rb = NULL;
688 int r0, r1, r = 0;
689
690 toku_off_t header_0_off = 0;
691 r0 = deserialize_ft_from_fd_into_rbuf(fd,
692 header_0_off,
693 &rb_0,
694 &checkpoint_count_0,
695 &checkpoint_lsn_0,
696 &version_0);
697 if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
698 h0_acceptable = true;
699 }
700
701 toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
702 r1 = deserialize_ft_from_fd_into_rbuf(fd,
703 header_1_off,
704 &rb_1,
705 &checkpoint_count_1,
706 &checkpoint_lsn_1,
707 &version_1);
708 if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
709 h1_acceptable = true;
710 }
711
712 // if either header is too new, the dictionary is unreadable
713 if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW ||
714 !(h0_acceptable || h1_acceptable)) {
715 // We were unable to read either header or at least one is too
716 // new. Certain errors are higher priority than others. Order of
717 // these if/else if is important.
718 if (r0 == TOKUDB_DICTIONARY_TOO_NEW ||
719 r1 == TOKUDB_DICTIONARY_TOO_NEW) {
720 r = TOKUDB_DICTIONARY_TOO_NEW;
721 } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD ||
722 r1 == TOKUDB_DICTIONARY_TOO_OLD) {
723 r = TOKUDB_DICTIONARY_TOO_OLD;
724 } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
725 fprintf(stderr, "Both header checksums failed.\n");
726 r = TOKUDB_BAD_CHECKSUM;
727 } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER ||
728 r1 == TOKUDB_DICTIONARY_NO_HEADER) {
729 r = TOKUDB_DICTIONARY_NO_HEADER;
730 } else {
731 r = r0 ? r0 : r1; // Arbitrarily report the error from the
732 // first header, unless it's readable
733 }
734
735 if (r != TOKUDB_DICTIONARY_NO_HEADER) {
736 dump_state_of_toku_deserialize_ft_from();
737 }
738
739 // it should not be possible for both headers to be later than the
740 // max_acceptable_lsn
741 invariant(
742 !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
743 (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
744 invariant(r != 0);
745 goto exit;
746 }
747
748 if (h0_acceptable && h1_acceptable) {
749 if (checkpoint_count_0 > checkpoint_count_1) {
750 if (!(checkpoint_count_0 == checkpoint_count_1 + 1) ||
751 !(version_0 >= version_1)) {
752 dump_state_of_toku_deserialize_ft_from();
753 }
754 invariant(checkpoint_count_0 == checkpoint_count_1 + 1);
755 invariant(version_0 >= version_1);
756 rb = &rb_0;
757 version = version_0;
758 } else {
759 if (!(checkpoint_count_1 == checkpoint_count_0 + 1) ||
760 !(version_1 >= version_0)) {
761 dump_state_of_toku_deserialize_ft_from();
762 }
763 invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
764 invariant(version_1 >= version_0);
765 rb = &rb_1;
766 version = version_1;
767 }
768 } else if (h0_acceptable) {
769 if (r1 == TOKUDB_BAD_CHECKSUM) {
770 // print something reassuring
771 fprintf(
772 stderr,
773 "Header 2 checksum failed, but header 1 ok. Proceeding.\n");
774 dump_state_of_toku_deserialize_ft_from();
775 }
776 rb = &rb_0;
777 version = version_0;
778 } else if (h1_acceptable) {
779 if (r0 == TOKUDB_BAD_CHECKSUM) {
780 // print something reassuring
781 fprintf(
782 stderr,
783 "Header 1 checksum failed, but header 2 ok. Proceeding.\n");
784 dump_state_of_toku_deserialize_ft_from();
785 }
786 rb = &rb_1;
787 version = version_1;
788 }
789
790 if (!rb) {
791 dump_state_of_toku_deserialize_ft_from();
792 }
793 paranoid_invariant(rb);
794 r = deserialize_ft_versioned(fd, rb, ft, version);
795
796exit:
797 if (rb_0.buf) {
798 toku_free(rb_0.buf);
799 }
800 if (rb_1.buf) {
801 toku_free(rb_1.buf);
802 }
803 return r;
804}
805
806size_t toku_serialize_ft_size(FT_HEADER h) {
807 size_t size = serialize_ft_min_size(h->layout_version);
808 // There is no dynamic data.
809 lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
810 return size;
811}
812
813void toku_serialize_ft_to_wbuf (
814 struct wbuf *wbuf,
815 FT_HEADER h,
816 DISKOFF translation_location_on_disk,
817 DISKOFF translation_size_on_disk
818 )
819{
820 wbuf_literal_bytes(wbuf, "tokudata", 8);
821 wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order
822 wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order
823 wbuf_network_int (wbuf, wbuf->size); //MUST be in network order regardless of disk order
824 wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order
825 wbuf_ulonglong(wbuf, h->checkpoint_count);
826 wbuf_LSN (wbuf, h->checkpoint_lsn);
827 wbuf_int (wbuf, h->nodesize);
828
829 wbuf_DISKOFF(wbuf, translation_location_on_disk);
830 wbuf_DISKOFF(wbuf, translation_size_on_disk);
831 wbuf_BLOCKNUM(wbuf, h->root_blocknum);
832 wbuf_int(wbuf, h->flags);
833 wbuf_int(wbuf, h->layout_version_original);
834 wbuf_int(wbuf, h->build_id_original);
835 wbuf_ulonglong(wbuf, h->time_of_creation);
836 wbuf_ulonglong(wbuf, h->time_of_last_modification);
837 wbuf_TXNID(wbuf, h->root_xid_that_created);
838 wbuf_int(wbuf, h->basementnodesize);
839 wbuf_ulonglong(wbuf, h->time_of_last_verification);
840 wbuf_ulonglong(wbuf, h->on_disk_stats.numrows);
841 wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes);
842 wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin);
843 wbuf_ulonglong(wbuf, h->time_of_last_optimize_end);
844 wbuf_int(wbuf, h->count_of_optimize_in_progress);
845 wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
846 wbuf_char(wbuf, (unsigned char) h->compression_method);
847 wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade);
848 wbuf_MSN(wbuf, h->max_msn_in_ft);
849 wbuf_int(wbuf, h->fanout);
850 wbuf_ulonglong(wbuf, h->on_disk_logical_rows);
851 uint32_t checksum = toku_x1764_finish(&wbuf->checksum);
852 wbuf_int(wbuf, checksum);
853 lazy_assert(wbuf->ndone == wbuf->size);
854}
855
856void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
857 lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS);
858 struct wbuf w_translation;
859 int64_t size_translation;
860 int64_t address_translation;
861
862 // Must serialize translation first, to get address,size for header.
863 bt->serialize_translation_to_wbuf(
864 fd, &w_translation, &address_translation, &size_translation);
865 invariant(size_translation == w_translation.ndone);
866
867 // the number of bytes available in the buffer is 0 mod 512, and those last
868 // bytes are all initialized.
869 invariant(w_translation.size % 512 == 0);
870
871 struct wbuf w_main;
872 size_t size_main = toku_serialize_ft_size(h);
873 size_t size_main_aligned = roundup_to_multiple(512, size_main);
874 invariant(size_main_aligned <
875 BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
876 char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
877 for (size_t i = size_main; i < size_main_aligned; i++)
878 mainbuf[i] = 0; // initialize the end of the buffer with zeros
879 wbuf_init(&w_main, mainbuf, size_main);
880 toku_serialize_ft_to_wbuf(
881 &w_main, h, address_translation, size_translation);
882 lazy_assert(w_main.ndone == size_main);
883
884 // Actually write translation table
885 // This write is guaranteed to read good data at the end of the buffer,
886 // since the
887 // w_translation.buf is padded with zeros to a 512-byte boundary.
888 toku_os_full_pwrite(fd,
889 w_translation.buf,
890 roundup_to_multiple(512, size_translation),
891 address_translation);
892
893 // Everything but the header MUST be on disk before header starts.
894 // Otherwise we will think the header is good and some blocks might not
895 // yet be on disk.
896 // If the header has a cachefile we need to do cachefile fsync (to
897 // prevent crash if we redirected to dev null)
898 // If there is no cachefile we still need to do an fsync.
899 if (cf) {
900 toku_cachefile_fsync(cf);
901 } else {
902 toku_file_fsync(fd);
903 }
904
905 // Alternate writing header to two locations:
906 // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
907 toku_off_t main_offset;
908 main_offset = (h->checkpoint_count & 0x1)
909 ? 0
910 : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
911 toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
912 toku_free(w_main.buf);
913 toku_free(w_translation.buf);
914}
915