| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #include "ft/ft.h" |
| 40 | #include "ft/ft-internal.h" |
| 41 | #include "ft/msg.h" |
| 42 | #include "ft/serialize/block_allocator.h" |
| 43 | #include "ft/serialize/block_table.h" |
| 44 | #include "ft/serialize/compress.h" |
| 45 | #include "ft/serialize/ft-serialize.h" |
| 46 | |
| 47 | // not version-sensitive because we only serialize a descriptor using the current layout_version |
| 48 | uint32_t |
| 49 | toku_serialize_descriptor_size(DESCRIPTOR desc) { |
| 50 | //Checksum NOT included in this. Checksum only exists in header's version. |
| 51 | uint32_t size = 4; // four bytes for size of descriptor |
| 52 | size += desc->dbt.size; |
| 53 | return size; |
| 54 | } |
| 55 | |
| 56 | static uint32_t |
| 57 | deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) { |
| 58 | //Checksum NOT included in this. Checksum only exists in header's version. |
| 59 | uint32_t size = 4; // four bytes for size of descriptor |
| 60 | if (layout_version == FT_LAYOUT_VERSION_13) |
| 61 | size += 4; // for version 13, include four bytes of "version" |
| 62 | size += desc->dbt.size; |
| 63 | return size; |
| 64 | } |
| 65 | |
| 66 | void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) { |
| 67 | wbuf_bytes(wb, desc->dbt.data, desc->dbt.size); |
| 68 | } |
| 69 | |
| 70 | //Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed) |
| 71 | //descriptor. |
| 72 | //Descriptors are NOT written during the header checkpoint process. |
| 73 | void |
| 74 | toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) { |
| 75 | // make the checksum |
| 76 | int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum |
| 77 | int64_t size_aligned = roundup_to_multiple(512, size); |
| 78 | struct wbuf w; |
| 79 | char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf); |
| 80 | for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0; |
| 81 | wbuf_init(&w, aligned_buf, size); |
| 82 | toku_serialize_descriptor_contents_to_wbuf(&w, desc); |
| 83 | { |
| 84 | //Add checksum |
| 85 | uint32_t checksum = toku_x1764_finish(&w.checksum); |
| 86 | wbuf_int(&w, checksum); |
| 87 | } |
| 88 | lazy_assert(w.ndone==w.size); |
| 89 | { |
| 90 | //Actual Write translation table |
| 91 | toku_os_full_pwrite(fd, w.buf, size_aligned, offset); |
| 92 | } |
| 93 | toku_free(w.buf); |
| 94 | } |
| 95 | |
| 96 | static void |
| 97 | deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) { |
| 98 | if (layout_version <= FT_LAYOUT_VERSION_13) { |
| 99 | // in older versions of tokuft, the descriptor had a 4 byte |
| 100 | // version, which we skip over |
| 101 | (void) rbuf_int(rb); |
| 102 | } |
| 103 | |
| 104 | uint32_t size; |
| 105 | const void *data; |
| 106 | rbuf_bytes(rb, &data, &size); |
| 107 | toku_memdup_dbt(&desc->dbt, data, size); |
| 108 | } |
| 109 | |
| 110 | static int |
| 111 | deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) { |
| 112 | int r = 0; |
| 113 | DISKOFF offset; |
| 114 | DISKOFF size; |
| 115 | unsigned char *dbuf = nullptr; |
| 116 | bt->get_descriptor_offset_size(&offset, &size); |
| 117 | memset(desc, 0, sizeof(*desc)); |
| 118 | if (size > 0) { |
| 119 | lazy_assert(size>=4); //4 for checksum |
| 120 | { |
| 121 | ssize_t size_to_malloc = roundup_to_multiple(512, size); |
| 122 | XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf); |
| 123 | { |
| 124 | |
| 125 | ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset); |
| 126 | lazy_assert(sz_read==size_to_malloc); |
| 127 | } |
| 128 | { |
| 129 | // check the checksum |
| 130 | uint32_t x1764 = toku_x1764_memory(dbuf, size-4); |
| 131 | //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); |
| 132 | uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4)); |
| 133 | if (x1764 != stored_x1764) { |
| 134 | fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n" , x1764, stored_x1764); |
| 135 | r = TOKUDB_BAD_CHECKSUM; |
| 136 | toku_free(dbuf); |
| 137 | goto exit; |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 }; |
| 142 | deserialize_descriptor_from_rbuf(&rb, desc, layout_version); |
| 143 | lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size); |
| 144 | toku_free(dbuf); |
| 145 | } |
| 146 | } |
| 147 | exit: |
| 148 | return r; |
| 149 | } |
| 150 | |
| 151 | int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) |
| 152 | // Effect: Deserialize the ft header. |
| 153 | // We deserialize ft_header only once and then share everything with all the FTs. |
| 154 | { |
| 155 | int r; |
| 156 | FT ft = NULL; |
| 157 | paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION); |
| 158 | paranoid_invariant(version <= FT_LAYOUT_VERSION); |
| 159 | // We already know: |
| 160 | // we have an rbuf representing the header. |
| 161 | // The checksum has been validated |
| 162 | |
| 163 | //Verification of initial elements. |
| 164 | //Check magic number |
| 165 | const void *magic; |
| 166 | rbuf_literal_bytes(rb, &magic, 8); |
| 167 | lazy_assert(memcmp(magic,"tokudata" ,8)==0); |
| 168 | |
| 169 | XCALLOC(ft); |
| 170 | ft->checkpoint_header = NULL; |
| 171 | toku_list_init(&ft->live_ft_handles); |
| 172 | |
| 173 | //version MUST be in network order on disk regardless of disk order |
| 174 | ft->layout_version_read_from_disk = rbuf_network_int(rb); |
| 175 | invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION); |
| 176 | invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION); |
| 177 | |
| 178 | //build_id MUST be in network order on disk regardless of disk order |
| 179 | uint32_t build_id; |
| 180 | build_id = rbuf_network_int(rb); |
| 181 | |
| 182 | //Size MUST be in network order regardless of disk order. |
| 183 | uint32_t size; |
| 184 | size = rbuf_network_int(rb); |
| 185 | lazy_assert(size == rb->size); |
| 186 | |
| 187 | const void *tmp_byte_order_check; |
| 188 | lazy_assert((sizeof tmp_byte_order_check) >= 8); |
| 189 | rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order |
| 190 | int64_t byte_order_stored; |
| 191 | byte_order_stored = *(int64_t*)tmp_byte_order_check; |
| 192 | lazy_assert(byte_order_stored == toku_byte_order_host); |
| 193 | |
| 194 | uint64_t checkpoint_count; |
| 195 | checkpoint_count = rbuf_ulonglong(rb); |
| 196 | LSN checkpoint_lsn; |
| 197 | checkpoint_lsn = rbuf_LSN(rb); |
| 198 | unsigned nodesize; |
| 199 | nodesize = rbuf_int(rb); |
| 200 | DISKOFF translation_address_on_disk; |
| 201 | translation_address_on_disk = rbuf_DISKOFF(rb); |
| 202 | DISKOFF translation_size_on_disk; |
| 203 | translation_size_on_disk = rbuf_DISKOFF(rb); |
| 204 | lazy_assert(translation_address_on_disk > 0); |
| 205 | lazy_assert(translation_size_on_disk > 0); |
| 206 | |
| 207 | // initialize the tree lock |
| 208 | toku_ft_init_reflock(ft); |
| 209 | |
| 210 | //Load translation table |
| 211 | { |
| 212 | size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk); |
| 213 | unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf); |
| 214 | { |
| 215 | // This cast is messed up in 32-bits if the block translation |
| 216 | // table is ever more than 4GB. But in that case, the |
| 217 | // translation table itself won't fit in main memory. |
| 218 | ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read, |
| 219 | translation_address_on_disk); |
| 220 | invariant(readsz >= translation_size_on_disk); |
| 221 | invariant(readsz <= (ssize_t)size_to_read); |
| 222 | } |
| 223 | // Create table and read in data. |
| 224 | r = ft->blocktable.create_from_buffer(fd, |
| 225 | translation_address_on_disk, |
| 226 | translation_size_on_disk, |
| 227 | tbuf); |
| 228 | toku_free(tbuf); |
| 229 | if (r != 0) { |
| 230 | goto exit; |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | BLOCKNUM root_blocknum; |
| 235 | root_blocknum = rbuf_blocknum(rb); |
| 236 | unsigned flags; |
| 237 | flags = rbuf_int(rb); |
| 238 | if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) { |
| 239 | // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag |
| 240 | flags &= ~TOKU_DB_VALCMP_BUILTIN_13; |
| 241 | } |
| 242 | int layout_version_original; |
| 243 | layout_version_original = rbuf_int(rb); |
| 244 | uint32_t build_id_original; |
| 245 | build_id_original = rbuf_int(rb); |
| 246 | uint64_t time_of_creation; |
| 247 | time_of_creation = rbuf_ulonglong(rb); |
| 248 | uint64_t time_of_last_modification; |
| 249 | time_of_last_modification = rbuf_ulonglong(rb); |
| 250 | |
| 251 | if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) { |
| 252 | // 17 was the last version with these fields, we no longer store |
| 253 | // them, so read and discard them |
| 254 | (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13 |
| 255 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { |
| 256 | (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14 |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | // fake creation during the last checkpoint |
| 261 | TXNID root_xid_that_created; |
| 262 | root_xid_that_created = checkpoint_lsn.lsn; |
| 263 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) { |
| 264 | rbuf_TXNID(rb, &root_xid_that_created); |
| 265 | } |
| 266 | |
| 267 | // TODO(leif): get this to default to what's specified, not the |
| 268 | // hard-coded default |
| 269 | unsigned basementnodesize; |
| 270 | basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE; |
| 271 | uint64_t time_of_last_verification; |
| 272 | time_of_last_verification = 0; |
| 273 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { |
| 274 | basementnodesize = rbuf_int(rb); |
| 275 | time_of_last_verification = rbuf_ulonglong(rb); |
| 276 | } |
| 277 | |
| 278 | STAT64INFO_S on_disk_stats; |
| 279 | on_disk_stats = ZEROSTATS; |
| 280 | uint64_t time_of_last_optimize_begin; |
| 281 | time_of_last_optimize_begin = 0; |
| 282 | uint64_t time_of_last_optimize_end; |
| 283 | time_of_last_optimize_end = 0; |
| 284 | uint32_t count_of_optimize_in_progress; |
| 285 | count_of_optimize_in_progress = 0; |
| 286 | MSN msn_at_start_of_last_completed_optimize; |
| 287 | msn_at_start_of_last_completed_optimize = ZERO_MSN; |
| 288 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) { |
| 289 | on_disk_stats.numrows = rbuf_ulonglong(rb); |
| 290 | on_disk_stats.numbytes = rbuf_ulonglong(rb); |
| 291 | ft->in_memory_stats = on_disk_stats; |
| 292 | time_of_last_optimize_begin = rbuf_ulonglong(rb); |
| 293 | time_of_last_optimize_end = rbuf_ulonglong(rb); |
| 294 | count_of_optimize_in_progress = rbuf_int(rb); |
| 295 | msn_at_start_of_last_completed_optimize = rbuf_MSN(rb); |
| 296 | } |
| 297 | |
| 298 | enum toku_compression_method compression_method; |
| 299 | MSN highest_unused_msn_for_upgrade; |
| 300 | highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1); |
| 301 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) { |
| 302 | unsigned char method = rbuf_char(rb); |
| 303 | compression_method = (enum toku_compression_method) method; |
| 304 | highest_unused_msn_for_upgrade = rbuf_MSN(rb); |
| 305 | } else { |
| 306 | // we hard coded zlib until 5.2, then quicklz in 5.2 |
| 307 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { |
| 308 | compression_method = TOKU_ZLIB_METHOD; |
| 309 | } else { |
| 310 | compression_method = TOKU_QUICKLZ_METHOD; |
| 311 | } |
| 312 | } |
| 313 | |
| 314 | MSN max_msn_in_ft; |
| 315 | max_msn_in_ft = ZERO_MSN; // We'll upgrade it from the root node later if necessary |
| 316 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) { |
| 317 | max_msn_in_ft = rbuf_MSN(rb); |
| 318 | } |
| 319 | |
| 320 | unsigned fanout; |
| 321 | fanout = FT_DEFAULT_FANOUT; |
| 322 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) { |
| 323 | fanout = rbuf_int(rb); |
| 324 | } |
| 325 | |
| 326 | uint64_t on_disk_logical_rows; |
| 327 | on_disk_logical_rows = (uint64_t)-1; |
| 328 | if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_29) { |
| 329 | on_disk_logical_rows = rbuf_ulonglong(rb); |
| 330 | } |
| 331 | ft->in_memory_logical_rows = on_disk_logical_rows; |
| 332 | |
| 333 | (void) rbuf_int(rb); //Read in checksum and ignore (already verified). |
| 334 | if (rb->ndone != rb->size) { |
| 335 | fprintf(stderr, "Header size did not match contents.\n" ); |
| 336 | r = EINVAL; |
| 337 | goto exit; |
| 338 | } |
| 339 | |
| 340 | { |
| 341 | struct ft_header h = { |
| 342 | .type = FT_CURRENT, |
| 343 | .dirty = 0, |
| 344 | .checkpoint_count = checkpoint_count, |
| 345 | .checkpoint_lsn = checkpoint_lsn, |
| 346 | .layout_version = FT_LAYOUT_VERSION, |
| 347 | .layout_version_original = layout_version_original, |
| 348 | .build_id = build_id, |
| 349 | .build_id_original = build_id_original, |
| 350 | .time_of_creation = time_of_creation, |
| 351 | .root_xid_that_created = root_xid_that_created, |
| 352 | .time_of_last_modification = time_of_last_modification, |
| 353 | .time_of_last_verification = time_of_last_verification, |
| 354 | .root_blocknum = root_blocknum, |
| 355 | .flags = flags, |
| 356 | .nodesize = nodesize, |
| 357 | .basementnodesize = basementnodesize, |
| 358 | .compression_method = compression_method, |
| 359 | .fanout = fanout, |
| 360 | .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade, |
| 361 | .max_msn_in_ft = max_msn_in_ft, |
| 362 | .time_of_last_optimize_begin = time_of_last_optimize_begin, |
| 363 | .time_of_last_optimize_end = time_of_last_optimize_end, |
| 364 | .count_of_optimize_in_progress = count_of_optimize_in_progress, |
| 365 | .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress, |
| 366 | .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize, |
| 367 | .on_disk_stats = on_disk_stats, |
| 368 | .on_disk_logical_rows = on_disk_logical_rows |
| 369 | }; |
| 370 | XMEMDUP(ft->h, &h); |
| 371 | } |
| 372 | |
| 373 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { |
| 374 | // This needs ft->h to be non-null, so we have to do it after we |
| 375 | // read everything else. |
| 376 | r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft); |
| 377 | if (r != 0) { |
| 378 | goto exit; |
| 379 | } |
| 380 | } |
| 381 | if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) { |
| 382 | r = toku_upgrade_msn_from_root_to_header(fd, ft); |
| 383 | if (r != 0) { |
| 384 | goto exit; |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | invariant((uint32_t) ft->layout_version_read_from_disk == version); |
| 389 | r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version); |
| 390 | if (r != 0) { |
| 391 | goto exit; |
| 392 | } |
| 393 | |
| 394 | // initialize for svn #4541 |
| 395 | toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt); |
| 396 | |
| 397 | // Version 13 descriptors had an extra 4 bytes that we don't read |
| 398 | // anymore. Since the header is going to think it's the current |
| 399 | // version if it gets written out, we need to write the descriptor in |
| 400 | // the new format (without those bytes) before that happens. |
| 401 | if (version <= FT_LAYOUT_VERSION_13) { |
| 402 | toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd); |
| 403 | } |
| 404 | r = 0; |
| 405 | exit: |
| 406 | if (r != 0 && ft != NULL) { |
| 407 | toku_free(ft); |
| 408 | ft = NULL; |
| 409 | } |
| 410 | *ftp = ft; |
| 411 | return r; |
| 412 | } |
| 413 | |
| 414 | static size_t serialize_ft_min_size(uint32_t version) { |
| 415 | size_t size = 0; |
| 416 | |
| 417 | switch (version) { |
| 418 | case FT_LAYOUT_VERSION_29: |
| 419 | size += sizeof(uint64_t); // logrows in ft |
| 420 | // fallthrough |
| 421 | case FT_LAYOUT_VERSION_28: |
| 422 | size += sizeof(uint32_t); // fanout in ft |
| 423 | // fallthrough |
| 424 | case FT_LAYOUT_VERSION_27: |
| 425 | case FT_LAYOUT_VERSION_26: |
| 426 | case FT_LAYOUT_VERSION_25: |
| 427 | case FT_LAYOUT_VERSION_24: |
| 428 | case FT_LAYOUT_VERSION_23: |
| 429 | case FT_LAYOUT_VERSION_22: |
| 430 | case FT_LAYOUT_VERSION_21: |
| 431 | size += sizeof(MSN); // max_msn_in_ft |
| 432 | // fallthrough |
| 433 | case FT_LAYOUT_VERSION_20: |
| 434 | case FT_LAYOUT_VERSION_19: |
| 435 | size += 1; // compression method |
| 436 | size += sizeof(MSN); // highest_unused_msn_for_upgrade |
| 437 | // fallthrough |
| 438 | case FT_LAYOUT_VERSION_18: |
| 439 | size += sizeof(uint64_t); // time_of_last_optimize_begin |
| 440 | size += sizeof(uint64_t); // time_of_last_optimize_end |
| 441 | size += sizeof(uint32_t); // count_of_optimize_in_progress |
| 442 | size += sizeof(MSN); // msn_at_start_of_last_completed_optimize |
| 443 | size -= 8; // removed num_blocks_to_upgrade_14 |
| 444 | size -= 8; // removed num_blocks_to_upgrade_13 |
| 445 | // fallthrough |
| 446 | case FT_LAYOUT_VERSION_17: |
| 447 | size += 16; |
| 448 | invariant(sizeof(STAT64INFO_S) == 16); |
| 449 | // fallthrough |
| 450 | case FT_LAYOUT_VERSION_16: |
| 451 | case FT_LAYOUT_VERSION_15: |
| 452 | size += 4; // basement node size |
| 453 | size += 8; // num_blocks_to_upgrade_14 (previously |
| 454 | // num_blocks_to_upgrade, now one int each for upgrade |
| 455 | // from 13, 14 |
| 456 | size += 8; // time of last verification |
| 457 | // fallthrough |
| 458 | case FT_LAYOUT_VERSION_14: |
| 459 | size += 8; // TXNID that created |
| 460 | // fallthrough |
| 461 | case FT_LAYOUT_VERSION_13: |
| 462 | size += (4 // build_id |
| 463 | + |
| 464 | 4 // build_id_original |
| 465 | + |
| 466 | 8 // time_of_creation |
| 467 | + |
| 468 | 8 // time_of_last_modification |
| 469 | ); |
| 470 | // fallthrough |
| 471 | case FT_LAYOUT_VERSION_12: |
| 472 | size += (+8 // "tokudata" |
| 473 | + |
| 474 | 4 // version |
| 475 | + |
| 476 | 4 // original_version |
| 477 | + |
| 478 | 4 // size |
| 479 | + |
| 480 | 8 // byte order verification |
| 481 | + |
| 482 | 8 // checkpoint_count |
| 483 | + |
| 484 | 8 // checkpoint_lsn |
| 485 | + |
| 486 | 4 // tree's nodesize |
| 487 | + |
| 488 | 8 // translation_size_on_disk |
| 489 | + |
| 490 | 8 // translation_address_on_disk |
| 491 | + |
| 492 | 4 // checksum |
| 493 | + |
| 494 | 8 // Number of blocks in old version. |
| 495 | + |
| 496 | 8 // diskoff |
| 497 | + |
| 498 | 4 // flags |
| 499 | ); |
| 500 | break; |
| 501 | default: |
| 502 | abort(); |
| 503 | } |
| 504 | |
| 505 | lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
| 506 | return size; |
| 507 | } |
| 508 | |
| 509 | int deserialize_ft_from_fd_into_rbuf(int fd, |
| 510 | toku_off_t , |
| 511 | struct rbuf *rb, |
| 512 | uint64_t *checkpoint_count, |
| 513 | LSN *checkpoint_lsn, |
| 514 | uint32_t *version_p) |
| 515 | // Effect: Read and parse the header of a fractalal tree |
| 516 | // |
| 517 | // Simply reading the raw bytes of the header into an rbuf is insensitive |
| 518 | // to disk format version. If that ever changes, then modify this. |
| 519 | // |
| 520 | // TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the |
| 521 | // file AND the header is useless |
| 522 | { |
| 523 | int r = 0; |
| 524 | const int64_t prefix_size = 8 + // magic ("tokudata") |
| 525 | 4 + // version |
| 526 | 4 + // build_id |
| 527 | 4; // size |
| 528 | const int64_t read_size = roundup_to_multiple(512, prefix_size); |
| 529 | unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix); |
| 530 | rb->buf = NULL; |
| 531 | int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header); |
| 532 | if (n != read_size) { |
| 533 | if (n == 0) { |
| 534 | r = TOKUDB_DICTIONARY_NO_HEADER; |
| 535 | } else if (n < 0) { |
| 536 | r = get_error_errno(); |
| 537 | } else { |
| 538 | r = EINVAL; |
| 539 | } |
| 540 | toku_free(prefix); |
| 541 | goto exit; |
| 542 | } |
| 543 | |
| 544 | rbuf_init(rb, prefix, prefix_size); |
| 545 | |
| 546 | // Check magic number |
| 547 | const void *magic; |
| 548 | rbuf_literal_bytes(rb, &magic, 8); |
| 549 | if (memcmp(magic, "tokudata" , 8) != 0) { |
| 550 | if ((*(uint64_t *)magic) == 0) { |
| 551 | r = TOKUDB_DICTIONARY_NO_HEADER; |
| 552 | } else { |
| 553 | r = EINVAL; // Not a tokudb file! Do not use. |
| 554 | } |
| 555 | goto exit; |
| 556 | } |
| 557 | |
| 558 | // Version MUST be in network order regardless of disk order. |
| 559 | uint32_t version; |
| 560 | version = rbuf_network_int(rb); |
| 561 | *version_p = version; |
| 562 | if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) { |
| 563 | r = TOKUDB_DICTIONARY_TOO_OLD; // Cannot use |
| 564 | goto exit; |
| 565 | } else if (version > FT_LAYOUT_VERSION) { |
| 566 | r = TOKUDB_DICTIONARY_TOO_NEW; // Cannot use |
| 567 | goto exit; |
| 568 | } |
| 569 | |
| 570 | // build_id MUST be in network order regardless of disk order. |
| 571 | uint32_t build_id __attribute__((__unused__)); |
| 572 | build_id = rbuf_network_int(rb); |
| 573 | int64_t ; |
| 574 | min_header_size = serialize_ft_min_size(version); |
| 575 | |
| 576 | // Size MUST be in network order regardless of disk order. |
| 577 | uint32_t size; |
| 578 | size = rbuf_network_int(rb); |
| 579 | // If too big, it is corrupt. We would probably notice during checksum |
| 580 | // but may have to do a multi-gigabyte malloc+read to find out. |
| 581 | // If its too small reading rbuf would crash, so verify. |
| 582 | if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE || |
| 583 | size < min_header_size) { |
| 584 | r = TOKUDB_DICTIONARY_NO_HEADER; |
| 585 | goto exit; |
| 586 | } |
| 587 | |
| 588 | lazy_assert(rb->ndone == prefix_size); |
| 589 | rb->size = size; |
| 590 | { |
| 591 | toku_free(rb->buf); |
| 592 | uint32_t size_to_read = roundup_to_multiple(512, size); |
| 593 | XMALLOC_N_ALIGNED(512, size_to_read, rb->buf); |
| 594 | |
| 595 | invariant(offset_of_header % 512 == 0); |
| 596 | n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header); |
| 597 | if (n != size_to_read) { |
| 598 | if (n < 0) { |
| 599 | r = get_error_errno(); |
| 600 | } else { |
| 601 | r = EINVAL; // Header might be useless (wrong size) or could be |
| 602 | // a disk read error. |
| 603 | } |
| 604 | goto exit; |
| 605 | } |
| 606 | } |
| 607 | // It's version 14 or later. Magic looks OK. |
| 608 | // We have an rbuf that represents the header. |
| 609 | // Size is within acceptable bounds. |
| 610 | |
| 611 | // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function |
| 612 | // changed) |
| 613 | uint32_t calculated_x1764; |
| 614 | calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4); |
| 615 | uint32_t stored_x1764; |
| 616 | stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4)); |
| 617 | if (calculated_x1764 != stored_x1764) { |
| 618 | r = TOKUDB_BAD_CHECKSUM; // Header useless |
| 619 | fprintf(stderr, |
| 620 | "Header checksum failure: calc=0x%08x read=0x%08x\n" , |
| 621 | calculated_x1764, |
| 622 | stored_x1764); |
| 623 | goto exit; |
| 624 | } |
| 625 | |
| 626 | // Verify byte order |
| 627 | const void *tmp_byte_order_check; |
| 628 | lazy_assert((sizeof toku_byte_order_host) == 8); |
| 629 | rbuf_literal_bytes( |
| 630 | rb, &tmp_byte_order_check, 8); // Must not translate byte order |
| 631 | int64_t byte_order_stored; |
| 632 | byte_order_stored = *(int64_t *)tmp_byte_order_check; |
| 633 | if (byte_order_stored != toku_byte_order_host) { |
| 634 | r = TOKUDB_DICTIONARY_NO_HEADER; // Cannot use dictionary |
| 635 | goto exit; |
| 636 | } |
| 637 | |
| 638 | // Load checkpoint count |
| 639 | *checkpoint_count = rbuf_ulonglong(rb); |
| 640 | *checkpoint_lsn = rbuf_LSN(rb); |
| 641 | // Restart at beginning during regular deserialization |
| 642 | rb->ndone = 0; |
| 643 | |
| 644 | exit: |
| 645 | if (r != 0 && rb->buf != NULL) { |
| 646 | toku_free(rb->buf); |
| 647 | rb->buf = NULL; |
| 648 | } |
| 649 | return r; |
| 650 | } |
| 651 | |
| 652 | // Read ft from file into struct. Read both headers and use one. |
| 653 | // We want the latest acceptable header whose checkpoint_lsn is no later |
| 654 | // than max_acceptable_lsn. |
| 655 | #define dump_state_of_toku_deserialize_ft_from() \ |
| 656 | fprintf(stderr, \ |
| 657 | "%s:%d toku_deserialize_ft_from: " \ |
| 658 | "filename[%s] " \ |
| 659 | "r[%d] max_acceptable_lsn[%lu]" \ |
| 660 | "r0[%d] checkpoint_lsn_0[%lu] checkpoint_count_0[%lu] " \ |
| 661 | "r1[%d] checkpoint_lsn_1[%lu] checkpoint_count_1[%lu]\n", \ |
| 662 | __FILE__, \ |
| 663 | __LINE__, \ |
| 664 | fn, \ |
| 665 | r, \ |
| 666 | max_acceptable_lsn.lsn, \ |
| 667 | r0, \ |
| 668 | checkpoint_lsn_0.lsn, \ |
| 669 | checkpoint_count_0, \ |
| 670 | r1, \ |
| 671 | checkpoint_lsn_1.lsn, \ |
| 672 | checkpoint_count_1); |
| 673 | |
| 674 | int toku_deserialize_ft_from(int fd, |
| 675 | const char *fn, |
| 676 | LSN max_acceptable_lsn, |
| 677 | FT *ft) { |
| 678 | struct rbuf rb_0; |
| 679 | struct rbuf rb_1; |
| 680 | uint64_t checkpoint_count_0 = 0; |
| 681 | uint64_t checkpoint_count_1 = 0; |
| 682 | LSN checkpoint_lsn_0; |
| 683 | LSN checkpoint_lsn_1; |
| 684 | uint32_t version_0 = 0, version_1 = 0, version = 0; |
| 685 | bool h0_acceptable = false; |
| 686 | bool h1_acceptable = false; |
| 687 | struct rbuf *rb = NULL; |
| 688 | int r0, r1, r = 0; |
| 689 | |
| 690 | toku_off_t = 0; |
| 691 | r0 = deserialize_ft_from_fd_into_rbuf(fd, |
| 692 | header_0_off, |
| 693 | &rb_0, |
| 694 | &checkpoint_count_0, |
| 695 | &checkpoint_lsn_0, |
| 696 | &version_0); |
| 697 | if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) { |
| 698 | h0_acceptable = true; |
| 699 | } |
| 700 | |
| 701 | toku_off_t = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; |
| 702 | r1 = deserialize_ft_from_fd_into_rbuf(fd, |
| 703 | header_1_off, |
| 704 | &rb_1, |
| 705 | &checkpoint_count_1, |
| 706 | &checkpoint_lsn_1, |
| 707 | &version_1); |
| 708 | if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) { |
| 709 | h1_acceptable = true; |
| 710 | } |
| 711 | |
| 712 | // if either header is too new, the dictionary is unreadable |
| 713 | if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW || |
| 714 | !(h0_acceptable || h1_acceptable)) { |
| 715 | // We were unable to read either header or at least one is too |
| 716 | // new. Certain errors are higher priority than others. Order of |
| 717 | // these if/else if is important. |
| 718 | if (r0 == TOKUDB_DICTIONARY_TOO_NEW || |
| 719 | r1 == TOKUDB_DICTIONARY_TOO_NEW) { |
| 720 | r = TOKUDB_DICTIONARY_TOO_NEW; |
| 721 | } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || |
| 722 | r1 == TOKUDB_DICTIONARY_TOO_OLD) { |
| 723 | r = TOKUDB_DICTIONARY_TOO_OLD; |
| 724 | } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) { |
| 725 | fprintf(stderr, "Both header checksums failed.\n" ); |
| 726 | r = TOKUDB_BAD_CHECKSUM; |
| 727 | } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || |
| 728 | r1 == TOKUDB_DICTIONARY_NO_HEADER) { |
| 729 | r = TOKUDB_DICTIONARY_NO_HEADER; |
| 730 | } else { |
| 731 | r = r0 ? r0 : r1; // Arbitrarily report the error from the |
| 732 | // first header, unless it's readable |
| 733 | } |
| 734 | |
| 735 | if (r != TOKUDB_DICTIONARY_NO_HEADER) { |
| 736 | dump_state_of_toku_deserialize_ft_from(); |
| 737 | } |
| 738 | |
| 739 | // it should not be possible for both headers to be later than the |
| 740 | // max_acceptable_lsn |
| 741 | invariant( |
| 742 | !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) && |
| 743 | (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn))); |
| 744 | invariant(r != 0); |
| 745 | goto exit; |
| 746 | } |
| 747 | |
| 748 | if (h0_acceptable && h1_acceptable) { |
| 749 | if (checkpoint_count_0 > checkpoint_count_1) { |
| 750 | if (!(checkpoint_count_0 == checkpoint_count_1 + 1) || |
| 751 | !(version_0 >= version_1)) { |
| 752 | dump_state_of_toku_deserialize_ft_from(); |
| 753 | } |
| 754 | invariant(checkpoint_count_0 == checkpoint_count_1 + 1); |
| 755 | invariant(version_0 >= version_1); |
| 756 | rb = &rb_0; |
| 757 | version = version_0; |
| 758 | } else { |
| 759 | if (!(checkpoint_count_1 == checkpoint_count_0 + 1) || |
| 760 | !(version_1 >= version_0)) { |
| 761 | dump_state_of_toku_deserialize_ft_from(); |
| 762 | } |
| 763 | invariant(checkpoint_count_1 == checkpoint_count_0 + 1); |
| 764 | invariant(version_1 >= version_0); |
| 765 | rb = &rb_1; |
| 766 | version = version_1; |
| 767 | } |
| 768 | } else if (h0_acceptable) { |
| 769 | if (r1 == TOKUDB_BAD_CHECKSUM) { |
| 770 | // print something reassuring |
| 771 | fprintf( |
| 772 | stderr, |
| 773 | "Header 2 checksum failed, but header 1 ok. Proceeding.\n" ); |
| 774 | dump_state_of_toku_deserialize_ft_from(); |
| 775 | } |
| 776 | rb = &rb_0; |
| 777 | version = version_0; |
| 778 | } else if (h1_acceptable) { |
| 779 | if (r0 == TOKUDB_BAD_CHECKSUM) { |
| 780 | // print something reassuring |
| 781 | fprintf( |
| 782 | stderr, |
| 783 | "Header 1 checksum failed, but header 2 ok. Proceeding.\n" ); |
| 784 | dump_state_of_toku_deserialize_ft_from(); |
| 785 | } |
| 786 | rb = &rb_1; |
| 787 | version = version_1; |
| 788 | } |
| 789 | |
| 790 | if (!rb) { |
| 791 | dump_state_of_toku_deserialize_ft_from(); |
| 792 | } |
| 793 | paranoid_invariant(rb); |
| 794 | r = deserialize_ft_versioned(fd, rb, ft, version); |
| 795 | |
| 796 | exit: |
| 797 | if (rb_0.buf) { |
| 798 | toku_free(rb_0.buf); |
| 799 | } |
| 800 | if (rb_1.buf) { |
| 801 | toku_free(rb_1.buf); |
| 802 | } |
| 803 | return r; |
| 804 | } |
| 805 | |
| 806 | size_t (FT_HEADER h) { |
| 807 | size_t size = serialize_ft_min_size(h->layout_version); |
| 808 | // There is no dynamic data. |
| 809 | lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
| 810 | return size; |
| 811 | } |
| 812 | |
| 813 | void ( |
| 814 | struct wbuf *wbuf, |
| 815 | FT_HEADER h, |
| 816 | DISKOFF translation_location_on_disk, |
| 817 | DISKOFF translation_size_on_disk |
| 818 | ) |
| 819 | { |
| 820 | wbuf_literal_bytes(wbuf, "tokudata" , 8); |
| 821 | wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order |
| 822 | wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order |
| 823 | wbuf_network_int (wbuf, wbuf->size); //MUST be in network order regardless of disk order |
| 824 | wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order |
| 825 | wbuf_ulonglong(wbuf, h->checkpoint_count); |
| 826 | wbuf_LSN (wbuf, h->checkpoint_lsn); |
| 827 | wbuf_int (wbuf, h->nodesize); |
| 828 | |
| 829 | wbuf_DISKOFF(wbuf, translation_location_on_disk); |
| 830 | wbuf_DISKOFF(wbuf, translation_size_on_disk); |
| 831 | wbuf_BLOCKNUM(wbuf, h->root_blocknum); |
| 832 | wbuf_int(wbuf, h->flags); |
| 833 | wbuf_int(wbuf, h->layout_version_original); |
| 834 | wbuf_int(wbuf, h->build_id_original); |
| 835 | wbuf_ulonglong(wbuf, h->time_of_creation); |
| 836 | wbuf_ulonglong(wbuf, h->time_of_last_modification); |
| 837 | wbuf_TXNID(wbuf, h->root_xid_that_created); |
| 838 | wbuf_int(wbuf, h->basementnodesize); |
| 839 | wbuf_ulonglong(wbuf, h->time_of_last_verification); |
| 840 | wbuf_ulonglong(wbuf, h->on_disk_stats.numrows); |
| 841 | wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes); |
| 842 | wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin); |
| 843 | wbuf_ulonglong(wbuf, h->time_of_last_optimize_end); |
| 844 | wbuf_int(wbuf, h->count_of_optimize_in_progress); |
| 845 | wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize); |
| 846 | wbuf_char(wbuf, (unsigned char) h->compression_method); |
| 847 | wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade); |
| 848 | wbuf_MSN(wbuf, h->max_msn_in_ft); |
| 849 | wbuf_int(wbuf, h->fanout); |
| 850 | wbuf_ulonglong(wbuf, h->on_disk_logical_rows); |
| 851 | uint32_t checksum = toku_x1764_finish(&wbuf->checksum); |
| 852 | wbuf_int(wbuf, checksum); |
| 853 | lazy_assert(wbuf->ndone == wbuf->size); |
| 854 | } |
| 855 | |
| 856 | void (int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) { |
| 857 | lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS); |
| 858 | struct wbuf w_translation; |
| 859 | int64_t size_translation; |
| 860 | int64_t address_translation; |
| 861 | |
| 862 | // Must serialize translation first, to get address,size for header. |
| 863 | bt->serialize_translation_to_wbuf( |
| 864 | fd, &w_translation, &address_translation, &size_translation); |
| 865 | invariant(size_translation == w_translation.ndone); |
| 866 | |
| 867 | // the number of bytes available in the buffer is 0 mod 512, and those last |
| 868 | // bytes are all initialized. |
| 869 | invariant(w_translation.size % 512 == 0); |
| 870 | |
| 871 | struct wbuf w_main; |
| 872 | size_t size_main = toku_serialize_ft_size(h); |
| 873 | size_t size_main_aligned = roundup_to_multiple(512, size_main); |
| 874 | invariant(size_main_aligned < |
| 875 | BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); |
| 876 | char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf); |
| 877 | for (size_t i = size_main; i < size_main_aligned; i++) |
| 878 | mainbuf[i] = 0; // initialize the end of the buffer with zeros |
| 879 | wbuf_init(&w_main, mainbuf, size_main); |
| 880 | toku_serialize_ft_to_wbuf( |
| 881 | &w_main, h, address_translation, size_translation); |
| 882 | lazy_assert(w_main.ndone == size_main); |
| 883 | |
| 884 | // Actually write translation table |
| 885 | // This write is guaranteed to read good data at the end of the buffer, |
| 886 | // since the |
| 887 | // w_translation.buf is padded with zeros to a 512-byte boundary. |
| 888 | toku_os_full_pwrite(fd, |
| 889 | w_translation.buf, |
| 890 | roundup_to_multiple(512, size_translation), |
| 891 | address_translation); |
| 892 | |
| 893 | // Everything but the header MUST be on disk before header starts. |
| 894 | // Otherwise we will think the header is good and some blocks might not |
| 895 | // yet be on disk. |
| 896 | // If the header has a cachefile we need to do cachefile fsync (to |
| 897 | // prevent crash if we redirected to dev null) |
| 898 | // If there is no cachefile we still need to do an fsync. |
| 899 | if (cf) { |
| 900 | toku_cachefile_fsync(cf); |
| 901 | } else { |
| 902 | toku_file_fsync(fd); |
| 903 | } |
| 904 | |
| 905 | // Alternate writing header to two locations: |
| 906 | // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE |
| 907 | toku_off_t main_offset; |
| 908 | main_offset = (h->checkpoint_count & 0x1) |
| 909 | ? 0 |
| 910 | : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; |
| 911 | toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset); |
| 912 | toku_free(w_main.buf); |
| 913 | toku_free(w_translation.buf); |
| 914 | } |
| 915 | |