1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#include <my_global.h>
40#include "portability/memory.h"
41#include "portability/toku_assert.h"
42#include "portability/toku_portability.h"
43#include "portability/toku_pthread.h"
44
45// ugly but pragmatic, need access to dirty bits while holding translation lock
46// TODO: Refactor this (possibly with FT-301)
47#include "ft/ft-internal.h"
48
49// TODO: reorganize this dependency (FT-303)
50#include "ft/ft-ops.h" // for toku_maybe_truncate_file
51#include "ft/serialize/block_table.h"
52#include "ft/serialize/rbuf.h"
53#include "ft/serialize/wbuf.h"
54#include "ft/serialize/block_allocator.h"
55#include "util/nb_mutex.h"
56#include "util/scoped_malloc.h"
57
58
59toku_instr_key *block_table_mutex_key;
60toku_instr_key *safe_file_size_lock_mutex_key;
61toku_instr_key *safe_file_size_lock_rwlock_key;
62
63// indicates the end of a freelist
64static const BLOCKNUM freelist_null = {-1};
65
66// value of block_translation_pair.size if blocknum is unused
67static const DISKOFF size_is_free = (DISKOFF)-1;
68
69// value of block_translation_pair.u.diskoff if blocknum is used but does not
70// yet have a diskblock
71static const DISKOFF diskoff_unused = (DISKOFF)-2;
72
73void block_table::_mutex_lock() { toku_mutex_lock(&_mutex); }
74
75void block_table::_mutex_unlock() { toku_mutex_unlock(&_mutex); }
76
77// TODO: Move lock to FT
78void toku_ft_lock(FT ft) {
79 block_table *bt = &ft->blocktable;
80 bt->_mutex_lock();
81}
82
83// TODO: Move lock to FT
84void toku_ft_unlock(FT ft) {
85 block_table *bt = &ft->blocktable;
86 toku_mutex_assert_locked(&bt->_mutex);
87 bt->_mutex_unlock();
88}
89
90// There are two headers: the reserve must fit them both and be suitably
91// aligned.
92static_assert(BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE %
93 BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT ==
94 0,
95 "Block allocator's header reserve must be suitibly aligned");
96static_assert(
97 BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 ==
98 BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
99 "Block allocator's total header reserve must exactly fit two headers");
100
101// does NOT initialize the block allocator: the caller is responsible
102void block_table::_create_internal() {
103 memset(&_current, 0, sizeof(struct translation));
104 memset(&_inprogress, 0, sizeof(struct translation));
105 memset(&_checkpointed, 0, sizeof(struct translation));
106 memset(&_mutex, 0, sizeof(_mutex));
107 _bt_block_allocator = new BlockAllocator();
108 toku_mutex_init(*block_table_mutex_key, &_mutex, nullptr);
109 nb_mutex_init(*safe_file_size_lock_mutex_key,
110 *safe_file_size_lock_rwlock_key,
111 &_safe_file_size_lock);
112}
113
114// Fill in the checkpointed translation from buffer, and copy checkpointed to
115// current.
116// The one read from disk is the last known checkpointed one, so we are keeping
117// it in
118// place and then setting current (which is never stored on disk) for current
119// use.
120// The translation_buffer has translation only, we create the rest of the
121// block_table.
122int block_table::create_from_buffer(
123 int fd,
124 DISKOFF location_on_disk, // Location of translation_buffer
125 DISKOFF size_on_disk,
126 unsigned char *translation_buffer) {
127 // Does not initialize the block allocator
128 _create_internal();
129
130 // Deserialize the translation and copy it to current
131 int r = _translation_deserialize_from_buffer(
132 &_checkpointed, location_on_disk, size_on_disk, translation_buffer);
133 if (r != 0) {
134 return r;
135 }
136 _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT);
137
138 // Determine the file size
139 int64_t file_size = 0;
140 r = toku_os_get_file_size(fd, &file_size);
141 lazy_assert_zero(r);
142 invariant(file_size >= 0);
143 _safe_file_size = file_size;
144
145 // Gather the non-empty translations and use them to create the block
146 // allocator
147 toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b *
148 sizeof(struct BlockAllocator::BlockPair));
149 struct BlockAllocator::BlockPair *CAST_FROM_VOIDP(pairs, pairs_buf.get());
150 uint64_t n_pairs = 0;
151 for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) {
152 struct block_translation_pair pair = _checkpointed.block_translation[i];
153 if (pair.size > 0) {
154 invariant(pair.u.diskoff != diskoff_unused);
155 pairs[n_pairs++] =
156 BlockAllocator::BlockPair(pair.u.diskoff, pair.size);
157 }
158 }
159
160 _bt_block_allocator->CreateFromBlockPairs(
161 BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
162 BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT,
163 pairs,
164 n_pairs);
165
166 return 0;
167}
168
169void block_table::create() {
170 // Does not initialize the block allocator
171 _create_internal();
172
173 _checkpointed.type = TRANSLATION_CHECKPOINTED;
174 _checkpointed.smallest_never_used_blocknum =
175 make_blocknum(RESERVED_BLOCKNUMS);
176 _checkpointed.length_of_array =
177 _checkpointed.smallest_never_used_blocknum.b;
178 _checkpointed.blocknum_freelist_head = freelist_null;
179 XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation);
180 for (int64_t i = 0; i < _checkpointed.length_of_array; i++) {
181 _checkpointed.block_translation[i].size = 0;
182 _checkpointed.block_translation[i].u.diskoff = diskoff_unused;
183 }
184
185 // we just created a default checkpointed, now copy it to current.
186 _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT);
187
188 // Create an empty block allocator.
189 _bt_block_allocator->Create(
190 BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
191 BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT);
192}
193
194// TODO: Refactor with FT-303
195static void ft_set_dirty(FT ft, bool for_checkpoint) {
196 invariant(ft->h->type == FT_CURRENT);
197 if (for_checkpoint) {
198 invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
199 ft->checkpoint_header->dirty = 1;
200 } else {
201 ft->h->dirty = 1;
202 }
203}
204
205void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) {
206 toku_mutex_assert_locked(&_mutex);
207 uint64_t new_size_needed = _bt_block_allocator->AllocatedLimit();
208 // Save a call to toku_os_get_file_size (kernel call) if unlikely to be
209 // useful.
210 if (new_size_needed < size_needed_before &&
211 new_size_needed < _safe_file_size) {
212 nb_mutex_lock(&_safe_file_size_lock, &_mutex);
213
214 // Must hold _safe_file_size_lock to change _safe_file_size.
215 if (new_size_needed < _safe_file_size) {
216 int64_t safe_file_size_before = _safe_file_size;
217 // Not safe to use the 'to-be-truncated' portion until truncate is
218 // done.
219 _safe_file_size = new_size_needed;
220 _mutex_unlock();
221
222 uint64_t size_after;
223 toku_maybe_truncate_file(
224 fd, new_size_needed, safe_file_size_before, &size_after);
225 _mutex_lock();
226
227 _safe_file_size = size_after;
228 }
229 nb_mutex_unlock(&_safe_file_size_lock);
230 }
231}
232
233void block_table::maybe_truncate_file_on_open(int fd) {
234 _mutex_lock();
235 _maybe_truncate_file(fd, _safe_file_size);
236 _mutex_unlock();
237}
238
239void block_table::_copy_translation(struct translation *dst,
240 struct translation *src,
241 enum translation_type newtype) {
242 // We intend to malloc a fresh block, so the incoming translation should be
243 // empty
244 invariant_null(dst->block_translation);
245
246 invariant(src->length_of_array >= src->smallest_never_used_blocknum.b);
247 invariant(newtype == TRANSLATION_DEBUG ||
248 (src->type == TRANSLATION_CURRENT &&
249 newtype == TRANSLATION_INPROGRESS) ||
250 (src->type == TRANSLATION_CHECKPOINTED &&
251 newtype == TRANSLATION_CURRENT));
252 dst->type = newtype;
253 dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum;
254 dst->blocknum_freelist_head = src->blocknum_freelist_head;
255
256 // destination btt is of fixed size. Allocate + memcpy the exact length
257 // necessary.
258 dst->length_of_array = dst->smallest_never_used_blocknum.b;
259 XMALLOC_N(dst->length_of_array, dst->block_translation);
260 memcpy(dst->block_translation,
261 src->block_translation,
262 dst->length_of_array * sizeof(*dst->block_translation));
263
264 // New version of btt is not yet stored on disk.
265 dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0;
266 dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff =
267 diskoff_unused;
268}
269
270int64_t block_table::get_blocks_in_use_unlocked() {
271 BLOCKNUM b;
272 struct translation *t = &_current;
273 int64_t num_blocks = 0;
274 {
275 // Reserved blocknums do not get upgraded; They are part of the header.
276 for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b;
277 b.b++) {
278 if (t->block_translation[b.b].size != size_is_free) {
279 num_blocks++;
280 }
281 }
282 }
283 return num_blocks;
284}
285
286void block_table::_maybe_optimize_translation(struct translation *t) {
287 // Reduce 'smallest_never_used_blocknum.b' (completely free blocknums
288 // instead of just
289 // on a free list. Doing so requires us to regenerate the free list.
290 // This is O(n) work, so do it only if you're already doing that.
291
292 BLOCKNUM b;
293 paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
294 // Calculate how large the free suffix is.
295 int64_t freed;
296 {
297 for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS;
298 b.b--) {
299 if (t->block_translation[b.b - 1].size != size_is_free) {
300 break;
301 }
302 }
303 freed = t->smallest_never_used_blocknum.b - b.b;
304 }
305 if (freed > 0) {
306 t->smallest_never_used_blocknum.b = b.b;
307 if (t->length_of_array / 4 > t->smallest_never_used_blocknum.b) {
308 // We're using more memory than necessary to represent this now.
309 // Reduce.
310 uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
311 XREALLOC_N(new_length, t->block_translation);
312 t->length_of_array = new_length;
313 // No need to zero anything out.
314 }
315
316 // Regenerate free list.
317 t->blocknum_freelist_head.b = freelist_null.b;
318 for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b;
319 b.b++) {
320 if (t->block_translation[b.b].size == size_is_free) {
321 t->block_translation[b.b].u.next_free_blocknum =
322 t->blocknum_freelist_head;
323 t->blocknum_freelist_head = b;
324 }
325 }
326 }
327}
328
329// block table must be locked by caller of this function
330void block_table::note_start_checkpoint_unlocked() {
331 toku_mutex_assert_locked(&_mutex);
332
333 // We're going to do O(n) work to copy the translation, so we
334 // can afford to do O(n) work by optimizing the translation
335 _maybe_optimize_translation(&_current);
336
337 // Copy current translation to inprogress translation.
338 _copy_translation(&_inprogress, &_current, TRANSLATION_INPROGRESS);
339
340 _checkpoint_skipped = false;
341}
342
343void block_table::note_skipped_checkpoint() {
344 // Purpose, alert block translation that the checkpoint was skipped, e.x.
345 // for a non-dirty header
346 _mutex_lock();
347 paranoid_invariant_notnull(_inprogress.block_translation);
348 _checkpoint_skipped = true;
349 _mutex_unlock();
350}
351
352// Purpose: free any disk space used by previous checkpoint that isn't in use by
353// either
354// - current state
355// - in-progress checkpoint
356// capture inprogress as new checkpointed.
357// For each entry in checkpointBTT
358// if offset does not match offset in inprogress
359// assert offset does not match offset in current
360// free (offset,len) from checkpoint
361// move inprogress to checkpoint (resetting type)
362// inprogress = NULL
363void block_table::note_end_checkpoint(int fd) {
364 // Free unused blocks
365 _mutex_lock();
366 uint64_t allocated_limit_at_start = _bt_block_allocator->AllocatedLimit();
367 paranoid_invariant_notnull(_inprogress.block_translation);
368 if (_checkpoint_skipped) {
369 toku_free(_inprogress.block_translation);
370 memset(&_inprogress, 0, sizeof(_inprogress));
371 goto end;
372 }
373
374 // Make certain inprogress was allocated space on disk
375 invariant(
376 _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0);
377 invariant(
378 _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff >
379 0);
380
381 {
382 struct translation *t = &_checkpointed;
383 for (int64_t i = 0; i < t->length_of_array; i++) {
384 struct block_translation_pair *pair = &t->block_translation[i];
385 if (pair->size > 0 &&
386 !_translation_prevents_freeing(
387 &_inprogress, make_blocknum(i), pair)) {
388 invariant(!_translation_prevents_freeing(
389 &_current, make_blocknum(i), pair));
390 _bt_block_allocator->FreeBlock(pair->u.diskoff, pair->size);
391 }
392 }
393 toku_free(_checkpointed.block_translation);
394 _checkpointed = _inprogress;
395 _checkpointed.type = TRANSLATION_CHECKPOINTED;
396 memset(&_inprogress, 0, sizeof(_inprogress));
397 _maybe_truncate_file(fd, allocated_limit_at_start);
398 }
399end:
400 _mutex_unlock();
401}
402
403bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) {
404 invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
405 return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b;
406}
407
408void block_table::_verify_valid_blocknum(struct translation *UU(t),
409 BLOCKNUM UU(b)) {
410 invariant(_is_valid_blocknum(t, b));
411}
412
413bool block_table::_is_valid_freeable_blocknum(struct translation *t,
414 BLOCKNUM b) {
415 invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
416 return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b;
417}
418
419// should be freeable
420void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t),
421 BLOCKNUM UU(b)) {
422 invariant(_is_valid_freeable_blocknum(t, b));
423}
424
425// Also used only in ft-serialize-test.
426void block_table::block_free(uint64_t offset, uint64_t size) {
427 _mutex_lock();
428 _bt_block_allocator->FreeBlock(offset, size);
429 _mutex_unlock();
430}
431
432int64_t block_table::_calculate_size_on_disk(struct translation *t) {
433 return 8 + // smallest_never_used_blocknum
434 8 + // blocknum_freelist_head
435 t->smallest_never_used_blocknum.b * 16 + // Array
436 4; // 4 for checksum
437}
438
439// We cannot free the disk space allocated to this blocknum if it is still in
440// use by the given translation table.
441bool block_table::_translation_prevents_freeing(
442 struct translation *t,
443 BLOCKNUM b,
444 struct block_translation_pair *old_pair) {
445 return t->block_translation && b.b < t->smallest_never_used_blocknum.b &&
446 old_pair->u.diskoff == t->block_translation[b.b].u.diskoff;
447}
448
449void block_table::_realloc_on_disk_internal(BLOCKNUM b,
450 DISKOFF size,
451 DISKOFF *offset,
452 FT ft,
453 bool for_checkpoint) {
454 toku_mutex_assert_locked(&_mutex);
455 ft_set_dirty(ft, for_checkpoint);
456
457 struct translation *t = &_current;
458 struct block_translation_pair old_pair = t->block_translation[b.b];
459 // Free the old block if it is not still in use by the checkpoint in
460 // progress or the previous checkpoint
461 bool cannot_free =
462 (!for_checkpoint &&
463 _translation_prevents_freeing(&_inprogress, b, &old_pair)) ||
464 _translation_prevents_freeing(&_checkpointed, b, &old_pair);
465 if (!cannot_free && old_pair.u.diskoff != diskoff_unused) {
466 _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size);
467 }
468
469 uint64_t allocator_offset = diskoff_unused;
470 t->block_translation[b.b].size = size;
471 if (size > 0) {
472 // Allocate a new block if the size is greater than 0,
473 // if the size is just 0, offset will be set to diskoff_unused
474 _bt_block_allocator->AllocBlock(size, &allocator_offset);
475 }
476 t->block_translation[b.b].u.diskoff = allocator_offset;
477 *offset = allocator_offset;
478
479 // Update inprogress btt if appropriate (if called because Pending bit is
480 // set).
481 if (for_checkpoint) {
482 paranoid_invariant(b.b < _inprogress.length_of_array);
483 _inprogress.block_translation[b.b] = t->block_translation[b.b];
484 }
485}
486
487void block_table::_ensure_safe_write_unlocked(int fd,
488 DISKOFF block_size,
489 DISKOFF block_offset) {
490 // Requires: holding _mutex
491 uint64_t size_needed = block_size + block_offset;
492 if (size_needed > _safe_file_size) {
493 // Must hold _safe_file_size_lock to change _safe_file_size.
494 nb_mutex_lock(&_safe_file_size_lock, &_mutex);
495 if (size_needed > _safe_file_size) {
496 _mutex_unlock();
497
498 int64_t size_after;
499 toku_maybe_preallocate_in_file(
500 fd, size_needed, _safe_file_size, &size_after);
501
502 _mutex_lock();
503 _safe_file_size = size_after;
504 }
505 nb_mutex_unlock(&_safe_file_size_lock);
506 }
507}
508
509void block_table::realloc_on_disk(BLOCKNUM b,
510 DISKOFF size,
511 DISKOFF *offset,
512 FT ft,
513 int fd,
514 bool for_checkpoint) {
515 _mutex_lock();
516 struct translation *t = &_current;
517 _verify_valid_freeable_blocknum(t, b);
518 _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint);
519
520 _ensure_safe_write_unlocked(fd, size, *offset);
521 _mutex_unlock();
522}
523
524bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) {
525 return pair->size == 0 && pair->u.diskoff == diskoff_unused;
526}
527
528// Effect: figure out where to put the inprogress btt on disk, allocate space
529// for it there.
530// The space must be 512-byte aligned (both the starting address and the
531// size).
532// As a result, the allcoated space may be a little bit bigger (up to the next
533// 512-byte boundary) than the actual btt.
534void block_table::_alloc_inprogress_translation_on_disk_unlocked() {
535 toku_mutex_assert_locked(&_mutex);
536
537 struct translation *t = &_inprogress;
538 paranoid_invariant_notnull(t->block_translation);
539 BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
540 // Each inprogress is allocated only once
541 paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b]));
542
543 // Allocate a new block
544 int64_t size = _calculate_size_on_disk(t);
545 uint64_t offset;
546 _bt_block_allocator->AllocBlock(size, &offset);
547 t->block_translation[b.b].u.diskoff = offset;
548 t->block_translation[b.b].size = size;
549}
550
551// Effect: Serializes the blocktable to a wbuf (which starts uninitialized)
552// A clean shutdown runs checkpoint start so that current and inprogress are
553// copies.
554// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the
555// total length is a multiple of 512 (so we pad with zeros at the end if
556// needd)
557// The address is guaranteed to be 512-byte aligned, but the size is not
558// guaranteed.
559// It *is* guaranteed that we can read up to the next 512-byte boundary,
560// however
561void block_table::serialize_translation_to_wbuf(int fd,
562 struct wbuf *w,
563 int64_t *address,
564 int64_t *size) {
565 _mutex_lock();
566 struct translation *t = &_inprogress;
567
568 BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
569 _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block
570 // must be 512-byte
571 // aligned to make
572 // O_DIRECT happy.
573 uint64_t size_translation = _calculate_size_on_disk(t);
574 uint64_t size_aligned = roundup_to_multiple(512, size_translation);
575 invariant((int64_t)size_translation == t->block_translation[b.b].size);
576 {
577 // Init wbuf
578 if (0)
579 printf(
580 "%s:%d writing translation table of size_translation %" PRIu64
581 " at %" PRId64 "\n",
582 __FILE__,
583 __LINE__,
584 size_translation,
585 t->block_translation[b.b].u.diskoff);
586 char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
587 for (uint64_t i = size_translation; i < size_aligned; i++)
588 buf[i] = 0; // fill in the end of the buffer with zeros.
589 wbuf_init(w, buf, size_aligned);
590 }
591 wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum);
592 wbuf_BLOCKNUM(w, t->blocknum_freelist_head);
593 int64_t i;
594 for (i = 0; i < t->smallest_never_used_blocknum.b; i++) {
595 if (0)
596 printf("%s:%d %" PRId64 ",%" PRId64 "\n",
597 __FILE__,
598 __LINE__,
599 t->block_translation[i].u.diskoff,
600 t->block_translation[i].size);
601 wbuf_DISKOFF(w, t->block_translation[i].u.diskoff);
602 wbuf_DISKOFF(w, t->block_translation[i].size);
603 }
604 uint32_t checksum = toku_x1764_finish(&w->checksum);
605 wbuf_int(w, checksum);
606 *address = t->block_translation[b.b].u.diskoff;
607 *size = size_translation;
608 invariant((*address) % 512 == 0);
609
610 _ensure_safe_write_unlocked(fd, size_aligned, *address);
611 _mutex_unlock();
612}
613
614// Perhaps rename: purpose is get disk address of a block, given its blocknum
615// (blockid?)
616void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b,
617 DISKOFF *offset,
618 DISKOFF *size) {
619 struct translation *t = &_current;
620 _verify_valid_blocknum(t, b);
621 if (offset) {
622 *offset = t->block_translation[b.b].u.diskoff;
623 }
624 if (size) {
625 *size = t->block_translation[b.b].size;
626 }
627}
628
629// Perhaps rename: purpose is get disk address of a block, given its blocknum
630// (blockid?)
631void block_table::translate_blocknum_to_offset_size(BLOCKNUM b,
632 DISKOFF *offset,
633 DISKOFF *size) {
634 _mutex_lock();
635 _translate_blocknum_to_offset_size_unlocked(b, offset, size);
636 _mutex_unlock();
637}
638
639// Only called by toku_allocate_blocknum
640// Effect: expand the array to maintain size invariant
641// given that one more never-used blocknum will soon be used.
642void block_table::_maybe_expand_translation(struct translation *t) {
643 if (t->length_of_array <= t->smallest_never_used_blocknum.b) {
644 // expansion is necessary
645 uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
646 XREALLOC_N(new_length, t->block_translation);
647 uint64_t i;
648 for (i = t->length_of_array; i < new_length; i++) {
649 t->block_translation[i].u.next_free_blocknum = freelist_null;
650 t->block_translation[i].size = size_is_free;
651 }
652 t->length_of_array = new_length;
653 }
654}
655
656void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) {
657 toku_mutex_assert_locked(&_mutex);
658 BLOCKNUM result;
659 struct translation *t = &_current;
660 if (t->blocknum_freelist_head.b == freelist_null.b) {
661 // no previously used blocknums are available
662 // use a never used blocknum
663 _maybe_expand_translation(
664 t); // Ensure a never used blocknums is available
665 result = t->smallest_never_used_blocknum;
666 t->smallest_never_used_blocknum.b++;
667 } else { // reuse a previously used blocknum
668 result = t->blocknum_freelist_head;
669 BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum;
670 t->blocknum_freelist_head = next;
671 }
672 // Verify the blocknum is free
673 paranoid_invariant(t->block_translation[result.b].size == size_is_free);
674 // blocknum is not free anymore
675 t->block_translation[result.b].u.diskoff = diskoff_unused;
676 t->block_translation[result.b].size = 0;
677 _verify_valid_freeable_blocknum(t, result);
678 *res = result;
679 ft_set_dirty(ft, false);
680}
681
682void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) {
683 _mutex_lock();
684 _allocate_blocknum_unlocked(res, ft);
685 _mutex_unlock();
686}
687
688void block_table::_free_blocknum_in_translation(struct translation *t,
689 BLOCKNUM b) {
690 _verify_valid_freeable_blocknum(t, b);
691 paranoid_invariant(t->block_translation[b.b].size != size_is_free);
692
693 t->block_translation[b.b].size = size_is_free;
694 t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head;
695 t->blocknum_freelist_head = b;
696}
697
698// Effect: Free a blocknum.
699// If the blocknum holds the only reference to a block on disk, free that block
700void block_table::_free_blocknum_unlocked(BLOCKNUM *bp,
701 FT ft,
702 bool for_checkpoint) {
703 toku_mutex_assert_locked(&_mutex);
704 BLOCKNUM b = *bp;
705 bp->b = 0; // Remove caller's reference.
706
707 struct block_translation_pair old_pair = _current.block_translation[b.b];
708
709 _free_blocknum_in_translation(&_current, b);
710 if (for_checkpoint) {
711 paranoid_invariant(ft->checkpoint_header->type ==
712 FT_CHECKPOINT_INPROGRESS);
713 _free_blocknum_in_translation(&_inprogress, b);
714 }
715
716 // If the size is 0, no disk block has ever been assigned to this blocknum.
717 if (old_pair.size > 0) {
718 // Free the old block if it is not still in use by the checkpoint in
719 // progress or the previous checkpoint
720 bool cannot_free =
721 _translation_prevents_freeing(&_inprogress, b, &old_pair) ||
722 _translation_prevents_freeing(&_checkpointed, b, &old_pair);
723 if (!cannot_free) {
724 _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size);
725 }
726 } else {
727 paranoid_invariant(old_pair.size == 0);
728 paranoid_invariant(old_pair.u.diskoff == diskoff_unused);
729 }
730 ft_set_dirty(ft, for_checkpoint);
731}
732
733void block_table::free_blocknum(BLOCKNUM *bp, FT ft, bool for_checkpoint) {
734 _mutex_lock();
735 _free_blocknum_unlocked(bp, ft, for_checkpoint);
736 _mutex_unlock();
737}
738
739// Verify there are no free blocks.
740void block_table::verify_no_free_blocknums() {
741 invariant(_current.blocknum_freelist_head.b == freelist_null.b);
742}
743
744// Frees blocknums that have a size of 0 and unused diskoff
745// Currently used for eliminating unused cached rollback log nodes
746void block_table::free_unused_blocknums(BLOCKNUM root) {
747 _mutex_lock();
748 int64_t smallest = _current.smallest_never_used_blocknum.b;
749 for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) {
750 if (i == root.b) {
751 continue;
752 }
753 BLOCKNUM b = make_blocknum(i);
754 if (_current.block_translation[b.b].size == 0) {
755 invariant(_current.block_translation[b.b].u.diskoff ==
756 diskoff_unused);
757 _free_blocknum_in_translation(&_current, b);
758 }
759 }
760 _mutex_unlock();
761}
762
763bool block_table::_no_data_blocks_except_root(BLOCKNUM root) {
764 bool ok = true;
765 _mutex_lock();
766 int64_t smallest = _current.smallest_never_used_blocknum.b;
767 if (root.b < RESERVED_BLOCKNUMS) {
768 ok = false;
769 goto cleanup;
770 }
771 for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) {
772 if (i == root.b) {
773 continue;
774 }
775 BLOCKNUM b = make_blocknum(i);
776 if (_current.block_translation[b.b].size != size_is_free) {
777 ok = false;
778 goto cleanup;
779 }
780 }
781cleanup:
782 _mutex_unlock();
783 return ok;
784}
785
786// Verify there are no data blocks except root.
787// TODO(leif): This actually takes a lock, but I don't want to fix all the
788// callers right now.
789void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) {
790 paranoid_invariant(_no_data_blocks_except_root(root));
791}
792
793bool block_table::_blocknum_allocated(BLOCKNUM b) {
794 _mutex_lock();
795 struct translation *t = &_current;
796 _verify_valid_blocknum(t, b);
797 bool ok = t->block_translation[b.b].size != size_is_free;
798 _mutex_unlock();
799 return ok;
800}
801
802// Verify a blocknum is currently allocated.
803void block_table::verify_blocknum_allocated(BLOCKNUM UU(b)) {
804 paranoid_invariant(_blocknum_allocated(b));
805}
806
807// Only used by toku_dump_translation table (debug info)
808void block_table::_dump_translation_internal(FILE *f, struct translation *t) {
809 if (t->block_translation) {
810 BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
811 fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array);
812 fprintf(f,
813 " smallest_never_used_blocknum[%" PRId64 "]",
814 t->smallest_never_used_blocknum.b);
815 fprintf(f,
816 " blocknum_free_list_head[%" PRId64 "]",
817 t->blocknum_freelist_head.b);
818 fprintf(
819 f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size);
820 fprintf(f,
821 " location_on_disk[%" PRId64 "]\n",
822 t->block_translation[b.b].u.diskoff);
823 int64_t i;
824 for (i = 0; i < t->length_of_array; i++) {
825 fprintf(f,
826 " %" PRId64 ": %" PRId64 " %" PRId64 "\n",
827 i,
828 t->block_translation[i].u.diskoff,
829 t->block_translation[i].size);
830 }
831 fprintf(f, "\n");
832 } else {
833 fprintf(f, " does not exist\n");
834 }
835}
836
837// Only used by toku_ft_dump which is only for debugging purposes
838// "pretty" just means we use tabs so we can parse output easier later
839void block_table::dump_translation_table_pretty(FILE *f) {
840 _mutex_lock();
841 struct translation *t = &_checkpointed;
842 invariant(t->block_translation != nullptr);
843 for (int64_t i = 0; i < t->length_of_array; ++i) {
844 fprintf(f,
845 "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n",
846 i,
847 t->block_translation[i].u.diskoff,
848 t->block_translation[i].size);
849 }
850 _mutex_unlock();
851}
852
853// Only used by toku_ft_dump which is only for debugging purposes
854void block_table::dump_translation_table(FILE *f) {
855 _mutex_lock();
856 fprintf(f, "Current block translation:");
857 _dump_translation_internal(f, &_current);
858 fprintf(f, "Checkpoint in progress block translation:");
859 _dump_translation_internal(f, &_inprogress);
860 fprintf(f, "Checkpointed block translation:");
861 _dump_translation_internal(f, &_checkpointed);
862 _mutex_unlock();
863}
864
865// Only used by ftdump
866void block_table::blocknum_dump_translation(BLOCKNUM b) {
867 _mutex_lock();
868
869 struct translation *t = &_current;
870 if (b.b < t->length_of_array) {
871 struct block_translation_pair *bx = &t->block_translation[b.b];
872 printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n",
873 b.b,
874 bx->u.diskoff,
875 bx->size);
876 }
877 _mutex_unlock();
878}
879
880// Must not call this function when anything else is using the blocktable.
881// No one may use the blocktable afterwards.
882void block_table::destroy(void) {
883 // TODO: translation.destroy();
884 toku_free(_current.block_translation);
885 toku_free(_inprogress.block_translation);
886 toku_free(_checkpointed.block_translation);
887
888 _bt_block_allocator->Destroy();
889 delete _bt_block_allocator;
890 toku_mutex_destroy(&_mutex);
891 nb_mutex_destroy(&_safe_file_size_lock);
892}
893
894int block_table::_translation_deserialize_from_buffer(
895 struct translation *t,
896 DISKOFF location_on_disk,
897 uint64_t size_on_disk,
898 // out: buffer with serialized translation
899 unsigned char *translation_buffer) {
900 int r = 0;
901 invariant(location_on_disk != 0);
902 t->type = TRANSLATION_CHECKPOINTED;
903
904 // check the checksum
905 uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4);
906 uint64_t offset = size_on_disk - 4;
907 uint32_t stored_x1764 = toku_dtoh32(*(int *)(translation_buffer + offset));
908 if (x1764 != stored_x1764) {
909 fprintf(stderr,
910 "Translation table checksum failure: calc=0x%08x read=0x%08x\n",
911 x1764,
912 stored_x1764);
913 r = TOKUDB_BAD_CHECKSUM;
914 goto exit;
915 }
916
917 struct rbuf rb;
918 rb.buf = translation_buffer;
919 rb.ndone = 0;
920 rb.size = size_on_disk - 4; // 4==checksum
921
922 t->smallest_never_used_blocknum = rbuf_blocknum(&rb);
923 t->length_of_array = t->smallest_never_used_blocknum.b;
924 invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
925 t->blocknum_freelist_head = rbuf_blocknum(&rb);
926 XMALLOC_N(t->length_of_array, t->block_translation);
927 for (int64_t i = 0; i < t->length_of_array; i++) {
928 t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb);
929 t->block_translation[i].size = rbuf_DISKOFF(&rb);
930 }
931 invariant(_calculate_size_on_disk(t) == (int64_t)size_on_disk);
932 invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size ==
933 (int64_t)size_on_disk);
934 invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff ==
935 location_on_disk);
936
937exit:
938 return r;
939}
940
941int block_table::iterate(enum translation_type type,
942 BLOCKTABLE_CALLBACK f,
943 void *extra,
944 bool data_only,
945 bool used_only) {
946 struct translation *src;
947
948 int r = 0;
949 switch (type) {
950 case TRANSLATION_CURRENT:
951 src = &_current;
952 break;
953 case TRANSLATION_INPROGRESS:
954 src = &_inprogress;
955 break;
956 case TRANSLATION_CHECKPOINTED:
957 src = &_checkpointed;
958 break;
959 default:
960 r = EINVAL;
961 }
962
963 struct translation fakecurrent;
964 memset(&fakecurrent, 0, sizeof(struct translation));
965
966 struct translation *t = &fakecurrent;
967 if (r == 0) {
968 _mutex_lock();
969 _copy_translation(t, src, TRANSLATION_DEBUG);
970 t->block_translation[RESERVED_BLOCKNUM_TRANSLATION] =
971 src->block_translation[RESERVED_BLOCKNUM_TRANSLATION];
972 _mutex_unlock();
973 int64_t i;
974 for (i = 0; i < t->smallest_never_used_blocknum.b; i++) {
975 struct block_translation_pair pair = t->block_translation[i];
976 if (data_only && i < RESERVED_BLOCKNUMS)
977 continue;
978 if (used_only && pair.size <= 0)
979 continue;
980 r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra);
981 if (r != 0)
982 break;
983 }
984 toku_free(t->block_translation);
985 }
986 return r;
987}
988
989typedef struct {
990 int64_t used_space;
991 int64_t total_space;
992} frag_extra;
993
994static int frag_helper(BLOCKNUM UU(b),
995 int64_t size,
996 int64_t address,
997 void *extra) {
998 frag_extra *info = (frag_extra *)extra;
999
1000 if (size + address > info->total_space)
1001 info->total_space = size + address;
1002 info->used_space += size;
1003 return 0;
1004}
1005
1006void block_table::internal_fragmentation(int64_t *total_sizep,
1007 int64_t *used_sizep) {
1008 frag_extra info = {0, 0};
1009 int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true);
1010 invariant_zero(r);
1011
1012 if (total_sizep)
1013 *total_sizep = info.total_space;
1014 if (used_sizep)
1015 *used_sizep = info.used_space;
1016}
1017
1018void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size,
1019 DISKOFF *offset,
1020 FT ft) {
1021 toku_mutex_assert_locked(&_mutex);
1022 BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR);
1023 _realloc_on_disk_internal(b, size, offset, ft, false);
1024}
1025
1026void block_table::realloc_descriptor_on_disk(DISKOFF size,
1027 DISKOFF *offset,
1028 FT ft,
1029 int fd) {
1030 _mutex_lock();
1031 _realloc_descriptor_on_disk_unlocked(size, offset, ft);
1032 _ensure_safe_write_unlocked(fd, size, *offset);
1033 _mutex_unlock();
1034}
1035
1036void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) {
1037 _mutex_lock();
1038 BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR);
1039 _translate_blocknum_to_offset_size_unlocked(b, offset, size);
1040 _mutex_unlock();
1041}
1042
1043void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) {
1044 // Requires: blocktable lock is held.
1045 // Requires: report->file_size_bytes is already filled in.
1046
1047 // Count the headers.
1048 report->data_bytes = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
1049 report->data_blocks = 1;
1050 report->checkpoint_bytes_additional =
1051 BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
1052 report->checkpoint_blocks_additional = 1;
1053
1054 struct translation *current = &_current;
1055 for (int64_t i = 0; i < current->length_of_array; i++) {
1056 struct block_translation_pair *pair = &current->block_translation[i];
1057 if (pair->size > 0) {
1058 report->data_bytes += pair->size;
1059 report->data_blocks++;
1060 }
1061 }
1062
1063 struct translation *checkpointed = &_checkpointed;
1064 for (int64_t i = 0; i < checkpointed->length_of_array; i++) {
1065 struct block_translation_pair *pair =
1066 &checkpointed->block_translation[i];
1067 if (pair->size > 0 &&
1068 !(i < current->length_of_array &&
1069 current->block_translation[i].size > 0 &&
1070 current->block_translation[i].u.diskoff == pair->u.diskoff)) {
1071 report->checkpoint_bytes_additional += pair->size;
1072 report->checkpoint_blocks_additional++;
1073 }
1074 }
1075
1076 struct translation *inprogress = &_inprogress;
1077 for (int64_t i = 0; i < inprogress->length_of_array; i++) {
1078 struct block_translation_pair *pair = &inprogress->block_translation[i];
1079 if (pair->size > 0 &&
1080 !(i < current->length_of_array &&
1081 current->block_translation[i].size > 0 &&
1082 current->block_translation[i].u.diskoff == pair->u.diskoff) &&
1083 !(i < checkpointed->length_of_array &&
1084 checkpointed->block_translation[i].size > 0 &&
1085 checkpointed->block_translation[i].u.diskoff ==
1086 pair->u.diskoff)) {
1087 report->checkpoint_bytes_additional += pair->size;
1088 report->checkpoint_blocks_additional++;
1089 }
1090 }
1091
1092 _bt_block_allocator->UnusedStatistics(report);
1093}
1094
1095void block_table::get_info64(struct ftinfo64 *s) {
1096 _mutex_lock();
1097
1098 struct translation *current = &_current;
1099 s->num_blocks_allocated = current->length_of_array;
1100 s->num_blocks_in_use = 0;
1101 s->size_allocated = 0;
1102 s->size_in_use = 0;
1103
1104 for (int64_t i = 0; i < current->length_of_array; ++i) {
1105 struct block_translation_pair *block = &current->block_translation[i];
1106 if (block->size != size_is_free) {
1107 ++s->num_blocks_in_use;
1108 s->size_in_use += block->size;
1109 if (block->u.diskoff != diskoff_unused) {
1110 uint64_t limit = block->u.diskoff + block->size;
1111 if (limit > s->size_allocated) {
1112 s->size_allocated = limit;
1113 }
1114 }
1115 }
1116 }
1117
1118 _mutex_unlock();
1119}
1120
1121int block_table::iterate_translation_tables(
1122 uint64_t checkpoint_count,
1123 int (*iter)(uint64_t checkpoint_count,
1124 int64_t total_num_rows,
1125 int64_t blocknum,
1126 int64_t diskoff,
1127 int64_t size,
1128 void *extra),
1129 void *iter_extra) {
1130 int error = 0;
1131 _mutex_lock();
1132
1133 int64_t total_num_rows =
1134 _current.length_of_array + _checkpointed.length_of_array;
1135 for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) {
1136 struct block_translation_pair *block = &_current.block_translation[i];
1137 error = iter(checkpoint_count,
1138 total_num_rows,
1139 i,
1140 block->u.diskoff,
1141 block->size,
1142 iter_extra);
1143 }
1144 for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) {
1145 struct block_translation_pair *block =
1146 &_checkpointed.block_translation[i];
1147 error = iter(checkpoint_count - 1,
1148 total_num_rows,
1149 i,
1150 block->u.diskoff,
1151 block->size,
1152 iter_extra);
1153 }
1154
1155 _mutex_unlock();
1156 return error;
1157}
1158