1#include "duckdb/storage/single_file_block_manager.hpp"
2
3#include "duckdb/common/allocator.hpp"
4#include "duckdb/common/checksum.hpp"
5#include "duckdb/common/exception.hpp"
6#include "duckdb/common/serializer/buffered_deserializer.hpp"
7#include "duckdb/common/serializer/buffered_serializer.hpp"
8#include "duckdb/common/field_writer.hpp"
9#include "duckdb/storage/meta_block_reader.hpp"
10#include "duckdb/storage/meta_block_writer.hpp"
11#include "duckdb/storage/buffer_manager.hpp"
12#include "duckdb/main/config.hpp"
13
14#include <algorithm>
15#include <cstring>
16
17namespace duckdb {
18
19const char MainHeader::MAGIC_BYTES[] = "DUCK";
20
21void MainHeader::Serialize(Serializer &ser) {
22 ser.WriteData(buffer: const_data_ptr_cast(src: MAGIC_BYTES), write_size: MAGIC_BYTE_SIZE);
23 ser.Write<uint64_t>(element: version_number);
24 FieldWriter writer(ser);
25 for (idx_t i = 0; i < FLAG_COUNT; i++) {
26 writer.WriteField<uint64_t>(element: flags[i]);
27 }
28 writer.Finalize();
29}
30
31void MainHeader::CheckMagicBytes(FileHandle &handle) {
32 data_t magic_bytes[MAGIC_BYTE_SIZE];
33 if (handle.GetFileSize() < MainHeader::MAGIC_BYTE_SIZE + MainHeader::MAGIC_BYTE_OFFSET) {
34 throw IOException("The file \"%s\" exists, but it is not a valid DuckDB database file!", handle.path);
35 }
36 handle.Read(buffer: magic_bytes, nr_bytes: MainHeader::MAGIC_BYTE_SIZE, location: MainHeader::MAGIC_BYTE_OFFSET);
37 if (memcmp(s1: magic_bytes, s2: MainHeader::MAGIC_BYTES, n: MainHeader::MAGIC_BYTE_SIZE) != 0) {
38 throw IOException("The file \"%s\" exists, but it is not a valid DuckDB database file!", handle.path);
39 }
40}
41
42MainHeader MainHeader::Deserialize(Deserializer &source) {
43 data_t magic_bytes[MAGIC_BYTE_SIZE];
44 MainHeader header;
45 source.ReadData(buffer: magic_bytes, read_size: MainHeader::MAGIC_BYTE_SIZE);
46 if (memcmp(s1: magic_bytes, s2: MainHeader::MAGIC_BYTES, n: MainHeader::MAGIC_BYTE_SIZE) != 0) {
47 throw IOException("The file is not a valid DuckDB database file!");
48 }
49 header.version_number = source.Read<uint64_t>();
50 // check the version number
51 if (header.version_number != VERSION_NUMBER) {
52 auto version = GetDuckDBVersion(version_number: header.version_number);
53 string version_text;
54 if (version) {
55 // known version
56 version_text = "DuckDB version " + string(version);
57 } else {
58 version_text = string("an ") + (VERSION_NUMBER > header.version_number ? "older development" : "newer") +
59 string(" version of DuckDB");
60 }
61 throw IOException(
62 "Trying to read a database file with version number %lld, but we can only read version %lld.\n"
63 "The database file was created with %s.\n\n"
64 "The storage of DuckDB is not yet stable; newer versions of DuckDB cannot read old database files and "
65 "vice versa.\n"
66 "The storage will be stabilized when version 1.0 releases.\n\n"
67 "For now, we recommend that you load the database file in a supported version of DuckDB, and use the "
68 "EXPORT DATABASE command "
69 "followed by IMPORT DATABASE on the current version of DuckDB.\n\n"
70 "See the storage page for more information: https://duckdb.org/internals/storage",
71 header.version_number, VERSION_NUMBER, version_text);
72 }
73 // read the flags
74 FieldReader reader(source);
75 for (idx_t i = 0; i < FLAG_COUNT; i++) {
76 header.flags[i] = reader.ReadRequired<uint64_t>();
77 }
78 reader.Finalize();
79 return header;
80}
81
82void DatabaseHeader::Serialize(Serializer &ser) {
83 ser.Write<uint64_t>(element: iteration);
84 ser.Write<block_id_t>(element: meta_block);
85 ser.Write<block_id_t>(element: free_list);
86 ser.Write<uint64_t>(element: block_count);
87}
88
89DatabaseHeader DatabaseHeader::Deserialize(Deserializer &source) {
90 DatabaseHeader header;
91 header.iteration = source.Read<uint64_t>();
92 header.meta_block = source.Read<block_id_t>();
93 header.free_list = source.Read<block_id_t>();
94 header.block_count = source.Read<uint64_t>();
95 return header;
96}
97
98template <class T>
99void SerializeHeaderStructure(T header, data_ptr_t ptr) {
100 BufferedSerializer ser(ptr, Storage::FILE_HEADER_SIZE);
101 header.Serialize(ser);
102}
103
104template <class T>
105T DeserializeHeaderStructure(data_ptr_t ptr) {
106 BufferedDeserializer source(ptr, Storage::FILE_HEADER_SIZE);
107 return T::Deserialize(source);
108}
109
110SingleFileBlockManager::SingleFileBlockManager(AttachedDatabase &db, string path_p, StorageManagerOptions options)
111 : BlockManager(BufferManager::GetBufferManager(db)), db(db), path(std::move(path_p)),
112 header_buffer(Allocator::Get(db), FileBufferType::MANAGED_BUFFER,
113 Storage::FILE_HEADER_SIZE - Storage::BLOCK_HEADER_SIZE),
114 iteration_count(0), options(options) {
115}
116
117void SingleFileBlockManager::GetFileFlags(uint8_t &flags, FileLockType &lock, bool create_new) {
118 if (options.read_only) {
119 D_ASSERT(!create_new);
120 flags = FileFlags::FILE_FLAGS_READ;
121 lock = FileLockType::READ_LOCK;
122 } else {
123 flags = FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_READ;
124 lock = FileLockType::WRITE_LOCK;
125 if (create_new) {
126 flags |= FileFlags::FILE_FLAGS_FILE_CREATE;
127 }
128 }
129 if (options.use_direct_io) {
130 flags |= FileFlags::FILE_FLAGS_DIRECT_IO;
131 }
132}
133
134void SingleFileBlockManager::CreateNewDatabase() {
135 uint8_t flags;
136 FileLockType lock;
137 GetFileFlags(flags, lock, create_new: true);
138
139 // open the RDBMS handle
140 auto &fs = FileSystem::Get(db);
141 handle = fs.OpenFile(path, flags, lock);
142
143 // if we create a new file, we fill the metadata of the file
144 // first fill in the new header
145 header_buffer.Clear();
146
147 MainHeader main_header;
148 main_header.version_number = VERSION_NUMBER;
149 memset(s: main_header.flags, c: 0, n: sizeof(uint64_t) * 4);
150
151 SerializeHeaderStructure<MainHeader>(header: main_header, ptr: header_buffer.buffer);
152 // now write the header to the file
153 ChecksumAndWrite(handle&: header_buffer, location: 0);
154 header_buffer.Clear();
155
156 // write the database headers
157 // initialize meta_block and free_list to INVALID_BLOCK because the database file does not contain any actual
158 // content yet
159 DatabaseHeader h1, h2;
160 // header 1
161 h1.iteration = 0;
162 h1.meta_block = INVALID_BLOCK;
163 h1.free_list = INVALID_BLOCK;
164 h1.block_count = 0;
165 SerializeHeaderStructure<DatabaseHeader>(header: h1, ptr: header_buffer.buffer);
166 ChecksumAndWrite(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE);
167 // header 2
168 h2.iteration = 0;
169 h2.meta_block = INVALID_BLOCK;
170 h2.free_list = INVALID_BLOCK;
171 h2.block_count = 0;
172 SerializeHeaderStructure<DatabaseHeader>(header: h2, ptr: header_buffer.buffer);
173 ChecksumAndWrite(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE * 2ULL);
174 // ensure that writing to disk is completed before returning
175 handle->Sync();
176 // we start with h2 as active_header, this way our initial write will be in h1
177 iteration_count = 0;
178 active_header = 1;
179 max_block = 0;
180}
181
182void SingleFileBlockManager::LoadExistingDatabase() {
183 uint8_t flags;
184 FileLockType lock;
185 GetFileFlags(flags, lock, create_new: false);
186
187 // open the RDBMS handle
188 auto &fs = FileSystem::Get(db);
189 handle = fs.OpenFile(path, flags, lock);
190
191 MainHeader::CheckMagicBytes(handle&: *handle);
192 // otherwise, we check the metadata of the file
193 ReadAndChecksum(handle&: header_buffer, location: 0);
194 DeserializeHeaderStructure<MainHeader>(ptr: header_buffer.buffer);
195
196 // read the database headers from disk
197 DatabaseHeader h1, h2;
198 ReadAndChecksum(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE);
199 h1 = DeserializeHeaderStructure<DatabaseHeader>(ptr: header_buffer.buffer);
200 ReadAndChecksum(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE * 2ULL);
201 h2 = DeserializeHeaderStructure<DatabaseHeader>(ptr: header_buffer.buffer);
202 // check the header with the highest iteration count
203 if (h1.iteration > h2.iteration) {
204 // h1 is active header
205 active_header = 0;
206 Initialize(header&: h1);
207 } else {
208 // h2 is active header
209 active_header = 1;
210 Initialize(header&: h2);
211 }
212 LoadFreeList();
213}
214
215void SingleFileBlockManager::ReadAndChecksum(FileBuffer &block, uint64_t location) const {
216 // read the buffer from disk
217 block.Read(handle&: *handle, location);
218 // compute the checksum
219 auto stored_checksum = Load<uint64_t>(ptr: block.InternalBuffer());
220 uint64_t computed_checksum = Checksum(buffer: block.buffer, size: block.size);
221 // verify the checksum
222 if (stored_checksum != computed_checksum) {
223 throw IOException("Corrupt database file: computed checksum %llu does not match stored checksum %llu in block",
224 computed_checksum, stored_checksum);
225 }
226}
227
228void SingleFileBlockManager::ChecksumAndWrite(FileBuffer &block, uint64_t location) const {
229 // compute the checksum and write it to the start of the buffer (if not temp buffer)
230 uint64_t checksum = Checksum(buffer: block.buffer, size: block.size);
231 Store<uint64_t>(val: checksum, ptr: block.InternalBuffer());
232 // now write the buffer
233 block.Write(handle&: *handle, location);
234}
235
236void SingleFileBlockManager::Initialize(DatabaseHeader &header) {
237 free_list_id = header.free_list;
238 meta_block = header.meta_block;
239 iteration_count = header.iteration;
240 max_block = header.block_count;
241}
242
243void SingleFileBlockManager::LoadFreeList() {
244 if (free_list_id == INVALID_BLOCK) {
245 // no free list
246 return;
247 }
248 MetaBlockReader reader(*this, free_list_id);
249 auto free_list_count = reader.Read<uint64_t>();
250 free_list.clear();
251 for (idx_t i = 0; i < free_list_count; i++) {
252 free_list.insert(x: reader.Read<block_id_t>());
253 }
254 auto multi_use_blocks_count = reader.Read<uint64_t>();
255 multi_use_blocks.clear();
256 for (idx_t i = 0; i < multi_use_blocks_count; i++) {
257 auto block_id = reader.Read<block_id_t>();
258 auto usage_count = reader.Read<uint32_t>();
259 multi_use_blocks[block_id] = usage_count;
260 }
261}
262
263bool SingleFileBlockManager::IsRootBlock(block_id_t root) {
264 return root == meta_block;
265}
266
267block_id_t SingleFileBlockManager::GetFreeBlockId() {
268 lock_guard<mutex> lock(block_lock);
269 block_id_t block;
270 if (!free_list.empty()) {
271 // free list is non empty
272 // take an entry from the free list
273 block = *free_list.begin();
274 // erase the entry from the free list again
275 free_list.erase(position: free_list.begin());
276 } else {
277 block = max_block++;
278 }
279 return block;
280}
281
282void SingleFileBlockManager::MarkBlockAsFree(block_id_t block_id) {
283 lock_guard<mutex> lock(block_lock);
284 D_ASSERT(block_id >= 0);
285 D_ASSERT(block_id < max_block);
286 if (free_list.find(x: block_id) != free_list.end()) {
287 throw InternalException("MarkBlockAsFree called but block %llu was already freed!", block_id);
288 }
289 multi_use_blocks.erase(x: block_id);
290 free_list.insert(x: block_id);
291}
292
293void SingleFileBlockManager::MarkBlockAsModified(block_id_t block_id) {
294 lock_guard<mutex> lock(block_lock);
295 D_ASSERT(block_id >= 0);
296 D_ASSERT(block_id < max_block);
297
298 // check if the block is a multi-use block
299 auto entry = multi_use_blocks.find(x: block_id);
300 if (entry != multi_use_blocks.end()) {
301 // it is! reduce the reference count of the block
302 entry->second--;
303 // check the reference count: is the block still a multi-use block?
304 if (entry->second <= 1) {
305 // no longer a multi-use block!
306 multi_use_blocks.erase(position: entry);
307 }
308 return;
309 }
310 // Check for multi-free
311 // TODO: Fix the bug that causes this assert to fire, then uncomment it.
312 // D_ASSERT(modified_blocks.find(block_id) == modified_blocks.end());
313 D_ASSERT(free_list.find(block_id) == free_list.end());
314 modified_blocks.insert(x: block_id);
315}
316
317void SingleFileBlockManager::IncreaseBlockReferenceCount(block_id_t block_id) {
318 lock_guard<mutex> lock(block_lock);
319 D_ASSERT(block_id >= 0);
320 D_ASSERT(block_id < max_block);
321 D_ASSERT(free_list.find(block_id) == free_list.end());
322 auto entry = multi_use_blocks.find(x: block_id);
323 if (entry != multi_use_blocks.end()) {
324 entry->second++;
325 } else {
326 multi_use_blocks[block_id] = 2;
327 }
328}
329
330block_id_t SingleFileBlockManager::GetMetaBlock() {
331 return meta_block;
332}
333
334idx_t SingleFileBlockManager::TotalBlocks() {
335 lock_guard<mutex> lock(block_lock);
336 return max_block;
337}
338
339idx_t SingleFileBlockManager::FreeBlocks() {
340 lock_guard<mutex> lock(block_lock);
341 return free_list.size();
342}
343
344unique_ptr<Block> SingleFileBlockManager::ConvertBlock(block_id_t block_id, FileBuffer &source_buffer) {
345 D_ASSERT(source_buffer.AllocSize() == Storage::BLOCK_ALLOC_SIZE);
346 return make_uniq<Block>(args&: source_buffer, args&: block_id);
347}
348
349unique_ptr<Block> SingleFileBlockManager::CreateBlock(block_id_t block_id, FileBuffer *source_buffer) {
350 unique_ptr<Block> result;
351 if (source_buffer) {
352 result = ConvertBlock(block_id, source_buffer&: *source_buffer);
353 } else {
354 result = make_uniq<Block>(args&: Allocator::Get(db), args&: block_id);
355 }
356 result->Initialize(info: options.debug_initialize);
357 return result;
358}
359
360void SingleFileBlockManager::Read(Block &block) {
361 D_ASSERT(block.id >= 0);
362 D_ASSERT(std::find(free_list.begin(), free_list.end(), block.id) == free_list.end());
363 ReadAndChecksum(block, location: BLOCK_START + block.id * Storage::BLOCK_ALLOC_SIZE);
364}
365
366void SingleFileBlockManager::Write(FileBuffer &buffer, block_id_t block_id) {
367 D_ASSERT(block_id >= 0);
368 ChecksumAndWrite(block&: buffer, location: BLOCK_START + block_id * Storage::BLOCK_ALLOC_SIZE);
369}
370
371vector<block_id_t> SingleFileBlockManager::GetFreeListBlocks() {
372 vector<block_id_t> free_list_blocks;
373
374 if (!free_list.empty() || !multi_use_blocks.empty() || !modified_blocks.empty()) {
375 // there are blocks in the free list or multi_use_blocks
376 // figure out how many blocks we need to write these to the file
377 auto free_list_size = sizeof(uint64_t) + sizeof(block_id_t) * (free_list.size() + modified_blocks.size());
378 auto multi_use_blocks_size =
379 sizeof(uint64_t) + (sizeof(block_id_t) + sizeof(uint32_t)) * multi_use_blocks.size();
380 auto total_size = free_list_size + multi_use_blocks_size;
381 // because of potential alignment issues and needing to store a next pointer in a block we subtract
382 // a bit from the max block size
383 auto space_in_block = Storage::BLOCK_SIZE - 4 * sizeof(block_id_t);
384 auto total_blocks = (total_size + space_in_block - 1) / space_in_block;
385 D_ASSERT(total_size > 0);
386 D_ASSERT(total_blocks > 0);
387
388 // reserve the blocks that we are going to write
389 // since these blocks are no longer free we cannot just include them in the free list!
390 for (idx_t i = 0; i < total_blocks; i++) {
391 auto block_id = GetFreeBlockId();
392 free_list_blocks.push_back(x: block_id);
393 }
394 }
395
396 return free_list_blocks;
397}
398
399class FreeListBlockWriter : public MetaBlockWriter {
400public:
401 FreeListBlockWriter(BlockManager &block_manager, vector<block_id_t> &free_list_blocks_p)
402 : MetaBlockWriter(block_manager, free_list_blocks_p[0]), free_list_blocks(free_list_blocks_p), index(1) {
403 }
404
405 vector<block_id_t> &free_list_blocks;
406 idx_t index;
407
408protected:
409 block_id_t GetNextBlockId() override {
410 if (index >= free_list_blocks.size()) {
411 throw InternalException(
412 "Free List Block Writer ran out of blocks, this means not enough blocks were allocated up front");
413 }
414 return free_list_blocks[index++];
415 }
416};
417
418void SingleFileBlockManager::WriteHeader(DatabaseHeader header) {
419 // set the iteration count
420 header.iteration = ++iteration_count;
421
422 vector<block_id_t> free_list_blocks = GetFreeListBlocks();
423
424 // now handle the free list
425 // add all modified blocks to the free list: they can now be written to again
426 for (auto &block : modified_blocks) {
427 free_list.insert(x: block);
428 }
429 modified_blocks.clear();
430
431 if (!free_list_blocks.empty()) {
432 // there are blocks to write, either in the free_list or in the modified_blocks
433 // we write these blocks specifically to the free_list_blocks
434 // a normal MetaBlockWriter will fetch blocks to use from the free_list
435 // but since we are WRITING the free_list, this behavior is sub-optimal
436
437 FreeListBlockWriter writer(*this, free_list_blocks);
438
439 auto ptr = writer.GetBlockPointer();
440 D_ASSERT(ptr.block_id == free_list_blocks[0]);
441 header.free_list = ptr.block_id;
442 for (auto &block_id : free_list_blocks) {
443 modified_blocks.insert(x: block_id);
444 }
445
446 writer.Write<uint64_t>(element: free_list.size());
447 for (auto &block_id : free_list) {
448 writer.Write<block_id_t>(element: block_id);
449 }
450 writer.Write<uint64_t>(element: multi_use_blocks.size());
451 for (auto &entry : multi_use_blocks) {
452 writer.Write<block_id_t>(element: entry.first);
453 writer.Write<uint32_t>(element: entry.second);
454 }
455 writer.Flush();
456 } else {
457 // no blocks in the free list
458 header.free_list = INVALID_BLOCK;
459 }
460 header.block_count = max_block;
461
462 auto &config = DBConfig::Get(db);
463 if (config.options.checkpoint_abort == CheckpointAbort::DEBUG_ABORT_AFTER_FREE_LIST_WRITE) {
464 throw FatalException("Checkpoint aborted after free list write because of PRAGMA checkpoint_abort flag");
465 }
466
467 if (!options.use_direct_io) {
468 // if we are not using Direct IO we need to fsync BEFORE we write the header to ensure that all the previous
469 // blocks are written as well
470 handle->Sync();
471 }
472 // set the header inside the buffer
473 header_buffer.Clear();
474 Store<DatabaseHeader>(val: header, ptr: header_buffer.buffer);
475 // now write the header to the file, active_header determines whether we write to h1 or h2
476 // note that if active_header is h1 we write to h2, and vice versa
477 ChecksumAndWrite(block&: header_buffer, location: active_header == 1 ? Storage::FILE_HEADER_SIZE : Storage::FILE_HEADER_SIZE * 2);
478 // switch active header to the other header
479 active_header = 1 - active_header;
480 //! Ensure the header write ends up on disk
481 handle->Sync();
482}
483
484} // namespace duckdb
485