1 | #include "duckdb/storage/single_file_block_manager.hpp" |
2 | |
3 | #include "duckdb/common/allocator.hpp" |
4 | #include "duckdb/common/checksum.hpp" |
5 | #include "duckdb/common/exception.hpp" |
6 | #include "duckdb/common/serializer/buffered_deserializer.hpp" |
7 | #include "duckdb/common/serializer/buffered_serializer.hpp" |
8 | #include "duckdb/common/field_writer.hpp" |
9 | #include "duckdb/storage/meta_block_reader.hpp" |
10 | #include "duckdb/storage/meta_block_writer.hpp" |
11 | #include "duckdb/storage/buffer_manager.hpp" |
12 | #include "duckdb/main/config.hpp" |
13 | |
14 | #include <algorithm> |
15 | #include <cstring> |
16 | |
17 | namespace duckdb { |
18 | |
19 | const char MainHeader::MAGIC_BYTES[] = "DUCK" ; |
20 | |
21 | void MainHeader::Serialize(Serializer &ser) { |
22 | ser.WriteData(buffer: const_data_ptr_cast(src: MAGIC_BYTES), write_size: MAGIC_BYTE_SIZE); |
23 | ser.Write<uint64_t>(element: version_number); |
24 | FieldWriter writer(ser); |
25 | for (idx_t i = 0; i < FLAG_COUNT; i++) { |
26 | writer.WriteField<uint64_t>(element: flags[i]); |
27 | } |
28 | writer.Finalize(); |
29 | } |
30 | |
31 | void MainHeader::CheckMagicBytes(FileHandle &handle) { |
32 | data_t magic_bytes[MAGIC_BYTE_SIZE]; |
33 | if (handle.GetFileSize() < MainHeader::MAGIC_BYTE_SIZE + MainHeader::MAGIC_BYTE_OFFSET) { |
34 | throw IOException("The file \"%s\" exists, but it is not a valid DuckDB database file!" , handle.path); |
35 | } |
36 | handle.Read(buffer: magic_bytes, nr_bytes: MainHeader::MAGIC_BYTE_SIZE, location: MainHeader::MAGIC_BYTE_OFFSET); |
37 | if (memcmp(s1: magic_bytes, s2: MainHeader::MAGIC_BYTES, n: MainHeader::MAGIC_BYTE_SIZE) != 0) { |
38 | throw IOException("The file \"%s\" exists, but it is not a valid DuckDB database file!" , handle.path); |
39 | } |
40 | } |
41 | |
42 | MainHeader MainHeader::Deserialize(Deserializer &source) { |
43 | data_t magic_bytes[MAGIC_BYTE_SIZE]; |
44 | MainHeader ; |
45 | source.ReadData(buffer: magic_bytes, read_size: MainHeader::MAGIC_BYTE_SIZE); |
46 | if (memcmp(s1: magic_bytes, s2: MainHeader::MAGIC_BYTES, n: MainHeader::MAGIC_BYTE_SIZE) != 0) { |
47 | throw IOException("The file is not a valid DuckDB database file!" ); |
48 | } |
49 | header.version_number = source.Read<uint64_t>(); |
50 | // check the version number |
51 | if (header.version_number != VERSION_NUMBER) { |
52 | auto version = GetDuckDBVersion(version_number: header.version_number); |
53 | string version_text; |
54 | if (version) { |
55 | // known version |
56 | version_text = "DuckDB version " + string(version); |
57 | } else { |
58 | version_text = string("an " ) + (VERSION_NUMBER > header.version_number ? "older development" : "newer" ) + |
59 | string(" version of DuckDB" ); |
60 | } |
61 | throw IOException( |
62 | "Trying to read a database file with version number %lld, but we can only read version %lld.\n" |
63 | "The database file was created with %s.\n\n" |
64 | "The storage of DuckDB is not yet stable; newer versions of DuckDB cannot read old database files and " |
65 | "vice versa.\n" |
66 | "The storage will be stabilized when version 1.0 releases.\n\n" |
67 | "For now, we recommend that you load the database file in a supported version of DuckDB, and use the " |
68 | "EXPORT DATABASE command " |
69 | "followed by IMPORT DATABASE on the current version of DuckDB.\n\n" |
70 | "See the storage page for more information: https://duckdb.org/internals/storage" , |
71 | header.version_number, VERSION_NUMBER, version_text); |
72 | } |
73 | // read the flags |
74 | FieldReader reader(source); |
75 | for (idx_t i = 0; i < FLAG_COUNT; i++) { |
76 | header.flags[i] = reader.ReadRequired<uint64_t>(); |
77 | } |
78 | reader.Finalize(); |
79 | return header; |
80 | } |
81 | |
82 | void DatabaseHeader::(Serializer &ser) { |
83 | ser.Write<uint64_t>(element: iteration); |
84 | ser.Write<block_id_t>(element: meta_block); |
85 | ser.Write<block_id_t>(element: free_list); |
86 | ser.Write<uint64_t>(element: block_count); |
87 | } |
88 | |
89 | DatabaseHeader DatabaseHeader::(Deserializer &source) { |
90 | DatabaseHeader ; |
91 | header.iteration = source.Read<uint64_t>(); |
92 | header.meta_block = source.Read<block_id_t>(); |
93 | header.free_list = source.Read<block_id_t>(); |
94 | header.block_count = source.Read<uint64_t>(); |
95 | return header; |
96 | } |
97 | |
98 | template <class T> |
99 | void (T , data_ptr_t ptr) { |
100 | BufferedSerializer ser(ptr, Storage::FILE_HEADER_SIZE); |
101 | header.Serialize(ser); |
102 | } |
103 | |
104 | template <class T> |
105 | T (data_ptr_t ptr) { |
106 | BufferedDeserializer source(ptr, Storage::FILE_HEADER_SIZE); |
107 | return T::Deserialize(source); |
108 | } |
109 | |
110 | SingleFileBlockManager::SingleFileBlockManager(AttachedDatabase &db, string path_p, StorageManagerOptions options) |
111 | : BlockManager(BufferManager::GetBufferManager(db)), db(db), path(std::move(path_p)), |
112 | header_buffer(Allocator::Get(db), FileBufferType::MANAGED_BUFFER, |
113 | Storage::FILE_HEADER_SIZE - Storage::BLOCK_HEADER_SIZE), |
114 | iteration_count(0), options(options) { |
115 | } |
116 | |
117 | void SingleFileBlockManager::GetFileFlags(uint8_t &flags, FileLockType &lock, bool create_new) { |
118 | if (options.read_only) { |
119 | D_ASSERT(!create_new); |
120 | flags = FileFlags::FILE_FLAGS_READ; |
121 | lock = FileLockType::READ_LOCK; |
122 | } else { |
123 | flags = FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_READ; |
124 | lock = FileLockType::WRITE_LOCK; |
125 | if (create_new) { |
126 | flags |= FileFlags::FILE_FLAGS_FILE_CREATE; |
127 | } |
128 | } |
129 | if (options.use_direct_io) { |
130 | flags |= FileFlags::FILE_FLAGS_DIRECT_IO; |
131 | } |
132 | } |
133 | |
134 | void SingleFileBlockManager::CreateNewDatabase() { |
135 | uint8_t flags; |
136 | FileLockType lock; |
137 | GetFileFlags(flags, lock, create_new: true); |
138 | |
139 | // open the RDBMS handle |
140 | auto &fs = FileSystem::Get(db); |
141 | handle = fs.OpenFile(path, flags, lock); |
142 | |
143 | // if we create a new file, we fill the metadata of the file |
144 | // first fill in the new header |
145 | header_buffer.Clear(); |
146 | |
147 | MainHeader main_header; |
148 | main_header.version_number = VERSION_NUMBER; |
149 | memset(s: main_header.flags, c: 0, n: sizeof(uint64_t) * 4); |
150 | |
151 | SerializeHeaderStructure<MainHeader>(header: main_header, ptr: header_buffer.buffer); |
152 | // now write the header to the file |
153 | ChecksumAndWrite(handle&: header_buffer, location: 0); |
154 | header_buffer.Clear(); |
155 | |
156 | // write the database headers |
157 | // initialize meta_block and free_list to INVALID_BLOCK because the database file does not contain any actual |
158 | // content yet |
159 | DatabaseHeader h1, h2; |
160 | // header 1 |
161 | h1.iteration = 0; |
162 | h1.meta_block = INVALID_BLOCK; |
163 | h1.free_list = INVALID_BLOCK; |
164 | h1.block_count = 0; |
165 | SerializeHeaderStructure<DatabaseHeader>(header: h1, ptr: header_buffer.buffer); |
166 | ChecksumAndWrite(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE); |
167 | // header 2 |
168 | h2.iteration = 0; |
169 | h2.meta_block = INVALID_BLOCK; |
170 | h2.free_list = INVALID_BLOCK; |
171 | h2.block_count = 0; |
172 | SerializeHeaderStructure<DatabaseHeader>(header: h2, ptr: header_buffer.buffer); |
173 | ChecksumAndWrite(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE * 2ULL); |
174 | // ensure that writing to disk is completed before returning |
175 | handle->Sync(); |
176 | // we start with h2 as active_header, this way our initial write will be in h1 |
177 | iteration_count = 0; |
178 | active_header = 1; |
179 | max_block = 0; |
180 | } |
181 | |
182 | void SingleFileBlockManager::LoadExistingDatabase() { |
183 | uint8_t flags; |
184 | FileLockType lock; |
185 | GetFileFlags(flags, lock, create_new: false); |
186 | |
187 | // open the RDBMS handle |
188 | auto &fs = FileSystem::Get(db); |
189 | handle = fs.OpenFile(path, flags, lock); |
190 | |
191 | MainHeader::CheckMagicBytes(handle&: *handle); |
192 | // otherwise, we check the metadata of the file |
193 | ReadAndChecksum(handle&: header_buffer, location: 0); |
194 | DeserializeHeaderStructure<MainHeader>(ptr: header_buffer.buffer); |
195 | |
196 | // read the database headers from disk |
197 | DatabaseHeader h1, h2; |
198 | ReadAndChecksum(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE); |
199 | h1 = DeserializeHeaderStructure<DatabaseHeader>(ptr: header_buffer.buffer); |
200 | ReadAndChecksum(handle&: header_buffer, location: Storage::FILE_HEADER_SIZE * 2ULL); |
201 | h2 = DeserializeHeaderStructure<DatabaseHeader>(ptr: header_buffer.buffer); |
202 | // check the header with the highest iteration count |
203 | if (h1.iteration > h2.iteration) { |
204 | // h1 is active header |
205 | active_header = 0; |
206 | Initialize(header&: h1); |
207 | } else { |
208 | // h2 is active header |
209 | active_header = 1; |
210 | Initialize(header&: h2); |
211 | } |
212 | LoadFreeList(); |
213 | } |
214 | |
215 | void SingleFileBlockManager::ReadAndChecksum(FileBuffer &block, uint64_t location) const { |
216 | // read the buffer from disk |
217 | block.Read(handle&: *handle, location); |
218 | // compute the checksum |
219 | auto stored_checksum = Load<uint64_t>(ptr: block.InternalBuffer()); |
220 | uint64_t computed_checksum = Checksum(buffer: block.buffer, size: block.size); |
221 | // verify the checksum |
222 | if (stored_checksum != computed_checksum) { |
223 | throw IOException("Corrupt database file: computed checksum %llu does not match stored checksum %llu in block" , |
224 | computed_checksum, stored_checksum); |
225 | } |
226 | } |
227 | |
228 | void SingleFileBlockManager::ChecksumAndWrite(FileBuffer &block, uint64_t location) const { |
229 | // compute the checksum and write it to the start of the buffer (if not temp buffer) |
230 | uint64_t checksum = Checksum(buffer: block.buffer, size: block.size); |
231 | Store<uint64_t>(val: checksum, ptr: block.InternalBuffer()); |
232 | // now write the buffer |
233 | block.Write(handle&: *handle, location); |
234 | } |
235 | |
236 | void SingleFileBlockManager::(DatabaseHeader &) { |
237 | free_list_id = header.free_list; |
238 | meta_block = header.meta_block; |
239 | iteration_count = header.iteration; |
240 | max_block = header.block_count; |
241 | } |
242 | |
243 | void SingleFileBlockManager::LoadFreeList() { |
244 | if (free_list_id == INVALID_BLOCK) { |
245 | // no free list |
246 | return; |
247 | } |
248 | MetaBlockReader reader(*this, free_list_id); |
249 | auto free_list_count = reader.Read<uint64_t>(); |
250 | free_list.clear(); |
251 | for (idx_t i = 0; i < free_list_count; i++) { |
252 | free_list.insert(x: reader.Read<block_id_t>()); |
253 | } |
254 | auto multi_use_blocks_count = reader.Read<uint64_t>(); |
255 | multi_use_blocks.clear(); |
256 | for (idx_t i = 0; i < multi_use_blocks_count; i++) { |
257 | auto block_id = reader.Read<block_id_t>(); |
258 | auto usage_count = reader.Read<uint32_t>(); |
259 | multi_use_blocks[block_id] = usage_count; |
260 | } |
261 | } |
262 | |
263 | bool SingleFileBlockManager::IsRootBlock(block_id_t root) { |
264 | return root == meta_block; |
265 | } |
266 | |
267 | block_id_t SingleFileBlockManager::GetFreeBlockId() { |
268 | lock_guard<mutex> lock(block_lock); |
269 | block_id_t block; |
270 | if (!free_list.empty()) { |
271 | // free list is non empty |
272 | // take an entry from the free list |
273 | block = *free_list.begin(); |
274 | // erase the entry from the free list again |
275 | free_list.erase(position: free_list.begin()); |
276 | } else { |
277 | block = max_block++; |
278 | } |
279 | return block; |
280 | } |
281 | |
282 | void SingleFileBlockManager::MarkBlockAsFree(block_id_t block_id) { |
283 | lock_guard<mutex> lock(block_lock); |
284 | D_ASSERT(block_id >= 0); |
285 | D_ASSERT(block_id < max_block); |
286 | if (free_list.find(x: block_id) != free_list.end()) { |
287 | throw InternalException("MarkBlockAsFree called but block %llu was already freed!" , block_id); |
288 | } |
289 | multi_use_blocks.erase(x: block_id); |
290 | free_list.insert(x: block_id); |
291 | } |
292 | |
293 | void SingleFileBlockManager::MarkBlockAsModified(block_id_t block_id) { |
294 | lock_guard<mutex> lock(block_lock); |
295 | D_ASSERT(block_id >= 0); |
296 | D_ASSERT(block_id < max_block); |
297 | |
298 | // check if the block is a multi-use block |
299 | auto entry = multi_use_blocks.find(x: block_id); |
300 | if (entry != multi_use_blocks.end()) { |
301 | // it is! reduce the reference count of the block |
302 | entry->second--; |
303 | // check the reference count: is the block still a multi-use block? |
304 | if (entry->second <= 1) { |
305 | // no longer a multi-use block! |
306 | multi_use_blocks.erase(position: entry); |
307 | } |
308 | return; |
309 | } |
310 | // Check for multi-free |
311 | // TODO: Fix the bug that causes this assert to fire, then uncomment it. |
312 | // D_ASSERT(modified_blocks.find(block_id) == modified_blocks.end()); |
313 | D_ASSERT(free_list.find(block_id) == free_list.end()); |
314 | modified_blocks.insert(x: block_id); |
315 | } |
316 | |
317 | void SingleFileBlockManager::IncreaseBlockReferenceCount(block_id_t block_id) { |
318 | lock_guard<mutex> lock(block_lock); |
319 | D_ASSERT(block_id >= 0); |
320 | D_ASSERT(block_id < max_block); |
321 | D_ASSERT(free_list.find(block_id) == free_list.end()); |
322 | auto entry = multi_use_blocks.find(x: block_id); |
323 | if (entry != multi_use_blocks.end()) { |
324 | entry->second++; |
325 | } else { |
326 | multi_use_blocks[block_id] = 2; |
327 | } |
328 | } |
329 | |
330 | block_id_t SingleFileBlockManager::GetMetaBlock() { |
331 | return meta_block; |
332 | } |
333 | |
334 | idx_t SingleFileBlockManager::TotalBlocks() { |
335 | lock_guard<mutex> lock(block_lock); |
336 | return max_block; |
337 | } |
338 | |
339 | idx_t SingleFileBlockManager::FreeBlocks() { |
340 | lock_guard<mutex> lock(block_lock); |
341 | return free_list.size(); |
342 | } |
343 | |
344 | unique_ptr<Block> SingleFileBlockManager::ConvertBlock(block_id_t block_id, FileBuffer &source_buffer) { |
345 | D_ASSERT(source_buffer.AllocSize() == Storage::BLOCK_ALLOC_SIZE); |
346 | return make_uniq<Block>(args&: source_buffer, args&: block_id); |
347 | } |
348 | |
349 | unique_ptr<Block> SingleFileBlockManager::CreateBlock(block_id_t block_id, FileBuffer *source_buffer) { |
350 | unique_ptr<Block> result; |
351 | if (source_buffer) { |
352 | result = ConvertBlock(block_id, source_buffer&: *source_buffer); |
353 | } else { |
354 | result = make_uniq<Block>(args&: Allocator::Get(db), args&: block_id); |
355 | } |
356 | result->Initialize(info: options.debug_initialize); |
357 | return result; |
358 | } |
359 | |
360 | void SingleFileBlockManager::Read(Block &block) { |
361 | D_ASSERT(block.id >= 0); |
362 | D_ASSERT(std::find(free_list.begin(), free_list.end(), block.id) == free_list.end()); |
363 | ReadAndChecksum(block, location: BLOCK_START + block.id * Storage::BLOCK_ALLOC_SIZE); |
364 | } |
365 | |
366 | void SingleFileBlockManager::Write(FileBuffer &buffer, block_id_t block_id) { |
367 | D_ASSERT(block_id >= 0); |
368 | ChecksumAndWrite(block&: buffer, location: BLOCK_START + block_id * Storage::BLOCK_ALLOC_SIZE); |
369 | } |
370 | |
371 | vector<block_id_t> SingleFileBlockManager::GetFreeListBlocks() { |
372 | vector<block_id_t> free_list_blocks; |
373 | |
374 | if (!free_list.empty() || !multi_use_blocks.empty() || !modified_blocks.empty()) { |
375 | // there are blocks in the free list or multi_use_blocks |
376 | // figure out how many blocks we need to write these to the file |
377 | auto free_list_size = sizeof(uint64_t) + sizeof(block_id_t) * (free_list.size() + modified_blocks.size()); |
378 | auto multi_use_blocks_size = |
379 | sizeof(uint64_t) + (sizeof(block_id_t) + sizeof(uint32_t)) * multi_use_blocks.size(); |
380 | auto total_size = free_list_size + multi_use_blocks_size; |
381 | // because of potential alignment issues and needing to store a next pointer in a block we subtract |
382 | // a bit from the max block size |
383 | auto space_in_block = Storage::BLOCK_SIZE - 4 * sizeof(block_id_t); |
384 | auto total_blocks = (total_size + space_in_block - 1) / space_in_block; |
385 | D_ASSERT(total_size > 0); |
386 | D_ASSERT(total_blocks > 0); |
387 | |
388 | // reserve the blocks that we are going to write |
389 | // since these blocks are no longer free we cannot just include them in the free list! |
390 | for (idx_t i = 0; i < total_blocks; i++) { |
391 | auto block_id = GetFreeBlockId(); |
392 | free_list_blocks.push_back(x: block_id); |
393 | } |
394 | } |
395 | |
396 | return free_list_blocks; |
397 | } |
398 | |
399 | class FreeListBlockWriter : public MetaBlockWriter { |
400 | public: |
401 | FreeListBlockWriter(BlockManager &block_manager, vector<block_id_t> &free_list_blocks_p) |
402 | : MetaBlockWriter(block_manager, free_list_blocks_p[0]), free_list_blocks(free_list_blocks_p), index(1) { |
403 | } |
404 | |
405 | vector<block_id_t> &free_list_blocks; |
406 | idx_t index; |
407 | |
408 | protected: |
409 | block_id_t GetNextBlockId() override { |
410 | if (index >= free_list_blocks.size()) { |
411 | throw InternalException( |
412 | "Free List Block Writer ran out of blocks, this means not enough blocks were allocated up front" ); |
413 | } |
414 | return free_list_blocks[index++]; |
415 | } |
416 | }; |
417 | |
418 | void SingleFileBlockManager::(DatabaseHeader ) { |
419 | // set the iteration count |
420 | header.iteration = ++iteration_count; |
421 | |
422 | vector<block_id_t> free_list_blocks = GetFreeListBlocks(); |
423 | |
424 | // now handle the free list |
425 | // add all modified blocks to the free list: they can now be written to again |
426 | for (auto &block : modified_blocks) { |
427 | free_list.insert(x: block); |
428 | } |
429 | modified_blocks.clear(); |
430 | |
431 | if (!free_list_blocks.empty()) { |
432 | // there are blocks to write, either in the free_list or in the modified_blocks |
433 | // we write these blocks specifically to the free_list_blocks |
434 | // a normal MetaBlockWriter will fetch blocks to use from the free_list |
435 | // but since we are WRITING the free_list, this behavior is sub-optimal |
436 | |
437 | FreeListBlockWriter writer(*this, free_list_blocks); |
438 | |
439 | auto ptr = writer.GetBlockPointer(); |
440 | D_ASSERT(ptr.block_id == free_list_blocks[0]); |
441 | header.free_list = ptr.block_id; |
442 | for (auto &block_id : free_list_blocks) { |
443 | modified_blocks.insert(x: block_id); |
444 | } |
445 | |
446 | writer.Write<uint64_t>(element: free_list.size()); |
447 | for (auto &block_id : free_list) { |
448 | writer.Write<block_id_t>(element: block_id); |
449 | } |
450 | writer.Write<uint64_t>(element: multi_use_blocks.size()); |
451 | for (auto &entry : multi_use_blocks) { |
452 | writer.Write<block_id_t>(element: entry.first); |
453 | writer.Write<uint32_t>(element: entry.second); |
454 | } |
455 | writer.Flush(); |
456 | } else { |
457 | // no blocks in the free list |
458 | header.free_list = INVALID_BLOCK; |
459 | } |
460 | header.block_count = max_block; |
461 | |
462 | auto &config = DBConfig::Get(db); |
463 | if (config.options.checkpoint_abort == CheckpointAbort::DEBUG_ABORT_AFTER_FREE_LIST_WRITE) { |
464 | throw FatalException("Checkpoint aborted after free list write because of PRAGMA checkpoint_abort flag" ); |
465 | } |
466 | |
467 | if (!options.use_direct_io) { |
468 | // if we are not using Direct IO we need to fsync BEFORE we write the header to ensure that all the previous |
469 | // blocks are written as well |
470 | handle->Sync(); |
471 | } |
472 | // set the header inside the buffer |
473 | header_buffer.Clear(); |
474 | Store<DatabaseHeader>(val: header, ptr: header_buffer.buffer); |
475 | // now write the header to the file, active_header determines whether we write to h1 or h2 |
476 | // note that if active_header is h1 we write to h2, and vice versa |
477 | ChecksumAndWrite(block&: header_buffer, location: active_header == 1 ? Storage::FILE_HEADER_SIZE : Storage::FILE_HEADER_SIZE * 2); |
478 | // switch active header to the other header |
479 | active_header = 1 - active_header; |
480 | //! Ensure the header write ends up on disk |
481 | handle->Sync(); |
482 | } |
483 | |
484 | } // namespace duckdb |
485 | |