rdb_index_merge.cc source code [MariaDB/storage/rocksdb/rdb_index_merge.cc]

1	/*
2	Copyright (c) 2016, Facebook, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /*
16
17	#include <my_global.h>
18
19	/ This C++ file's header file /
20	#include "./rdb_index_merge.h"
21
22	/ MySQL header files /
23	#include "../sql/sql_class.h"
24
25	/ MyRocks header files /
26	#include "./ha_rocksdb.h"
27	#include "./rdb_datadic.h"
28
29	namespace myrocks {
30
31	Rdb_index_merge::Rdb_index_merge(const char *const tmpfile_path,
32	const ulonglong &merge_buf_size,
33	const ulonglong &merge_combine_read_size,
34	const ulonglong &merge_tmp_file_removal_delay,
35	rocksdb::ColumnFamilyHandle *cf)
36	: m_tmpfile_path(tmpfile_path), m_merge_buf_size(merge_buf_size),
37	m_merge_combine_read_size(merge_combine_read_size),
38	m_merge_tmp_file_removal_delay(merge_tmp_file_removal_delay),
39	m_cf_handle(cf), m_rec_buf_unsorted (nullptr), m_output_buf (nullptr) {}
40
41	Rdb_index_merge::~Rdb_index_merge() {
42	/*
43	If merge_tmp_file_removal_delay is set, sleep between calls to chsize.
44
45	This helps mitigate potential trim stalls on flash when large files are
46	being deleted too quickly.
47	*/
48	if (m_merge_tmp_file_removal_delay > `0`) {
49	uint64 curr_size = m_merge_buf_size * m_merge_file.m_num_sort_buffers;
50	for (uint i = `0`; i < m_merge_file.m_num_sort_buffers; i++) {
51	if (my_chsize(m_merge_file.m_fd, curr_size, `0`, MYF(MY_WME))) {
52	// NO_LINT_DEBUG
53	sql_print_error("Error truncating file during fast index creation.");
54	}
55
56	my_sleep(m_merge_tmp_file_removal_delay * `1000`);
57	curr_size -= m_merge_buf_size;
58	}
59	}
60
61	/*
62	Close file descriptor, we don't need to worry about deletion,
63	mysql handles it.
64	*/
65	my_close(m_merge_file.m_fd, MYF(MY_WME));
66	}
67
68	int Rdb_index_merge::init() {
69	/*
70	Create a temporary merge file on disk to store sorted chunks during
71	inplace index creation.
72	*/
73	if (merge_file_create()) {
74	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
75	}
76
77	/*
78	Then, allocate buffer to store unsorted records before they are written
79	to disk. They will be written to disk sorted. A sorted tree is used to
80	keep track of the offset of each record within the unsorted buffer.
81	*/
82	m_rec_buf_unsorted =
83	std::shared_ptr<merge_buf_info>(new merge_buf_info (m_merge_buf_size));
84
85	/*
86	Allocate output buffer that will contain sorted block that is written to
87	disk.
88	*/
89	m_output_buf =
90	std::shared_ptr<merge_buf_info>(new merge_buf_info (m_merge_buf_size));
91
92	return HA_EXIT_SUCCESS;
93	}
94
95	/**
96	Create a merge file in the given location.
97	*/
98	int Rdb_index_merge::merge_file_create() {
99	DBUG_ASSERT(m_merge_file.m_fd == -`1`);
100
101	int fd;
102	#ifdef MARIAROCKS_NOT_YET // mysql_tmpfile_path use
103	/ If no path set for tmpfile, use mysql_tmpdir by default /
104	if (m_tmpfile_path == nullptr) {
105	fd = mysql_tmpfile("myrocks");
106	} else {
107	fd = mysql_tmpfile_path(m_tmpfile_path, "myrocks");
108	}
109	#else
110	fd = mysql_tmpfile("myrocks");
111	#endif
112	if (fd < `0`) {
113	// NO_LINT_DEBUG
114	sql_print_error("Failed to create temp file during fast index creation.");
115	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
116	}
117
118	m_merge_file.m_fd = fd;
119	m_merge_file.m_num_sort_buffers = `0`;
120
121	return HA_EXIT_SUCCESS;
122	}
123
124	/**
125	Add record to offset tree (and unsorted merge buffer) in preparation for
126	writing out to disk in sorted chunks.
127
128	If buffer in memory is full, write the buffer out to disk sorted using the
129	offset tree, and clear the tree. (Happens in merge_buf_write)
130	*/
131	int Rdb_index_merge::add(const rocksdb::Slice &key, const rocksdb::Slice &val) {
132	/ Adding a record after heap is already created results in error /
133	DBUG_ASSERT(m_merge_min_heap.empty());
134
135	/*
136	Check if sort buffer is going to be out of space, if so write it
137	out to disk in sorted order using offset tree.
138	*/
139	const uint total_offset = RDB_MERGE_CHUNK_LEN +
140	m_rec_buf_unsorted ->m_curr_offset +
141	RDB_MERGE_KEY_DELIMITER + RDB_MERGE_VAL_DELIMITER +
142	key.size() + val.size();
143	if (total_offset >= m_rec_buf_unsorted ->m_total_size) {
144	/*
145	If the offset tree is empty here, that means that the proposed key to
146	add is too large for the buffer.
147	*/
148	if (m_offset_tree.empty()) {
149	// NO_LINT_DEBUG
150	sql_print_error("Sort buffer size is too small to process merge. "
151	"Please set merge buffer size to a higher value.");
152	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
153	}
154
155	if (merge_buf_write()) {
156	// NO_LINT_DEBUG
157	sql_print_error("Error writing sort buffer to disk.");
158	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
159	}
160	}
161
162	const ulonglong rec_offset = m_rec_buf_unsorted ->m_curr_offset;
163
164	/*
165	Store key and value in temporary unsorted in memory buffer pointed to by
166	offset tree.
167	*/
168	m_rec_buf_unsorted ->store_key_value(key, val);
169
170	/ Find sort order of the new record /
171	auto res =
172	m_offset_tree.emplace(m_rec_buf_unsorted ->m_block.get() + rec_offset,
173	m_cf_handle->GetComparator());
174	if (!res.second) {
175	my_printf_error(ER_DUP_ENTRY,
176	"Failed to insert the record: the key already exists",
177	MYF(`0`));
178	return ER_DUP_ENTRY;
179	}
180
181	return HA_EXIT_SUCCESS;
182	}
183
184	/**
185	Sort + write merge buffer chunk out to disk.
186	*/
187	int Rdb_index_merge::merge_buf_write() {
188	DBUG_ASSERT(m_merge_file.m_fd != -`1`);
189	DBUG_ASSERT(m_rec_buf_unsorted != nullptr);
190	DBUG_ASSERT(m_output_buf != nullptr);
191	DBUG_ASSERT(!m_offset_tree.empty());
192
193	/ Write actual chunk size to first 8 bytes of the merge buffer /
194	merge_store_uint64(m_output_buf ->m_block.get(),
195	m_rec_buf_unsorted ->m_curr_offset + RDB_MERGE_CHUNK_LEN);
196	m_output_buf ->m_curr_offset += RDB_MERGE_CHUNK_LEN;
197
198	/*
199	Iterate through the offset tree. Should be ordered by the secondary key
200	at this point.
201	*/
202	for (const auto &rec : m_offset_tree) {
203	DBUG_ASSERT(m_output_buf->m_curr_offset <= m_merge_buf_size);
204
205	/ Read record from offset (should never fail) /
206	rocksdb::Slice key;
207	rocksdb::Slice val;
208	merge_read_rec(rec.m_block, &key, &val);
209
210	/ Store key and value into sorted output buffer /
211	m_output_buf ->store_key_value(key, val);
212	}
213
214	DBUG_ASSERT(m_output_buf->m_curr_offset <= m_output_buf->m_total_size);
215
216	/*
217	Write output buffer to disk.
218
219	Need to position cursor to the chunk it needs to be at on filesystem
220	then write into the respective merge buffer.
221	*/
222	if (my_seek(m_merge_file.m_fd,
223	m_merge_file.m_num_sort_buffers * m_merge_buf_size, SEEK_SET,
224	MYF(`0`)) == MY_FILEPOS_ERROR) {
225	// NO_LINT_DEBUG
226	sql_print_error("Error seeking to location in merge file on disk.");
227	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
228	}
229
230	/*
231	Add a file sync call here to flush the data out. Otherwise, the filesystem
232	cache can flush out all of the files at the same time, causing a write
233	burst.
234	*/
235	if (my_write(m_merge_file.m_fd, m_output_buf ->m_block.get(),
236	m_output_buf ->m_total_size, MYF(MY_WME \| MY_NABP)) \|\|
237	mysql_file_sync(m_merge_file.m_fd, MYF(MY_WME))) {
238	// NO_LINT_DEBUG
239	sql_print_error("Error writing sorted merge buffer to disk.");
240	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
241	}
242
243	/ Increment merge file offset to track number of merge buffers written /
244	m_merge_file.m_num_sort_buffers += `1`;
245
246	/ Reset everything for next run /
247	merge_reset();
248
249	return HA_EXIT_SUCCESS;
250	}
251
252	/**
253	Prepare n-way merge of n sorted buffers on disk, using a heap sorted by
254	secondary key records.
255	*/
256	int Rdb_index_merge::merge_heap_prepare() {
257	DBUG_ASSERT(m_merge_min_heap.empty());
258
259	/*
260	If the offset tree is not empty, there are still some records that need to
261	be written to disk. Write them out now.
262	*/
263	if (!m_offset_tree.empty() && merge_buf_write()) {
264	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
265	}
266
267	DBUG_ASSERT(m_merge_file.m_num_sort_buffers > `0`);
268
269	/*
270	For an n-way merge, we need to read chunks of each merge file
271	simultaneously.
272	*/
273	ulonglong chunk_size =
274	m_merge_combine_read_size / m_merge_file.m_num_sort_buffers;
275	if (chunk_size >= m_merge_buf_size) {
276	chunk_size = m_merge_buf_size;
277	}
278
279	/ Allocate buffers for each chunk /
280	for (ulonglong i = `0`; i < m_merge_file.m_num_sort_buffers; i++) {
281	const auto entry =
282	std::make_shared<merge_heap_entry>(m_cf_handle->GetComparator());
283
284	/*
285	Read chunk_size bytes from each chunk on disk, and place inside
286	respective chunk buffer.
287	*/
288	const size_t total_size =
289	entry ->prepare(m_merge_file.m_fd, i * m_merge_buf_size, chunk_size);
290
291	if (total_size == (size_t)-`1`) {
292	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
293	}
294
295	/ Can reach this condition if an index was added on table w/ no rows /
296	if (total_size - RDB_MERGE_CHUNK_LEN == `0`) {
297	break;
298	}
299
300	/ Read the first record from each buffer to initially populate the heap /
301	if (entry ->read_rec(&entry ->m_key, &entry ->m_val)) {
302	// NO_LINT_DEBUG
303	sql_print_error("Chunk size is too small to process merge.");
304	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
305	}
306
307	m_merge_min_heap.push(std::move(entry));
308	}
309
310	return HA_EXIT_SUCCESS;
311	}
312
313	/**
314	Create and/or iterate through keys in the merge heap.
315	*/
316	int Rdb_index_merge::next(rocksdb::Slice *const key,
317	rocksdb::Slice *const val) {
318	/*
319	If table fits in one sort buffer, we can optimize by writing
320	the sort buffer directly through to the sstfilewriter instead of
321	needing to create tmp files/heap to merge the sort buffers.
322
323	If there are no sort buffer records (alters on empty tables),
324	also exit here.
325	*/
326	if (m_merge_file.m_num_sort_buffers == `0`) {
327	if (m_offset_tree.empty()) {
328	return -`1`;
329	}
330
331	const auto rec = m_offset_tree.begin();
332
333	/ Read record from offset /
334	merge_read_rec(rec ->m_block, key, val);
335
336	m_offset_tree.erase(rec);
337	return HA_EXIT_SUCCESS;
338	}
339
340	int res;
341
342	/*
343	If heap and heap chunk info are empty, we must be beginning the merge phase
344	of the external sort. Populate the heap with initial values from each
345	disk chunk.
346	*/
347	if (m_merge_min_heap.empty()) {
348	if ((res = merge_heap_prepare())) {
349	// NO_LINT_DEBUG
350	sql_print_error("Error during preparation of heap.");
351	return res;
352	}
353
354	/*
355	Return the first top record without popping, as we haven't put this
356	inside the SST file yet.
357	*/
358	merge_heap_top(key, val);
359	return HA_EXIT_SUCCESS;
360	}
361
362	DBUG_ASSERT(!m_merge_min_heap.empty());
363	return merge_heap_pop_and_get_next(key, val);
364	}
365
366	/**
367	Get current top record from the heap.
368	*/
369	void Rdb_index_merge::merge_heap_top(rocksdb::Slice *const key,
370	rocksdb::Slice *const val) {
371	DBUG_ASSERT(!m_merge_min_heap.empty());
372
373	const std::shared_ptr<merge_heap_entry> &entry = m_merge_min_heap.top();
374	*key = entry ->m_key;
375	*val = entry ->m_val;
376	}
377
378	/**
379	Pops the top record, and uses it to read next record from the
380	corresponding sort buffer and push onto the heap.
381
382	Returns -1 when there are no more records in the heap.
383	*/
384	int Rdb_index_merge::merge_heap_pop_and_get_next(rocksdb::Slice *const key,
385	rocksdb::Slice *const val) {
386	/*
387	Make a new reference to shared ptr so it doesn't get destroyed
388	during pop(). We are going to push this entry back onto the heap.
389	*/
390	const std::shared_ptr<merge_heap_entry> entry = m_merge_min_heap.top();
391	m_merge_min_heap.pop();
392
393	/*
394	We are finished w/ current chunk if:
395	current_offset + disk_offset == m_total_size
396
397	Return without adding entry back onto heap.
398	If heap is also empty, we must be finished with merge.
399	*/
400	if (entry ->m_chunk_info ->is_chunk_finished()) {
401	if (m_merge_min_heap.empty()) {
402	return -`1`;
403	}
404
405	merge_heap_top(key, val);
406	return HA_EXIT_SUCCESS;
407	}
408
409	/*
410	Make sure we haven't reached the end of the chunk.
411	*/
412	DBUG_ASSERT(!entry->m_chunk_info->is_chunk_finished());
413
414	/*
415	If merge_read_rec fails, it means the either the chunk was cut off
416	or we've reached the end of the respective chunk.
417	*/
418	if (entry ->read_rec(&entry ->m_key, &entry ->m_val)) {
419	if (entry ->read_next_chunk_from_disk(m_merge_file.m_fd)) {
420	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
421	}
422
423	/ Try reading record again, should never fail. /
424	if (entry ->read_rec(&entry ->m_key, &entry ->m_val)) {
425	return HA_ERR_ROCKSDB_MERGE_FILE_ERR;
426	}
427	}
428
429	/ Push entry back on to the heap w/ updated buffer + offset ptr /
430	m_merge_min_heap.push(std::move(entry));
431
432	/ Return the current top record on heap /
433	merge_heap_top(key, val);
434	return HA_EXIT_SUCCESS;
435	}
436
437	int Rdb_index_merge::merge_heap_entry::read_next_chunk_from_disk(File fd) {
438	if (m_chunk_info ->read_next_chunk_from_disk(fd)) {
439	return HA_EXIT_FAILURE;
440	}
441
442	m_block = m_chunk_info ->m_block.get();
443	return HA_EXIT_SUCCESS;
444	}
445
446	int Rdb_index_merge::merge_buf_info::read_next_chunk_from_disk(File fd) {
447	m_disk_curr_offset += m_curr_offset;
448
449	if (my_seek(fd, m_disk_curr_offset, SEEK_SET, MYF(`0`)) == MY_FILEPOS_ERROR) {
450	// NO_LINT_DEBUG
451	sql_print_error("Error seeking to location in merge file on disk.");
452	return HA_EXIT_FAILURE;
453	}
454
455	/ Overwrite the old block /
456	const size_t bytes_read =
457	my_read(fd, m_block.get(), m_block_len, MYF(MY_WME));
458	if (bytes_read == (size_t)-`1`) {
459	// NO_LINT_DEBUG
460	sql_print_error("Error reading merge file from disk.");
461	return HA_EXIT_FAILURE;
462	}
463
464	m_curr_offset = `0`;
465	return HA_EXIT_SUCCESS;
466	}
467
468	/**
469	Get records from offset within sort buffer and compare them.
470	Sort by least to greatest.
471	*/
472	int Rdb_index_merge::merge_record_compare(
473	const uchar *const a_block, const uchar *const b_block,
474	const rocksdb::Comparator *const comparator) {
475	return comparator->Compare(as_slice(a_block), as_slice(b_block));
476	}
477
478	/**
479	Given an offset in a merge sort buffer, read out the keys + values.
480	After this, block will point to the next record in the buffer.
481	**/
482	void Rdb_index_merge::merge_read_rec(const uchar *const block,
483	rocksdb::Slice *const key,
484	rocksdb::Slice *const val) {
485	/ Read key at block offset into key slice and the value into value slice/
486	read_slice(key, block);
487	read_slice(val, block + RDB_MERGE_REC_DELIMITER + key->size());
488	}
489
490	void Rdb_index_merge::read_slice(rocksdb::Slice *slice,
491	const uchar *block_ptr) {
492	uint64 slice_len;
493	merge_read_uint64(&block_ptr, &slice_len);
494
495	slice = rocksdb::Slice (reinterpret_cast<const* char *>(block_ptr), slice_len);
496	}
497
498	int Rdb_index_merge::merge_heap_entry::read_rec(rocksdb::Slice *const key,
499	rocksdb::Slice *const val) {
500	const uchar *block_ptr = m_block;
501	const auto orig_offset = m_chunk_info ->m_curr_offset;
502	const auto orig_block = m_block;
503
504	/ Read key at block offset into key slice and the value into value slice/
505	if (read_slice(key, &block_ptr) != `0`) {
506	return HA_EXIT_FAILURE;
507	}
508
509	m_chunk_info ->m_curr_offset += (uintptr_t)block_ptr - (uintptr_t)m_block;
510	m_block += (uintptr_t)block_ptr - (uintptr_t)m_block;
511
512	if (read_slice(val, &block_ptr) != `0`) {
513	m_chunk_info ->m_curr_offset = orig_offset;
514	m_block = orig_block;
515	return HA_EXIT_FAILURE;
516	}
517
518	m_chunk_info ->m_curr_offset += (uintptr_t)block_ptr - (uintptr_t)m_block;
519	m_block += (uintptr_t)block_ptr - (uintptr_t)m_block;
520
521	return HA_EXIT_SUCCESS;
522	}
523
524	int Rdb_index_merge::merge_heap_entry::read_slice(rocksdb::Slice *const slice,
525	const uchar **block_ptr) {
526	if (!m_chunk_info ->has_space(RDB_MERGE_REC_DELIMITER)) {
527	return HA_EXIT_FAILURE;
528	}
529
530	uint64 slice_len;
531	merge_read_uint64(block_ptr, &slice_len);
532	if (!m_chunk_info ->has_space(RDB_MERGE_REC_DELIMITER + slice_len)) {
533	return HA_EXIT_FAILURE;
534	}
535
536	*slice =
537	rocksdb::Slice (reinterpret_cast<const char >(block_ptr), slice_len);
538	*block_ptr += slice_len;
539	return HA_EXIT_SUCCESS;
540	}
541
542	size_t Rdb_index_merge::merge_heap_entry::prepare(File fd, ulonglong f_offset,
543	ulonglong chunk_size) {
544	m_chunk_info = std::make_shared<merge_buf_info>(chunk_size);
545	const size_t res = m_chunk_info ->prepare(fd, f_offset);
546	if (res != (size_t)-`1`) {
547	m_block = m_chunk_info ->m_block.get() + RDB_MERGE_CHUNK_LEN;
548	}
549
550	return res;
551	}
552
553	size_t Rdb_index_merge::merge_buf_info::prepare(File fd, ulonglong f_offset) {
554	m_disk_start_offset = f_offset;
555	m_disk_curr_offset = f_offset;
556
557	/*
558	Need to position cursor to the chunk it needs to be at on filesystem
559	then read 'chunk_size' bytes into the respective chunk buffer.
560	*/
561	if (my_seek(fd, f_offset, SEEK_SET, MYF(`0`)) == MY_FILEPOS_ERROR) {
562	// NO_LINT_DEBUG
563	sql_print_error("Error seeking to location in merge file on disk.");
564	return (size_t)-`1`;
565	}
566
567	const size_t bytes_read =
568	my_read(fd, m_block.get(), m_total_size, MYF(MY_WME));
569	if (bytes_read == (size_t)-`1`) {
570	// NO_LINT_DEBUG
571	sql_print_error("Error reading merge file from disk.");
572	return (size_t)-`1`;
573	}
574
575	/*
576	Read the first 8 bytes of each chunk, this gives us the actual
577	size of each chunk.
578	*/
579	const uchar *block_ptr = m_block.get();
580	merge_read_uint64(&block_ptr, &m_total_size);
581	m_curr_offset += RDB_MERGE_CHUNK_LEN;
582	return m_total_size;
583	}
584
585	/ Store key and value w/ their respective delimiters at the given offset /
586	void Rdb_index_merge::merge_buf_info::store_key_value(
587	const rocksdb::Slice &key, const rocksdb::Slice &val) {
588	store_slice(key);
589	store_slice(val);
590	}
591
592	void Rdb_index_merge::merge_buf_info::store_slice(const rocksdb::Slice &slice) {
593	/ Store length delimiter /
594	merge_store_uint64(&m_block [m_curr_offset], slice.size());
595
596	/ Store slice data /
597	memcpy(&m_block [m_curr_offset + RDB_MERGE_REC_DELIMITER], slice.data(),
598	slice.size());
599
600	m_curr_offset += slice.size() + RDB_MERGE_REC_DELIMITER;
601	}
602
603	void Rdb_index_merge::merge_reset() {
604	/*
605	Either error, or all values in the sort buffer have been written to disk,
606	so we need to clear the offset tree.
607	*/
608	m_offset_tree.clear();
609
610	/ Reset sort buffer block /
611	if (m_rec_buf_unsorted && m_rec_buf_unsorted ->m_block) {
612	m_rec_buf_unsorted ->m_curr_offset = `0`;
613	}
614
615	/ Reset output buf /
616	if (m_output_buf && m_output_buf ->m_block) {
617	m_output_buf ->m_curr_offset = `0`;
618	}
619	}
620
621	} // namespace myrocks
622

Browse the source code of MariaDB/storage/rocksdb/rdb_index_merge.cc