| 1 | /* |
| 2 | Copyright (c) 2016, Facebook, Inc. |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or modify |
| 5 | it under the terms of the GNU General Public License as published by |
| 6 | the Free Software Foundation; version 2 of the License. |
| 7 | |
| 8 | This program is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | GNU General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU General Public License |
| 14 | along with this program; if not, write to the Free Software |
| 15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
| 16 | |
| 17 | #include <my_global.h> |
| 18 | |
| 19 | /* This C++ file's header file */ |
| 20 | #include "./rdb_index_merge.h" |
| 21 | |
| 22 | /* MySQL header files */ |
| 23 | #include "../sql/sql_class.h" |
| 24 | |
| 25 | /* MyRocks header files */ |
| 26 | #include "./ha_rocksdb.h" |
| 27 | #include "./rdb_datadic.h" |
| 28 | |
| 29 | namespace myrocks { |
| 30 | |
| 31 | Rdb_index_merge::Rdb_index_merge(const char *const tmpfile_path, |
| 32 | const ulonglong &merge_buf_size, |
| 33 | const ulonglong &merge_combine_read_size, |
| 34 | const ulonglong &merge_tmp_file_removal_delay, |
| 35 | rocksdb::ColumnFamilyHandle *cf) |
| 36 | : m_tmpfile_path(tmpfile_path), m_merge_buf_size(merge_buf_size), |
| 37 | m_merge_combine_read_size(merge_combine_read_size), |
| 38 | m_merge_tmp_file_removal_delay(merge_tmp_file_removal_delay), |
| 39 | m_cf_handle(cf), m_rec_buf_unsorted(nullptr), m_output_buf(nullptr) {} |
| 40 | |
| 41 | Rdb_index_merge::~Rdb_index_merge() { |
| 42 | /* |
| 43 | If merge_tmp_file_removal_delay is set, sleep between calls to chsize. |
| 44 | |
| 45 | This helps mitigate potential trim stalls on flash when large files are |
| 46 | being deleted too quickly. |
| 47 | */ |
| 48 | if (m_merge_tmp_file_removal_delay > 0) { |
| 49 | uint64 curr_size = m_merge_buf_size * m_merge_file.m_num_sort_buffers; |
| 50 | for (uint i = 0; i < m_merge_file.m_num_sort_buffers; i++) { |
| 51 | if (my_chsize(m_merge_file.m_fd, curr_size, 0, MYF(MY_WME))) { |
| 52 | // NO_LINT_DEBUG |
| 53 | sql_print_error("Error truncating file during fast index creation." ); |
| 54 | } |
| 55 | |
| 56 | my_sleep(m_merge_tmp_file_removal_delay * 1000); |
| 57 | curr_size -= m_merge_buf_size; |
| 58 | } |
| 59 | } |
| 60 | |
| 61 | /* |
| 62 | Close file descriptor, we don't need to worry about deletion, |
| 63 | mysql handles it. |
| 64 | */ |
| 65 | my_close(m_merge_file.m_fd, MYF(MY_WME)); |
| 66 | } |
| 67 | |
| 68 | int Rdb_index_merge::init() { |
| 69 | /* |
| 70 | Create a temporary merge file on disk to store sorted chunks during |
| 71 | inplace index creation. |
| 72 | */ |
| 73 | if (merge_file_create()) { |
| 74 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 75 | } |
| 76 | |
| 77 | /* |
| 78 | Then, allocate buffer to store unsorted records before they are written |
| 79 | to disk. They will be written to disk sorted. A sorted tree is used to |
| 80 | keep track of the offset of each record within the unsorted buffer. |
| 81 | */ |
| 82 | m_rec_buf_unsorted = |
| 83 | std::shared_ptr<merge_buf_info>(new merge_buf_info(m_merge_buf_size)); |
| 84 | |
| 85 | /* |
| 86 | Allocate output buffer that will contain sorted block that is written to |
| 87 | disk. |
| 88 | */ |
| 89 | m_output_buf = |
| 90 | std::shared_ptr<merge_buf_info>(new merge_buf_info(m_merge_buf_size)); |
| 91 | |
| 92 | return HA_EXIT_SUCCESS; |
| 93 | } |
| 94 | |
| 95 | /** |
| 96 | Create a merge file in the given location. |
| 97 | */ |
| 98 | int Rdb_index_merge::merge_file_create() { |
| 99 | DBUG_ASSERT(m_merge_file.m_fd == -1); |
| 100 | |
| 101 | int fd; |
| 102 | #ifdef MARIAROCKS_NOT_YET // mysql_tmpfile_path use |
| 103 | /* If no path set for tmpfile, use mysql_tmpdir by default */ |
| 104 | if (m_tmpfile_path == nullptr) { |
| 105 | fd = mysql_tmpfile("myrocks" ); |
| 106 | } else { |
| 107 | fd = mysql_tmpfile_path(m_tmpfile_path, "myrocks" ); |
| 108 | } |
| 109 | #else |
| 110 | fd = mysql_tmpfile("myrocks" ); |
| 111 | #endif |
| 112 | if (fd < 0) { |
| 113 | // NO_LINT_DEBUG |
| 114 | sql_print_error("Failed to create temp file during fast index creation." ); |
| 115 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 116 | } |
| 117 | |
| 118 | m_merge_file.m_fd = fd; |
| 119 | m_merge_file.m_num_sort_buffers = 0; |
| 120 | |
| 121 | return HA_EXIT_SUCCESS; |
| 122 | } |
| 123 | |
| 124 | /** |
| 125 | Add record to offset tree (and unsorted merge buffer) in preparation for |
| 126 | writing out to disk in sorted chunks. |
| 127 | |
| 128 | If buffer in memory is full, write the buffer out to disk sorted using the |
| 129 | offset tree, and clear the tree. (Happens in merge_buf_write) |
| 130 | */ |
| 131 | int Rdb_index_merge::add(const rocksdb::Slice &key, const rocksdb::Slice &val) { |
| 132 | /* Adding a record after heap is already created results in error */ |
| 133 | DBUG_ASSERT(m_merge_min_heap.empty()); |
| 134 | |
| 135 | /* |
| 136 | Check if sort buffer is going to be out of space, if so write it |
| 137 | out to disk in sorted order using offset tree. |
| 138 | */ |
| 139 | const uint total_offset = RDB_MERGE_CHUNK_LEN + |
| 140 | m_rec_buf_unsorted->m_curr_offset + |
| 141 | RDB_MERGE_KEY_DELIMITER + RDB_MERGE_VAL_DELIMITER + |
| 142 | key.size() + val.size(); |
| 143 | if (total_offset >= m_rec_buf_unsorted->m_total_size) { |
| 144 | /* |
| 145 | If the offset tree is empty here, that means that the proposed key to |
| 146 | add is too large for the buffer. |
| 147 | */ |
| 148 | if (m_offset_tree.empty()) { |
| 149 | // NO_LINT_DEBUG |
| 150 | sql_print_error("Sort buffer size is too small to process merge. " |
| 151 | "Please set merge buffer size to a higher value." ); |
| 152 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 153 | } |
| 154 | |
| 155 | if (merge_buf_write()) { |
| 156 | // NO_LINT_DEBUG |
| 157 | sql_print_error("Error writing sort buffer to disk." ); |
| 158 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | const ulonglong rec_offset = m_rec_buf_unsorted->m_curr_offset; |
| 163 | |
| 164 | /* |
| 165 | Store key and value in temporary unsorted in memory buffer pointed to by |
| 166 | offset tree. |
| 167 | */ |
| 168 | m_rec_buf_unsorted->store_key_value(key, val); |
| 169 | |
| 170 | /* Find sort order of the new record */ |
| 171 | auto res = |
| 172 | m_offset_tree.emplace(m_rec_buf_unsorted->m_block.get() + rec_offset, |
| 173 | m_cf_handle->GetComparator()); |
| 174 | if (!res.second) { |
| 175 | my_printf_error(ER_DUP_ENTRY, |
| 176 | "Failed to insert the record: the key already exists" , |
| 177 | MYF(0)); |
| 178 | return ER_DUP_ENTRY; |
| 179 | } |
| 180 | |
| 181 | return HA_EXIT_SUCCESS; |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | Sort + write merge buffer chunk out to disk. |
| 186 | */ |
| 187 | int Rdb_index_merge::merge_buf_write() { |
| 188 | DBUG_ASSERT(m_merge_file.m_fd != -1); |
| 189 | DBUG_ASSERT(m_rec_buf_unsorted != nullptr); |
| 190 | DBUG_ASSERT(m_output_buf != nullptr); |
| 191 | DBUG_ASSERT(!m_offset_tree.empty()); |
| 192 | |
| 193 | /* Write actual chunk size to first 8 bytes of the merge buffer */ |
| 194 | merge_store_uint64(m_output_buf->m_block.get(), |
| 195 | m_rec_buf_unsorted->m_curr_offset + RDB_MERGE_CHUNK_LEN); |
| 196 | m_output_buf->m_curr_offset += RDB_MERGE_CHUNK_LEN; |
| 197 | |
| 198 | /* |
| 199 | Iterate through the offset tree. Should be ordered by the secondary key |
| 200 | at this point. |
| 201 | */ |
| 202 | for (const auto &rec : m_offset_tree) { |
| 203 | DBUG_ASSERT(m_output_buf->m_curr_offset <= m_merge_buf_size); |
| 204 | |
| 205 | /* Read record from offset (should never fail) */ |
| 206 | rocksdb::Slice key; |
| 207 | rocksdb::Slice val; |
| 208 | merge_read_rec(rec.m_block, &key, &val); |
| 209 | |
| 210 | /* Store key and value into sorted output buffer */ |
| 211 | m_output_buf->store_key_value(key, val); |
| 212 | } |
| 213 | |
| 214 | DBUG_ASSERT(m_output_buf->m_curr_offset <= m_output_buf->m_total_size); |
| 215 | |
| 216 | /* |
| 217 | Write output buffer to disk. |
| 218 | |
| 219 | Need to position cursor to the chunk it needs to be at on filesystem |
| 220 | then write into the respective merge buffer. |
| 221 | */ |
| 222 | if (my_seek(m_merge_file.m_fd, |
| 223 | m_merge_file.m_num_sort_buffers * m_merge_buf_size, SEEK_SET, |
| 224 | MYF(0)) == MY_FILEPOS_ERROR) { |
| 225 | // NO_LINT_DEBUG |
| 226 | sql_print_error("Error seeking to location in merge file on disk." ); |
| 227 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 228 | } |
| 229 | |
| 230 | /* |
| 231 | Add a file sync call here to flush the data out. Otherwise, the filesystem |
| 232 | cache can flush out all of the files at the same time, causing a write |
| 233 | burst. |
| 234 | */ |
| 235 | if (my_write(m_merge_file.m_fd, m_output_buf->m_block.get(), |
| 236 | m_output_buf->m_total_size, MYF(MY_WME | MY_NABP)) || |
| 237 | mysql_file_sync(m_merge_file.m_fd, MYF(MY_WME))) { |
| 238 | // NO_LINT_DEBUG |
| 239 | sql_print_error("Error writing sorted merge buffer to disk." ); |
| 240 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 241 | } |
| 242 | |
| 243 | /* Increment merge file offset to track number of merge buffers written */ |
| 244 | m_merge_file.m_num_sort_buffers += 1; |
| 245 | |
| 246 | /* Reset everything for next run */ |
| 247 | merge_reset(); |
| 248 | |
| 249 | return HA_EXIT_SUCCESS; |
| 250 | } |
| 251 | |
| 252 | /** |
| 253 | Prepare n-way merge of n sorted buffers on disk, using a heap sorted by |
| 254 | secondary key records. |
| 255 | */ |
| 256 | int Rdb_index_merge::merge_heap_prepare() { |
| 257 | DBUG_ASSERT(m_merge_min_heap.empty()); |
| 258 | |
| 259 | /* |
| 260 | If the offset tree is not empty, there are still some records that need to |
| 261 | be written to disk. Write them out now. |
| 262 | */ |
| 263 | if (!m_offset_tree.empty() && merge_buf_write()) { |
| 264 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 265 | } |
| 266 | |
| 267 | DBUG_ASSERT(m_merge_file.m_num_sort_buffers > 0); |
| 268 | |
| 269 | /* |
| 270 | For an n-way merge, we need to read chunks of each merge file |
| 271 | simultaneously. |
| 272 | */ |
| 273 | ulonglong chunk_size = |
| 274 | m_merge_combine_read_size / m_merge_file.m_num_sort_buffers; |
| 275 | if (chunk_size >= m_merge_buf_size) { |
| 276 | chunk_size = m_merge_buf_size; |
| 277 | } |
| 278 | |
| 279 | /* Allocate buffers for each chunk */ |
| 280 | for (ulonglong i = 0; i < m_merge_file.m_num_sort_buffers; i++) { |
| 281 | const auto entry = |
| 282 | std::make_shared<merge_heap_entry>(m_cf_handle->GetComparator()); |
| 283 | |
| 284 | /* |
| 285 | Read chunk_size bytes from each chunk on disk, and place inside |
| 286 | respective chunk buffer. |
| 287 | */ |
| 288 | const size_t total_size = |
| 289 | entry->prepare(m_merge_file.m_fd, i * m_merge_buf_size, chunk_size); |
| 290 | |
| 291 | if (total_size == (size_t)-1) { |
| 292 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 293 | } |
| 294 | |
| 295 | /* Can reach this condition if an index was added on table w/ no rows */ |
| 296 | if (total_size - RDB_MERGE_CHUNK_LEN == 0) { |
| 297 | break; |
| 298 | } |
| 299 | |
| 300 | /* Read the first record from each buffer to initially populate the heap */ |
| 301 | if (entry->read_rec(&entry->m_key, &entry->m_val)) { |
| 302 | // NO_LINT_DEBUG |
| 303 | sql_print_error("Chunk size is too small to process merge." ); |
| 304 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 305 | } |
| 306 | |
| 307 | m_merge_min_heap.push(std::move(entry)); |
| 308 | } |
| 309 | |
| 310 | return HA_EXIT_SUCCESS; |
| 311 | } |
| 312 | |
| 313 | /** |
| 314 | Create and/or iterate through keys in the merge heap. |
| 315 | */ |
| 316 | int Rdb_index_merge::next(rocksdb::Slice *const key, |
| 317 | rocksdb::Slice *const val) { |
| 318 | /* |
| 319 | If table fits in one sort buffer, we can optimize by writing |
| 320 | the sort buffer directly through to the sstfilewriter instead of |
| 321 | needing to create tmp files/heap to merge the sort buffers. |
| 322 | |
| 323 | If there are no sort buffer records (alters on empty tables), |
| 324 | also exit here. |
| 325 | */ |
| 326 | if (m_merge_file.m_num_sort_buffers == 0) { |
| 327 | if (m_offset_tree.empty()) { |
| 328 | return -1; |
| 329 | } |
| 330 | |
| 331 | const auto rec = m_offset_tree.begin(); |
| 332 | |
| 333 | /* Read record from offset */ |
| 334 | merge_read_rec(rec->m_block, key, val); |
| 335 | |
| 336 | m_offset_tree.erase(rec); |
| 337 | return HA_EXIT_SUCCESS; |
| 338 | } |
| 339 | |
| 340 | int res; |
| 341 | |
| 342 | /* |
| 343 | If heap and heap chunk info are empty, we must be beginning the merge phase |
| 344 | of the external sort. Populate the heap with initial values from each |
| 345 | disk chunk. |
| 346 | */ |
| 347 | if (m_merge_min_heap.empty()) { |
| 348 | if ((res = merge_heap_prepare())) { |
| 349 | // NO_LINT_DEBUG |
| 350 | sql_print_error("Error during preparation of heap." ); |
| 351 | return res; |
| 352 | } |
| 353 | |
| 354 | /* |
| 355 | Return the first top record without popping, as we haven't put this |
| 356 | inside the SST file yet. |
| 357 | */ |
| 358 | merge_heap_top(key, val); |
| 359 | return HA_EXIT_SUCCESS; |
| 360 | } |
| 361 | |
| 362 | DBUG_ASSERT(!m_merge_min_heap.empty()); |
| 363 | return merge_heap_pop_and_get_next(key, val); |
| 364 | } |
| 365 | |
| 366 | /** |
| 367 | Get current top record from the heap. |
| 368 | */ |
| 369 | void Rdb_index_merge::merge_heap_top(rocksdb::Slice *const key, |
| 370 | rocksdb::Slice *const val) { |
| 371 | DBUG_ASSERT(!m_merge_min_heap.empty()); |
| 372 | |
| 373 | const std::shared_ptr<merge_heap_entry> &entry = m_merge_min_heap.top(); |
| 374 | *key = entry->m_key; |
| 375 | *val = entry->m_val; |
| 376 | } |
| 377 | |
| 378 | /** |
| 379 | Pops the top record, and uses it to read next record from the |
| 380 | corresponding sort buffer and push onto the heap. |
| 381 | |
| 382 | Returns -1 when there are no more records in the heap. |
| 383 | */ |
| 384 | int Rdb_index_merge::merge_heap_pop_and_get_next(rocksdb::Slice *const key, |
| 385 | rocksdb::Slice *const val) { |
| 386 | /* |
| 387 | Make a new reference to shared ptr so it doesn't get destroyed |
| 388 | during pop(). We are going to push this entry back onto the heap. |
| 389 | */ |
| 390 | const std::shared_ptr<merge_heap_entry> entry = m_merge_min_heap.top(); |
| 391 | m_merge_min_heap.pop(); |
| 392 | |
| 393 | /* |
| 394 | We are finished w/ current chunk if: |
| 395 | current_offset + disk_offset == m_total_size |
| 396 | |
| 397 | Return without adding entry back onto heap. |
| 398 | If heap is also empty, we must be finished with merge. |
| 399 | */ |
| 400 | if (entry->m_chunk_info->is_chunk_finished()) { |
| 401 | if (m_merge_min_heap.empty()) { |
| 402 | return -1; |
| 403 | } |
| 404 | |
| 405 | merge_heap_top(key, val); |
| 406 | return HA_EXIT_SUCCESS; |
| 407 | } |
| 408 | |
| 409 | /* |
| 410 | Make sure we haven't reached the end of the chunk. |
| 411 | */ |
| 412 | DBUG_ASSERT(!entry->m_chunk_info->is_chunk_finished()); |
| 413 | |
| 414 | /* |
| 415 | If merge_read_rec fails, it means the either the chunk was cut off |
| 416 | or we've reached the end of the respective chunk. |
| 417 | */ |
| 418 | if (entry->read_rec(&entry->m_key, &entry->m_val)) { |
| 419 | if (entry->read_next_chunk_from_disk(m_merge_file.m_fd)) { |
| 420 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 421 | } |
| 422 | |
| 423 | /* Try reading record again, should never fail. */ |
| 424 | if (entry->read_rec(&entry->m_key, &entry->m_val)) { |
| 425 | return HA_ERR_ROCKSDB_MERGE_FILE_ERR; |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | /* Push entry back on to the heap w/ updated buffer + offset ptr */ |
| 430 | m_merge_min_heap.push(std::move(entry)); |
| 431 | |
| 432 | /* Return the current top record on heap */ |
| 433 | merge_heap_top(key, val); |
| 434 | return HA_EXIT_SUCCESS; |
| 435 | } |
| 436 | |
| 437 | int Rdb_index_merge::merge_heap_entry::read_next_chunk_from_disk(File fd) { |
| 438 | if (m_chunk_info->read_next_chunk_from_disk(fd)) { |
| 439 | return HA_EXIT_FAILURE; |
| 440 | } |
| 441 | |
| 442 | m_block = m_chunk_info->m_block.get(); |
| 443 | return HA_EXIT_SUCCESS; |
| 444 | } |
| 445 | |
| 446 | int Rdb_index_merge::merge_buf_info::read_next_chunk_from_disk(File fd) { |
| 447 | m_disk_curr_offset += m_curr_offset; |
| 448 | |
| 449 | if (my_seek(fd, m_disk_curr_offset, SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR) { |
| 450 | // NO_LINT_DEBUG |
| 451 | sql_print_error("Error seeking to location in merge file on disk." ); |
| 452 | return HA_EXIT_FAILURE; |
| 453 | } |
| 454 | |
| 455 | /* Overwrite the old block */ |
| 456 | const size_t bytes_read = |
| 457 | my_read(fd, m_block.get(), m_block_len, MYF(MY_WME)); |
| 458 | if (bytes_read == (size_t)-1) { |
| 459 | // NO_LINT_DEBUG |
| 460 | sql_print_error("Error reading merge file from disk." ); |
| 461 | return HA_EXIT_FAILURE; |
| 462 | } |
| 463 | |
| 464 | m_curr_offset = 0; |
| 465 | return HA_EXIT_SUCCESS; |
| 466 | } |
| 467 | |
| 468 | /** |
| 469 | Get records from offset within sort buffer and compare them. |
| 470 | Sort by least to greatest. |
| 471 | */ |
| 472 | int Rdb_index_merge::merge_record_compare( |
| 473 | const uchar *const a_block, const uchar *const b_block, |
| 474 | const rocksdb::Comparator *const comparator) { |
| 475 | return comparator->Compare(as_slice(a_block), as_slice(b_block)); |
| 476 | } |
| 477 | |
| 478 | /** |
| 479 | Given an offset in a merge sort buffer, read out the keys + values. |
| 480 | After this, block will point to the next record in the buffer. |
| 481 | **/ |
| 482 | void Rdb_index_merge::merge_read_rec(const uchar *const block, |
| 483 | rocksdb::Slice *const key, |
| 484 | rocksdb::Slice *const val) { |
| 485 | /* Read key at block offset into key slice and the value into value slice*/ |
| 486 | read_slice(key, block); |
| 487 | read_slice(val, block + RDB_MERGE_REC_DELIMITER + key->size()); |
| 488 | } |
| 489 | |
| 490 | void Rdb_index_merge::read_slice(rocksdb::Slice *slice, |
| 491 | const uchar *block_ptr) { |
| 492 | uint64 slice_len; |
| 493 | merge_read_uint64(&block_ptr, &slice_len); |
| 494 | |
| 495 | *slice = rocksdb::Slice(reinterpret_cast<const char *>(block_ptr), slice_len); |
| 496 | } |
| 497 | |
| 498 | int Rdb_index_merge::merge_heap_entry::read_rec(rocksdb::Slice *const key, |
| 499 | rocksdb::Slice *const val) { |
| 500 | const uchar *block_ptr = m_block; |
| 501 | const auto orig_offset = m_chunk_info->m_curr_offset; |
| 502 | const auto orig_block = m_block; |
| 503 | |
| 504 | /* Read key at block offset into key slice and the value into value slice*/ |
| 505 | if (read_slice(key, &block_ptr) != 0) { |
| 506 | return HA_EXIT_FAILURE; |
| 507 | } |
| 508 | |
| 509 | m_chunk_info->m_curr_offset += (uintptr_t)block_ptr - (uintptr_t)m_block; |
| 510 | m_block += (uintptr_t)block_ptr - (uintptr_t)m_block; |
| 511 | |
| 512 | if (read_slice(val, &block_ptr) != 0) { |
| 513 | m_chunk_info->m_curr_offset = orig_offset; |
| 514 | m_block = orig_block; |
| 515 | return HA_EXIT_FAILURE; |
| 516 | } |
| 517 | |
| 518 | m_chunk_info->m_curr_offset += (uintptr_t)block_ptr - (uintptr_t)m_block; |
| 519 | m_block += (uintptr_t)block_ptr - (uintptr_t)m_block; |
| 520 | |
| 521 | return HA_EXIT_SUCCESS; |
| 522 | } |
| 523 | |
| 524 | int Rdb_index_merge::merge_heap_entry::read_slice(rocksdb::Slice *const slice, |
| 525 | const uchar **block_ptr) { |
| 526 | if (!m_chunk_info->has_space(RDB_MERGE_REC_DELIMITER)) { |
| 527 | return HA_EXIT_FAILURE; |
| 528 | } |
| 529 | |
| 530 | uint64 slice_len; |
| 531 | merge_read_uint64(block_ptr, &slice_len); |
| 532 | if (!m_chunk_info->has_space(RDB_MERGE_REC_DELIMITER + slice_len)) { |
| 533 | return HA_EXIT_FAILURE; |
| 534 | } |
| 535 | |
| 536 | *slice = |
| 537 | rocksdb::Slice(reinterpret_cast<const char *>(*block_ptr), slice_len); |
| 538 | *block_ptr += slice_len; |
| 539 | return HA_EXIT_SUCCESS; |
| 540 | } |
| 541 | |
| 542 | size_t Rdb_index_merge::merge_heap_entry::prepare(File fd, ulonglong f_offset, |
| 543 | ulonglong chunk_size) { |
| 544 | m_chunk_info = std::make_shared<merge_buf_info>(chunk_size); |
| 545 | const size_t res = m_chunk_info->prepare(fd, f_offset); |
| 546 | if (res != (size_t)-1) { |
| 547 | m_block = m_chunk_info->m_block.get() + RDB_MERGE_CHUNK_LEN; |
| 548 | } |
| 549 | |
| 550 | return res; |
| 551 | } |
| 552 | |
| 553 | size_t Rdb_index_merge::merge_buf_info::prepare(File fd, ulonglong f_offset) { |
| 554 | m_disk_start_offset = f_offset; |
| 555 | m_disk_curr_offset = f_offset; |
| 556 | |
| 557 | /* |
| 558 | Need to position cursor to the chunk it needs to be at on filesystem |
| 559 | then read 'chunk_size' bytes into the respective chunk buffer. |
| 560 | */ |
| 561 | if (my_seek(fd, f_offset, SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR) { |
| 562 | // NO_LINT_DEBUG |
| 563 | sql_print_error("Error seeking to location in merge file on disk." ); |
| 564 | return (size_t)-1; |
| 565 | } |
| 566 | |
| 567 | const size_t bytes_read = |
| 568 | my_read(fd, m_block.get(), m_total_size, MYF(MY_WME)); |
| 569 | if (bytes_read == (size_t)-1) { |
| 570 | // NO_LINT_DEBUG |
| 571 | sql_print_error("Error reading merge file from disk." ); |
| 572 | return (size_t)-1; |
| 573 | } |
| 574 | |
| 575 | /* |
| 576 | Read the first 8 bytes of each chunk, this gives us the actual |
| 577 | size of each chunk. |
| 578 | */ |
| 579 | const uchar *block_ptr = m_block.get(); |
| 580 | merge_read_uint64(&block_ptr, &m_total_size); |
| 581 | m_curr_offset += RDB_MERGE_CHUNK_LEN; |
| 582 | return m_total_size; |
| 583 | } |
| 584 | |
| 585 | /* Store key and value w/ their respective delimiters at the given offset */ |
| 586 | void Rdb_index_merge::merge_buf_info::store_key_value( |
| 587 | const rocksdb::Slice &key, const rocksdb::Slice &val) { |
| 588 | store_slice(key); |
| 589 | store_slice(val); |
| 590 | } |
| 591 | |
| 592 | void Rdb_index_merge::merge_buf_info::store_slice(const rocksdb::Slice &slice) { |
| 593 | /* Store length delimiter */ |
| 594 | merge_store_uint64(&m_block[m_curr_offset], slice.size()); |
| 595 | |
| 596 | /* Store slice data */ |
| 597 | memcpy(&m_block[m_curr_offset + RDB_MERGE_REC_DELIMITER], slice.data(), |
| 598 | slice.size()); |
| 599 | |
| 600 | m_curr_offset += slice.size() + RDB_MERGE_REC_DELIMITER; |
| 601 | } |
| 602 | |
| 603 | void Rdb_index_merge::merge_reset() { |
| 604 | /* |
| 605 | Either error, or all values in the sort buffer have been written to disk, |
| 606 | so we need to clear the offset tree. |
| 607 | */ |
| 608 | m_offset_tree.clear(); |
| 609 | |
| 610 | /* Reset sort buffer block */ |
| 611 | if (m_rec_buf_unsorted && m_rec_buf_unsorted->m_block) { |
| 612 | m_rec_buf_unsorted->m_curr_offset = 0; |
| 613 | } |
| 614 | |
| 615 | /* Reset output buf */ |
| 616 | if (m_output_buf && m_output_buf->m_block) { |
| 617 | m_output_buf->m_curr_offset = 0; |
| 618 | } |
| 619 | } |
| 620 | |
| 621 | } // namespace myrocks |
| 622 | |