| 1 | /*********************************************************************** |
| 2 | |
| 3 | Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2009, Percona Inc. |
| 5 | Copyright (c) 2013, 2018, MariaDB Corporation. |
| 6 | |
| 7 | Portions of this file contain modifications contributed and copyrighted |
| 8 | by Percona Inc.. Those modifications are |
| 9 | gratefully acknowledged and are described briefly in the InnoDB |
| 10 | documentation. The contributions by Percona Inc. are incorporated with |
| 11 | their permission, and subject to the conditions contained in the file |
| 12 | COPYING.Percona. |
| 13 | |
| 14 | This program is free software; you can redistribute it and/or modify it |
| 15 | under the terms of the GNU General Public License as published by the |
| 16 | Free Software Foundation; version 2 of the License. |
| 17 | |
| 18 | This program is distributed in the hope that it will be useful, but |
| 19 | WITHOUT ANY WARRANTY; without even the implied warranty of |
| 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General |
| 21 | Public License for more details. |
| 22 | |
| 23 | You should have received a copy of the GNU General Public License along with |
| 24 | this program; if not, write to the Free Software Foundation, Inc., |
| 25 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 26 | |
| 27 | ***********************************************************************/ |
| 28 | |
| 29 | /**************************************************//** |
| 30 | @file os/os0file.cc |
| 31 | The interface to the operating system file i/o primitives |
| 32 | |
| 33 | Created 10/21/1995 Heikki Tuuri |
| 34 | *******************************************************/ |
| 35 | |
| 36 | #ifndef UNIV_INNOCHECKSUM |
| 37 | |
| 38 | #include "ha_prototypes.h" |
| 39 | #include "sql_const.h" |
| 40 | |
| 41 | #include "os0file.h" |
| 42 | |
| 43 | #ifdef UNIV_LINUX |
| 44 | #include <sys/types.h> |
| 45 | #include <sys/stat.h> |
| 46 | #endif |
| 47 | |
| 48 | #include "srv0srv.h" |
| 49 | #include "srv0start.h" |
| 50 | #include "fil0fil.h" |
| 51 | #include "fil0crypt.h" |
| 52 | #include "fsp0fsp.h" |
| 53 | #include "fil0pagecompress.h" |
| 54 | #include "srv0srv.h" |
| 55 | #ifdef HAVE_LINUX_UNISTD_H |
| 56 | #include "unistd.h" |
| 57 | #endif |
| 58 | #include "os0event.h" |
| 59 | #include "os0thread.h" |
| 60 | |
| 61 | #include <vector> |
| 62 | |
| 63 | #ifdef LINUX_NATIVE_AIO |
| 64 | #include <libaio.h> |
| 65 | #endif /* LINUX_NATIVE_AIO */ |
| 66 | |
| 67 | #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE |
| 68 | # include <fcntl.h> |
| 69 | # include <linux/falloc.h> |
| 70 | #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ |
| 71 | |
| 72 | #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) |
| 73 | # include <sys/ioctl.h> |
| 74 | # ifndef DFS_IOCTL_ATOMIC_WRITE_SET |
| 75 | # define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) |
| 76 | # endif |
| 77 | #endif |
| 78 | |
| 79 | #if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) |
| 80 | #include <sys/statvfs.h> |
| 81 | #endif |
| 82 | |
| 83 | #if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H) |
| 84 | #include <linux/falloc.h> |
| 85 | #endif |
| 86 | |
| 87 | #ifdef _WIN32 |
| 88 | #include <winioctl.h> |
| 89 | #endif |
| 90 | |
| 91 | /** Insert buffer segment id */ |
| 92 | static const ulint IO_IBUF_SEGMENT = 0; |
| 93 | |
| 94 | /** Log segment id */ |
| 95 | static const ulint IO_LOG_SEGMENT = 1; |
| 96 | |
| 97 | /** Number of retries for partial I/O's */ |
| 98 | static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10; |
| 99 | |
| 100 | /* This specifies the file permissions InnoDB uses when it creates files in |
| 101 | Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to |
| 102 | my_umask */ |
| 103 | |
| 104 | #ifndef _WIN32 |
| 105 | /** Umask for creating files */ |
| 106 | static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; |
| 107 | #else |
| 108 | /** Umask for creating files */ |
| 109 | static ulint os_innodb_umask = 0; |
| 110 | static HANDLE data_completion_port; |
| 111 | static HANDLE log_completion_port; |
| 112 | |
| 113 | static DWORD fls_sync_io = FLS_OUT_OF_INDEXES; |
| 114 | #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1 |
| 115 | #endif /* _WIN32 */ |
| 116 | |
| 117 | /** In simulated aio, merge at most this many consecutive i/os */ |
| 118 | static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64; |
| 119 | |
| 120 | /** Flag indicating if the page_cleaner is in active state. */ |
| 121 | extern bool buf_page_cleaner_is_active; |
| 122 | |
| 123 | #ifdef WITH_INNODB_DISALLOW_WRITES |
| 124 | #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) |
| 125 | #else |
| 126 | #define WAIT_ALLOW_WRITES() do { } while (0) |
| 127 | #endif /* WITH_INNODB_DISALLOW_WRITES */ |
| 128 | |
| 129 | /********************************************************************** |
| 130 | |
| 131 | InnoDB AIO Implementation: |
| 132 | ========================= |
| 133 | |
| 134 | We support native AIO for Windows and Linux. For rest of the platforms |
| 135 | we simulate AIO by special IO-threads servicing the IO-requests. |
| 136 | |
| 137 | Simulated AIO: |
| 138 | ============== |
| 139 | |
| 140 | On platforms where we 'simulate' AIO, the following is a rough explanation |
| 141 | of the high level design. |
| 142 | There are four io-threads (for ibuf, log, read, write). |
| 143 | All synchronous IO requests are serviced by the calling thread using |
| 144 | os_file_write/os_file_read. The Asynchronous requests are queued up |
| 145 | in an array (there are four such arrays) by the calling thread. |
| 146 | Later these requests are picked up by the IO-thread and are serviced |
| 147 | synchronously. |
| 148 | |
| 149 | Windows native AIO: |
| 150 | ================== |
| 151 | |
| 152 | If srv_use_native_aio is not set then Windows follow the same |
| 153 | code as simulated AIO. If the flag is set then native AIO interface |
| 154 | is used. On windows, one of the limitation is that if a file is opened |
| 155 | for AIO no synchronous IO can be done on it. Therefore we have an |
| 156 | extra fifth array to queue up synchronous IO requests. |
| 157 | There are innodb_file_io_threads helper threads. These threads work |
| 158 | on the four arrays mentioned above in Simulated AIO. No thread is |
| 159 | required for the sync array. |
| 160 | If a synchronous IO request is made, it is first queued in the sync |
| 161 | array. Then the calling thread itself waits on the request, thus |
| 162 | making the call synchronous. |
| 163 | If an AIO request is made the calling thread not only queues it in the |
| 164 | array but also submits the requests. The helper thread then collects |
| 165 | the completed IO request and calls completion routine on it. |
| 166 | |
| 167 | Linux native AIO: |
| 168 | ================= |
| 169 | |
| 170 | If we have libaio installed on the system and innodb_use_native_aio |
| 171 | is set to true we follow the code path of native AIO, otherwise we |
| 172 | do simulated AIO. |
| 173 | There are innodb_file_io_threads helper threads. These threads work |
| 174 | on the four arrays mentioned above in Simulated AIO. |
| 175 | If a synchronous IO request is made, it is handled by calling |
| 176 | os_file_write/os_file_read. |
| 177 | If an AIO request is made the calling thread not only queues it in the |
| 178 | array but also submits the requests. The helper thread then collects |
| 179 | the completed IO request and calls completion routine on it. |
| 180 | |
| 181 | **********************************************************************/ |
| 182 | |
| 183 | |
| 184 | #ifdef UNIV_PFS_IO |
| 185 | /* Keys to register InnoDB I/O with performance schema */ |
| 186 | mysql_pfs_key_t innodb_data_file_key; |
| 187 | mysql_pfs_key_t innodb_log_file_key; |
| 188 | mysql_pfs_key_t innodb_temp_file_key; |
| 189 | #endif /* UNIV_PFS_IO */ |
| 190 | |
| 191 | class AIO; |
| 192 | |
| 193 | /** The asynchronous I/O context */ |
| 194 | struct Slot { |
| 195 | |
| 196 | #ifdef WIN_ASYNC_IO |
| 197 | /** Windows control block for the aio request |
| 198 | must be at the very start of Slot, so we can |
| 199 | cast Slot* to OVERLAPPED* |
| 200 | */ |
| 201 | OVERLAPPED control; |
| 202 | #endif |
| 203 | |
| 204 | /** index of the slot in the aio array */ |
| 205 | uint16_t pos; |
| 206 | |
| 207 | /** true if this slot is reserved */ |
| 208 | bool is_reserved; |
| 209 | |
| 210 | /** time when reserved */ |
| 211 | time_t reservation_time; |
| 212 | |
| 213 | /** buffer used in i/o */ |
| 214 | byte* buf; |
| 215 | |
| 216 | /** Buffer pointer used for actual IO. We advance this |
| 217 | when partial IO is required and not buf */ |
| 218 | byte* ptr; |
| 219 | |
| 220 | /** OS_FILE_READ or OS_FILE_WRITE */ |
| 221 | IORequest type; |
| 222 | |
| 223 | /** file offset in bytes */ |
| 224 | os_offset_t offset; |
| 225 | |
| 226 | /** file where to read or write */ |
| 227 | pfs_os_file_t file; |
| 228 | |
| 229 | /** file name or path */ |
| 230 | const char* name; |
| 231 | |
| 232 | /** used only in simulated aio: true if the physical i/o |
| 233 | already made and only the slot message needs to be passed |
| 234 | to the caller of os_aio_simulated_handle */ |
| 235 | bool io_already_done; |
| 236 | |
| 237 | /*!< file block size */ |
| 238 | ulint file_block_size; |
| 239 | |
| 240 | /** The file node for which the IO is requested. */ |
| 241 | fil_node_t* m1; |
| 242 | |
| 243 | /** the requester of an aio operation and which can be used |
| 244 | to identify which pending aio operation was completed */ |
| 245 | void* m2; |
| 246 | |
| 247 | /** AIO completion status */ |
| 248 | dberr_t err; |
| 249 | |
| 250 | #ifdef WIN_ASYNC_IO |
| 251 | |
| 252 | /** bytes written/read */ |
| 253 | DWORD n_bytes; |
| 254 | |
| 255 | /** length of the block to read or write */ |
| 256 | DWORD len; |
| 257 | |
| 258 | /** aio array containing this slot */ |
| 259 | AIO *array; |
| 260 | #elif defined(LINUX_NATIVE_AIO) |
| 261 | /** Linux control block for aio */ |
| 262 | struct iocb control; |
| 263 | |
| 264 | /** AIO return code */ |
| 265 | int ret; |
| 266 | |
| 267 | /** bytes written/read. */ |
| 268 | ssize_t n_bytes; |
| 269 | |
| 270 | /** length of the block to read or write */ |
| 271 | ulint len; |
| 272 | #else |
| 273 | /** length of the block to read or write */ |
| 274 | ulint len; |
| 275 | |
| 276 | /** bytes written/read. */ |
| 277 | ulint n_bytes; |
| 278 | #endif /* WIN_ASYNC_IO */ |
| 279 | |
| 280 | /** Length of the block before it was compressed */ |
| 281 | uint32 original_len; |
| 282 | |
| 283 | }; |
| 284 | |
| 285 | /** The asynchronous i/o array structure */ |
| 286 | class AIO { |
| 287 | public: |
| 288 | /** Constructor |
| 289 | @param[in] id Latch ID |
| 290 | @param[in] n_slots Number of slots to configure |
| 291 | @param[in] segments Number of segments to configure */ |
| 292 | AIO(latch_id_t id, ulint n_slots, ulint segments); |
| 293 | |
| 294 | /** Destructor */ |
| 295 | ~AIO(); |
| 296 | |
| 297 | /** Initialize the instance |
| 298 | @return DB_SUCCESS or error code */ |
| 299 | dberr_t init(); |
| 300 | |
| 301 | /** Requests for a slot in the aio array. If no slot is available, waits |
| 302 | until not_full-event becomes signaled. |
| 303 | |
| 304 | @param[in] type IO context |
| 305 | @param[in,out] m1 message to be passed along with the AIO |
| 306 | operation |
| 307 | @param[in,out] m2 message to be passed along with the AIO |
| 308 | operation |
| 309 | @param[in] file file handle |
| 310 | @param[in] name name of the file or path as a null-terminated |
| 311 | string |
| 312 | @param[in,out] buf buffer where to read or from which to write |
| 313 | @param[in] offset file offset, where to read from or start writing |
| 314 | @param[in] len length of the block to read or write |
| 315 | @return pointer to slot */ |
| 316 | Slot* reserve_slot( |
| 317 | const IORequest& type, |
| 318 | fil_node_t* m1, |
| 319 | void* m2, |
| 320 | pfs_os_file_t file, |
| 321 | const char* name, |
| 322 | void* buf, |
| 323 | os_offset_t offset, |
| 324 | ulint len) |
| 325 | MY_ATTRIBUTE((warn_unused_result)); |
| 326 | |
| 327 | /** @return number of reserved slots */ |
| 328 | ulint pending_io_count() const; |
| 329 | |
| 330 | /** Returns a pointer to the nth slot in the aio array. |
| 331 | @param[in] index Index of the slot in the array |
| 332 | @return pointer to slot */ |
| 333 | const Slot* at(ulint i) const |
| 334 | MY_ATTRIBUTE((warn_unused_result)) |
| 335 | { |
| 336 | ut_a(i < m_slots.size()); |
| 337 | |
| 338 | return(&m_slots[i]); |
| 339 | } |
| 340 | |
| 341 | /** Non const version */ |
| 342 | Slot* at(ulint i) |
| 343 | MY_ATTRIBUTE((warn_unused_result)) |
| 344 | { |
| 345 | ut_a(i < m_slots.size()); |
| 346 | |
| 347 | return(&m_slots[i]); |
| 348 | } |
| 349 | |
| 350 | /** Frees a slot in the AIO array, assumes caller owns the mutex. |
| 351 | @param[in,out] slot Slot to release */ |
| 352 | void release(Slot* slot); |
| 353 | |
| 354 | /** Frees a slot in the AIO array, assumes caller doesn't own the mutex. |
| 355 | @param[in,out] slot Slot to release */ |
| 356 | void release_with_mutex(Slot* slot); |
| 357 | |
| 358 | /** Prints info about the aio array. |
| 359 | @param[in,out] file Where to print */ |
| 360 | void print(FILE* file); |
| 361 | |
| 362 | /** @return the number of slots per segment */ |
| 363 | ulint slots_per_segment() const |
| 364 | MY_ATTRIBUTE((warn_unused_result)) |
| 365 | { |
| 366 | return(m_slots.size() / m_n_segments); |
| 367 | } |
| 368 | |
| 369 | /** @return accessor for n_segments */ |
| 370 | ulint get_n_segments() const |
| 371 | MY_ATTRIBUTE((warn_unused_result)) |
| 372 | { |
| 373 | return(m_n_segments); |
| 374 | } |
| 375 | |
| 376 | #ifdef UNIV_DEBUG |
| 377 | /** @return true if the thread owns the mutex */ |
| 378 | bool is_mutex_owned() const |
| 379 | MY_ATTRIBUTE((warn_unused_result)) |
| 380 | { |
| 381 | return(mutex_own(&m_mutex)); |
| 382 | } |
| 383 | #endif /* UNIV_DEBUG */ |
| 384 | |
| 385 | /** Acquire the mutex */ |
| 386 | void acquire() const |
| 387 | { |
| 388 | mutex_enter(&m_mutex); |
| 389 | } |
| 390 | |
| 391 | /** Release the mutex */ |
| 392 | void release() const |
| 393 | { |
| 394 | mutex_exit(&m_mutex); |
| 395 | } |
| 396 | |
| 397 | /** Write out the state to the file/stream |
| 398 | @param[in, out] file File to write to */ |
| 399 | void to_file(FILE* file) const; |
| 400 | |
| 401 | #ifdef LINUX_NATIVE_AIO |
| 402 | /** Dispatch an AIO request to the kernel. |
| 403 | @param[in,out] slot an already reserved slot |
| 404 | @return true on success. */ |
| 405 | bool linux_dispatch(Slot* slot) |
| 406 | MY_ATTRIBUTE((warn_unused_result)); |
| 407 | |
| 408 | /** Accessor for an AIO event |
| 409 | @param[in] index Index into the array |
| 410 | @return the event at the index */ |
| 411 | io_event* io_events(ulint index) |
| 412 | MY_ATTRIBUTE((warn_unused_result)) |
| 413 | { |
| 414 | ut_a(index < m_events.size()); |
| 415 | |
| 416 | return(&m_events[index]); |
| 417 | } |
| 418 | |
| 419 | /** Accessor for the AIO context |
| 420 | @param[in] segment Segment for which to get the context |
| 421 | @return the AIO context for the segment */ |
| 422 | io_context* io_ctx(ulint segment) |
| 423 | MY_ATTRIBUTE((warn_unused_result)) |
| 424 | { |
| 425 | ut_ad(segment < get_n_segments()); |
| 426 | |
| 427 | return(m_aio_ctx[segment]); |
| 428 | } |
| 429 | |
| 430 | /** Creates an io_context for native linux AIO. |
| 431 | @param[in] max_events number of events |
| 432 | @param[out] io_ctx io_ctx to initialize. |
| 433 | @return true on success. */ |
| 434 | static bool linux_create_io_ctx(unsigned max_events, io_context_t* io_ctx) |
| 435 | MY_ATTRIBUTE((warn_unused_result)); |
| 436 | |
| 437 | /** Checks if the system supports native linux aio. On some kernel |
| 438 | versions where native aio is supported it won't work on tmpfs. In such |
| 439 | cases we can't use native aio as it is not possible to mix simulated |
| 440 | and native aio. |
| 441 | @return true if supported, false otherwise. */ |
| 442 | static bool is_linux_native_aio_supported() |
| 443 | MY_ATTRIBUTE((warn_unused_result)); |
| 444 | #endif /* LINUX_NATIVE_AIO */ |
| 445 | |
| 446 | #ifdef WIN_ASYNC_IO |
| 447 | HANDLE m_completion_port; |
| 448 | /** Wake up all AIO threads in Windows native aio */ |
| 449 | static void wake_at_shutdown() { |
| 450 | AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf }; |
| 451 | for (size_t i = 0; i < array_elements(all_arrays); i++) { |
| 452 | AIO *a = all_arrays[i]; |
| 453 | if (a) { |
| 454 | PostQueuedCompletionStatus(a->m_completion_port, 0, |
| 455 | IOCP_SHUTDOWN_KEY, 0); |
| 456 | } |
| 457 | } |
| 458 | } |
| 459 | #endif /* WIN_ASYNC_IO */ |
| 460 | |
| 461 | #ifdef _WIN32 |
| 462 | /** This function can be called if one wants to post a batch of reads |
| 463 | and prefers an I/O - handler thread to handle them all at once later.You |
| 464 | must call os_aio_simulated_wake_handler_threads later to ensure the |
| 465 | threads are not left sleeping! */ |
| 466 | static void simulated_put_read_threads_to_sleep(); |
| 467 | #endif /* _WIN32 */ |
| 468 | |
| 469 | /** Create an instance using new(std::nothrow) |
| 470 | @param[in] id Latch ID |
| 471 | @param[in] n_slots The number of AIO request slots |
| 472 | @param[in] segments The number of segments |
| 473 | @return a new AIO instance */ |
| 474 | static AIO* create( |
| 475 | latch_id_t id, |
| 476 | ulint n_slots, |
| 477 | ulint segments) |
| 478 | MY_ATTRIBUTE((warn_unused_result)); |
| 479 | |
| 480 | /** Initializes the asynchronous io system. Creates one array each |
| 481 | for ibuf and log I/O. Also creates one array each for read and write |
| 482 | where each array is divided logically into n_readers and n_writers |
| 483 | respectively. The caller must create an i/o handler thread for each |
| 484 | segment in these arrays. This function also creates the sync array. |
| 485 | No I/O handler thread needs to be created for that |
| 486 | @param[in] n_per_seg maximum number of pending aio |
| 487 | operations allowed per segment |
| 488 | @param[in] n_readers number of reader threads |
| 489 | @param[in] n_writers number of writer threads |
| 490 | @param[in] n_slots_sync number of slots in the sync aio array |
| 491 | @return true if AIO sub-system was started successfully */ |
| 492 | static bool start( |
| 493 | ulint n_per_seg, |
| 494 | ulint n_readers, |
| 495 | ulint n_writers, |
| 496 | ulint n_slots_sync) |
| 497 | MY_ATTRIBUTE((warn_unused_result)); |
| 498 | |
| 499 | /** Free the AIO arrays */ |
| 500 | static void shutdown(); |
| 501 | |
| 502 | /** Print all the AIO segments |
| 503 | @param[in,out] file Where to print */ |
| 504 | static void print_all(FILE* file); |
| 505 | |
| 506 | /** Calculates local segment number and aio array from global |
| 507 | segment number. |
| 508 | @param[out] array AIO wait array |
| 509 | @param[in] segment global segment number |
| 510 | @return local segment number within the aio array */ |
| 511 | static ulint get_array_and_local_segment( |
| 512 | AIO** array, |
| 513 | ulint segment) |
| 514 | MY_ATTRIBUTE((warn_unused_result)); |
| 515 | |
| 516 | /** Select the IO slot array |
| 517 | @param[in,out] type Type of IO, READ or WRITE |
| 518 | @param[in] read_only true if running in read-only mode |
| 519 | @param[in] mode IO mode |
| 520 | @return slot array or NULL if invalid mode specified */ |
| 521 | static AIO* select_slot_array( |
| 522 | IORequest& type, |
| 523 | bool read_only, |
| 524 | ulint mode) |
| 525 | MY_ATTRIBUTE((warn_unused_result)); |
| 526 | |
| 527 | /** Calculates segment number for a slot. |
| 528 | @param[in] array AIO wait array |
| 529 | @param[in] slot slot in this array |
| 530 | @return segment number (which is the number used by, for example, |
| 531 | I/O handler threads) */ |
| 532 | static ulint get_segment_no_from_slot( |
| 533 | const AIO* array, |
| 534 | const Slot* slot) |
| 535 | MY_ATTRIBUTE((warn_unused_result)); |
| 536 | |
| 537 | /** Wakes up a simulated AIO I/O-handler thread if it has something |
| 538 | to do. |
| 539 | @param[in] global_segment the number of the segment in the |
| 540 | AIO arrays */ |
| 541 | static void wake_simulated_handler_thread(ulint global_segment); |
| 542 | |
| 543 | /** Check if it is a read request |
| 544 | @param[in] aio The AIO instance to check |
| 545 | @return true if the AIO instance is for reading. */ |
| 546 | static bool is_read(const AIO* aio) |
| 547 | MY_ATTRIBUTE((warn_unused_result)) |
| 548 | { |
| 549 | return(s_reads == aio); |
| 550 | } |
| 551 | |
| 552 | /** Wait on an event until no pending writes */ |
| 553 | static void wait_until_no_pending_writes() |
| 554 | { |
| 555 | os_event_wait(AIO::s_writes->m_is_empty); |
| 556 | } |
| 557 | |
| 558 | /** Print to file |
| 559 | @param[in] file File to write to */ |
| 560 | static void print_to_file(FILE* file); |
| 561 | |
| 562 | /** Check for pending IO. Gets the count and also validates the |
| 563 | data structures. |
| 564 | @return count of pending IO requests */ |
| 565 | static ulint total_pending_io_count(); |
| 566 | |
| 567 | private: |
| 568 | /** Initialise the slots |
| 569 | @return DB_SUCCESS or error code */ |
| 570 | dberr_t init_slots() |
| 571 | MY_ATTRIBUTE((warn_unused_result)); |
| 572 | |
| 573 | /** Wakes up a simulated AIO I/O-handler thread if it has something |
| 574 | to do for a local segment in the AIO array. |
| 575 | @param[in] global_segment the number of the segment in the |
| 576 | AIO arrays |
| 577 | @param[in] segment the local segment in the AIO array */ |
| 578 | void wake_simulated_handler_thread(ulint global_segment, ulint segment); |
| 579 | |
| 580 | /** Prints pending IO requests per segment of an aio array. |
| 581 | We probably don't need per segment statistics but they can help us |
| 582 | during development phase to see if the IO requests are being |
| 583 | distributed as expected. |
| 584 | @param[in,out] file file where to print |
| 585 | @param[in] segments pending IO array */ |
| 586 | void print_segment_info( |
| 587 | FILE* file, |
| 588 | const ulint* segments); |
| 589 | |
| 590 | #ifdef LINUX_NATIVE_AIO |
| 591 | /** Initialise the Linux native AIO data structures |
| 592 | @return DB_SUCCESS or error code */ |
| 593 | dberr_t init_linux_native_aio() |
| 594 | MY_ATTRIBUTE((warn_unused_result)); |
| 595 | #endif /* LINUX_NATIVE_AIO */ |
| 596 | |
| 597 | private: |
| 598 | typedef std::vector<Slot> Slots; |
| 599 | |
| 600 | /** the mutex protecting the aio array */ |
| 601 | mutable SysMutex m_mutex; |
| 602 | |
| 603 | /** Pointer to the slots in the array. |
| 604 | Number of elements must be divisible by n_threads. */ |
| 605 | Slots m_slots; |
| 606 | |
| 607 | /** Number of segments in the aio array of pending aio requests. |
| 608 | A thread can wait separately for any one of the segments. */ |
| 609 | ulint m_n_segments; |
| 610 | |
| 611 | /** The event which is set to the signaled state when |
| 612 | there is space in the aio outside the ibuf segment; |
| 613 | os_event_set() and os_event_reset() are protected by AIO::m_mutex */ |
| 614 | os_event_t m_not_full; |
| 615 | |
| 616 | /** The event which is set to the signaled state when |
| 617 | there are no pending i/os in this array; |
| 618 | os_event_set() and os_event_reset() are protected by AIO::m_mutex */ |
| 619 | os_event_t m_is_empty; |
| 620 | |
| 621 | /** Number of reserved slots in the AIO array outside |
| 622 | the ibuf segment */ |
| 623 | ulint m_n_reserved; |
| 624 | |
| 625 | |
| 626 | #if defined(LINUX_NATIVE_AIO) |
| 627 | typedef std::vector<io_event> IOEvents; |
| 628 | |
| 629 | /** completion queue for IO. There is one such queue per |
| 630 | segment. Each thread will work on one ctx exclusively. */ |
| 631 | io_context_t* m_aio_ctx; |
| 632 | |
| 633 | /** The array to collect completed IOs. There is one such |
| 634 | event for each possible pending IO. The size of the array |
| 635 | is equal to m_slots.size(). */ |
| 636 | IOEvents m_events; |
| 637 | #endif /* LINUX_NATIV_AIO */ |
| 638 | |
| 639 | /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as |
| 640 | sync AIO. These are NULL when the module has not yet been |
| 641 | initialized. */ |
| 642 | |
| 643 | /** Insert buffer */ |
| 644 | static AIO* s_ibuf; |
| 645 | |
| 646 | /** Redo log */ |
| 647 | static AIO* s_log; |
| 648 | |
| 649 | /** Reads */ |
| 650 | static AIO* s_reads; |
| 651 | |
| 652 | /** Writes */ |
| 653 | static AIO* s_writes; |
| 654 | |
| 655 | /** Synchronous I/O */ |
| 656 | static AIO* s_sync; |
| 657 | }; |
| 658 | |
| 659 | /** Static declarations */ |
| 660 | AIO* AIO::s_reads; |
| 661 | AIO* AIO::s_writes; |
| 662 | AIO* AIO::s_ibuf; |
| 663 | AIO* AIO::s_log; |
| 664 | AIO* AIO::s_sync; |
| 665 | |
| 666 | #if defined(LINUX_NATIVE_AIO) |
| 667 | /** timeout for each io_getevents() call = 500ms. */ |
| 668 | static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL; |
| 669 | |
| 670 | /** time to sleep, in microseconds if io_setup() returns EAGAIN. */ |
| 671 | static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL; |
| 672 | |
| 673 | /** number of attempts before giving up on io_setup(). */ |
| 674 | static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5; |
| 675 | #endif /* LINUX_NATIVE_AIO */ |
| 676 | |
| 677 | /** Array of events used in simulated AIO */ |
| 678 | static os_event_t* os_aio_segment_wait_events; |
| 679 | |
| 680 | /** Number of asynchronous I/O segments. Set by os_aio_init(). */ |
| 681 | static ulint os_aio_n_segments = ULINT_UNDEFINED; |
| 682 | |
| 683 | /** If the following is true, read i/o handler threads try to |
| 684 | wait until a batch of new read requests have been posted */ |
| 685 | static bool os_aio_recommend_sleep_for_read_threads; |
| 686 | |
| 687 | ulint os_n_file_reads; |
| 688 | static ulint os_bytes_read_since_printout; |
| 689 | ulint os_n_file_writes; |
| 690 | ulint os_n_fsyncs; |
| 691 | static ulint os_n_file_reads_old; |
| 692 | static ulint os_n_file_writes_old; |
| 693 | static ulint os_n_fsyncs_old; |
| 694 | |
| 695 | static time_t os_last_printout; |
| 696 | bool os_has_said_disk_full; |
| 697 | |
| 698 | /** Default Zip compression level */ |
| 699 | extern uint page_zip_level; |
| 700 | |
| 701 | /** Validates the consistency of the aio system. |
| 702 | @return true if ok */ |
| 703 | static |
| 704 | bool |
| 705 | os_aio_validate(); |
| 706 | |
| 707 | /** Handle errors for file operations. |
| 708 | @param[in] name name of a file or NULL |
| 709 | @param[in] operation operation |
| 710 | @param[in] should_abort whether to abort on an unknown error |
| 711 | @param[in] on_error_silent whether to suppress reports of non-fatal errors |
| 712 | @return true if we should retry the operation */ |
| 713 | static MY_ATTRIBUTE((warn_unused_result)) |
| 714 | bool |
| 715 | os_file_handle_error_cond_exit( |
| 716 | const char* name, |
| 717 | const char* operation, |
| 718 | bool should_abort, |
| 719 | bool on_error_silent); |
| 720 | |
| 721 | /** Does error handling when a file operation fails. |
| 722 | @param[in] name name of a file or NULL |
| 723 | @param[in] operation operation name that failed |
| 724 | @return true if we should retry the operation */ |
| 725 | static |
| 726 | bool |
| 727 | os_file_handle_error( |
| 728 | const char* name, |
| 729 | const char* operation) |
| 730 | { |
| 731 | /* Exit in case of unknown error */ |
| 732 | return(os_file_handle_error_cond_exit(name, operation, true, false)); |
| 733 | } |
| 734 | |
| 735 | /** Does error handling when a file operation fails. |
| 736 | @param[in] name name of a file or NULL |
| 737 | @param[in] operation operation name that failed |
| 738 | @param[in] on_error_silent if true then don't print any message to the log. |
| 739 | @return true if we should retry the operation */ |
| 740 | static |
| 741 | bool |
| 742 | os_file_handle_error_no_exit( |
| 743 | const char* name, |
| 744 | const char* operation, |
| 745 | bool on_error_silent) |
| 746 | { |
| 747 | /* Don't exit in case of unknown error */ |
| 748 | return(os_file_handle_error_cond_exit( |
| 749 | name, operation, false, on_error_silent)); |
| 750 | } |
| 751 | |
| 752 | /** Does simulated AIO. This function should be called by an i/o-handler |
| 753 | thread. |
| 754 | |
| 755 | @param[in] segment The number of the segment in the aio arrays to wait |
| 756 | for; segment 0 is the ibuf i/o thread, segment 1 the |
| 757 | log i/o thread, then follow the non-ibuf read threads, |
| 758 | and as the last are the non-ibuf write threads |
| 759 | @param[out] m1 the messages passed with the AIO request; note that |
| 760 | also in the case where the AIO operation failed, these |
| 761 | output parameters are valid and can be used to restart |
| 762 | the operation, for example |
| 763 | @param[out] m2 Callback argument |
| 764 | @param[in] type IO context |
| 765 | @return DB_SUCCESS or error code */ |
| 766 | static |
| 767 | dberr_t |
| 768 | os_aio_simulated_handler( |
| 769 | ulint global_segment, |
| 770 | fil_node_t** m1, |
| 771 | void** m2, |
| 772 | IORequest* type); |
| 773 | |
| 774 | #ifdef _WIN32 |
| 775 | static HANDLE win_get_syncio_event(); |
| 776 | #endif |
| 777 | |
| 778 | #ifdef _WIN32 |
| 779 | /** |
| 780 | Wrapper around Windows DeviceIoControl() function. |
| 781 | |
| 782 | Works synchronously, also in case for handle opened |
| 783 | for async access (i.e with FILE_FLAG_OVERLAPPED). |
| 784 | |
| 785 | Accepts the same parameters as DeviceIoControl(),except |
| 786 | last parameter (OVERLAPPED). |
| 787 | */ |
| 788 | static |
| 789 | BOOL |
| 790 | os_win32_device_io_control( |
| 791 | HANDLE handle, |
| 792 | DWORD code, |
| 793 | LPVOID inbuf, |
| 794 | DWORD inbuf_size, |
| 795 | LPVOID outbuf, |
| 796 | DWORD outbuf_size, |
| 797 | LPDWORD bytes_returned |
| 798 | ) |
| 799 | { |
| 800 | OVERLAPPED overlapped = { 0 }; |
| 801 | overlapped.hEvent = win_get_syncio_event(); |
| 802 | BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf, |
| 803 | outbuf_size, NULL, &overlapped); |
| 804 | |
| 805 | if (result || (GetLastError() == ERROR_IO_PENDING)) { |
| 806 | /* Wait for async io to complete */ |
| 807 | result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE); |
| 808 | } |
| 809 | |
| 810 | return result; |
| 811 | } |
| 812 | |
| 813 | #endif |
| 814 | |
| 815 | /***********************************************************************//** |
| 816 | Try to get number of bytes per sector from file system. |
| 817 | @return file block size */ |
| 818 | UNIV_INTERN |
| 819 | ulint |
| 820 | os_file_get_block_size( |
| 821 | /*===================*/ |
| 822 | os_file_t file, /*!< in: handle to a file */ |
| 823 | const char* name) /*!< in: file name */ |
| 824 | { |
| 825 | ulint fblock_size = 512; |
| 826 | |
| 827 | #if defined(UNIV_LINUX) |
| 828 | struct stat local_stat; |
| 829 | int err; |
| 830 | |
| 831 | err = fstat((int)file, &local_stat); |
| 832 | |
| 833 | if (err != 0) { |
| 834 | os_file_handle_error_no_exit(name, "fstat()" , FALSE); |
| 835 | } else { |
| 836 | fblock_size = local_stat.st_blksize; |
| 837 | } |
| 838 | #endif /* UNIV_LINUX */ |
| 839 | #ifdef _WIN32 |
| 840 | |
| 841 | fblock_size = 0; |
| 842 | BOOL result = false; |
| 843 | size_t len = 0; |
| 844 | // Open volume for this file, find out it "physical bytes per sector" |
| 845 | |
| 846 | HANDLE volume_handle = INVALID_HANDLE_VALUE; |
| 847 | char volume[MAX_PATH + 4]="\\\\.\\" ; // Special prefix required for volume names. |
| 848 | if (!GetVolumePathName(name , volume + 4, MAX_PATH)) { |
| 849 | os_file_handle_error_no_exit(name, |
| 850 | "GetVolumePathName()" , FALSE); |
| 851 | goto end; |
| 852 | } |
| 853 | |
| 854 | len = strlen(volume); |
| 855 | if (volume[len - 1] == '\\') { |
| 856 | // Trim trailing backslash from volume name. |
| 857 | volume[len - 1] = 0; |
| 858 | } |
| 859 | |
| 860 | volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, |
| 861 | FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, |
| 862 | 0, OPEN_EXISTING, 0, 0); |
| 863 | |
| 864 | if (volume_handle == INVALID_HANDLE_VALUE) { |
| 865 | if (GetLastError() != ERROR_ACCESS_DENIED) { |
| 866 | os_file_handle_error_no_exit(volume, |
| 867 | "CreateFile()" , FALSE); |
| 868 | } |
| 869 | goto end; |
| 870 | } |
| 871 | |
| 872 | DWORD tmp; |
| 873 | STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; |
| 874 | |
| 875 | STORAGE_PROPERTY_QUERY storage_query; |
| 876 | memset(&storage_query, 0, sizeof(storage_query)); |
| 877 | storage_query.PropertyId = StorageAccessAlignmentProperty; |
| 878 | storage_query.QueryType = PropertyStandardQuery; |
| 879 | |
| 880 | result = os_win32_device_io_control(volume_handle, |
| 881 | IOCTL_STORAGE_QUERY_PROPERTY, |
| 882 | &storage_query, |
| 883 | sizeof(storage_query), |
| 884 | &disk_alignment, |
| 885 | sizeof(disk_alignment), |
| 886 | &tmp); |
| 887 | |
| 888 | if (!result) { |
| 889 | DWORD err = GetLastError(); |
| 890 | if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) { |
| 891 | os_file_handle_error_no_exit(volume, |
| 892 | "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)" , FALSE); |
| 893 | } |
| 894 | goto end; |
| 895 | } |
| 896 | |
| 897 | fblock_size = disk_alignment.BytesPerPhysicalSector; |
| 898 | |
| 899 | end: |
| 900 | if (volume_handle != INVALID_HANDLE_VALUE) { |
| 901 | CloseHandle(volume_handle); |
| 902 | } |
| 903 | #endif /* _WIN32 */ |
| 904 | |
| 905 | /* Currently we support file block size up to 4Kb */ |
| 906 | if (fblock_size > 4096 || fblock_size < 512) { |
| 907 | if (fblock_size < 512) { |
| 908 | fblock_size = 512; |
| 909 | } else { |
| 910 | fblock_size = 4096; |
| 911 | } |
| 912 | } |
| 913 | |
| 914 | return fblock_size; |
| 915 | } |
| 916 | |
| 917 | #ifdef WIN_ASYNC_IO |
| 918 | /** This function is only used in Windows asynchronous i/o. |
| 919 | Waits for an aio operation to complete. This function is used to wait the |
| 920 | for completed requests. The aio array of pending requests is divided |
| 921 | into segments. The thread specifies which segment or slot it wants to wait |
| 922 | for. NOTE: this function will also take care of freeing the aio slot, |
| 923 | therefore no other thread is allowed to do the freeing! |
| 924 | @param[in] segment The number of the segment in the aio arrays to |
| 925 | wait for; segment 0 is the ibuf I/O thread, |
| 926 | segment 1 the log I/O thread, then follow the |
| 927 | non-ibuf read threads, and as the last are the |
| 928 | non-ibuf write threads; if this is |
| 929 | ULINT_UNDEFINED, then it means that sync AIO |
| 930 | is used, and this parameter is ignored |
| 931 | @param[in] pos this parameter is used only in sync AIO: |
| 932 | wait for the aio slot at this position |
| 933 | @param[out] m1 the messages passed with the AIO request; note |
| 934 | that also in the case where the AIO operation |
| 935 | failed, these output parameters are valid and |
| 936 | can be used to restart the operation, |
| 937 | for example |
| 938 | @param[out] m2 callback message |
| 939 | @param[out] type OS_FILE_WRITE or ..._READ |
| 940 | @return DB_SUCCESS or error code */ |
| 941 | static |
| 942 | dberr_t |
| 943 | os_aio_windows_handler( |
| 944 | ulint segment, |
| 945 | ulint pos, |
| 946 | fil_node_t** m1, |
| 947 | void** m2, |
| 948 | IORequest* type); |
| 949 | #endif /* WIN_ASYNC_IO */ |
| 950 | |
| 951 | /** Generic AIO Handler methods. Currently handles IO post processing. */ |
| 952 | class AIOHandler { |
| 953 | public: |
| 954 | /** Do any post processing after a read/write |
| 955 | @return DB_SUCCESS or error code. */ |
| 956 | static dberr_t post_io_processing(Slot* slot); |
| 957 | }; |
| 958 | |
| 959 | /** Helper class for doing synchronous file IO. Currently, the objective |
| 960 | is to hide the OS specific code, so that the higher level functions aren't |
| 961 | peppered with #ifdef. Makes the code flow difficult to follow. */ |
| 962 | class SyncFileIO { |
| 963 | public: |
| 964 | /** Constructor |
| 965 | @param[in] fh File handle |
| 966 | @param[in,out] buf Buffer to read/write |
| 967 | @param[in] n Number of bytes to read/write |
| 968 | @param[in] offset Offset where to read or write */ |
| 969 | SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset) |
| 970 | : |
| 971 | m_fh(fh), |
| 972 | m_buf(buf), |
| 973 | m_n(static_cast<ssize_t>(n)), |
| 974 | m_offset(offset) |
| 975 | { |
| 976 | ut_ad(m_n > 0); |
| 977 | } |
| 978 | |
| 979 | /** Destructor */ |
| 980 | ~SyncFileIO() |
| 981 | { |
| 982 | /* No op */ |
| 983 | } |
| 984 | |
| 985 | /** Do the read/write |
| 986 | @param[in] request The IO context and type |
| 987 | @return the number of bytes read/written or negative value on error */ |
| 988 | ssize_t execute(const IORequest& request); |
| 989 | |
| 990 | /** Do the read/write |
| 991 | @param[in,out] slot The IO slot, it has the IO context |
| 992 | @return the number of bytes read/written or negative value on error */ |
| 993 | static ssize_t execute(Slot* slot); |
| 994 | |
| 995 | /** Move the read/write offset up to where the partial IO succeeded. |
| 996 | @param[in] n_bytes The number of bytes to advance */ |
| 997 | void advance(ssize_t n_bytes) |
| 998 | { |
| 999 | m_offset += n_bytes; |
| 1000 | |
| 1001 | ut_ad(m_n >= n_bytes); |
| 1002 | |
| 1003 | m_n -= n_bytes; |
| 1004 | |
| 1005 | m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes; |
| 1006 | } |
| 1007 | |
| 1008 | private: |
| 1009 | /** Open file handle */ |
| 1010 | os_file_t m_fh; |
| 1011 | |
| 1012 | /** Buffer to read/write */ |
| 1013 | void* m_buf; |
| 1014 | |
| 1015 | /** Number of bytes to read/write */ |
| 1016 | ssize_t m_n; |
| 1017 | |
| 1018 | /** Offset from where to read/write */ |
| 1019 | os_offset_t m_offset; |
| 1020 | }; |
| 1021 | |
| 1022 | /** Do any post processing after a read/write |
| 1023 | @return DB_SUCCESS or error code. */ |
| 1024 | dberr_t |
| 1025 | AIOHandler::post_io_processing(Slot* slot) |
| 1026 | { |
| 1027 | ut_ad(slot->is_reserved); |
| 1028 | |
| 1029 | /* Total bytes read so far */ |
| 1030 | ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes; |
| 1031 | |
| 1032 | return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL); |
| 1033 | } |
| 1034 | |
| 1035 | /** Count the number of free slots |
| 1036 | @return number of reserved slots */ |
| 1037 | ulint |
| 1038 | AIO::pending_io_count() const |
| 1039 | { |
| 1040 | acquire(); |
| 1041 | |
| 1042 | #ifdef UNIV_DEBUG |
| 1043 | ut_a(m_n_segments > 0); |
| 1044 | ut_a(!m_slots.empty()); |
| 1045 | |
| 1046 | ulint count = 0; |
| 1047 | |
| 1048 | for (ulint i = 0; i < m_slots.size(); ++i) { |
| 1049 | |
| 1050 | const Slot& slot = m_slots[i]; |
| 1051 | |
| 1052 | if (slot.is_reserved) { |
| 1053 | ++count; |
| 1054 | ut_a(slot.len > 0); |
| 1055 | } |
| 1056 | } |
| 1057 | |
| 1058 | ut_a(m_n_reserved == count); |
| 1059 | #endif /* UNIV_DEBUG */ |
| 1060 | |
| 1061 | ulint reserved = m_n_reserved; |
| 1062 | |
| 1063 | release(); |
| 1064 | |
| 1065 | return(reserved); |
| 1066 | } |
| 1067 | |
| 1068 | #ifdef UNIV_DEBUG |
| 1069 | /** Validates the consistency the aio system some of the time. |
| 1070 | @return true if ok or the check was skipped */ |
| 1071 | static |
| 1072 | bool |
| 1073 | os_aio_validate_skip() |
| 1074 | { |
| 1075 | /** Try os_aio_validate() every this many times */ |
| 1076 | # define OS_AIO_VALIDATE_SKIP 13 |
| 1077 | |
| 1078 | /** The os_aio_validate() call skip counter. |
| 1079 | Use a signed type because of the race condition below. */ |
| 1080 | static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; |
| 1081 | |
| 1082 | /* There is a race condition below, but it does not matter, |
| 1083 | because this call is only for heuristic purposes. We want to |
| 1084 | reduce the call frequency of the costly os_aio_validate() |
| 1085 | check in debug builds. */ |
| 1086 | --os_aio_validate_count; |
| 1087 | |
| 1088 | if (os_aio_validate_count > 0) { |
| 1089 | return(true); |
| 1090 | } |
| 1091 | |
| 1092 | os_aio_validate_count = OS_AIO_VALIDATE_SKIP; |
| 1093 | return(os_aio_validate()); |
| 1094 | } |
| 1095 | #endif /* UNIV_DEBUG */ |
| 1096 | |
| 1097 | #undef USE_FILE_LOCK |
| 1098 | #ifndef _WIN32 |
| 1099 | /* On Windows, mandatory locking is used */ |
| 1100 | # define USE_FILE_LOCK |
| 1101 | #endif |
| 1102 | #ifdef USE_FILE_LOCK |
| 1103 | /** Obtain an exclusive lock on a file. |
| 1104 | @param[in] fd file descriptor |
| 1105 | @param[in] name file name |
| 1106 | @return 0 on success */ |
| 1107 | static |
| 1108 | int |
| 1109 | os_file_lock( |
| 1110 | int fd, |
| 1111 | const char* name) |
| 1112 | { |
| 1113 | struct flock lk; |
| 1114 | |
| 1115 | lk.l_type = F_WRLCK; |
| 1116 | lk.l_whence = SEEK_SET; |
| 1117 | lk.l_start = lk.l_len = 0; |
| 1118 | |
| 1119 | if (fcntl(fd, F_SETLK, &lk) == -1) { |
| 1120 | |
| 1121 | ib::error() |
| 1122 | << "Unable to lock " << name |
| 1123 | << " error: " << errno; |
| 1124 | |
| 1125 | if (errno == EAGAIN || errno == EACCES) { |
| 1126 | |
| 1127 | ib::info() |
| 1128 | << "Check that you do not already have" |
| 1129 | " another mysqld process using the" |
| 1130 | " same InnoDB data or log files." ; |
| 1131 | } |
| 1132 | |
| 1133 | return(-1); |
| 1134 | } |
| 1135 | |
| 1136 | return(0); |
| 1137 | } |
| 1138 | #endif /* USE_FILE_LOCK */ |
| 1139 | |
| 1140 | /** Calculates local segment number and aio array from global segment number. |
| 1141 | @param[out] array aio wait array |
| 1142 | @param[in] segment global segment number |
| 1143 | @return local segment number within the aio array */ |
| 1144 | ulint |
| 1145 | AIO::get_array_and_local_segment( |
| 1146 | AIO** array, |
| 1147 | ulint segment) |
| 1148 | { |
| 1149 | ulint local_segment; |
| 1150 | ulint = (srv_read_only_mode) ? 0 : 2; |
| 1151 | |
| 1152 | ut_a(segment < os_aio_n_segments); |
| 1153 | |
| 1154 | if (!srv_read_only_mode && segment < n_extra_segs) { |
| 1155 | |
| 1156 | /* We don't support ibuf/log IO during read only mode. */ |
| 1157 | |
| 1158 | if (segment == IO_IBUF_SEGMENT) { |
| 1159 | |
| 1160 | *array = s_ibuf; |
| 1161 | |
| 1162 | } else if (segment == IO_LOG_SEGMENT) { |
| 1163 | |
| 1164 | *array = s_log; |
| 1165 | |
| 1166 | } else { |
| 1167 | *array = NULL; |
| 1168 | } |
| 1169 | |
| 1170 | local_segment = 0; |
| 1171 | |
| 1172 | } else if (segment < s_reads->m_n_segments + n_extra_segs) { |
| 1173 | |
| 1174 | *array = s_reads; |
| 1175 | local_segment = segment - n_extra_segs; |
| 1176 | |
| 1177 | } else { |
| 1178 | *array = s_writes; |
| 1179 | |
| 1180 | local_segment = segment |
| 1181 | - (s_reads->m_n_segments + n_extra_segs); |
| 1182 | } |
| 1183 | |
| 1184 | return(local_segment); |
| 1185 | } |
| 1186 | |
| 1187 | /** Frees a slot in the aio array. Assumes caller owns the mutex. |
| 1188 | @param[in,out] slot Slot to release */ |
| 1189 | void |
| 1190 | AIO::release(Slot* slot) |
| 1191 | { |
| 1192 | ut_ad(is_mutex_owned()); |
| 1193 | |
| 1194 | ut_ad(slot->is_reserved); |
| 1195 | |
| 1196 | slot->is_reserved = false; |
| 1197 | |
| 1198 | --m_n_reserved; |
| 1199 | |
| 1200 | if (m_n_reserved == m_slots.size() - 1) { |
| 1201 | os_event_set(m_not_full); |
| 1202 | } |
| 1203 | |
| 1204 | if (m_n_reserved == 0) { |
| 1205 | os_event_set(m_is_empty); |
| 1206 | } |
| 1207 | |
| 1208 | #if defined(LINUX_NATIVE_AIO) |
| 1209 | |
| 1210 | if (srv_use_native_aio) { |
| 1211 | memset(&slot->control, 0x0, sizeof(slot->control)); |
| 1212 | slot->ret = 0; |
| 1213 | slot->n_bytes = 0; |
| 1214 | } else { |
| 1215 | /* These fields should not be used if we are not |
| 1216 | using native AIO. */ |
| 1217 | ut_ad(slot->n_bytes == 0); |
| 1218 | ut_ad(slot->ret == 0); |
| 1219 | } |
| 1220 | |
| 1221 | #endif /* WIN_ASYNC_IO */ |
| 1222 | } |
| 1223 | |
| 1224 | /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex. |
| 1225 | @param[in,out] slot Slot to release */ |
| 1226 | void |
| 1227 | AIO::release_with_mutex(Slot* slot) |
| 1228 | { |
| 1229 | acquire(); |
| 1230 | |
| 1231 | release(slot); |
| 1232 | |
| 1233 | release(); |
| 1234 | } |
| 1235 | |
| 1236 | /** Create a temporary file. This function is like tmpfile(3), but |
| 1237 | the temporary file is created in the in the mysql server configuration |
| 1238 | parameter (--tmpdir). |
| 1239 | @return temporary file handle, or NULL on error */ |
| 1240 | FILE* |
| 1241 | os_file_create_tmpfile() |
| 1242 | { |
| 1243 | FILE* file = NULL; |
| 1244 | WAIT_ALLOW_WRITES(); |
| 1245 | os_file_t fd = innobase_mysql_tmpfile(NULL); |
| 1246 | |
| 1247 | if (fd != OS_FILE_CLOSED) { |
| 1248 | #ifdef _WIN32 |
| 1249 | int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0); |
| 1250 | if (crt_fd != -1) { |
| 1251 | file = fdopen(crt_fd, "w+b" ); |
| 1252 | if (!file) { |
| 1253 | close(crt_fd); |
| 1254 | } |
| 1255 | } |
| 1256 | #else |
| 1257 | file = fdopen(fd, "w+b" ); |
| 1258 | if (!file) { |
| 1259 | close(fd); |
| 1260 | } |
| 1261 | #endif |
| 1262 | } |
| 1263 | |
| 1264 | if (file == NULL) { |
| 1265 | |
| 1266 | ib::error() |
| 1267 | << "Unable to create temporary file; errno: " |
| 1268 | << errno; |
| 1269 | } |
| 1270 | |
| 1271 | return(file); |
| 1272 | } |
| 1273 | |
| 1274 | /** Rewind file to its start, read at most size - 1 bytes from it to str, and |
| 1275 | NUL-terminate str. All errors are silently ignored. This function is |
| 1276 | mostly meant to be used with temporary files. |
| 1277 | @param[in,out] file File to read from |
| 1278 | @param[in,out] str Buffer where to read |
| 1279 | @param[in] size Size of buffer */ |
| 1280 | void |
| 1281 | os_file_read_string( |
| 1282 | FILE* file, |
| 1283 | char* str, |
| 1284 | ulint size) |
| 1285 | { |
| 1286 | if (size != 0) { |
| 1287 | rewind(file); |
| 1288 | |
| 1289 | size_t flen = fread(str, 1, size - 1, file); |
| 1290 | |
| 1291 | str[flen] = '\0'; |
| 1292 | } |
| 1293 | } |
| 1294 | |
| 1295 | /** This function returns a new path name after replacing the basename |
| 1296 | in an old path with a new basename. The old_path is a full path |
| 1297 | name including the extension. The tablename is in the normal |
| 1298 | form "databasename/tablename". The new base name is found after |
| 1299 | the forward slash. Both input strings are null terminated. |
| 1300 | |
| 1301 | This function allocates memory to be returned. It is the callers |
| 1302 | responsibility to free the return value after it is no longer needed. |
| 1303 | |
| 1304 | @param[in] old_path Pathname |
| 1305 | @param[in] tablename Contains new base name |
| 1306 | @return own: new full pathname */ |
| 1307 | char* |
| 1308 | os_file_make_new_pathname( |
| 1309 | const char* old_path, |
| 1310 | const char* tablename) |
| 1311 | { |
| 1312 | ulint dir_len; |
| 1313 | char* last_slash; |
| 1314 | char* base_name; |
| 1315 | char* new_path; |
| 1316 | ulint new_path_len; |
| 1317 | |
| 1318 | /* Split the tablename into its database and table name components. |
| 1319 | They are separated by a '/'. */ |
| 1320 | last_slash = strrchr((char*) tablename, '/'); |
| 1321 | base_name = last_slash ? last_slash + 1 : (char*) tablename; |
| 1322 | |
| 1323 | /* Find the offset of the last slash. We will strip off the |
| 1324 | old basename.ibd which starts after that slash. */ |
| 1325 | last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR); |
| 1326 | dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path); |
| 1327 | |
| 1328 | /* allocate a new path and move the old directory path to it. */ |
| 1329 | new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd" ; |
| 1330 | new_path = static_cast<char*>(ut_malloc_nokey(new_path_len)); |
| 1331 | memcpy(new_path, old_path, dir_len); |
| 1332 | |
| 1333 | snprintf(new_path + dir_len, new_path_len - dir_len, |
| 1334 | "%c%s.ibd" , OS_PATH_SEPARATOR, base_name); |
| 1335 | |
| 1336 | return(new_path); |
| 1337 | } |
| 1338 | |
| 1339 | /** This function reduces a null-terminated full remote path name into |
| 1340 | the path that is sent by MySQL for DATA DIRECTORY clause. It replaces |
| 1341 | the 'databasename/tablename.ibd' found at the end of the path with just |
| 1342 | 'tablename'. |
| 1343 | |
| 1344 | Since the result is always smaller than the path sent in, no new memory |
| 1345 | is allocated. The caller should allocate memory for the path sent in. |
| 1346 | This function manipulates that path in place. |
| 1347 | |
| 1348 | If the path format is not as expected, just return. The result is used |
| 1349 | to inform a SHOW CREATE TABLE command. |
| 1350 | @param[in,out] data_dir_path Full path/data_dir_path */ |
| 1351 | void |
| 1352 | os_file_make_data_dir_path( |
| 1353 | char* data_dir_path) |
| 1354 | { |
| 1355 | /* Replace the period before the extension with a null byte. */ |
| 1356 | char* ptr = strrchr((char*) data_dir_path, '.'); |
| 1357 | |
| 1358 | if (ptr == NULL) { |
| 1359 | return; |
| 1360 | } |
| 1361 | |
| 1362 | ptr[0] = '\0'; |
| 1363 | |
| 1364 | /* The tablename starts after the last slash. */ |
| 1365 | ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); |
| 1366 | |
| 1367 | if (ptr == NULL) { |
| 1368 | return; |
| 1369 | } |
| 1370 | |
| 1371 | ptr[0] = '\0'; |
| 1372 | |
| 1373 | char* tablename = ptr + 1; |
| 1374 | |
| 1375 | /* The databasename starts after the next to last slash. */ |
| 1376 | ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); |
| 1377 | |
| 1378 | if (ptr == NULL) { |
| 1379 | return; |
| 1380 | } |
| 1381 | |
| 1382 | ulint tablename_len = ut_strlen(tablename); |
| 1383 | |
| 1384 | ut_memmove(++ptr, tablename, tablename_len); |
| 1385 | |
| 1386 | ptr[tablename_len] = '\0'; |
| 1387 | } |
| 1388 | |
| 1389 | /** Check if the path refers to the root of a drive using a pointer |
| 1390 | to the last directory separator that the caller has fixed. |
| 1391 | @param[in] path path name |
| 1392 | @param[in] path last directory separator in the path |
| 1393 | @return true if this path is a drive root, false if not */ |
| 1394 | UNIV_INLINE |
| 1395 | bool |
| 1396 | os_file_is_root( |
| 1397 | const char* path, |
| 1398 | const char* last_slash) |
| 1399 | { |
| 1400 | return( |
| 1401 | #ifdef _WIN32 |
| 1402 | (last_slash == path + 2 && path[1] == ':') || |
| 1403 | #endif /* _WIN32 */ |
| 1404 | last_slash == path); |
| 1405 | } |
| 1406 | |
| 1407 | /** Return the parent directory component of a null-terminated path. |
| 1408 | Return a new buffer containing the string up to, but not including, |
| 1409 | the final component of the path. |
| 1410 | The path returned will not contain a trailing separator. |
| 1411 | Do not return a root path, return NULL instead. |
| 1412 | The final component trimmed off may be a filename or a directory name. |
| 1413 | If the final component is the only component of the path, return NULL. |
| 1414 | It is the caller's responsibility to free the returned string after it |
| 1415 | is no longer needed. |
| 1416 | @param[in] path Path name |
| 1417 | @return own: parent directory of the path */ |
| 1418 | static |
| 1419 | char* |
| 1420 | os_file_get_parent_dir( |
| 1421 | const char* path) |
| 1422 | { |
| 1423 | bool has_trailing_slash = false; |
| 1424 | |
| 1425 | /* Find the offset of the last slash */ |
| 1426 | const char* last_slash = strrchr(path, OS_PATH_SEPARATOR); |
| 1427 | |
| 1428 | if (!last_slash) { |
| 1429 | /* No slash in the path, return NULL */ |
| 1430 | return(NULL); |
| 1431 | } |
| 1432 | |
| 1433 | /* Ok, there is a slash. Is there anything after it? */ |
| 1434 | if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) { |
| 1435 | has_trailing_slash = true; |
| 1436 | } |
| 1437 | |
| 1438 | /* Reduce repetative slashes. */ |
| 1439 | while (last_slash > path |
| 1440 | && last_slash[-1] == OS_PATH_SEPARATOR) { |
| 1441 | last_slash--; |
| 1442 | } |
| 1443 | |
| 1444 | /* Check for the root of a drive. */ |
| 1445 | if (os_file_is_root(path, last_slash)) { |
| 1446 | return(NULL); |
| 1447 | } |
| 1448 | |
| 1449 | /* If a trailing slash prevented the first strrchr() from trimming |
| 1450 | the last component of the path, trim that component now. */ |
| 1451 | if (has_trailing_slash) { |
| 1452 | /* Back up to the previous slash. */ |
| 1453 | last_slash--; |
| 1454 | while (last_slash > path |
| 1455 | && last_slash[0] != OS_PATH_SEPARATOR) { |
| 1456 | last_slash--; |
| 1457 | } |
| 1458 | |
| 1459 | /* Reduce repetative slashes. */ |
| 1460 | while (last_slash > path |
| 1461 | && last_slash[-1] == OS_PATH_SEPARATOR) { |
| 1462 | last_slash--; |
| 1463 | } |
| 1464 | } |
| 1465 | |
| 1466 | /* Check for the root of a drive. */ |
| 1467 | if (os_file_is_root(path, last_slash)) { |
| 1468 | return(NULL); |
| 1469 | } |
| 1470 | |
| 1471 | /* Non-trivial directory component */ |
| 1472 | |
| 1473 | return(mem_strdupl(path, ulint(last_slash - path))); |
| 1474 | } |
| 1475 | #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR |
| 1476 | |
| 1477 | /* Test the function os_file_get_parent_dir. */ |
| 1478 | void |
| 1479 | test_os_file_get_parent_dir( |
| 1480 | const char* child_dir, |
| 1481 | const char* expected_dir) |
| 1482 | { |
| 1483 | char* child = mem_strdup(child_dir); |
| 1484 | char* expected = expected_dir == NULL ? NULL |
| 1485 | : mem_strdup(expected_dir); |
| 1486 | |
| 1487 | /* os_file_get_parent_dir() assumes that separators are |
| 1488 | converted to OS_PATH_SEPARATOR. */ |
| 1489 | os_normalize_path(child); |
| 1490 | os_normalize_path(expected); |
| 1491 | |
| 1492 | char* parent = os_file_get_parent_dir(child); |
| 1493 | |
| 1494 | bool unexpected = (expected == NULL |
| 1495 | ? (parent != NULL) |
| 1496 | : (0 != strcmp(parent, expected))); |
| 1497 | if (unexpected) { |
| 1498 | ib::fatal() << "os_file_get_parent_dir('" << child |
| 1499 | << "') returned '" << parent |
| 1500 | << "', instead of '" << expected << "'." ; |
| 1501 | } |
| 1502 | ut_free(parent); |
| 1503 | ut_free(child); |
| 1504 | ut_free(expected); |
| 1505 | } |
| 1506 | |
| 1507 | /* Test the function os_file_get_parent_dir. */ |
| 1508 | void |
| 1509 | unit_test_os_file_get_parent_dir() |
| 1510 | { |
| 1511 | test_os_file_get_parent_dir("/usr/lib/a" , "/usr/lib" ); |
| 1512 | test_os_file_get_parent_dir("/usr/" , NULL); |
| 1513 | test_os_file_get_parent_dir("//usr//" , NULL); |
| 1514 | test_os_file_get_parent_dir("usr" , NULL); |
| 1515 | test_os_file_get_parent_dir("usr//" , NULL); |
| 1516 | test_os_file_get_parent_dir("/" , NULL); |
| 1517 | test_os_file_get_parent_dir("//" , NULL); |
| 1518 | test_os_file_get_parent_dir("." , NULL); |
| 1519 | test_os_file_get_parent_dir(".." , NULL); |
| 1520 | # ifdef _WIN32 |
| 1521 | test_os_file_get_parent_dir("D:" , NULL); |
| 1522 | test_os_file_get_parent_dir("D:/" , NULL); |
| 1523 | test_os_file_get_parent_dir("D:\\" , NULL); |
| 1524 | test_os_file_get_parent_dir("D:/data" , NULL); |
| 1525 | test_os_file_get_parent_dir("D:/data/" , NULL); |
| 1526 | test_os_file_get_parent_dir("D:\\data\\" , NULL); |
| 1527 | test_os_file_get_parent_dir("D:///data/////" , NULL); |
| 1528 | test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\" , NULL); |
| 1529 | test_os_file_get_parent_dir("D:/data//a" , "D:/data" ); |
| 1530 | test_os_file_get_parent_dir("D:\\data\\\\a" , "D:\\data" ); |
| 1531 | test_os_file_get_parent_dir("D:///data//a///b/" , "D:///data//a" ); |
| 1532 | test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\" , "D:\\\\\\data\\\\a" ); |
| 1533 | #endif /* _WIN32 */ |
| 1534 | } |
| 1535 | #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ |
| 1536 | |
| 1537 | |
| 1538 | /** Creates all missing subdirectories along the given path. |
| 1539 | @param[in] path Path name |
| 1540 | @return DB_SUCCESS if OK, otherwise error code. */ |
| 1541 | dberr_t |
| 1542 | os_file_create_subdirs_if_needed( |
| 1543 | const char* path) |
| 1544 | { |
| 1545 | if (srv_read_only_mode) { |
| 1546 | |
| 1547 | ib::error() |
| 1548 | << "read only mode set. Can't create " |
| 1549 | << "subdirectories '" << path << "'" ; |
| 1550 | |
| 1551 | return(DB_READ_ONLY); |
| 1552 | |
| 1553 | } |
| 1554 | |
| 1555 | char* subdir = os_file_get_parent_dir(path); |
| 1556 | |
| 1557 | if (subdir == NULL) { |
| 1558 | /* subdir is root or cwd, nothing to do */ |
| 1559 | return(DB_SUCCESS); |
| 1560 | } |
| 1561 | |
| 1562 | /* Test if subdir exists */ |
| 1563 | os_file_type_t type; |
| 1564 | bool subdir_exists; |
| 1565 | bool success = os_file_status(subdir, &subdir_exists, &type); |
| 1566 | |
| 1567 | if (success && !subdir_exists) { |
| 1568 | |
| 1569 | /* Subdir does not exist, create it */ |
| 1570 | dberr_t err = os_file_create_subdirs_if_needed(subdir); |
| 1571 | |
| 1572 | if (err != DB_SUCCESS) { |
| 1573 | |
| 1574 | ut_free(subdir); |
| 1575 | |
| 1576 | return(err); |
| 1577 | } |
| 1578 | |
| 1579 | success = os_file_create_directory(subdir, false); |
| 1580 | } |
| 1581 | |
| 1582 | ut_free(subdir); |
| 1583 | |
| 1584 | return(success ? DB_SUCCESS : DB_ERROR); |
| 1585 | } |
| 1586 | |
| 1587 | #ifndef _WIN32 |
| 1588 | |
| 1589 | /** Do the read/write |
| 1590 | @param[in] request The IO context and type |
| 1591 | @return the number of bytes read/written or negative value on error */ |
| 1592 | ssize_t |
| 1593 | SyncFileIO::execute(const IORequest& request) |
| 1594 | { |
| 1595 | ssize_t n_bytes; |
| 1596 | |
| 1597 | if (request.is_read()) { |
| 1598 | n_bytes = pread(m_fh, m_buf, m_n, m_offset); |
| 1599 | } else { |
| 1600 | ut_ad(request.is_write()); |
| 1601 | n_bytes = pwrite(m_fh, m_buf, m_n, m_offset); |
| 1602 | } |
| 1603 | |
| 1604 | return(n_bytes); |
| 1605 | } |
| 1606 | /** Free storage space associated with a section of the file. |
| 1607 | @param[in] fh Open file handle |
| 1608 | @param[in] off Starting offset (SEEK_SET) |
| 1609 | @param[in] len Size of the hole |
| 1610 | @return DB_SUCCESS or error code */ |
| 1611 | static |
| 1612 | dberr_t |
| 1613 | os_file_punch_hole_posix( |
| 1614 | os_file_t fh, |
| 1615 | os_offset_t off, |
| 1616 | os_offset_t len) |
| 1617 | { |
| 1618 | |
| 1619 | #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE |
| 1620 | const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; |
| 1621 | |
| 1622 | int ret = fallocate(fh, mode, off, len); |
| 1623 | |
| 1624 | if (ret == 0) { |
| 1625 | return(DB_SUCCESS); |
| 1626 | } |
| 1627 | |
| 1628 | if (errno == ENOTSUP) { |
| 1629 | return(DB_IO_NO_PUNCH_HOLE); |
| 1630 | } |
| 1631 | |
| 1632 | ib::warn() |
| 1633 | << "fallocate(" |
| 1634 | <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, " |
| 1635 | << off << ", " << len << ") returned errno: " |
| 1636 | << errno; |
| 1637 | |
| 1638 | return(DB_IO_ERROR); |
| 1639 | |
| 1640 | #elif defined(UNIV_SOLARIS) |
| 1641 | |
| 1642 | // Use F_FREESP |
| 1643 | |
| 1644 | #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ |
| 1645 | |
| 1646 | return(DB_IO_NO_PUNCH_HOLE); |
| 1647 | } |
| 1648 | |
| 1649 | #if defined(LINUX_NATIVE_AIO) |
| 1650 | |
| 1651 | /** Linux native AIO handler */ |
| 1652 | class LinuxAIOHandler { |
| 1653 | public: |
| 1654 | /** |
| 1655 | @param[in] global_segment The global segment*/ |
| 1656 | LinuxAIOHandler(ulint global_segment) |
| 1657 | : |
| 1658 | m_global_segment(global_segment) |
| 1659 | { |
| 1660 | /* Should never be doing Sync IO here. */ |
| 1661 | ut_a(m_global_segment != ULINT_UNDEFINED); |
| 1662 | |
| 1663 | /* Find the array and the local segment. */ |
| 1664 | |
| 1665 | m_segment = AIO::get_array_and_local_segment( |
| 1666 | &m_array, m_global_segment); |
| 1667 | |
| 1668 | m_n_slots = m_array->slots_per_segment(); |
| 1669 | } |
| 1670 | |
| 1671 | /** Destructor */ |
| 1672 | ~LinuxAIOHandler() |
| 1673 | { |
| 1674 | // No op |
| 1675 | } |
| 1676 | |
| 1677 | /** |
| 1678 | Process a Linux AIO request |
| 1679 | @param[out] m1 the messages passed with the |
| 1680 | @param[out] m2 AIO request; note that in case the |
| 1681 | AIO operation failed, these output |
| 1682 | parameters are valid and can be used to |
| 1683 | restart the operation. |
| 1684 | @param[out] request IO context |
| 1685 | @return DB_SUCCESS or error code */ |
| 1686 | dberr_t poll(fil_node_t** m1, void** m2, IORequest* request); |
| 1687 | |
| 1688 | private: |
| 1689 | /** Resubmit an IO request that was only partially successful |
| 1690 | @param[in,out] slot Request to resubmit |
| 1691 | @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */ |
| 1692 | dberr_t resubmit(Slot* slot); |
| 1693 | |
| 1694 | /** Check if the AIO succeeded |
| 1695 | @param[in,out] slot The slot to check |
| 1696 | @return DB_SUCCESS, DB_FAIL if the operation should be retried or |
| 1697 | DB_IO_ERROR on all other errors */ |
| 1698 | dberr_t check_state(Slot* slot); |
| 1699 | |
| 1700 | /** @return true if a shutdown was detected */ |
| 1701 | bool is_shutdown() const |
| 1702 | { |
| 1703 | return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
| 1704 | && !buf_page_cleaner_is_active); |
| 1705 | } |
| 1706 | |
| 1707 | /** If no slot was found then the m_array->m_mutex will be released. |
| 1708 | @param[out] n_pending The number of pending IOs |
| 1709 | @return NULL or a slot that has completed IO */ |
| 1710 | Slot* find_completed_slot(ulint* n_pending); |
| 1711 | |
| 1712 | /** This is called from within the IO-thread. If there are no completed |
| 1713 | IO requests in the slot array, the thread calls this function to |
| 1714 | collect more requests from the Linux kernel. |
| 1715 | The IO-thread waits on io_getevents(), which is a blocking call, with |
| 1716 | a timeout value. Unless the system is very heavy loaded, keeping the |
| 1717 | IO-thread very busy, the io-thread will spend most of its time waiting |
| 1718 | in this function. |
| 1719 | The IO-thread also exits in this function. It checks server status at |
| 1720 | each wakeup and that is why we use timed wait in io_getevents(). */ |
| 1721 | void collect(); |
| 1722 | |
| 1723 | private: |
| 1724 | /** Slot array */ |
| 1725 | AIO* m_array; |
| 1726 | |
| 1727 | /** Number of slots inthe local segment */ |
| 1728 | ulint m_n_slots; |
| 1729 | |
| 1730 | /** The local segment to check */ |
| 1731 | ulint m_segment; |
| 1732 | |
| 1733 | /** The global segment */ |
| 1734 | ulint m_global_segment; |
| 1735 | }; |
| 1736 | |
| 1737 | /** Resubmit an IO request that was only partially successful |
| 1738 | @param[in,out] slot Request to resubmit |
| 1739 | @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */ |
| 1740 | dberr_t |
| 1741 | LinuxAIOHandler::resubmit(Slot* slot) |
| 1742 | { |
| 1743 | #ifdef UNIV_DEBUG |
| 1744 | /* Bytes already read/written out */ |
| 1745 | ulint n_bytes = slot->ptr - slot->buf; |
| 1746 | |
| 1747 | ut_ad(m_array->is_mutex_owned()); |
| 1748 | |
| 1749 | ut_ad(n_bytes < slot->original_len); |
| 1750 | ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes); |
| 1751 | /* Partial read or write scenario */ |
| 1752 | ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes)); |
| 1753 | #endif /* UNIV_DEBUG */ |
| 1754 | |
| 1755 | slot->len -= slot->n_bytes; |
| 1756 | slot->ptr += slot->n_bytes; |
| 1757 | slot->offset += slot->n_bytes; |
| 1758 | |
| 1759 | /* Resetting the bytes read/written */ |
| 1760 | slot->n_bytes = 0; |
| 1761 | slot->io_already_done = false; |
| 1762 | |
| 1763 | struct iocb* iocb = &slot->control; |
| 1764 | |
| 1765 | if (slot->type.is_read()) { |
| 1766 | |
| 1767 | io_prep_pread( |
| 1768 | iocb, |
| 1769 | slot->file, |
| 1770 | slot->ptr, |
| 1771 | slot->len, |
| 1772 | static_cast<off_t>(slot->offset)); |
| 1773 | } else { |
| 1774 | |
| 1775 | ut_a(slot->type.is_write()); |
| 1776 | |
| 1777 | io_prep_pwrite( |
| 1778 | iocb, |
| 1779 | slot->file, |
| 1780 | slot->ptr, |
| 1781 | slot->len, |
| 1782 | static_cast<off_t>(slot->offset)); |
| 1783 | } |
| 1784 | |
| 1785 | iocb->data = slot; |
| 1786 | |
| 1787 | /* Resubmit an I/O request */ |
| 1788 | int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb); |
| 1789 | |
| 1790 | if (ret < -1) { |
| 1791 | errno = -ret; |
| 1792 | } |
| 1793 | |
| 1794 | return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS); |
| 1795 | } |
| 1796 | |
| 1797 | /** Check if the AIO succeeded |
| 1798 | @param[in,out] slot The slot to check |
| 1799 | @return DB_SUCCESS, DB_FAIL if the operation should be retried or |
| 1800 | DB_IO_ERROR on all other errors */ |
| 1801 | dberr_t |
| 1802 | LinuxAIOHandler::check_state(Slot* slot) |
| 1803 | { |
| 1804 | ut_ad(m_array->is_mutex_owned()); |
| 1805 | |
| 1806 | /* Note that it may be that there is more then one completed |
| 1807 | IO requests. We process them one at a time. We may have a case |
| 1808 | here to improve the performance slightly by dealing with all |
| 1809 | requests in one sweep. */ |
| 1810 | |
| 1811 | srv_set_io_thread_op_info( |
| 1812 | m_global_segment, "processing completed aio requests" ); |
| 1813 | |
| 1814 | ut_ad(slot->io_already_done); |
| 1815 | |
| 1816 | dberr_t err = DB_SUCCESS; |
| 1817 | |
| 1818 | if (slot->ret == 0) { |
| 1819 | |
| 1820 | err = AIOHandler::post_io_processing(slot); |
| 1821 | |
| 1822 | } else { |
| 1823 | errno = -slot->ret; |
| 1824 | |
| 1825 | /* os_file_handle_error does tell us if we should retry |
| 1826 | this IO. As it stands now, we don't do this retry when |
| 1827 | reaping requests from a different context than |
| 1828 | the dispatcher. This non-retry logic is the same for |
| 1829 | Windows and Linux native AIO. |
| 1830 | We should probably look into this to transparently |
| 1831 | re-submit the IO. */ |
| 1832 | os_file_handle_error(slot->name, "Linux aio" ); |
| 1833 | |
| 1834 | err = DB_IO_ERROR; |
| 1835 | } |
| 1836 | |
| 1837 | return(err); |
| 1838 | } |
| 1839 | |
| 1840 | /** If no slot was found then the m_array->m_mutex will be released. |
| 1841 | @param[out] n_pending The number of pending IOs |
| 1842 | @return NULL or a slot that has completed IO */ |
| 1843 | Slot* |
| 1844 | LinuxAIOHandler::find_completed_slot(ulint* n_pending) |
| 1845 | { |
| 1846 | ulint offset = m_n_slots * m_segment; |
| 1847 | |
| 1848 | *n_pending = 0; |
| 1849 | |
| 1850 | m_array->acquire(); |
| 1851 | |
| 1852 | Slot* slot = m_array->at(offset); |
| 1853 | |
| 1854 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
| 1855 | |
| 1856 | if (slot->is_reserved) { |
| 1857 | |
| 1858 | ++*n_pending; |
| 1859 | |
| 1860 | if (slot->io_already_done) { |
| 1861 | |
| 1862 | /* Something for us to work on. |
| 1863 | Note: We don't release the mutex. */ |
| 1864 | return(slot); |
| 1865 | } |
| 1866 | } |
| 1867 | } |
| 1868 | |
| 1869 | m_array->release(); |
| 1870 | |
| 1871 | return(NULL); |
| 1872 | } |
| 1873 | |
| 1874 | /** This function is only used in Linux native asynchronous i/o. This is |
| 1875 | called from within the io-thread. If there are no completed IO requests |
| 1876 | in the slot array, the thread calls this function to collect more |
| 1877 | requests from the kernel. |
| 1878 | The io-thread waits on io_getevents(), which is a blocking call, with |
| 1879 | a timeout value. Unless the system is very heavy loaded, keeping the |
| 1880 | io-thread very busy, the io-thread will spend most of its time waiting |
| 1881 | in this function. |
| 1882 | The io-thread also exits in this function. It checks server status at |
| 1883 | each wakeup and that is why we use timed wait in io_getevents(). */ |
| 1884 | void |
| 1885 | LinuxAIOHandler::collect() |
| 1886 | { |
| 1887 | ut_ad(m_n_slots > 0); |
| 1888 | ut_ad(m_array != NULL); |
| 1889 | ut_ad(m_segment < m_array->get_n_segments()); |
| 1890 | |
| 1891 | /* Which io_context we are going to use. */ |
| 1892 | io_context* io_ctx = m_array->io_ctx(m_segment); |
| 1893 | |
| 1894 | /* Starting point of the m_segment we will be working on. */ |
| 1895 | ulint start_pos = m_segment * m_n_slots; |
| 1896 | |
| 1897 | /* End point. */ |
| 1898 | ulint end_pos = start_pos + m_n_slots; |
| 1899 | |
| 1900 | for (;;) { |
| 1901 | struct io_event* events; |
| 1902 | |
| 1903 | /* Which part of event array we are going to work on. */ |
| 1904 | events = m_array->io_events(m_segment * m_n_slots); |
| 1905 | |
| 1906 | /* Initialize the events. */ |
| 1907 | memset(events, 0, sizeof(*events) * m_n_slots); |
| 1908 | |
| 1909 | /* The timeout value is arbitrary. We probably need |
| 1910 | to experiment with it a little. */ |
| 1911 | struct timespec timeout; |
| 1912 | |
| 1913 | timeout.tv_sec = 0; |
| 1914 | timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; |
| 1915 | |
| 1916 | int ret; |
| 1917 | |
| 1918 | ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout); |
| 1919 | |
| 1920 | for (int i = 0; i < ret; ++i) { |
| 1921 | |
| 1922 | struct iocb* iocb; |
| 1923 | |
| 1924 | iocb = reinterpret_cast<struct iocb*>(events[i].obj); |
| 1925 | ut_a(iocb != NULL); |
| 1926 | |
| 1927 | Slot* slot = reinterpret_cast<Slot*>(iocb->data); |
| 1928 | |
| 1929 | /* Some sanity checks. */ |
| 1930 | ut_a(slot != NULL); |
| 1931 | ut_a(slot->is_reserved); |
| 1932 | |
| 1933 | /* We are not scribbling previous segment. */ |
| 1934 | ut_a(slot->pos >= start_pos); |
| 1935 | |
| 1936 | /* We have not overstepped to next segment. */ |
| 1937 | ut_a(slot->pos < end_pos); |
| 1938 | |
| 1939 | /* Deallocate unused blocks from file system. |
| 1940 | This is newer done to page 0 or to log files.*/ |
| 1941 | if (slot->offset > 0 |
| 1942 | && !slot->type.is_log() |
| 1943 | && slot->type.is_write() |
| 1944 | && slot->type.punch_hole()) { |
| 1945 | |
| 1946 | slot->err = slot->type.punch_hole( |
| 1947 | slot->file, |
| 1948 | slot->offset, slot->len); |
| 1949 | } else { |
| 1950 | slot->err = DB_SUCCESS; |
| 1951 | } |
| 1952 | |
| 1953 | /* Mark this request as completed. The error handling |
| 1954 | will be done in the calling function. */ |
| 1955 | m_array->acquire(); |
| 1956 | |
| 1957 | slot->ret = events[i].res2; |
| 1958 | slot->io_already_done = true; |
| 1959 | slot->n_bytes = events[i].res; |
| 1960 | |
| 1961 | m_array->release(); |
| 1962 | } |
| 1963 | |
| 1964 | if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
| 1965 | || !buf_page_cleaner_is_active |
| 1966 | || ret > 0) { |
| 1967 | |
| 1968 | break; |
| 1969 | } |
| 1970 | |
| 1971 | /* This error handling is for any error in collecting the |
| 1972 | IO requests. The errors, if any, for any particular IO |
| 1973 | request are simply passed on to the calling routine. */ |
| 1974 | |
| 1975 | switch (ret) { |
| 1976 | case -EAGAIN: |
| 1977 | /* Not enough resources! Try again. */ |
| 1978 | |
| 1979 | case -EINTR: |
| 1980 | /* Interrupted! The behaviour in case of an interrupt. |
| 1981 | If we have some completed IOs available then the |
| 1982 | return code will be the number of IOs. We get EINTR |
| 1983 | only if there are no completed IOs and we have been |
| 1984 | interrupted. */ |
| 1985 | |
| 1986 | case 0: |
| 1987 | /* No pending request! Go back and check again. */ |
| 1988 | |
| 1989 | continue; |
| 1990 | } |
| 1991 | |
| 1992 | /* All other errors should cause a trap for now. */ |
| 1993 | ib::fatal() |
| 1994 | << "Unexpected ret_code[" << ret |
| 1995 | << "] from io_getevents()!" ; |
| 1996 | |
| 1997 | break; |
| 1998 | } |
| 1999 | } |
| 2000 | |
| 2001 | /** Process a Linux AIO request |
| 2002 | @param[out] m1 the messages passed with the |
| 2003 | @param[out] m2 AIO request; note that in case the |
| 2004 | AIO operation failed, these output |
| 2005 | parameters are valid and can be used to |
| 2006 | restart the operation. |
| 2007 | @param[out] request IO context |
| 2008 | @return DB_SUCCESS or error code */ |
| 2009 | dberr_t |
| 2010 | LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request) |
| 2011 | { |
| 2012 | dberr_t err = DB_SUCCESS; |
| 2013 | Slot* slot; |
| 2014 | |
| 2015 | /* Loop until we have found a completed request. */ |
| 2016 | for (;;) { |
| 2017 | |
| 2018 | ulint n_pending; |
| 2019 | |
| 2020 | slot = find_completed_slot(&n_pending); |
| 2021 | |
| 2022 | if (slot != NULL) { |
| 2023 | |
| 2024 | ut_ad(m_array->is_mutex_owned()); |
| 2025 | |
| 2026 | err = check_state(slot); |
| 2027 | |
| 2028 | /* DB_FAIL is not a hard error, we should retry */ |
| 2029 | if (err != DB_FAIL) { |
| 2030 | break; |
| 2031 | } |
| 2032 | |
| 2033 | /* Partial IO, resubmit request for |
| 2034 | remaining bytes to read/write */ |
| 2035 | err = resubmit(slot); |
| 2036 | |
| 2037 | if (err != DB_SUCCESS) { |
| 2038 | break; |
| 2039 | } |
| 2040 | |
| 2041 | m_array->release(); |
| 2042 | |
| 2043 | } else if (is_shutdown() && n_pending == 0) { |
| 2044 | |
| 2045 | /* There is no completed request. If there is |
| 2046 | no pending request at all, and the system is |
| 2047 | being shut down, exit. */ |
| 2048 | |
| 2049 | *m1 = NULL; |
| 2050 | *m2 = NULL; |
| 2051 | |
| 2052 | return(DB_SUCCESS); |
| 2053 | |
| 2054 | } else { |
| 2055 | |
| 2056 | /* Wait for some request. Note that we return |
| 2057 | from wait if we have found a request. */ |
| 2058 | |
| 2059 | srv_set_io_thread_op_info( |
| 2060 | m_global_segment, |
| 2061 | "waiting for completed aio requests" ); |
| 2062 | |
| 2063 | collect(); |
| 2064 | } |
| 2065 | } |
| 2066 | |
| 2067 | if (err == DB_IO_PARTIAL_FAILED) { |
| 2068 | /* Aborting in case of submit failure */ |
| 2069 | ib::fatal() |
| 2070 | << "Native Linux AIO interface. " |
| 2071 | "io_submit() call failed when " |
| 2072 | "resubmitting a partial I/O " |
| 2073 | "request on the file " << slot->name |
| 2074 | << "." ; |
| 2075 | } |
| 2076 | |
| 2077 | *m1 = slot->m1; |
| 2078 | *m2 = slot->m2; |
| 2079 | |
| 2080 | *request = slot->type; |
| 2081 | |
| 2082 | m_array->release(slot); |
| 2083 | |
| 2084 | m_array->release(); |
| 2085 | |
| 2086 | return(err); |
| 2087 | } |
| 2088 | |
| 2089 | /** This function is only used in Linux native asynchronous i/o. |
| 2090 | Waits for an aio operation to complete. This function is used to wait for |
| 2091 | the completed requests. The aio array of pending requests is divided |
| 2092 | into segments. The thread specifies which segment or slot it wants to wait |
| 2093 | for. NOTE: this function will also take care of freeing the aio slot, |
| 2094 | therefore no other thread is allowed to do the freeing! |
| 2095 | |
| 2096 | @param[in] global_seg segment number in the aio array |
| 2097 | to wait for; segment 0 is the ibuf |
| 2098 | i/o thread, segment 1 is log i/o thread, |
| 2099 | then follow the non-ibuf read threads, |
| 2100 | and the last are the non-ibuf write |
| 2101 | threads. |
| 2102 | @param[out] m1 the messages passed with the |
| 2103 | @param[out] m2 AIO request; note that in case the |
| 2104 | AIO operation failed, these output |
| 2105 | parameters are valid and can be used to |
| 2106 | restart the operation. |
| 2107 | @param[out]xi request IO context |
| 2108 | @return DB_SUCCESS if the IO was successful */ |
| 2109 | static |
| 2110 | dberr_t |
| 2111 | os_aio_linux_handler( |
| 2112 | ulint global_segment, |
| 2113 | fil_node_t** m1, |
| 2114 | void** m2, |
| 2115 | IORequest* request) |
| 2116 | { |
| 2117 | return LinuxAIOHandler(global_segment).poll(m1, m2, request); |
| 2118 | } |
| 2119 | |
| 2120 | /** Dispatch an AIO request to the kernel. |
| 2121 | @param[in,out] slot an already reserved slot |
| 2122 | @return true on success. */ |
| 2123 | bool |
| 2124 | AIO::linux_dispatch(Slot* slot) |
| 2125 | { |
| 2126 | ut_a(slot->is_reserved); |
| 2127 | ut_ad(slot->type.validate()); |
| 2128 | |
| 2129 | /* Find out what we are going to work with. |
| 2130 | The iocb struct is directly in the slot. |
| 2131 | The io_context is one per segment. */ |
| 2132 | |
| 2133 | ulint io_ctx_index; |
| 2134 | struct iocb* iocb = &slot->control; |
| 2135 | |
| 2136 | io_ctx_index = (slot->pos * m_n_segments) / m_slots.size(); |
| 2137 | |
| 2138 | int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb); |
| 2139 | |
| 2140 | /* io_submit() returns number of successfully queued requests |
| 2141 | or -errno. */ |
| 2142 | |
| 2143 | if (ret != 1) { |
| 2144 | errno = -ret; |
| 2145 | } |
| 2146 | |
| 2147 | return(ret == 1); |
| 2148 | } |
| 2149 | |
| 2150 | /** Creates an io_context for native linux AIO. |
| 2151 | @param[in] max_events number of events |
| 2152 | @param[out] io_ctx io_ctx to initialize. |
| 2153 | @return true on success. */ |
| 2154 | bool |
| 2155 | AIO::linux_create_io_ctx( |
| 2156 | unsigned max_events, |
| 2157 | io_context_t* io_ctx) |
| 2158 | { |
| 2159 | ssize_t n_retries = 0; |
| 2160 | |
| 2161 | for (;;) { |
| 2162 | |
| 2163 | memset(io_ctx, 0x0, sizeof(*io_ctx)); |
| 2164 | |
| 2165 | /* Initialize the io_ctx. Tell it how many pending |
| 2166 | IO requests this context will handle. */ |
| 2167 | |
| 2168 | int ret = io_setup(max_events, io_ctx); |
| 2169 | |
| 2170 | if (ret == 0) { |
| 2171 | /* Success. Return now. */ |
| 2172 | return(true); |
| 2173 | } |
| 2174 | |
| 2175 | /* If we hit EAGAIN we'll make a few attempts before failing. */ |
| 2176 | |
| 2177 | switch (ret) { |
| 2178 | case -EAGAIN: |
| 2179 | if (n_retries == 0) { |
| 2180 | /* First time around. */ |
| 2181 | ib::warn() |
| 2182 | << "io_setup() failed with EAGAIN." |
| 2183 | " Will make " |
| 2184 | << OS_AIO_IO_SETUP_RETRY_ATTEMPTS |
| 2185 | << " attempts before giving up." ; |
| 2186 | } |
| 2187 | |
| 2188 | if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { |
| 2189 | |
| 2190 | ++n_retries; |
| 2191 | |
| 2192 | ib::warn() |
| 2193 | << "io_setup() attempt " |
| 2194 | << n_retries << "." ; |
| 2195 | |
| 2196 | os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); |
| 2197 | |
| 2198 | continue; |
| 2199 | } |
| 2200 | |
| 2201 | /* Have tried enough. Better call it a day. */ |
| 2202 | ib::error() |
| 2203 | << "io_setup() failed with EAGAIN after " |
| 2204 | << OS_AIO_IO_SETUP_RETRY_ATTEMPTS |
| 2205 | << " attempts." ; |
| 2206 | break; |
| 2207 | |
| 2208 | case -ENOSYS: |
| 2209 | ib::error() |
| 2210 | << "Linux Native AIO interface" |
| 2211 | " is not supported on this platform. Please" |
| 2212 | " check your OS documentation and install" |
| 2213 | " appropriate binary of InnoDB." ; |
| 2214 | |
| 2215 | break; |
| 2216 | |
| 2217 | default: |
| 2218 | ib::error() |
| 2219 | << "Linux Native AIO setup" |
| 2220 | << " returned following error[" |
| 2221 | << ret << "]" ; |
| 2222 | break; |
| 2223 | } |
| 2224 | |
| 2225 | ib::info() |
| 2226 | << "You can disable Linux Native AIO by" |
| 2227 | " setting innodb_use_native_aio = 0 in my.cnf" ; |
| 2228 | |
| 2229 | break; |
| 2230 | } |
| 2231 | |
| 2232 | return(false); |
| 2233 | } |
| 2234 | |
| 2235 | /** Checks if the system supports native linux aio. On some kernel |
| 2236 | versions where native aio is supported it won't work on tmpfs. In such |
| 2237 | cases we can't use native aio as it is not possible to mix simulated |
| 2238 | and native aio. |
| 2239 | @return: true if supported, false otherwise. */ |
| 2240 | bool |
| 2241 | AIO::is_linux_native_aio_supported() |
| 2242 | { |
| 2243 | int fd; |
| 2244 | io_context_t io_ctx; |
| 2245 | char name[1000]; |
| 2246 | |
| 2247 | if (!linux_create_io_ctx(1, &io_ctx)) { |
| 2248 | |
| 2249 | /* The platform does not support native aio. */ |
| 2250 | |
| 2251 | return(false); |
| 2252 | |
| 2253 | } else if (!srv_read_only_mode) { |
| 2254 | |
| 2255 | /* Now check if tmpdir supports native aio ops. */ |
| 2256 | fd = innobase_mysql_tmpfile(NULL); |
| 2257 | |
| 2258 | if (fd < 0) { |
| 2259 | ib::warn() |
| 2260 | << "Unable to create temp file to check" |
| 2261 | " native AIO support." ; |
| 2262 | |
| 2263 | return(false); |
| 2264 | } |
| 2265 | } else { |
| 2266 | |
| 2267 | os_normalize_path(srv_log_group_home_dir); |
| 2268 | |
| 2269 | ulint dirnamelen = strlen(srv_log_group_home_dir); |
| 2270 | |
| 2271 | ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile" ); |
| 2272 | |
| 2273 | memcpy(name, srv_log_group_home_dir, dirnamelen); |
| 2274 | |
| 2275 | /* Add a path separator if needed. */ |
| 2276 | if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) { |
| 2277 | |
| 2278 | name[dirnamelen++] = OS_PATH_SEPARATOR; |
| 2279 | } |
| 2280 | |
| 2281 | strcpy(name + dirnamelen, "ib_logfile0" ); |
| 2282 | |
| 2283 | fd = open(name, O_RDONLY | O_CLOEXEC); |
| 2284 | |
| 2285 | if (fd == -1) { |
| 2286 | |
| 2287 | ib::warn() |
| 2288 | << "Unable to open" |
| 2289 | << " \"" << name << "\" to check native" |
| 2290 | << " AIO read support." ; |
| 2291 | |
| 2292 | return(false); |
| 2293 | } |
| 2294 | } |
| 2295 | |
| 2296 | struct io_event io_event; |
| 2297 | |
| 2298 | memset(&io_event, 0x0, sizeof(io_event)); |
| 2299 | |
| 2300 | byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2)); |
| 2301 | byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size)); |
| 2302 | |
| 2303 | struct iocb iocb; |
| 2304 | |
| 2305 | /* Suppress valgrind warning. */ |
| 2306 | memset(buf, 0x00, srv_page_size * 2); |
| 2307 | memset(&iocb, 0x0, sizeof(iocb)); |
| 2308 | |
| 2309 | struct iocb* p_iocb = &iocb; |
| 2310 | |
| 2311 | if (!srv_read_only_mode) { |
| 2312 | |
| 2313 | io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0); |
| 2314 | |
| 2315 | } else { |
| 2316 | ut_a(srv_page_size >= 512); |
| 2317 | io_prep_pread(p_iocb, fd, ptr, 512, 0); |
| 2318 | } |
| 2319 | |
| 2320 | int err = io_submit(io_ctx, 1, &p_iocb); |
| 2321 | |
| 2322 | if (err >= 1) { |
| 2323 | /* Now collect the submitted IO request. */ |
| 2324 | err = io_getevents(io_ctx, 1, 1, &io_event, NULL); |
| 2325 | } |
| 2326 | |
| 2327 | ut_free(buf); |
| 2328 | close(fd); |
| 2329 | |
| 2330 | switch (err) { |
| 2331 | case 1: |
| 2332 | return(true); |
| 2333 | |
| 2334 | case -EINVAL: |
| 2335 | case -ENOSYS: |
| 2336 | ib::error() |
| 2337 | << "Linux Native AIO not supported. You can either" |
| 2338 | " move " |
| 2339 | << (srv_read_only_mode ? name : "tmpdir" ) |
| 2340 | << " to a file system that supports native" |
| 2341 | " AIO or you can set innodb_use_native_aio to" |
| 2342 | " FALSE to avoid this message." ; |
| 2343 | |
| 2344 | /* fall through. */ |
| 2345 | default: |
| 2346 | ib::error() |
| 2347 | << "Linux Native AIO check on " |
| 2348 | << (srv_read_only_mode ? name : "tmpdir" ) |
| 2349 | << "returned error[" << -err << "]" ; |
| 2350 | } |
| 2351 | |
| 2352 | return(false); |
| 2353 | } |
| 2354 | |
| 2355 | #endif /* LINUX_NATIVE_AIO */ |
| 2356 | |
| 2357 | /** Retrieves the last error number if an error occurs in a file io function. |
| 2358 | The number should be retrieved before any other OS calls (because they may |
| 2359 | overwrite the error number). If the number is not known to this program, |
| 2360 | the OS error number + 100 is returned. |
| 2361 | @param[in] report_all_errors true if we want an error message |
| 2362 | printed of all errors |
| 2363 | @param[in] on_error_silent true then don't print any diagnostic |
| 2364 | to the log |
| 2365 | @return error number, or OS error number + 100 */ |
| 2366 | static |
| 2367 | ulint |
| 2368 | os_file_get_last_error_low( |
| 2369 | bool report_all_errors, |
| 2370 | bool on_error_silent) |
| 2371 | { |
| 2372 | int err = errno; |
| 2373 | |
| 2374 | if (err == 0) { |
| 2375 | return(0); |
| 2376 | } |
| 2377 | |
| 2378 | if (report_all_errors |
| 2379 | || (err != ENOSPC && err != EEXIST && !on_error_silent)) { |
| 2380 | |
| 2381 | ib::error() |
| 2382 | << "Operating system error number " |
| 2383 | << err |
| 2384 | << " in a file operation." ; |
| 2385 | |
| 2386 | if (err == ENOENT) { |
| 2387 | |
| 2388 | ib::error() |
| 2389 | << "The error means the system" |
| 2390 | " cannot find the path specified." ; |
| 2391 | |
| 2392 | if (srv_is_being_started) { |
| 2393 | |
| 2394 | ib::error() |
| 2395 | << "If you are installing InnoDB," |
| 2396 | " remember that you must create" |
| 2397 | " directories yourself, InnoDB" |
| 2398 | " does not create them." ; |
| 2399 | } |
| 2400 | } else if (err == EACCES) { |
| 2401 | |
| 2402 | ib::error() |
| 2403 | << "The error means mysqld does not have" |
| 2404 | " the access rights to the directory." ; |
| 2405 | |
| 2406 | } else { |
| 2407 | if (strerror(err) != NULL) { |
| 2408 | |
| 2409 | ib::error() |
| 2410 | << "Error number " << err << " means '" |
| 2411 | << strerror(err) << "'" ; |
| 2412 | } |
| 2413 | |
| 2414 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
| 2415 | } |
| 2416 | } |
| 2417 | |
| 2418 | switch (err) { |
| 2419 | case ENOSPC: |
| 2420 | return(OS_FILE_DISK_FULL); |
| 2421 | case ENOENT: |
| 2422 | return(OS_FILE_NOT_FOUND); |
| 2423 | case EEXIST: |
| 2424 | return(OS_FILE_ALREADY_EXISTS); |
| 2425 | case EXDEV: |
| 2426 | case ENOTDIR: |
| 2427 | case EISDIR: |
| 2428 | return(OS_FILE_PATH_ERROR); |
| 2429 | case EAGAIN: |
| 2430 | if (srv_use_native_aio) { |
| 2431 | return(OS_FILE_AIO_RESOURCES_RESERVED); |
| 2432 | } |
| 2433 | break; |
| 2434 | case EINTR: |
| 2435 | if (srv_use_native_aio) { |
| 2436 | return(OS_FILE_AIO_INTERRUPTED); |
| 2437 | } |
| 2438 | break; |
| 2439 | case EACCES: |
| 2440 | return(OS_FILE_ACCESS_VIOLATION); |
| 2441 | } |
| 2442 | return(OS_FILE_ERROR_MAX + err); |
| 2443 | } |
| 2444 | |
| 2445 | /** Wrapper to fsync(2) that retries the call on some errors. |
| 2446 | Returns the value 0 if successful; otherwise the value -1 is returned and |
| 2447 | the global variable errno is set to indicate the error. |
| 2448 | @param[in] file open file handle |
| 2449 | @return 0 if success, -1 otherwise */ |
| 2450 | static |
| 2451 | int |
| 2452 | os_file_fsync_posix( |
| 2453 | os_file_t file) |
| 2454 | { |
| 2455 | ulint failures = 0; |
| 2456 | |
| 2457 | for (;;) { |
| 2458 | |
| 2459 | ++os_n_fsyncs; |
| 2460 | |
| 2461 | int ret = fsync(file); |
| 2462 | |
| 2463 | if (ret == 0) { |
| 2464 | return(ret); |
| 2465 | } |
| 2466 | |
| 2467 | switch(errno) { |
| 2468 | case ENOLCK: |
| 2469 | |
| 2470 | ++failures; |
| 2471 | ut_a(failures < 1000); |
| 2472 | |
| 2473 | if (!(failures % 100)) { |
| 2474 | |
| 2475 | ib::warn() |
| 2476 | << "fsync(): " |
| 2477 | << "No locks available; retrying" ; |
| 2478 | } |
| 2479 | |
| 2480 | /* 0.2 sec */ |
| 2481 | os_thread_sleep(200000); |
| 2482 | break; |
| 2483 | |
| 2484 | case EIO: |
| 2485 | |
| 2486 | ++failures; |
| 2487 | ut_a(failures < 1000); |
| 2488 | |
| 2489 | if (!(failures % 100)) { |
| 2490 | |
| 2491 | ib::warn() |
| 2492 | << "fsync(): " |
| 2493 | << "An error occurred during " |
| 2494 | << "synchronization," |
| 2495 | << " retrying" ; |
| 2496 | } |
| 2497 | |
| 2498 | /* 0.2 sec */ |
| 2499 | os_thread_sleep(200000); |
| 2500 | break; |
| 2501 | |
| 2502 | case EINTR: |
| 2503 | |
| 2504 | ++failures; |
| 2505 | ut_a(failures < 2000); |
| 2506 | break; |
| 2507 | |
| 2508 | default: |
| 2509 | ut_error; |
| 2510 | break; |
| 2511 | } |
| 2512 | } |
| 2513 | |
| 2514 | ut_error; |
| 2515 | |
| 2516 | return(-1); |
| 2517 | } |
| 2518 | |
| 2519 | /** Check the existence and type of the given file. |
| 2520 | @param[in] path path name of file |
| 2521 | @param[out] exists true if the file exists |
| 2522 | @param[out] type Type of the file, if it exists |
| 2523 | @return true if call succeeded */ |
| 2524 | static |
| 2525 | bool |
| 2526 | os_file_status_posix( |
| 2527 | const char* path, |
| 2528 | bool* exists, |
| 2529 | os_file_type_t* type) |
| 2530 | { |
| 2531 | struct stat statinfo; |
| 2532 | |
| 2533 | int ret = stat(path, &statinfo); |
| 2534 | |
| 2535 | *exists = !ret; |
| 2536 | |
| 2537 | if (!ret) { |
| 2538 | /* file exists, everything OK */ |
| 2539 | |
| 2540 | } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { |
| 2541 | /* file does not exist */ |
| 2542 | return(true); |
| 2543 | |
| 2544 | } else { |
| 2545 | /* file exists, but stat call failed */ |
| 2546 | os_file_handle_error_no_exit(path, "stat" , false); |
| 2547 | return(false); |
| 2548 | } |
| 2549 | |
| 2550 | if (S_ISDIR(statinfo.st_mode)) { |
| 2551 | *type = OS_FILE_TYPE_DIR; |
| 2552 | |
| 2553 | } else if (S_ISLNK(statinfo.st_mode)) { |
| 2554 | *type = OS_FILE_TYPE_LINK; |
| 2555 | |
| 2556 | } else if (S_ISREG(statinfo.st_mode)) { |
| 2557 | *type = OS_FILE_TYPE_FILE; |
| 2558 | } else { |
| 2559 | *type = OS_FILE_TYPE_UNKNOWN; |
| 2560 | } |
| 2561 | |
| 2562 | return(true); |
| 2563 | } |
| 2564 | |
| 2565 | /** NOTE! Use the corresponding macro os_file_flush(), not directly this |
| 2566 | function! |
| 2567 | Flushes the write buffers of a given file to the disk. |
| 2568 | @param[in] file handle to a file |
| 2569 | @return true if success */ |
| 2570 | bool |
| 2571 | os_file_flush_func( |
| 2572 | os_file_t file) |
| 2573 | { |
| 2574 | int ret; |
| 2575 | |
| 2576 | WAIT_ALLOW_WRITES(); |
| 2577 | ret = os_file_fsync_posix(file); |
| 2578 | |
| 2579 | if (ret == 0) { |
| 2580 | return(true); |
| 2581 | } |
| 2582 | |
| 2583 | /* Since Linux returns EINVAL if the 'file' is actually a raw device, |
| 2584 | we choose to ignore that error if we are using raw disks */ |
| 2585 | |
| 2586 | if (srv_start_raw_disk_in_use && errno == EINVAL) { |
| 2587 | |
| 2588 | return(true); |
| 2589 | } |
| 2590 | |
| 2591 | ib::error() << "The OS said file flush did not succeed" ; |
| 2592 | |
| 2593 | os_file_handle_error(NULL, "flush" ); |
| 2594 | |
| 2595 | /* It is a fatal error if a file flush does not succeed, because then |
| 2596 | the database can get corrupt on disk */ |
| 2597 | ut_error; |
| 2598 | |
| 2599 | return(false); |
| 2600 | } |
| 2601 | |
| 2602 | /** NOTE! Use the corresponding macro os_file_create_simple(), not directly |
| 2603 | this function! |
| 2604 | A simple function to open or create a file. |
| 2605 | @param[in] name name of the file or path as a null-terminated |
| 2606 | string |
| 2607 | @param[in] create_mode create mode |
| 2608 | @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE |
| 2609 | @param[in] read_only if true, read only checks are enforced |
| 2610 | @param[out] success true if succeed, false if error |
| 2611 | @return handle to the file, not defined if error, error number |
| 2612 | can be retrieved with os_file_get_last_error */ |
| 2613 | pfs_os_file_t |
| 2614 | os_file_create_simple_func( |
| 2615 | const char* name, |
| 2616 | ulint create_mode, |
| 2617 | ulint access_type, |
| 2618 | bool read_only, |
| 2619 | bool* success) |
| 2620 | { |
| 2621 | pfs_os_file_t file; |
| 2622 | |
| 2623 | *success = false; |
| 2624 | |
| 2625 | int create_flag; |
| 2626 | const char* mode_str = NULL; |
| 2627 | |
| 2628 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
| 2629 | WAIT_ALLOW_WRITES(); |
| 2630 | } |
| 2631 | |
| 2632 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
| 2633 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
| 2634 | |
| 2635 | if (create_mode == OS_FILE_OPEN) { |
| 2636 | mode_str = "OPEN" ; |
| 2637 | |
| 2638 | if (access_type == OS_FILE_READ_ONLY) { |
| 2639 | |
| 2640 | create_flag = O_RDONLY; |
| 2641 | |
| 2642 | } else if (read_only) { |
| 2643 | |
| 2644 | create_flag = O_RDONLY; |
| 2645 | |
| 2646 | } else { |
| 2647 | create_flag = O_RDWR; |
| 2648 | } |
| 2649 | |
| 2650 | } else if (read_only) { |
| 2651 | |
| 2652 | mode_str = "OPEN" ; |
| 2653 | create_flag = O_RDONLY; |
| 2654 | |
| 2655 | } else if (create_mode == OS_FILE_CREATE) { |
| 2656 | |
| 2657 | mode_str = "CREATE" ; |
| 2658 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
| 2659 | |
| 2660 | } else if (create_mode == OS_FILE_CREATE_PATH) { |
| 2661 | |
| 2662 | mode_str = "CREATE PATH" ; |
| 2663 | /* Create subdirs along the path if needed. */ |
| 2664 | |
| 2665 | *success = os_file_create_subdirs_if_needed(name); |
| 2666 | |
| 2667 | if (!*success) { |
| 2668 | |
| 2669 | ib::error() |
| 2670 | << "Unable to create subdirectories '" |
| 2671 | << name << "'" ; |
| 2672 | |
| 2673 | return(OS_FILE_CLOSED); |
| 2674 | } |
| 2675 | |
| 2676 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
| 2677 | create_mode = OS_FILE_CREATE; |
| 2678 | } else { |
| 2679 | |
| 2680 | ib::error() |
| 2681 | << "Unknown file create mode (" |
| 2682 | << create_mode |
| 2683 | << " for file '" << name << "'" ; |
| 2684 | |
| 2685 | return(OS_FILE_CLOSED); |
| 2686 | } |
| 2687 | |
| 2688 | bool retry; |
| 2689 | |
| 2690 | do { |
| 2691 | file = open(name, create_flag, os_innodb_umask); |
| 2692 | |
| 2693 | if (file == -1) { |
| 2694 | *success = false; |
| 2695 | retry = os_file_handle_error( |
| 2696 | name, |
| 2697 | create_mode == OS_FILE_OPEN |
| 2698 | ? "open" : "create" ); |
| 2699 | } else { |
| 2700 | *success = true; |
| 2701 | retry = false; |
| 2702 | } |
| 2703 | |
| 2704 | } while (retry); |
| 2705 | |
| 2706 | /* This function is always called for data files, we should disable |
| 2707 | OS caching (O_DIRECT) here as we do in os_file_create_func(), so |
| 2708 | we open the same file in the same mode, see man page of open(2). */ |
| 2709 | if (!srv_read_only_mode |
| 2710 | && *success |
| 2711 | && (srv_file_flush_method == SRV_O_DIRECT |
| 2712 | || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { |
| 2713 | |
| 2714 | os_file_set_nocache(file, name, mode_str); |
| 2715 | } |
| 2716 | |
| 2717 | #ifdef USE_FILE_LOCK |
| 2718 | if (!read_only |
| 2719 | && *success |
| 2720 | && (access_type == OS_FILE_READ_WRITE) |
| 2721 | && os_file_lock(file, name)) { |
| 2722 | |
| 2723 | *success = false; |
| 2724 | close(file); |
| 2725 | file = -1; |
| 2726 | } |
| 2727 | #endif /* USE_FILE_LOCK */ |
| 2728 | |
| 2729 | return(file); |
| 2730 | } |
| 2731 | |
| 2732 | /** This function attempts to create a directory named pathname. The new |
| 2733 | directory gets default permissions. On Unix the permissions are |
| 2734 | (0770 & ~umask). If the directory exists already, nothing is done and |
| 2735 | the call succeeds, unless the fail_if_exists arguments is true. |
| 2736 | If another error occurs, such as a permission error, this does not crash, |
| 2737 | but reports the error and returns false. |
| 2738 | @param[in] pathname directory name as null-terminated string |
| 2739 | @param[in] fail_if_exists if true, pre-existing directory is treated as |
| 2740 | an error. |
| 2741 | @return true if call succeeds, false on error */ |
| 2742 | bool |
| 2743 | os_file_create_directory( |
| 2744 | const char* pathname, |
| 2745 | bool fail_if_exists) |
| 2746 | { |
| 2747 | int rcode; |
| 2748 | |
| 2749 | WAIT_ALLOW_WRITES(); |
| 2750 | rcode = mkdir(pathname, 0770); |
| 2751 | |
| 2752 | if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { |
| 2753 | /* failure */ |
| 2754 | os_file_handle_error_no_exit(pathname, "mkdir" , false); |
| 2755 | |
| 2756 | return(false); |
| 2757 | } |
| 2758 | |
| 2759 | return(true); |
| 2760 | } |
| 2761 | |
| 2762 | /** |
| 2763 | The os_file_opendir() function opens a directory stream corresponding to the |
| 2764 | directory named by the dirname argument. The directory stream is positioned |
| 2765 | at the first entry. In both Unix and Windows we automatically skip the '.' |
| 2766 | and '..' items at the start of the directory listing. |
| 2767 | @param[in] dirname directory name; it must not contain a trailing |
| 2768 | '\' or '/' |
| 2769 | @param[in] is_fatal true if we should treat an error as a fatal |
| 2770 | error; if we try to open symlinks then we do |
| 2771 | not wish a fatal error if it happens not to be |
| 2772 | a directory |
| 2773 | @return directory stream, NULL if error */ |
| 2774 | os_file_dir_t |
| 2775 | os_file_opendir( |
| 2776 | const char* dirname, |
| 2777 | bool error_is_fatal) |
| 2778 | { |
| 2779 | os_file_dir_t dir; |
| 2780 | dir = opendir(dirname); |
| 2781 | |
| 2782 | if (dir == NULL && error_is_fatal) { |
| 2783 | os_file_handle_error(dirname, "opendir" ); |
| 2784 | } |
| 2785 | |
| 2786 | return(dir); |
| 2787 | } |
| 2788 | |
| 2789 | /** Closes a directory stream. |
| 2790 | @param[in] dir directory stream |
| 2791 | @return 0 if success, -1 if failure */ |
| 2792 | int |
| 2793 | os_file_closedir( |
| 2794 | os_file_dir_t dir) |
| 2795 | { |
| 2796 | int ret = closedir(dir); |
| 2797 | |
| 2798 | if (ret != 0) { |
| 2799 | os_file_handle_error_no_exit(NULL, "closedir" , false); |
| 2800 | } |
| 2801 | |
| 2802 | return(ret); |
| 2803 | } |
| 2804 | |
| 2805 | /** This function returns information of the next file in the directory. We jump |
| 2806 | over the '.' and '..' entries in the directory. |
| 2807 | @param[in] dirname directory name or path |
| 2808 | @param[in] dir directory stream |
| 2809 | @param[out] info buffer where the info is returned |
| 2810 | @return 0 if ok, -1 if error, 1 if at the end of the directory */ |
| 2811 | int |
| 2812 | os_file_readdir_next_file( |
| 2813 | const char* dirname, |
| 2814 | os_file_dir_t dir, |
| 2815 | os_file_stat_t* info) |
| 2816 | { |
| 2817 | struct dirent* ent; |
| 2818 | char* full_path; |
| 2819 | int ret; |
| 2820 | struct stat statinfo; |
| 2821 | |
| 2822 | next_file: |
| 2823 | |
| 2824 | ent = readdir(dir); |
| 2825 | |
| 2826 | if (ent == NULL) { |
| 2827 | |
| 2828 | return(1); |
| 2829 | } |
| 2830 | |
| 2831 | ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); |
| 2832 | |
| 2833 | if (strcmp(ent->d_name, "." ) == 0 || strcmp(ent->d_name, ".." ) == 0) { |
| 2834 | |
| 2835 | goto next_file; |
| 2836 | } |
| 2837 | |
| 2838 | strcpy(info->name, ent->d_name); |
| 2839 | |
| 2840 | full_path = static_cast<char*>( |
| 2841 | ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10)); |
| 2842 | |
| 2843 | sprintf(full_path, "%s/%s" , dirname, ent->d_name); |
| 2844 | |
| 2845 | ret = stat(full_path, &statinfo); |
| 2846 | |
| 2847 | if (ret) { |
| 2848 | |
| 2849 | if (errno == ENOENT) { |
| 2850 | /* readdir() returned a file that does not exist, |
| 2851 | it must have been deleted in the meantime. Do what |
| 2852 | would have happened if the file was deleted before |
| 2853 | readdir() - ignore and go to the next entry. |
| 2854 | If this is the last entry then info->name will still |
| 2855 | contain the name of the deleted file when this |
| 2856 | function returns, but this is not an issue since the |
| 2857 | caller shouldn't be looking at info when end of |
| 2858 | directory is returned. */ |
| 2859 | |
| 2860 | ut_free(full_path); |
| 2861 | |
| 2862 | goto next_file; |
| 2863 | } |
| 2864 | |
| 2865 | os_file_handle_error_no_exit(full_path, "stat" , false); |
| 2866 | |
| 2867 | ut_free(full_path); |
| 2868 | |
| 2869 | return(-1); |
| 2870 | } |
| 2871 | |
| 2872 | info->size = statinfo.st_size; |
| 2873 | |
| 2874 | if (S_ISDIR(statinfo.st_mode)) { |
| 2875 | info->type = OS_FILE_TYPE_DIR; |
| 2876 | } else if (S_ISLNK(statinfo.st_mode)) { |
| 2877 | info->type = OS_FILE_TYPE_LINK; |
| 2878 | } else if (S_ISREG(statinfo.st_mode)) { |
| 2879 | info->type = OS_FILE_TYPE_FILE; |
| 2880 | } else { |
| 2881 | info->type = OS_FILE_TYPE_UNKNOWN; |
| 2882 | } |
| 2883 | |
| 2884 | ut_free(full_path); |
| 2885 | |
| 2886 | return(0); |
| 2887 | } |
| 2888 | |
| 2889 | /** NOTE! Use the corresponding macro os_file_create(), not directly |
| 2890 | this function! |
| 2891 | Opens an existing file or creates a new. |
| 2892 | @param[in] name name of the file or path as a null-terminated |
| 2893 | string |
| 2894 | @param[in] create_mode create mode |
| 2895 | @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O |
| 2896 | is desired, OS_FILE_NORMAL, if any normal file; |
| 2897 | NOTE that it also depends on type, os_aio_.. |
| 2898 | and srv_.. variables whether we really use async |
| 2899 | I/O or unbuffered I/O: look in the function |
| 2900 | source code for the exact rules |
| 2901 | @param[in] type OS_DATA_FILE or OS_LOG_FILE |
| 2902 | @param[in] read_only true, if read only checks should be enforcedm |
| 2903 | @param[in] success true if succeeded |
| 2904 | @return handle to the file, not defined if error, error number |
| 2905 | can be retrieved with os_file_get_last_error */ |
| 2906 | pfs_os_file_t |
| 2907 | os_file_create_func( |
| 2908 | const char* name, |
| 2909 | ulint create_mode, |
| 2910 | ulint purpose, |
| 2911 | ulint type, |
| 2912 | bool read_only, |
| 2913 | bool* success) |
| 2914 | { |
| 2915 | bool on_error_no_exit; |
| 2916 | bool on_error_silent; |
| 2917 | |
| 2918 | *success = false; |
| 2919 | |
| 2920 | DBUG_EXECUTE_IF( |
| 2921 | "ib_create_table_fail_disk_full" , |
| 2922 | *success = false; |
| 2923 | errno = ENOSPC; |
| 2924 | return(OS_FILE_CLOSED); |
| 2925 | ); |
| 2926 | |
| 2927 | int create_flag; |
| 2928 | const char* mode_str = NULL; |
| 2929 | |
| 2930 | on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT |
| 2931 | ? true : false; |
| 2932 | on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT |
| 2933 | ? true : false; |
| 2934 | |
| 2935 | create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT |
| 2936 | | OS_FILE_ON_ERROR_SILENT)); |
| 2937 | |
| 2938 | if (create_mode == OS_FILE_OPEN |
| 2939 | || create_mode == OS_FILE_OPEN_RAW |
| 2940 | || create_mode == OS_FILE_OPEN_RETRY) { |
| 2941 | |
| 2942 | mode_str = "OPEN" ; |
| 2943 | |
| 2944 | create_flag = read_only ? O_RDONLY : O_RDWR; |
| 2945 | |
| 2946 | } else if (read_only) { |
| 2947 | |
| 2948 | mode_str = "OPEN" ; |
| 2949 | |
| 2950 | create_flag = O_RDONLY; |
| 2951 | |
| 2952 | } else if (create_mode == OS_FILE_CREATE) { |
| 2953 | |
| 2954 | mode_str = "CREATE" ; |
| 2955 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
| 2956 | |
| 2957 | } else if (create_mode == OS_FILE_OVERWRITE) { |
| 2958 | |
| 2959 | mode_str = "OVERWRITE" ; |
| 2960 | create_flag = O_RDWR | O_CREAT | O_TRUNC; |
| 2961 | |
| 2962 | } else { |
| 2963 | ib::error() |
| 2964 | << "Unknown file create mode (" << create_mode << ")" |
| 2965 | << " for file '" << name << "'" ; |
| 2966 | |
| 2967 | return(OS_FILE_CLOSED); |
| 2968 | } |
| 2969 | |
| 2970 | ut_a(type == OS_LOG_FILE |
| 2971 | || type == OS_DATA_FILE |
| 2972 | || type == OS_DATA_TEMP_FILE); |
| 2973 | |
| 2974 | ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); |
| 2975 | |
| 2976 | #ifdef O_SYNC |
| 2977 | /* We let O_SYNC only affect log files; note that we map O_DSYNC to |
| 2978 | O_SYNC because the datasync options seemed to corrupt files in 2001 |
| 2979 | in both Linux and Solaris */ |
| 2980 | |
| 2981 | if (!read_only |
| 2982 | && type == OS_LOG_FILE |
| 2983 | && srv_file_flush_method == SRV_O_DSYNC) { |
| 2984 | |
| 2985 | create_flag |= O_SYNC; |
| 2986 | } |
| 2987 | #endif /* O_SYNC */ |
| 2988 | |
| 2989 | os_file_t file; |
| 2990 | bool retry; |
| 2991 | |
| 2992 | do { |
| 2993 | file = open(name, create_flag, os_innodb_umask); |
| 2994 | |
| 2995 | if (file == -1) { |
| 2996 | const char* operation; |
| 2997 | |
| 2998 | operation = (create_mode == OS_FILE_CREATE |
| 2999 | && !read_only) ? "create" : "open" ; |
| 3000 | |
| 3001 | *success = false; |
| 3002 | |
| 3003 | if (on_error_no_exit) { |
| 3004 | retry = os_file_handle_error_no_exit( |
| 3005 | name, operation, on_error_silent); |
| 3006 | } else { |
| 3007 | retry = os_file_handle_error(name, operation); |
| 3008 | } |
| 3009 | } else { |
| 3010 | *success = true; |
| 3011 | retry = false; |
| 3012 | } |
| 3013 | |
| 3014 | } while (retry); |
| 3015 | |
| 3016 | /* We disable OS caching (O_DIRECT) only on data files */ |
| 3017 | if (!read_only |
| 3018 | && *success |
| 3019 | && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE) |
| 3020 | && (srv_file_flush_method == SRV_O_DIRECT |
| 3021 | || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { |
| 3022 | |
| 3023 | os_file_set_nocache(file, name, mode_str); |
| 3024 | } |
| 3025 | |
| 3026 | #ifdef USE_FILE_LOCK |
| 3027 | if (!read_only |
| 3028 | && *success |
| 3029 | && create_mode != OS_FILE_OPEN_RAW |
| 3030 | && os_file_lock(file, name)) { |
| 3031 | |
| 3032 | if (create_mode == OS_FILE_OPEN_RETRY) { |
| 3033 | |
| 3034 | ib::info() |
| 3035 | << "Retrying to lock the first data file" ; |
| 3036 | |
| 3037 | for (int i = 0; i < 100; i++) { |
| 3038 | os_thread_sleep(1000000); |
| 3039 | |
| 3040 | if (!os_file_lock(file, name)) { |
| 3041 | *success = true; |
| 3042 | return(file); |
| 3043 | } |
| 3044 | } |
| 3045 | |
| 3046 | ib::info() |
| 3047 | << "Unable to open the first data file" ; |
| 3048 | } |
| 3049 | |
| 3050 | *success = false; |
| 3051 | close(file); |
| 3052 | file = -1; |
| 3053 | } |
| 3054 | #endif /* USE_FILE_LOCK */ |
| 3055 | |
| 3056 | return(file); |
| 3057 | } |
| 3058 | |
| 3059 | /** NOTE! Use the corresponding macro |
| 3060 | os_file_create_simple_no_error_handling(), not directly this function! |
| 3061 | A simple function to open or create a file. |
| 3062 | @param[in] name name of the file or path as a null-terminated |
| 3063 | string |
| 3064 | @param[in] create_mode create mode |
| 3065 | @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or |
| 3066 | OS_FILE_READ_ALLOW_DELETE; the last option |
| 3067 | is used by a backup program reading the file |
| 3068 | @param[in] read_only if true read only mode checks are enforced |
| 3069 | @param[out] success true if succeeded |
| 3070 | @return own: handle to the file, not defined if error, error number |
| 3071 | can be retrieved with os_file_get_last_error */ |
| 3072 | pfs_os_file_t |
| 3073 | os_file_create_simple_no_error_handling_func( |
| 3074 | const char* name, |
| 3075 | ulint create_mode, |
| 3076 | ulint access_type, |
| 3077 | bool read_only, |
| 3078 | bool* success) |
| 3079 | { |
| 3080 | os_file_t file; |
| 3081 | int create_flag; |
| 3082 | |
| 3083 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
| 3084 | WAIT_ALLOW_WRITES(); |
| 3085 | } |
| 3086 | |
| 3087 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
| 3088 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
| 3089 | |
| 3090 | *success = false; |
| 3091 | |
| 3092 | if (create_mode == OS_FILE_OPEN) { |
| 3093 | |
| 3094 | if (access_type == OS_FILE_READ_ONLY) { |
| 3095 | |
| 3096 | create_flag = O_RDONLY; |
| 3097 | |
| 3098 | } else if (read_only) { |
| 3099 | |
| 3100 | create_flag = O_RDONLY; |
| 3101 | |
| 3102 | } else { |
| 3103 | |
| 3104 | ut_a(access_type == OS_FILE_READ_WRITE |
| 3105 | || access_type == OS_FILE_READ_ALLOW_DELETE); |
| 3106 | |
| 3107 | create_flag = O_RDWR; |
| 3108 | } |
| 3109 | |
| 3110 | } else if (read_only) { |
| 3111 | |
| 3112 | create_flag = O_RDONLY; |
| 3113 | |
| 3114 | } else if (create_mode == OS_FILE_CREATE) { |
| 3115 | |
| 3116 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
| 3117 | |
| 3118 | } else { |
| 3119 | |
| 3120 | ib::error() |
| 3121 | << "Unknown file create mode " |
| 3122 | << create_mode << " for file '" << name << "'" ; |
| 3123 | |
| 3124 | return(OS_FILE_CLOSED); |
| 3125 | } |
| 3126 | |
| 3127 | file = open(name, create_flag, os_innodb_umask); |
| 3128 | |
| 3129 | *success = (file != -1); |
| 3130 | |
| 3131 | #ifdef USE_FILE_LOCK |
| 3132 | if (!read_only |
| 3133 | && *success |
| 3134 | && access_type == OS_FILE_READ_WRITE |
| 3135 | && os_file_lock(file, name)) { |
| 3136 | |
| 3137 | *success = false; |
| 3138 | close(file); |
| 3139 | file = -1; |
| 3140 | |
| 3141 | } |
| 3142 | #endif /* USE_FILE_LOCK */ |
| 3143 | |
| 3144 | return(file); |
| 3145 | } |
| 3146 | |
| 3147 | /** Deletes a file if it exists. The file has to be closed before calling this. |
| 3148 | @param[in] name file path as a null-terminated string |
| 3149 | @param[out] exist indicate if file pre-exist |
| 3150 | @return true if success */ |
| 3151 | bool |
| 3152 | os_file_delete_if_exists_func( |
| 3153 | const char* name, |
| 3154 | bool* exist) |
| 3155 | { |
| 3156 | if (exist != NULL) { |
| 3157 | *exist = true; |
| 3158 | } |
| 3159 | |
| 3160 | int ret; |
| 3161 | WAIT_ALLOW_WRITES(); |
| 3162 | |
| 3163 | ret = unlink(name); |
| 3164 | |
| 3165 | if (ret != 0 && errno == ENOENT) { |
| 3166 | if (exist != NULL) { |
| 3167 | *exist = false; |
| 3168 | } |
| 3169 | } else if (ret != 0 && errno != ENOENT) { |
| 3170 | os_file_handle_error_no_exit(name, "delete" , false); |
| 3171 | |
| 3172 | return(false); |
| 3173 | } |
| 3174 | |
| 3175 | return(true); |
| 3176 | } |
| 3177 | |
| 3178 | /** Deletes a file. The file has to be closed before calling this. |
| 3179 | @param[in] name file path as a null-terminated string |
| 3180 | @return true if success */ |
| 3181 | bool |
| 3182 | os_file_delete_func( |
| 3183 | const char* name) |
| 3184 | { |
| 3185 | int ret; |
| 3186 | WAIT_ALLOW_WRITES(); |
| 3187 | |
| 3188 | ret = unlink(name); |
| 3189 | |
| 3190 | if (ret != 0) { |
| 3191 | os_file_handle_error_no_exit(name, "delete" , FALSE); |
| 3192 | |
| 3193 | return(false); |
| 3194 | } |
| 3195 | |
| 3196 | return(true); |
| 3197 | } |
| 3198 | |
| 3199 | /** NOTE! Use the corresponding macro os_file_rename(), not directly this |
| 3200 | function! |
| 3201 | Renames a file (can also move it to another directory). It is safest that the |
| 3202 | file is closed before calling this function. |
| 3203 | @param[in] oldpath old file path as a null-terminated string |
| 3204 | @param[in] newpath new file path |
| 3205 | @return true if success */ |
| 3206 | bool |
| 3207 | os_file_rename_func( |
| 3208 | const char* oldpath, |
| 3209 | const char* newpath) |
| 3210 | { |
| 3211 | #ifdef UNIV_DEBUG |
| 3212 | os_file_type_t type; |
| 3213 | bool exists; |
| 3214 | |
| 3215 | /* New path must not exist. */ |
| 3216 | ut_ad(os_file_status(newpath, &exists, &type)); |
| 3217 | ut_ad(!exists); |
| 3218 | |
| 3219 | /* Old path must exist. */ |
| 3220 | ut_ad(os_file_status(oldpath, &exists, &type)); |
| 3221 | ut_ad(exists); |
| 3222 | #endif /* UNIV_DEBUG */ |
| 3223 | |
| 3224 | int ret; |
| 3225 | WAIT_ALLOW_WRITES(); |
| 3226 | |
| 3227 | ret = rename(oldpath, newpath); |
| 3228 | |
| 3229 | if (ret != 0) { |
| 3230 | os_file_handle_error_no_exit(oldpath, "rename" , FALSE); |
| 3231 | |
| 3232 | return(false); |
| 3233 | } |
| 3234 | |
| 3235 | return(true); |
| 3236 | } |
| 3237 | |
| 3238 | /** NOTE! Use the corresponding macro os_file_close(), not directly this |
| 3239 | function! |
| 3240 | Closes a file handle. In case of error, error number can be retrieved with |
| 3241 | os_file_get_last_error. |
| 3242 | @param[in] file Handle to close |
| 3243 | @return true if success */ |
| 3244 | bool |
| 3245 | os_file_close_func( |
| 3246 | os_file_t file) |
| 3247 | { |
| 3248 | int ret = close(file); |
| 3249 | |
| 3250 | if (ret == -1) { |
| 3251 | os_file_handle_error(NULL, "close" ); |
| 3252 | |
| 3253 | return(false); |
| 3254 | } |
| 3255 | |
| 3256 | return(true); |
| 3257 | } |
| 3258 | |
| 3259 | /** Gets a file size. |
| 3260 | @param[in] file handle to an open file |
| 3261 | @return file size, or (os_offset_t) -1 on failure */ |
| 3262 | os_offset_t |
| 3263 | os_file_get_size(os_file_t file) |
| 3264 | { |
| 3265 | struct stat statbuf; |
| 3266 | return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size; |
| 3267 | } |
| 3268 | |
| 3269 | /** Gets a file size. |
| 3270 | @param[in] filename Full path to the filename to check |
| 3271 | @return file size if OK, else set m_total_size to ~0 and m_alloc_size to |
| 3272 | errno */ |
| 3273 | os_file_size_t |
| 3274 | os_file_get_size( |
| 3275 | const char* filename) |
| 3276 | { |
| 3277 | struct stat s; |
| 3278 | os_file_size_t file_size; |
| 3279 | |
| 3280 | int ret = stat(filename, &s); |
| 3281 | |
| 3282 | if (ret == 0) { |
| 3283 | file_size.m_total_size = s.st_size; |
| 3284 | /* st_blocks is in 512 byte sized blocks */ |
| 3285 | file_size.m_alloc_size = s.st_blocks * 512; |
| 3286 | } else { |
| 3287 | file_size.m_total_size = ~0U; |
| 3288 | file_size.m_alloc_size = (os_offset_t) errno; |
| 3289 | } |
| 3290 | |
| 3291 | return(file_size); |
| 3292 | } |
| 3293 | |
| 3294 | /** This function returns information about the specified file |
| 3295 | @param[in] path pathname of the file |
| 3296 | @param[out] stat_info information of a file in a directory |
| 3297 | @param[in,out] statinfo information of a file in a directory |
| 3298 | @param[in] check_rw_perm for testing whether the file can be opened |
| 3299 | in RW mode |
| 3300 | @param[in] read_only if true read only mode checks are enforced |
| 3301 | @return DB_SUCCESS if all OK */ |
| 3302 | static |
| 3303 | dberr_t |
| 3304 | os_file_get_status_posix( |
| 3305 | const char* path, |
| 3306 | os_file_stat_t* stat_info, |
| 3307 | struct stat* statinfo, |
| 3308 | bool check_rw_perm, |
| 3309 | bool read_only) |
| 3310 | { |
| 3311 | int ret = stat(path, statinfo); |
| 3312 | |
| 3313 | if (ret && (errno == ENOENT || errno == ENOTDIR)) { |
| 3314 | /* file does not exist */ |
| 3315 | |
| 3316 | return(DB_NOT_FOUND); |
| 3317 | |
| 3318 | } else if (ret) { |
| 3319 | /* file exists, but stat call failed */ |
| 3320 | |
| 3321 | os_file_handle_error_no_exit(path, "stat" , false); |
| 3322 | |
| 3323 | return(DB_FAIL); |
| 3324 | } |
| 3325 | |
| 3326 | switch (statinfo->st_mode & S_IFMT) { |
| 3327 | case S_IFDIR: |
| 3328 | stat_info->type = OS_FILE_TYPE_DIR; |
| 3329 | break; |
| 3330 | case S_IFLNK: |
| 3331 | stat_info->type = OS_FILE_TYPE_LINK; |
| 3332 | break; |
| 3333 | case S_IFBLK: |
| 3334 | /* Handle block device as regular file. */ |
| 3335 | case S_IFCHR: |
| 3336 | /* Handle character device as regular file. */ |
| 3337 | case S_IFREG: |
| 3338 | stat_info->type = OS_FILE_TYPE_FILE; |
| 3339 | break; |
| 3340 | default: |
| 3341 | stat_info->type = OS_FILE_TYPE_UNKNOWN; |
| 3342 | } |
| 3343 | |
| 3344 | stat_info->size = statinfo->st_size; |
| 3345 | stat_info->block_size = statinfo->st_blksize; |
| 3346 | stat_info->alloc_size = statinfo->st_blocks * 512; |
| 3347 | |
| 3348 | if (check_rw_perm |
| 3349 | && (stat_info->type == OS_FILE_TYPE_FILE |
| 3350 | || stat_info->type == OS_FILE_TYPE_BLOCK)) { |
| 3351 | |
| 3352 | stat_info->rw_perm = !access(path, read_only |
| 3353 | ? R_OK : R_OK | W_OK); |
| 3354 | } |
| 3355 | |
| 3356 | return(DB_SUCCESS); |
| 3357 | } |
| 3358 | |
| 3359 | /** Truncates a file to a specified size in bytes. |
| 3360 | Do nothing if the size to preserve is greater or equal to the current |
| 3361 | size of the file. |
| 3362 | @param[in] pathname file path |
| 3363 | @param[in] file file to be truncated |
| 3364 | @param[in] size size to preserve in bytes |
| 3365 | @return true if success */ |
| 3366 | static |
| 3367 | bool |
| 3368 | os_file_truncate_posix( |
| 3369 | const char* pathname, |
| 3370 | os_file_t file, |
| 3371 | os_offset_t size) |
| 3372 | { |
| 3373 | int res = ftruncate(file, size); |
| 3374 | |
| 3375 | if (res == -1) { |
| 3376 | |
| 3377 | bool retry; |
| 3378 | |
| 3379 | retry = os_file_handle_error_no_exit( |
| 3380 | pathname, "truncate" , false); |
| 3381 | |
| 3382 | if (retry) { |
| 3383 | ib::warn() |
| 3384 | << "Truncate failed for '" |
| 3385 | << pathname << "'" ; |
| 3386 | } |
| 3387 | } |
| 3388 | |
| 3389 | return(res == 0); |
| 3390 | } |
| 3391 | |
| 3392 | /** Truncates a file at its current position. |
| 3393 | @return true if success */ |
| 3394 | bool |
| 3395 | os_file_set_eof( |
| 3396 | FILE* file) /*!< in: file to be truncated */ |
| 3397 | { |
| 3398 | WAIT_ALLOW_WRITES(); |
| 3399 | return(!ftruncate(fileno(file), ftell(file))); |
| 3400 | } |
| 3401 | |
| 3402 | #else /* !_WIN32 */ |
| 3403 | |
| 3404 | #include <WinIoCtl.h> |
| 3405 | |
| 3406 | /* |
| 3407 | Windows : Handling synchronous IO on files opened asynchronously. |
| 3408 | |
| 3409 | If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to |
| 3410 | a completion port, then every IO on this file would normally be enqueued to the |
| 3411 | completion port. Sometimes however we would like to do a synchronous IO. This is |
| 3412 | possible if we initialitze have overlapped.hEvent with a valid event and set its |
| 3413 | lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) |
| 3414 | |
| 3415 | We'll create this special event once for each thread and store in thread local |
| 3416 | storage. |
| 3417 | */ |
| 3418 | |
| 3419 | |
| 3420 | static void __stdcall win_free_syncio_event(void *data) { |
| 3421 | if (data) { |
| 3422 | CloseHandle((HANDLE)data); |
| 3423 | } |
| 3424 | } |
| 3425 | |
| 3426 | |
| 3427 | /* |
| 3428 | Retrieve per-thread event for doing synchronous io on asyncronously opened files |
| 3429 | */ |
| 3430 | static HANDLE win_get_syncio_event() |
| 3431 | { |
| 3432 | HANDLE h; |
| 3433 | |
| 3434 | h = (HANDLE)FlsGetValue(fls_sync_io); |
| 3435 | if (h) { |
| 3436 | return h; |
| 3437 | } |
| 3438 | h = CreateEventA(NULL, FALSE, FALSE, NULL); |
| 3439 | ut_a(h); |
| 3440 | /* Set low-order bit to keeps I/O completion from being queued */ |
| 3441 | h = (HANDLE)((uintptr_t)h | 1); |
| 3442 | FlsSetValue(fls_sync_io, h); |
| 3443 | return h; |
| 3444 | } |
| 3445 | |
| 3446 | |
| 3447 | /** Do the read/write |
| 3448 | @param[in] request The IO context and type |
| 3449 | @return the number of bytes read/written or negative value on error */ |
| 3450 | ssize_t |
| 3451 | SyncFileIO::execute(const IORequest& request) |
| 3452 | { |
| 3453 | OVERLAPPED seek; |
| 3454 | |
| 3455 | memset(&seek, 0x0, sizeof(seek)); |
| 3456 | |
| 3457 | seek.hEvent = win_get_syncio_event(); |
| 3458 | seek.Offset = (DWORD) m_offset & 0xFFFFFFFF; |
| 3459 | seek.OffsetHigh = (DWORD) (m_offset >> 32); |
| 3460 | |
| 3461 | BOOL ret; |
| 3462 | DWORD n_bytes; |
| 3463 | |
| 3464 | if (request.is_read()) { |
| 3465 | ret = ReadFile(m_fh, m_buf, |
| 3466 | static_cast<DWORD>(m_n), NULL, &seek); |
| 3467 | |
| 3468 | } else { |
| 3469 | ut_ad(request.is_write()); |
| 3470 | ret = WriteFile(m_fh, m_buf, |
| 3471 | static_cast<DWORD>(m_n), NULL, &seek); |
| 3472 | } |
| 3473 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
| 3474 | /* Wait for async io to complete */ |
| 3475 | ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE); |
| 3476 | } |
| 3477 | |
| 3478 | return(ret ? static_cast<ssize_t>(n_bytes) : -1); |
| 3479 | } |
| 3480 | |
| 3481 | /** Do the read/write |
| 3482 | @param[in,out] slot The IO slot, it has the IO context |
| 3483 | @return the number of bytes read/written or negative value on error */ |
| 3484 | ssize_t |
| 3485 | SyncFileIO::execute(Slot* slot) |
| 3486 | { |
| 3487 | BOOL ret; |
| 3488 | slot->control.hEvent = win_get_syncio_event(); |
| 3489 | if (slot->type.is_read()) { |
| 3490 | |
| 3491 | ret = ReadFile( |
| 3492 | slot->file, slot->ptr, slot->len, |
| 3493 | NULL, &slot->control); |
| 3494 | |
| 3495 | } else { |
| 3496 | ut_ad(slot->type.is_write()); |
| 3497 | |
| 3498 | ret = WriteFile( |
| 3499 | slot->file, slot->ptr, slot->len, |
| 3500 | NULL, &slot->control); |
| 3501 | |
| 3502 | } |
| 3503 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
| 3504 | /* Wait for async io to complete */ |
| 3505 | ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE); |
| 3506 | } |
| 3507 | |
| 3508 | return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1); |
| 3509 | } |
| 3510 | |
| 3511 | /* Startup/shutdown */ |
| 3512 | |
| 3513 | struct WinIoInit |
| 3514 | { |
| 3515 | WinIoInit() { |
| 3516 | fls_sync_io = FlsAlloc(win_free_syncio_event); |
| 3517 | ut_a(fls_sync_io != FLS_OUT_OF_INDEXES); |
| 3518 | } |
| 3519 | |
| 3520 | ~WinIoInit() { |
| 3521 | FlsFree(fls_sync_io); |
| 3522 | } |
| 3523 | }; |
| 3524 | |
| 3525 | /* Ensures proper initialization and shutdown */ |
| 3526 | static WinIoInit win_io_init; |
| 3527 | |
| 3528 | |
| 3529 | /** Free storage space associated with a section of the file. |
| 3530 | @param[in] fh Open file handle |
| 3531 | @param[in] page_size Tablespace page size |
| 3532 | @param[in] block_size File system block size |
| 3533 | @param[in] off Starting offset (SEEK_SET) |
| 3534 | @param[in] len Size of the hole |
| 3535 | @return 0 on success or errno */ |
| 3536 | static |
| 3537 | dberr_t |
| 3538 | os_file_punch_hole_win32( |
| 3539 | os_file_t fh, |
| 3540 | os_offset_t off, |
| 3541 | os_offset_t len) |
| 3542 | { |
| 3543 | FILE_ZERO_DATA_INFORMATION punch; |
| 3544 | |
| 3545 | punch.FileOffset.QuadPart = off; |
| 3546 | punch.BeyondFinalZero.QuadPart = off + len; |
| 3547 | |
| 3548 | /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL, |
| 3549 | therefore we pass a dummy parameter. */ |
| 3550 | DWORD temp; |
| 3551 | BOOL success = os_win32_device_io_control( |
| 3552 | fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch), |
| 3553 | NULL, 0, &temp); |
| 3554 | |
| 3555 | return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE); |
| 3556 | } |
| 3557 | |
| 3558 | /** Check the existence and type of the given file. |
| 3559 | @param[in] path path name of file |
| 3560 | @param[out] exists true if the file exists |
| 3561 | @param[out] type Type of the file, if it exists |
| 3562 | @return true if call succeeded */ |
| 3563 | static |
| 3564 | bool |
| 3565 | os_file_status_win32( |
| 3566 | const char* path, |
| 3567 | bool* exists, |
| 3568 | os_file_type_t* type) |
| 3569 | { |
| 3570 | int ret; |
| 3571 | struct _stat64 statinfo; |
| 3572 | |
| 3573 | ret = _stat64(path, &statinfo); |
| 3574 | |
| 3575 | *exists = !ret; |
| 3576 | |
| 3577 | if (!ret) { |
| 3578 | /* file exists, everything OK */ |
| 3579 | |
| 3580 | } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { |
| 3581 | /* file does not exist */ |
| 3582 | return(true); |
| 3583 | |
| 3584 | } else { |
| 3585 | /* file exists, but stat call failed */ |
| 3586 | os_file_handle_error_no_exit(path, "stat" , false); |
| 3587 | return(false); |
| 3588 | } |
| 3589 | |
| 3590 | if (_S_IFDIR & statinfo.st_mode) { |
| 3591 | *type = OS_FILE_TYPE_DIR; |
| 3592 | |
| 3593 | } else if (_S_IFREG & statinfo.st_mode) { |
| 3594 | *type = OS_FILE_TYPE_FILE; |
| 3595 | |
| 3596 | } else { |
| 3597 | *type = OS_FILE_TYPE_UNKNOWN; |
| 3598 | } |
| 3599 | |
| 3600 | return(true); |
| 3601 | } |
| 3602 | |
| 3603 | /** NOTE! Use the corresponding macro os_file_flush(), not directly this |
| 3604 | function! |
| 3605 | Flushes the write buffers of a given file to the disk. |
| 3606 | @param[in] file handle to a file |
| 3607 | @return true if success */ |
| 3608 | bool |
| 3609 | os_file_flush_func( |
| 3610 | os_file_t file) |
| 3611 | { |
| 3612 | ++os_n_fsyncs; |
| 3613 | |
| 3614 | BOOL ret = FlushFileBuffers(file); |
| 3615 | |
| 3616 | if (ret) { |
| 3617 | return(true); |
| 3618 | } |
| 3619 | |
| 3620 | /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is |
| 3621 | actually a raw device, we choose to ignore that error if we are using |
| 3622 | raw disks */ |
| 3623 | |
| 3624 | if (srv_start_raw_disk_in_use && GetLastError() |
| 3625 | == ERROR_INVALID_FUNCTION) { |
| 3626 | return(true); |
| 3627 | } |
| 3628 | |
| 3629 | os_file_handle_error(NULL, "flush" ); |
| 3630 | |
| 3631 | /* It is a fatal error if a file flush does not succeed, because then |
| 3632 | the database can get corrupt on disk */ |
| 3633 | ut_error; |
| 3634 | |
| 3635 | return(false); |
| 3636 | } |
| 3637 | |
| 3638 | /** Retrieves the last error number if an error occurs in a file io function. |
| 3639 | The number should be retrieved before any other OS calls (because they may |
| 3640 | overwrite the error number). If the number is not known to this program, |
| 3641 | the OS error number + 100 is returned. |
| 3642 | @param[in] report_all_errors true if we want an error message printed |
| 3643 | of all errors |
| 3644 | @param[in] on_error_silent true then don't print any diagnostic |
| 3645 | to the log |
| 3646 | @return error number, or OS error number + 100 */ |
| 3647 | static |
| 3648 | ulint |
| 3649 | os_file_get_last_error_low( |
| 3650 | bool report_all_errors, |
| 3651 | bool on_error_silent) |
| 3652 | { |
| 3653 | ulint err = (ulint) GetLastError(); |
| 3654 | |
| 3655 | if (err == ERROR_SUCCESS) { |
| 3656 | return(0); |
| 3657 | } |
| 3658 | |
| 3659 | if (report_all_errors |
| 3660 | || (!on_error_silent |
| 3661 | && err != ERROR_DISK_FULL |
| 3662 | && err != ERROR_FILE_EXISTS)) { |
| 3663 | |
| 3664 | ib::error() |
| 3665 | << "Operating system error number " << err |
| 3666 | << " in a file operation." ; |
| 3667 | |
| 3668 | if (err == ERROR_PATH_NOT_FOUND) { |
| 3669 | ib::error() |
| 3670 | << "The error means the system" |
| 3671 | " cannot find the path specified." ; |
| 3672 | |
| 3673 | if (srv_is_being_started) { |
| 3674 | ib::error() |
| 3675 | << "If you are installing InnoDB," |
| 3676 | " remember that you must create" |
| 3677 | " directories yourself, InnoDB" |
| 3678 | " does not create them." ; |
| 3679 | } |
| 3680 | |
| 3681 | } else if (err == ERROR_ACCESS_DENIED) { |
| 3682 | |
| 3683 | ib::error() |
| 3684 | << "The error means mysqld does not have" |
| 3685 | " the access rights to" |
| 3686 | " the directory. It may also be" |
| 3687 | " you have created a subdirectory" |
| 3688 | " of the same name as a data file." ; |
| 3689 | |
| 3690 | } else if (err == ERROR_SHARING_VIOLATION |
| 3691 | || err == ERROR_LOCK_VIOLATION) { |
| 3692 | |
| 3693 | ib::error() |
| 3694 | << "The error means that another program" |
| 3695 | " is using InnoDB's files." |
| 3696 | " This might be a backup or antivirus" |
| 3697 | " software or another instance" |
| 3698 | " of MySQL." |
| 3699 | " Please close it to get rid of this error." ; |
| 3700 | |
| 3701 | } else if (err == ERROR_WORKING_SET_QUOTA |
| 3702 | || err == ERROR_NO_SYSTEM_RESOURCES) { |
| 3703 | |
| 3704 | ib::error() |
| 3705 | << "The error means that there are no" |
| 3706 | " sufficient system resources or quota to" |
| 3707 | " complete the operation." ; |
| 3708 | |
| 3709 | } else if (err == ERROR_OPERATION_ABORTED) { |
| 3710 | |
| 3711 | ib::error() |
| 3712 | << "The error means that the I/O" |
| 3713 | " operation has been aborted" |
| 3714 | " because of either a thread exit" |
| 3715 | " or an application request." |
| 3716 | " Retry attempt is made." ; |
| 3717 | } else { |
| 3718 | |
| 3719 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
| 3720 | } |
| 3721 | } |
| 3722 | |
| 3723 | if (err == ERROR_FILE_NOT_FOUND) { |
| 3724 | return(OS_FILE_NOT_FOUND); |
| 3725 | } else if (err == ERROR_DISK_FULL) { |
| 3726 | return(OS_FILE_DISK_FULL); |
| 3727 | } else if (err == ERROR_FILE_EXISTS) { |
| 3728 | return(OS_FILE_ALREADY_EXISTS); |
| 3729 | } else if (err == ERROR_SHARING_VIOLATION |
| 3730 | || err == ERROR_LOCK_VIOLATION) { |
| 3731 | return(OS_FILE_SHARING_VIOLATION); |
| 3732 | } else if (err == ERROR_WORKING_SET_QUOTA |
| 3733 | || err == ERROR_NO_SYSTEM_RESOURCES) { |
| 3734 | return(OS_FILE_INSUFFICIENT_RESOURCE); |
| 3735 | } else if (err == ERROR_OPERATION_ABORTED) { |
| 3736 | return(OS_FILE_OPERATION_ABORTED); |
| 3737 | } else if (err == ERROR_ACCESS_DENIED) { |
| 3738 | return(OS_FILE_ACCESS_VIOLATION); |
| 3739 | } |
| 3740 | |
| 3741 | return(OS_FILE_ERROR_MAX + err); |
| 3742 | } |
| 3743 | |
| 3744 | |
| 3745 | /** NOTE! Use the corresponding macro os_file_create_simple(), not directly |
| 3746 | this function! |
| 3747 | A simple function to open or create a file. |
| 3748 | @param[in] name name of the file or path as a null-terminated |
| 3749 | string |
| 3750 | @param[in] create_mode create mode |
| 3751 | @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE |
| 3752 | @param[in] read_only if true read only mode checks are enforced |
| 3753 | @param[out] success true if succeed, false if error |
| 3754 | @return handle to the file, not defined if error, error number |
| 3755 | can be retrieved with os_file_get_last_error */ |
| 3756 | pfs_os_file_t |
| 3757 | os_file_create_simple_func( |
| 3758 | const char* name, |
| 3759 | ulint create_mode, |
| 3760 | ulint access_type, |
| 3761 | bool read_only, |
| 3762 | bool* success) |
| 3763 | { |
| 3764 | os_file_t file; |
| 3765 | |
| 3766 | *success = false; |
| 3767 | |
| 3768 | DWORD access; |
| 3769 | DWORD create_flag; |
| 3770 | DWORD attributes = 0; |
| 3771 | |
| 3772 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
| 3773 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
| 3774 | ut_ad(srv_operation == SRV_OPERATION_NORMAL); |
| 3775 | |
| 3776 | if (create_mode == OS_FILE_OPEN) { |
| 3777 | |
| 3778 | create_flag = OPEN_EXISTING; |
| 3779 | |
| 3780 | } else if (read_only) { |
| 3781 | |
| 3782 | create_flag = OPEN_EXISTING; |
| 3783 | |
| 3784 | } else if (create_mode == OS_FILE_CREATE) { |
| 3785 | |
| 3786 | create_flag = CREATE_NEW; |
| 3787 | |
| 3788 | } else if (create_mode == OS_FILE_CREATE_PATH) { |
| 3789 | |
| 3790 | /* Create subdirs along the path if needed. */ |
| 3791 | *success = os_file_create_subdirs_if_needed(name); |
| 3792 | |
| 3793 | if (!*success) { |
| 3794 | |
| 3795 | ib::error() |
| 3796 | << "Unable to create subdirectories '" |
| 3797 | << name << "'" ; |
| 3798 | |
| 3799 | return(OS_FILE_CLOSED); |
| 3800 | } |
| 3801 | |
| 3802 | create_flag = CREATE_NEW; |
| 3803 | create_mode = OS_FILE_CREATE; |
| 3804 | |
| 3805 | } else { |
| 3806 | |
| 3807 | ib::error() |
| 3808 | << "Unknown file create mode (" |
| 3809 | << create_mode << ") for file '" |
| 3810 | << name << "'" ; |
| 3811 | |
| 3812 | return(OS_FILE_CLOSED); |
| 3813 | } |
| 3814 | |
| 3815 | if (access_type == OS_FILE_READ_ONLY) { |
| 3816 | |
| 3817 | access = GENERIC_READ; |
| 3818 | |
| 3819 | } else if (read_only) { |
| 3820 | |
| 3821 | ib::info() |
| 3822 | << "Read only mode set. Unable to" |
| 3823 | " open file '" << name << "' in RW mode, " |
| 3824 | << "trying RO mode" ; |
| 3825 | |
| 3826 | access = GENERIC_READ; |
| 3827 | |
| 3828 | } else if (access_type == OS_FILE_READ_WRITE) { |
| 3829 | |
| 3830 | access = GENERIC_READ | GENERIC_WRITE; |
| 3831 | |
| 3832 | } else { |
| 3833 | |
| 3834 | ib::error() |
| 3835 | << "Unknown file access type (" << access_type << ") " |
| 3836 | "for file '" << name << "'" ; |
| 3837 | |
| 3838 | return(OS_FILE_CLOSED); |
| 3839 | } |
| 3840 | |
| 3841 | bool retry; |
| 3842 | |
| 3843 | do { |
| 3844 | /* Use default security attributes and no template file. */ |
| 3845 | |
| 3846 | file = CreateFile( |
| 3847 | (LPCTSTR) name, access, FILE_SHARE_READ, NULL, |
| 3848 | create_flag, attributes, NULL); |
| 3849 | |
| 3850 | if (file == INVALID_HANDLE_VALUE) { |
| 3851 | |
| 3852 | *success = false; |
| 3853 | |
| 3854 | retry = os_file_handle_error( |
| 3855 | name, create_mode == OS_FILE_OPEN ? |
| 3856 | "open" : "create" ); |
| 3857 | |
| 3858 | } else { |
| 3859 | |
| 3860 | retry = false; |
| 3861 | |
| 3862 | *success = true; |
| 3863 | } |
| 3864 | |
| 3865 | } while (retry); |
| 3866 | |
| 3867 | return(file); |
| 3868 | } |
| 3869 | |
| 3870 | /** This function attempts to create a directory named pathname. The new |
| 3871 | directory gets default permissions. On Unix the permissions are |
| 3872 | (0770 & ~umask). If the directory exists already, nothing is done and |
| 3873 | the call succeeds, unless the fail_if_exists arguments is true. |
| 3874 | If another error occurs, such as a permission error, this does not crash, |
| 3875 | but reports the error and returns false. |
| 3876 | @param[in] pathname directory name as null-terminated string |
| 3877 | @param[in] fail_if_exists if true, pre-existing directory is treated |
| 3878 | as an error. |
| 3879 | @return true if call succeeds, false on error */ |
| 3880 | bool |
| 3881 | os_file_create_directory( |
| 3882 | const char* pathname, |
| 3883 | bool fail_if_exists) |
| 3884 | { |
| 3885 | BOOL rcode; |
| 3886 | |
| 3887 | rcode = CreateDirectory((LPCTSTR) pathname, NULL); |
| 3888 | if (!(rcode != 0 |
| 3889 | || (GetLastError() == ERROR_ALREADY_EXISTS |
| 3890 | && !fail_if_exists))) { |
| 3891 | |
| 3892 | os_file_handle_error_no_exit( |
| 3893 | pathname, "CreateDirectory" , false); |
| 3894 | |
| 3895 | return(false); |
| 3896 | } |
| 3897 | |
| 3898 | return(true); |
| 3899 | } |
| 3900 | |
| 3901 | /** The os_file_opendir() function opens a directory stream corresponding to the |
| 3902 | directory named by the dirname argument. The directory stream is positioned |
| 3903 | at the first entry. In both Unix and Windows we automatically skip the '.' |
| 3904 | and '..' items at the start of the directory listing. |
| 3905 | @param[in] dirname directory name; it must not contain a trailing |
| 3906 | '\' or '/' |
| 3907 | @param[in] is_fatal true if we should treat an error as a fatal |
| 3908 | error; if we try to open symlinks then we do |
| 3909 | not wish a fatal error if it happens not to |
| 3910 | be a directory |
| 3911 | @return directory stream, NULL if error */ |
| 3912 | os_file_dir_t |
| 3913 | os_file_opendir( |
| 3914 | const char* dirname, |
| 3915 | bool error_is_fatal) |
| 3916 | { |
| 3917 | os_file_dir_t dir; |
| 3918 | LPWIN32_FIND_DATA lpFindFileData; |
| 3919 | char path[OS_FILE_MAX_PATH + 3]; |
| 3920 | |
| 3921 | ut_a(strlen(dirname) < OS_FILE_MAX_PATH); |
| 3922 | |
| 3923 | strcpy(path, dirname); |
| 3924 | strcpy(path + strlen(path), "\\*" ); |
| 3925 | |
| 3926 | /* Note that in Windows opening the 'directory stream' also retrieves |
| 3927 | the first entry in the directory. Since it is '.', that is no problem, |
| 3928 | as we will skip over the '.' and '..' entries anyway. */ |
| 3929 | |
| 3930 | lpFindFileData = static_cast<LPWIN32_FIND_DATA>( |
| 3931 | ut_malloc_nokey(sizeof(WIN32_FIND_DATA))); |
| 3932 | |
| 3933 | dir = FindFirstFile((LPCTSTR) path, lpFindFileData); |
| 3934 | |
| 3935 | ut_free(lpFindFileData); |
| 3936 | |
| 3937 | if (dir == INVALID_HANDLE_VALUE) { |
| 3938 | |
| 3939 | if (error_is_fatal) { |
| 3940 | os_file_handle_error(dirname, "opendir" ); |
| 3941 | } |
| 3942 | |
| 3943 | return(NULL); |
| 3944 | } |
| 3945 | |
| 3946 | return(dir); |
| 3947 | } |
| 3948 | |
| 3949 | /** Closes a directory stream. |
| 3950 | @param[in] dir directory stream |
| 3951 | @return 0 if success, -1 if failure */ |
| 3952 | int |
| 3953 | os_file_closedir( |
| 3954 | os_file_dir_t dir) |
| 3955 | { |
| 3956 | BOOL ret; |
| 3957 | |
| 3958 | ret = FindClose(dir); |
| 3959 | |
| 3960 | if (!ret) { |
| 3961 | os_file_handle_error_no_exit(NULL, "closedir" , false); |
| 3962 | |
| 3963 | return(-1); |
| 3964 | } |
| 3965 | |
| 3966 | return(0); |
| 3967 | } |
| 3968 | |
| 3969 | /** This function returns information of the next file in the directory. We |
| 3970 | jump over the '.' and '..' entries in the directory. |
| 3971 | @param[in] dirname directory name or path |
| 3972 | @param[in] dir directory stream |
| 3973 | @param[out] info buffer where the info is returned |
| 3974 | @return 0 if ok, -1 if error, 1 if at the end of the directory */ |
| 3975 | int |
| 3976 | os_file_readdir_next_file( |
| 3977 | const char* dirname, |
| 3978 | os_file_dir_t dir, |
| 3979 | os_file_stat_t* info) |
| 3980 | { |
| 3981 | BOOL ret; |
| 3982 | int status; |
| 3983 | WIN32_FIND_DATA find_data; |
| 3984 | |
| 3985 | next_file: |
| 3986 | |
| 3987 | ret = FindNextFile(dir, &find_data); |
| 3988 | |
| 3989 | if (ret > 0) { |
| 3990 | |
| 3991 | const char* name; |
| 3992 | |
| 3993 | name = static_cast<const char*>(find_data.cFileName); |
| 3994 | |
| 3995 | ut_a(strlen(name) < OS_FILE_MAX_PATH); |
| 3996 | |
| 3997 | if (strcmp(name, "." ) == 0 || strcmp(name, ".." ) == 0) { |
| 3998 | |
| 3999 | goto next_file; |
| 4000 | } |
| 4001 | |
| 4002 | strcpy(info->name, name); |
| 4003 | |
| 4004 | info->size = find_data.nFileSizeHigh; |
| 4005 | info->size <<= 32; |
| 4006 | info->size |= find_data.nFileSizeLow; |
| 4007 | |
| 4008 | if (find_data.dwFileAttributes |
| 4009 | & FILE_ATTRIBUTE_REPARSE_POINT) { |
| 4010 | |
| 4011 | /* TODO: test Windows symlinks */ |
| 4012 | /* TODO: MySQL has apparently its own symlink |
| 4013 | implementation in Windows, dbname.sym can |
| 4014 | redirect a database directory: |
| 4015 | REFMAN "windows-symbolic-links.html" */ |
| 4016 | |
| 4017 | info->type = OS_FILE_TYPE_LINK; |
| 4018 | |
| 4019 | } else if (find_data.dwFileAttributes |
| 4020 | & FILE_ATTRIBUTE_DIRECTORY) { |
| 4021 | |
| 4022 | info->type = OS_FILE_TYPE_DIR; |
| 4023 | |
| 4024 | } else { |
| 4025 | |
| 4026 | /* It is probably safest to assume that all other |
| 4027 | file types are normal. Better to check them rather |
| 4028 | than blindly skip them. */ |
| 4029 | |
| 4030 | info->type = OS_FILE_TYPE_FILE; |
| 4031 | } |
| 4032 | |
| 4033 | status = 0; |
| 4034 | |
| 4035 | } else if (GetLastError() == ERROR_NO_MORE_FILES) { |
| 4036 | |
| 4037 | status = 1; |
| 4038 | |
| 4039 | } else { |
| 4040 | |
| 4041 | os_file_handle_error_no_exit(NULL, "readdir_next_file" , false); |
| 4042 | |
| 4043 | status = -1; |
| 4044 | } |
| 4045 | |
| 4046 | return(status); |
| 4047 | } |
| 4048 | |
| 4049 | /** NOTE! Use the corresponding macro os_file_create(), not directly |
| 4050 | this function! |
| 4051 | Opens an existing file or creates a new. |
| 4052 | @param[in] name name of the file or path as a null-terminated |
| 4053 | string |
| 4054 | @param[in] create_mode create mode |
| 4055 | @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O |
| 4056 | is desired, OS_FILE_NORMAL, if any normal file; |
| 4057 | NOTE that it also depends on type, os_aio_.. |
| 4058 | and srv_.. variables whether we really use async |
| 4059 | I/O or unbuffered I/O: look in the function |
| 4060 | source code for the exact rules |
| 4061 | @param[in] type OS_DATA_FILE or OS_LOG_FILE |
| 4062 | @param[in] success true if succeeded |
| 4063 | @return handle to the file, not defined if error, error number |
| 4064 | can be retrieved with os_file_get_last_error */ |
| 4065 | pfs_os_file_t |
| 4066 | os_file_create_func( |
| 4067 | const char* name, |
| 4068 | ulint create_mode, |
| 4069 | ulint purpose, |
| 4070 | ulint type, |
| 4071 | bool read_only, |
| 4072 | bool* success) |
| 4073 | { |
| 4074 | os_file_t file; |
| 4075 | bool retry; |
| 4076 | bool on_error_no_exit; |
| 4077 | bool on_error_silent; |
| 4078 | |
| 4079 | *success = false; |
| 4080 | |
| 4081 | DBUG_EXECUTE_IF( |
| 4082 | "ib_create_table_fail_disk_full" , |
| 4083 | *success = false; |
| 4084 | SetLastError(ERROR_DISK_FULL); |
| 4085 | return(OS_FILE_CLOSED); |
| 4086 | ); |
| 4087 | |
| 4088 | DWORD create_flag; |
| 4089 | DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL |
| 4090 | ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE |
| 4091 | : FILE_SHARE_READ; |
| 4092 | |
| 4093 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
| 4094 | WAIT_ALLOW_WRITES(); |
| 4095 | } |
| 4096 | |
| 4097 | on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT |
| 4098 | ? true : false; |
| 4099 | |
| 4100 | on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT |
| 4101 | ? true : false; |
| 4102 | |
| 4103 | create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT); |
| 4104 | |
| 4105 | if (create_mode == OS_FILE_OPEN_RAW) { |
| 4106 | |
| 4107 | ut_a(!read_only); |
| 4108 | |
| 4109 | create_flag = OPEN_EXISTING; |
| 4110 | |
| 4111 | /* On Windows Physical devices require admin privileges and |
| 4112 | have to have the write-share mode set. See the remarks |
| 4113 | section for the CreateFile() function documentation in MSDN. */ |
| 4114 | |
| 4115 | share_mode |= FILE_SHARE_WRITE; |
| 4116 | |
| 4117 | } else if (create_mode == OS_FILE_OPEN |
| 4118 | || create_mode == OS_FILE_OPEN_RETRY) { |
| 4119 | |
| 4120 | create_flag = OPEN_EXISTING; |
| 4121 | |
| 4122 | } else if (read_only) { |
| 4123 | |
| 4124 | create_flag = OPEN_EXISTING; |
| 4125 | |
| 4126 | } else if (create_mode == OS_FILE_CREATE) { |
| 4127 | |
| 4128 | create_flag = CREATE_NEW; |
| 4129 | |
| 4130 | } else if (create_mode == OS_FILE_OVERWRITE) { |
| 4131 | |
| 4132 | create_flag = CREATE_ALWAYS; |
| 4133 | |
| 4134 | } else { |
| 4135 | ib::error() |
| 4136 | << "Unknown file create mode (" << create_mode << ") " |
| 4137 | << " for file '" << name << "'" ; |
| 4138 | |
| 4139 | return(OS_FILE_CLOSED); |
| 4140 | } |
| 4141 | |
| 4142 | DWORD attributes = 0; |
| 4143 | |
| 4144 | if (purpose == OS_FILE_AIO) { |
| 4145 | |
| 4146 | #ifdef WIN_ASYNC_IO |
| 4147 | /* If specified, use asynchronous (overlapped) io and no |
| 4148 | buffering of writes in the OS */ |
| 4149 | |
| 4150 | if (srv_use_native_aio) { |
| 4151 | attributes |= FILE_FLAG_OVERLAPPED; |
| 4152 | } |
| 4153 | #endif /* WIN_ASYNC_IO */ |
| 4154 | |
| 4155 | } else if (purpose == OS_FILE_NORMAL) { |
| 4156 | |
| 4157 | /* Use default setting. */ |
| 4158 | |
| 4159 | } else { |
| 4160 | |
| 4161 | ib::error() |
| 4162 | << "Unknown purpose flag (" << purpose << ") " |
| 4163 | << "while opening file '" << name << "'" ; |
| 4164 | |
| 4165 | return(OS_FILE_CLOSED); |
| 4166 | } |
| 4167 | |
| 4168 | if (type == OS_LOG_FILE) { |
| 4169 | /* There is not reason to use buffered write to logs.*/ |
| 4170 | attributes |= FILE_FLAG_NO_BUFFERING; |
| 4171 | } |
| 4172 | |
| 4173 | switch (srv_file_flush_method) |
| 4174 | { |
| 4175 | case SRV_O_DSYNC: |
| 4176 | if (type == OS_LOG_FILE) { |
| 4177 | /* Map O_SYNC to FILE_WRITE_THROUGH */ |
| 4178 | attributes |= FILE_FLAG_WRITE_THROUGH; |
| 4179 | } |
| 4180 | break; |
| 4181 | |
| 4182 | case SRV_O_DIRECT_NO_FSYNC: |
| 4183 | case SRV_O_DIRECT: |
| 4184 | if (type == OS_DATA_FILE) { |
| 4185 | attributes |= FILE_FLAG_NO_BUFFERING; |
| 4186 | } |
| 4187 | break; |
| 4188 | |
| 4189 | case SRV_ALL_O_DIRECT_FSYNC: |
| 4190 | /*Traditional Windows behavior, no buffering for any files.*/ |
| 4191 | attributes |= FILE_FLAG_NO_BUFFERING; |
| 4192 | break; |
| 4193 | |
| 4194 | case SRV_FSYNC: |
| 4195 | case SRV_LITTLESYNC: |
| 4196 | break; |
| 4197 | |
| 4198 | case SRV_NOSYNC: |
| 4199 | /* Let Windows cache manager handle all writes.*/ |
| 4200 | attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); |
| 4201 | break; |
| 4202 | |
| 4203 | default: |
| 4204 | ut_a(false); /* unknown flush mode.*/ |
| 4205 | } |
| 4206 | |
| 4207 | |
| 4208 | // TODO: Create a bug, this looks wrong. The flush log |
| 4209 | // parameter is dynamic. |
| 4210 | if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { |
| 4211 | /* Do not use unbuffered i/o for the log files because |
| 4212 | value 2 denotes that we do not flush the log at every |
| 4213 | commit, but only once per second */ |
| 4214 | attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); |
| 4215 | } |
| 4216 | |
| 4217 | |
| 4218 | DWORD access = GENERIC_READ; |
| 4219 | |
| 4220 | if (!read_only) { |
| 4221 | access |= GENERIC_WRITE; |
| 4222 | } |
| 4223 | |
| 4224 | do { |
| 4225 | /* Use default security attributes and no template file. */ |
| 4226 | file = CreateFile( |
| 4227 | (LPCTSTR) name, access, share_mode, NULL, |
| 4228 | create_flag, attributes, NULL); |
| 4229 | |
| 4230 | if (file == INVALID_HANDLE_VALUE) { |
| 4231 | const char* operation; |
| 4232 | |
| 4233 | operation = (create_mode == OS_FILE_CREATE |
| 4234 | && !read_only) |
| 4235 | ? "create" : "open" ; |
| 4236 | |
| 4237 | *success = false; |
| 4238 | |
| 4239 | if (on_error_no_exit) { |
| 4240 | retry = os_file_handle_error_no_exit( |
| 4241 | name, operation, on_error_silent); |
| 4242 | } else { |
| 4243 | retry = os_file_handle_error(name, operation); |
| 4244 | } |
| 4245 | } else { |
| 4246 | |
| 4247 | retry = false; |
| 4248 | |
| 4249 | *success = true; |
| 4250 | |
| 4251 | if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) { |
| 4252 | /* Bind the file handle to completion port. Completion port |
| 4253 | might not be created yet, in some stages of backup, but |
| 4254 | must always be there for the server.*/ |
| 4255 | HANDLE port =(type == OS_LOG_FILE)? |
| 4256 | log_completion_port : data_completion_port; |
| 4257 | ut_a(port || srv_operation != SRV_OPERATION_NORMAL); |
| 4258 | if (port) { |
| 4259 | ut_a(CreateIoCompletionPort(file, port, 0, 0)); |
| 4260 | } |
| 4261 | } |
| 4262 | } |
| 4263 | } while (retry); |
| 4264 | |
| 4265 | return(file); |
| 4266 | } |
| 4267 | |
| 4268 | /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(), |
| 4269 | not directly this function! |
| 4270 | A simple function to open or create a file. |
| 4271 | @param[in] name name of the file or path as a null-terminated |
| 4272 | string |
| 4273 | @param[in] create_mode create mode |
| 4274 | @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or |
| 4275 | OS_FILE_READ_ALLOW_DELETE; the last option is |
| 4276 | used by a backup program reading the file |
| 4277 | @param[out] success true if succeeded |
| 4278 | @return own: handle to the file, not defined if error, error number |
| 4279 | can be retrieved with os_file_get_last_error */ |
| 4280 | pfs_os_file_t |
| 4281 | os_file_create_simple_no_error_handling_func( |
| 4282 | const char* name, |
| 4283 | ulint create_mode, |
| 4284 | ulint access_type, |
| 4285 | bool read_only, |
| 4286 | bool* success) |
| 4287 | { |
| 4288 | os_file_t file; |
| 4289 | |
| 4290 | *success = false; |
| 4291 | |
| 4292 | DWORD access; |
| 4293 | DWORD create_flag; |
| 4294 | DWORD attributes = 0; |
| 4295 | DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL |
| 4296 | ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE |
| 4297 | : FILE_SHARE_READ; |
| 4298 | |
| 4299 | ut_a(name); |
| 4300 | |
| 4301 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
| 4302 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
| 4303 | |
| 4304 | if (create_mode == OS_FILE_OPEN) { |
| 4305 | |
| 4306 | create_flag = OPEN_EXISTING; |
| 4307 | |
| 4308 | } else if (read_only) { |
| 4309 | |
| 4310 | create_flag = OPEN_EXISTING; |
| 4311 | |
| 4312 | } else if (create_mode == OS_FILE_CREATE) { |
| 4313 | |
| 4314 | create_flag = CREATE_NEW; |
| 4315 | |
| 4316 | } else { |
| 4317 | |
| 4318 | ib::error() |
| 4319 | << "Unknown file create mode (" << create_mode << ") " |
| 4320 | << " for file '" << name << "'" ; |
| 4321 | |
| 4322 | return(OS_FILE_CLOSED); |
| 4323 | } |
| 4324 | |
| 4325 | if (access_type == OS_FILE_READ_ONLY) { |
| 4326 | |
| 4327 | access = GENERIC_READ; |
| 4328 | |
| 4329 | } else if (read_only) { |
| 4330 | |
| 4331 | access = GENERIC_READ; |
| 4332 | |
| 4333 | } else if (access_type == OS_FILE_READ_WRITE) { |
| 4334 | |
| 4335 | access = GENERIC_READ | GENERIC_WRITE; |
| 4336 | |
| 4337 | } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { |
| 4338 | |
| 4339 | ut_a(!read_only); |
| 4340 | |
| 4341 | access = GENERIC_READ; |
| 4342 | |
| 4343 | /*!< A backup program has to give mysqld the maximum |
| 4344 | freedom to do what it likes with the file */ |
| 4345 | |
| 4346 | share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE |
| 4347 | | FILE_SHARE_READ; |
| 4348 | } else { |
| 4349 | |
| 4350 | ib::error() |
| 4351 | << "Unknown file access type (" << access_type << ") " |
| 4352 | << "for file '" << name << "'" ; |
| 4353 | |
| 4354 | return(OS_FILE_CLOSED); |
| 4355 | } |
| 4356 | |
| 4357 | file = CreateFile((LPCTSTR) name, |
| 4358 | access, |
| 4359 | share_mode, |
| 4360 | NULL, // Security attributes |
| 4361 | create_flag, |
| 4362 | attributes, |
| 4363 | NULL); // No template file |
| 4364 | |
| 4365 | *success = (file != INVALID_HANDLE_VALUE); |
| 4366 | |
| 4367 | return(file); |
| 4368 | } |
| 4369 | |
| 4370 | /** Deletes a file if it exists. The file has to be closed before calling this. |
| 4371 | @param[in] name file path as a null-terminated string |
| 4372 | @param[out] exist indicate if file pre-exist |
| 4373 | @return true if success */ |
| 4374 | bool |
| 4375 | os_file_delete_if_exists_func( |
| 4376 | const char* name, |
| 4377 | bool* exist) |
| 4378 | { |
| 4379 | ulint count = 0; |
| 4380 | |
| 4381 | if (exist != NULL) { |
| 4382 | *exist = true; |
| 4383 | } |
| 4384 | |
| 4385 | for (;;) { |
| 4386 | /* In Windows, deleting an .ibd file may fail if |
| 4387 | the file is being accessed by an external program, |
| 4388 | such as a backup tool. */ |
| 4389 | |
| 4390 | bool ret = DeleteFile((LPCTSTR) name); |
| 4391 | |
| 4392 | if (ret) { |
| 4393 | return(true); |
| 4394 | } |
| 4395 | |
| 4396 | DWORD lasterr = GetLastError(); |
| 4397 | |
| 4398 | if (lasterr == ERROR_FILE_NOT_FOUND |
| 4399 | || lasterr == ERROR_PATH_NOT_FOUND) { |
| 4400 | |
| 4401 | /* the file does not exist, this not an error */ |
| 4402 | if (exist != NULL) { |
| 4403 | *exist = false; |
| 4404 | } |
| 4405 | |
| 4406 | return(true); |
| 4407 | } |
| 4408 | |
| 4409 | ++count; |
| 4410 | |
| 4411 | if (count > 100 && 0 == (count % 10)) { |
| 4412 | |
| 4413 | /* Print error information */ |
| 4414 | os_file_get_last_error(true); |
| 4415 | |
| 4416 | ib::warn() << "Delete of file '" << name << "' failed." ; |
| 4417 | } |
| 4418 | |
| 4419 | /* Sleep for a second */ |
| 4420 | os_thread_sleep(1000000); |
| 4421 | |
| 4422 | if (count > 2000) { |
| 4423 | |
| 4424 | return(false); |
| 4425 | } |
| 4426 | } |
| 4427 | } |
| 4428 | |
| 4429 | /** Deletes a file. The file has to be closed before calling this. |
| 4430 | @param[in] name File path as NUL terminated string |
| 4431 | @return true if success */ |
| 4432 | bool |
| 4433 | os_file_delete_func( |
| 4434 | const char* name) |
| 4435 | { |
| 4436 | ulint count = 0; |
| 4437 | |
| 4438 | for (;;) { |
| 4439 | /* In Windows, deleting an .ibd file may fail if |
| 4440 | the file is being accessed by an external program, |
| 4441 | such as a backup tool. */ |
| 4442 | |
| 4443 | BOOL ret = DeleteFile((LPCTSTR) name); |
| 4444 | |
| 4445 | if (ret) { |
| 4446 | return(true); |
| 4447 | } |
| 4448 | |
| 4449 | if (GetLastError() == ERROR_FILE_NOT_FOUND) { |
| 4450 | /* If the file does not exist, we classify this as |
| 4451 | a 'mild' error and return */ |
| 4452 | |
| 4453 | return(false); |
| 4454 | } |
| 4455 | |
| 4456 | ++count; |
| 4457 | |
| 4458 | if (count > 100 && 0 == (count % 10)) { |
| 4459 | |
| 4460 | /* print error information */ |
| 4461 | os_file_get_last_error(true); |
| 4462 | |
| 4463 | ib::warn() |
| 4464 | << "Cannot delete file '" << name << "'. Is " |
| 4465 | << "another program accessing it?" ; |
| 4466 | } |
| 4467 | |
| 4468 | /* sleep for a second */ |
| 4469 | os_thread_sleep(1000000); |
| 4470 | |
| 4471 | if (count > 2000) { |
| 4472 | |
| 4473 | return(false); |
| 4474 | } |
| 4475 | } |
| 4476 | |
| 4477 | ut_error; |
| 4478 | return(false); |
| 4479 | } |
| 4480 | |
| 4481 | /** NOTE! Use the corresponding macro os_file_rename(), not directly this |
| 4482 | function! |
| 4483 | Renames a file (can also move it to another directory). It is safest that the |
| 4484 | file is closed before calling this function. |
| 4485 | @param[in] oldpath old file path as a null-terminated string |
| 4486 | @param[in] newpath new file path |
| 4487 | @return true if success */ |
| 4488 | bool |
| 4489 | os_file_rename_func( |
| 4490 | const char* oldpath, |
| 4491 | const char* newpath) |
| 4492 | { |
| 4493 | #ifdef UNIV_DEBUG |
| 4494 | os_file_type_t type; |
| 4495 | bool exists; |
| 4496 | |
| 4497 | /* New path must not exist. */ |
| 4498 | ut_ad(os_file_status(newpath, &exists, &type)); |
| 4499 | ut_ad(!exists); |
| 4500 | |
| 4501 | /* Old path must exist. */ |
| 4502 | ut_ad(os_file_status(oldpath, &exists, &type)); |
| 4503 | ut_ad(exists); |
| 4504 | #endif /* UNIV_DEBUG */ |
| 4505 | |
| 4506 | if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) { |
| 4507 | return(true); |
| 4508 | } |
| 4509 | |
| 4510 | os_file_handle_error_no_exit(oldpath, "rename" , false); |
| 4511 | |
| 4512 | return(false); |
| 4513 | } |
| 4514 | |
| 4515 | /** NOTE! Use the corresponding macro os_file_close(), not directly |
| 4516 | this function! |
| 4517 | Closes a file handle. In case of error, error number can be retrieved with |
| 4518 | os_file_get_last_error. |
| 4519 | @param[in,own] file Handle to a file |
| 4520 | @return true if success */ |
| 4521 | bool |
| 4522 | os_file_close_func( |
| 4523 | os_file_t file) |
| 4524 | { |
| 4525 | ut_a(file); |
| 4526 | |
| 4527 | if (CloseHandle(file)) { |
| 4528 | return(true); |
| 4529 | } |
| 4530 | |
| 4531 | os_file_handle_error(NULL, "close" ); |
| 4532 | |
| 4533 | return(false); |
| 4534 | } |
| 4535 | |
| 4536 | /** Gets a file size. |
| 4537 | @param[in] file Handle to a file |
| 4538 | @return file size, or (os_offset_t) -1 on failure */ |
| 4539 | os_offset_t |
| 4540 | os_file_get_size( |
| 4541 | os_file_t file) |
| 4542 | { |
| 4543 | DWORD high; |
| 4544 | DWORD low = GetFileSize(file, &high); |
| 4545 | |
| 4546 | if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) { |
| 4547 | return((os_offset_t) -1); |
| 4548 | } |
| 4549 | |
| 4550 | return(os_offset_t(low | (os_offset_t(high) << 32))); |
| 4551 | } |
| 4552 | |
| 4553 | /** Gets a file size. |
| 4554 | @param[in] filename Full path to the filename to check |
| 4555 | @return file size if OK, else set m_total_size to ~0 and m_alloc_size to |
| 4556 | errno */ |
| 4557 | os_file_size_t |
| 4558 | os_file_get_size( |
| 4559 | const char* filename) |
| 4560 | { |
| 4561 | struct __stat64 s; |
| 4562 | os_file_size_t file_size; |
| 4563 | |
| 4564 | int ret = _stat64(filename, &s); |
| 4565 | |
| 4566 | if (ret == 0) { |
| 4567 | |
| 4568 | file_size.m_total_size = s.st_size; |
| 4569 | |
| 4570 | DWORD low_size; |
| 4571 | DWORD high_size; |
| 4572 | |
| 4573 | low_size = GetCompressedFileSize(filename, &high_size); |
| 4574 | |
| 4575 | if (low_size != INVALID_FILE_SIZE) { |
| 4576 | |
| 4577 | file_size.m_alloc_size = high_size; |
| 4578 | file_size.m_alloc_size <<= 32; |
| 4579 | file_size.m_alloc_size |= low_size; |
| 4580 | |
| 4581 | } else { |
| 4582 | ib::error() |
| 4583 | << "GetCompressedFileSize(" |
| 4584 | << filename << ", ..) failed." ; |
| 4585 | |
| 4586 | file_size.m_alloc_size = (os_offset_t) -1; |
| 4587 | } |
| 4588 | } else { |
| 4589 | file_size.m_total_size = ~0; |
| 4590 | file_size.m_alloc_size = (os_offset_t) ret; |
| 4591 | } |
| 4592 | |
| 4593 | return(file_size); |
| 4594 | } |
| 4595 | |
| 4596 | /** This function returns information about the specified file |
| 4597 | @param[in] path pathname of the file |
| 4598 | @param[out] stat_info information of a file in a directory |
| 4599 | @param[in,out] statinfo information of a file in a directory |
| 4600 | @param[in] check_rw_perm for testing whether the file can be opened |
| 4601 | in RW mode |
| 4602 | @param[in] read_only true if the file is opened in read-only mode |
| 4603 | @return DB_SUCCESS if all OK */ |
| 4604 | static |
| 4605 | dberr_t |
| 4606 | os_file_get_status_win32( |
| 4607 | const char* path, |
| 4608 | os_file_stat_t* stat_info, |
| 4609 | struct _stat64* statinfo, |
| 4610 | bool check_rw_perm, |
| 4611 | bool read_only) |
| 4612 | { |
| 4613 | int ret = _stat64(path, statinfo); |
| 4614 | |
| 4615 | if (ret && (errno == ENOENT || errno == ENOTDIR)) { |
| 4616 | /* file does not exist */ |
| 4617 | |
| 4618 | return(DB_NOT_FOUND); |
| 4619 | |
| 4620 | } else if (ret) { |
| 4621 | /* file exists, but stat call failed */ |
| 4622 | |
| 4623 | os_file_handle_error_no_exit(path, "STAT" , false); |
| 4624 | |
| 4625 | return(DB_FAIL); |
| 4626 | |
| 4627 | } else if (_S_IFDIR & statinfo->st_mode) { |
| 4628 | |
| 4629 | stat_info->type = OS_FILE_TYPE_DIR; |
| 4630 | |
| 4631 | } else if (_S_IFREG & statinfo->st_mode) { |
| 4632 | |
| 4633 | DWORD access = GENERIC_READ; |
| 4634 | |
| 4635 | if (!read_only) { |
| 4636 | access |= GENERIC_WRITE; |
| 4637 | } |
| 4638 | |
| 4639 | stat_info->type = OS_FILE_TYPE_FILE; |
| 4640 | |
| 4641 | /* Check if we can open it in read-only mode. */ |
| 4642 | |
| 4643 | if (check_rw_perm) { |
| 4644 | HANDLE fh; |
| 4645 | |
| 4646 | fh = CreateFile( |
| 4647 | (LPCTSTR) path, // File to open |
| 4648 | access, |
| 4649 | FILE_SHARE_READ | FILE_SHARE_WRITE |
| 4650 | | FILE_SHARE_DELETE, // Full sharing |
| 4651 | NULL, // Default security |
| 4652 | OPEN_EXISTING, // Existing file only |
| 4653 | FILE_ATTRIBUTE_NORMAL, // Normal file |
| 4654 | NULL); // No attr. template |
| 4655 | |
| 4656 | if (fh == INVALID_HANDLE_VALUE) { |
| 4657 | stat_info->rw_perm = false; |
| 4658 | } else { |
| 4659 | stat_info->rw_perm = true; |
| 4660 | CloseHandle(fh); |
| 4661 | } |
| 4662 | } |
| 4663 | |
| 4664 | char volname[MAX_PATH]; |
| 4665 | BOOL result = GetVolumePathName(path, volname, MAX_PATH); |
| 4666 | |
| 4667 | if (!result) { |
| 4668 | |
| 4669 | ib::error() |
| 4670 | << "os_file_get_status_win32: " |
| 4671 | << "Failed to get the volume path name for: " |
| 4672 | << path |
| 4673 | << "- OS error number " << GetLastError(); |
| 4674 | |
| 4675 | return(DB_FAIL); |
| 4676 | } |
| 4677 | |
| 4678 | DWORD sectorsPerCluster; |
| 4679 | DWORD bytesPerSector; |
| 4680 | DWORD numberOfFreeClusters; |
| 4681 | DWORD totalNumberOfClusters; |
| 4682 | |
| 4683 | result = GetDiskFreeSpace( |
| 4684 | (LPCSTR) volname, |
| 4685 | §orsPerCluster, |
| 4686 | &bytesPerSector, |
| 4687 | &numberOfFreeClusters, |
| 4688 | &totalNumberOfClusters); |
| 4689 | |
| 4690 | if (!result) { |
| 4691 | |
| 4692 | ib::error() |
| 4693 | << "GetDiskFreeSpace(" << volname << ",...) " |
| 4694 | << "failed " |
| 4695 | << "- OS error number " << GetLastError(); |
| 4696 | |
| 4697 | return(DB_FAIL); |
| 4698 | } |
| 4699 | |
| 4700 | stat_info->block_size = bytesPerSector * sectorsPerCluster; |
| 4701 | } else { |
| 4702 | stat_info->type = OS_FILE_TYPE_UNKNOWN; |
| 4703 | } |
| 4704 | |
| 4705 | return(DB_SUCCESS); |
| 4706 | } |
| 4707 | |
| 4708 | /** |
| 4709 | Sets a sparse flag on Windows file. |
| 4710 | @param[in] file file handle |
| 4711 | @return true on success, false on error |
| 4712 | */ |
| 4713 | #include <versionhelpers.h> |
| 4714 | bool os_file_set_sparse_win32(os_file_t file, bool is_sparse) |
| 4715 | { |
| 4716 | if (!is_sparse && !IsWindows8OrGreater()) { |
| 4717 | /* Cannot unset sparse flag on older Windows. |
| 4718 | Until Windows8 it is documented to produce unpredictable results, |
| 4719 | if there are unallocated ranges in file.*/ |
| 4720 | return false; |
| 4721 | } |
| 4722 | DWORD temp; |
| 4723 | FILE_SET_SPARSE_BUFFER sparse_buffer; |
| 4724 | sparse_buffer.SetSparse = is_sparse; |
| 4725 | return os_win32_device_io_control(file, |
| 4726 | FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp); |
| 4727 | } |
| 4728 | |
| 4729 | |
| 4730 | /** |
| 4731 | Change file size on Windows. |
| 4732 | |
| 4733 | If file is extended, the bytes between old and new EOF |
| 4734 | are zeros. |
| 4735 | |
| 4736 | If file is sparse, "virtual" block is added at the end of |
| 4737 | allocated area. |
| 4738 | |
| 4739 | If file is normal, file system allocates storage. |
| 4740 | |
| 4741 | @param[in] pathname file path |
| 4742 | @param[in] file file handle |
| 4743 | @param[in] size size to preserve in bytes |
| 4744 | @return true if success */ |
| 4745 | bool |
| 4746 | os_file_change_size_win32( |
| 4747 | const char* pathname, |
| 4748 | os_file_t file, |
| 4749 | os_offset_t size) |
| 4750 | { |
| 4751 | LARGE_INTEGER length; |
| 4752 | |
| 4753 | length.QuadPart = size; |
| 4754 | |
| 4755 | BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN); |
| 4756 | |
| 4757 | if (!success) { |
| 4758 | os_file_handle_error_no_exit( |
| 4759 | pathname, "SetFilePointerEx" , false); |
| 4760 | } else { |
| 4761 | success = SetEndOfFile(file); |
| 4762 | if (!success) { |
| 4763 | os_file_handle_error_no_exit( |
| 4764 | pathname, "SetEndOfFile" , false); |
| 4765 | } |
| 4766 | } |
| 4767 | return(success); |
| 4768 | } |
| 4769 | |
| 4770 | /** Truncates a file at its current position. |
| 4771 | @param[in] file Handle to be truncated |
| 4772 | @return true if success */ |
| 4773 | bool |
| 4774 | os_file_set_eof( |
| 4775 | FILE* file) |
| 4776 | { |
| 4777 | HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); |
| 4778 | |
| 4779 | return(SetEndOfFile(h)); |
| 4780 | } |
| 4781 | |
| 4782 | /** This function can be called if one wants to post a batch of reads and |
| 4783 | prefers an i/o-handler thread to handle them all at once later. You must |
| 4784 | call os_aio_simulated_wake_handler_threads later to ensure the threads |
| 4785 | are not left sleeping! */ |
| 4786 | void |
| 4787 | os_aio_simulated_put_read_threads_to_sleep() |
| 4788 | { |
| 4789 | AIO::simulated_put_read_threads_to_sleep(); |
| 4790 | } |
| 4791 | |
| 4792 | /** This function can be called if one wants to post a batch of reads and |
| 4793 | prefers an i/o-handler thread to handle them all at once later. You must |
| 4794 | call os_aio_simulated_wake_handler_threads later to ensure the threads |
| 4795 | are not left sleeping! */ |
| 4796 | void |
| 4797 | AIO::simulated_put_read_threads_to_sleep() |
| 4798 | { |
| 4799 | /* The idea of putting background IO threads to sleep is only for |
| 4800 | Windows when using simulated AIO. Windows XP seems to schedule |
| 4801 | background threads too eagerly to allow for coalescing during |
| 4802 | readahead requests. */ |
| 4803 | |
| 4804 | if (srv_use_native_aio) { |
| 4805 | /* We do not use simulated AIO: do nothing */ |
| 4806 | |
| 4807 | return; |
| 4808 | } |
| 4809 | |
| 4810 | os_aio_recommend_sleep_for_read_threads = true; |
| 4811 | |
| 4812 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
| 4813 | AIO* array; |
| 4814 | |
| 4815 | get_array_and_local_segment(&array, i); |
| 4816 | |
| 4817 | if (array == s_reads) { |
| 4818 | |
| 4819 | os_event_reset(os_aio_segment_wait_events[i]); |
| 4820 | } |
| 4821 | } |
| 4822 | } |
| 4823 | |
| 4824 | #endif /* !_WIN32*/ |
| 4825 | |
| 4826 | /** Does a syncronous read or write depending upon the type specified |
| 4827 | In case of partial reads/writes the function tries |
| 4828 | NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data. |
| 4829 | @param[in] type, IO flags |
| 4830 | @param[in] file handle to an open file |
| 4831 | @param[out] buf buffer where to read |
| 4832 | @param[in] offset file offset from the start where to read |
| 4833 | @param[in] n number of bytes to read, starting from offset |
| 4834 | @param[out] err DB_SUCCESS or error code |
| 4835 | @return number of bytes read/written, -1 if error */ |
| 4836 | static MY_ATTRIBUTE((warn_unused_result)) |
| 4837 | ssize_t |
| 4838 | os_file_io( |
| 4839 | const IORequest&in_type, |
| 4840 | os_file_t file, |
| 4841 | void* buf, |
| 4842 | ulint n, |
| 4843 | os_offset_t offset, |
| 4844 | dberr_t* err) |
| 4845 | { |
| 4846 | ssize_t original_n = ssize_t(n); |
| 4847 | IORequest type = in_type; |
| 4848 | ssize_t bytes_returned = 0; |
| 4849 | |
| 4850 | SyncFileIO sync_file_io(file, buf, n, offset); |
| 4851 | |
| 4852 | for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) { |
| 4853 | |
| 4854 | ssize_t n_bytes = sync_file_io.execute(type); |
| 4855 | |
| 4856 | /* Check for a hard error. Not much we can do now. */ |
| 4857 | if (n_bytes < 0) { |
| 4858 | |
| 4859 | break; |
| 4860 | |
| 4861 | } else if (n_bytes + bytes_returned == ssize_t(n)) { |
| 4862 | |
| 4863 | bytes_returned += n_bytes; |
| 4864 | |
| 4865 | if (offset > 0 |
| 4866 | && !type.is_log() |
| 4867 | && type.is_write() |
| 4868 | && type.punch_hole()) { |
| 4869 | *err = type.punch_hole(file, offset, n); |
| 4870 | |
| 4871 | } else { |
| 4872 | *err = DB_SUCCESS; |
| 4873 | } |
| 4874 | |
| 4875 | return(original_n); |
| 4876 | } |
| 4877 | |
| 4878 | /* Handle partial read/write. */ |
| 4879 | |
| 4880 | ut_ad(ulint(n_bytes + bytes_returned) < n); |
| 4881 | |
| 4882 | bytes_returned += n_bytes; |
| 4883 | |
| 4884 | if (!type.is_partial_io_warning_disabled()) { |
| 4885 | |
| 4886 | const char* op = type.is_read() |
| 4887 | ? "read" : "written" ; |
| 4888 | |
| 4889 | ib::warn() |
| 4890 | << n |
| 4891 | << " bytes should have been " << op << ". Only " |
| 4892 | << bytes_returned |
| 4893 | << " bytes " << op << ". Retrying" |
| 4894 | << " for the remaining bytes." ; |
| 4895 | } |
| 4896 | |
| 4897 | /* Advance the offset and buffer by n_bytes */ |
| 4898 | sync_file_io.advance(n_bytes); |
| 4899 | } |
| 4900 | |
| 4901 | *err = DB_IO_ERROR; |
| 4902 | |
| 4903 | if (!type.is_partial_io_warning_disabled()) { |
| 4904 | ib::warn() |
| 4905 | << "Retry attempts for " |
| 4906 | << (type.is_read() ? "reading" : "writing" ) |
| 4907 | << " partial data failed." ; |
| 4908 | } |
| 4909 | |
| 4910 | return(bytes_returned); |
| 4911 | } |
| 4912 | |
| 4913 | /** Does a synchronous write operation in Posix. |
| 4914 | @param[in] type IO context |
| 4915 | @param[in] file handle to an open file |
| 4916 | @param[out] buf buffer from which to write |
| 4917 | @param[in] n number of bytes to read, starting from offset |
| 4918 | @param[in] offset file offset from the start where to read |
| 4919 | @param[out] err DB_SUCCESS or error code |
| 4920 | @return number of bytes written, -1 if error */ |
| 4921 | static MY_ATTRIBUTE((warn_unused_result)) |
| 4922 | ssize_t |
| 4923 | os_file_pwrite( |
| 4924 | const IORequest& type, |
| 4925 | os_file_t file, |
| 4926 | const byte* buf, |
| 4927 | ulint n, |
| 4928 | os_offset_t offset, |
| 4929 | dberr_t* err) |
| 4930 | { |
| 4931 | ut_ad(type.validate()); |
| 4932 | ut_ad(type.is_write()); |
| 4933 | |
| 4934 | ++os_n_file_writes; |
| 4935 | |
| 4936 | const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); |
| 4937 | MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); |
| 4938 | ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf), |
| 4939 | n, offset, err); |
| 4940 | MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); |
| 4941 | |
| 4942 | return(n_bytes); |
| 4943 | } |
| 4944 | |
| 4945 | /** NOTE! Use the corresponding macro os_file_write(), not directly |
| 4946 | Requests a synchronous write operation. |
| 4947 | @param[in] type IO flags |
| 4948 | @param[in] file handle to an open file |
| 4949 | @param[out] buf buffer from which to write |
| 4950 | @param[in] offset file offset from the start where to read |
| 4951 | @param[in] n number of bytes to read, starting from offset |
| 4952 | @return DB_SUCCESS if request was successful, false if fail */ |
| 4953 | dberr_t |
| 4954 | os_file_write_func( |
| 4955 | const IORequest& type, |
| 4956 | const char* name, |
| 4957 | os_file_t file, |
| 4958 | const void* buf, |
| 4959 | os_offset_t offset, |
| 4960 | ulint n) |
| 4961 | { |
| 4962 | dberr_t err; |
| 4963 | |
| 4964 | ut_ad(type.validate()); |
| 4965 | ut_ad(n > 0); |
| 4966 | |
| 4967 | WAIT_ALLOW_WRITES(); |
| 4968 | |
| 4969 | ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err); |
| 4970 | |
| 4971 | if ((ulint) n_bytes != n && !os_has_said_disk_full) { |
| 4972 | |
| 4973 | ib::error() |
| 4974 | << "Write to file " << name << " failed at offset " |
| 4975 | << offset << ", " << n |
| 4976 | << " bytes should have been written," |
| 4977 | " only " << n_bytes << " were written." |
| 4978 | " Operating system error number " << errno << "." |
| 4979 | " Check that your OS and file system" |
| 4980 | " support files of this size." |
| 4981 | " Check also that the disk is not full" |
| 4982 | " or a disk quota exceeded." ; |
| 4983 | |
| 4984 | if (strerror(errno) != NULL) { |
| 4985 | |
| 4986 | ib::error() |
| 4987 | << "Error number " << errno |
| 4988 | << " means '" << strerror(errno) << "'" ; |
| 4989 | } |
| 4990 | |
| 4991 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
| 4992 | |
| 4993 | os_has_said_disk_full = true; |
| 4994 | } |
| 4995 | |
| 4996 | return(err); |
| 4997 | } |
| 4998 | |
| 4999 | /** Does a synchronous read operation in Posix. |
| 5000 | @param[in] type IO flags |
| 5001 | @param[in] file handle to an open file |
| 5002 | @param[out] buf buffer where to read |
| 5003 | @param[in] offset file offset from the start where to read |
| 5004 | @param[in] n number of bytes to read, starting from offset |
| 5005 | @param[out] err DB_SUCCESS or error code |
| 5006 | @return number of bytes read, -1 if error */ |
| 5007 | static MY_ATTRIBUTE((warn_unused_result)) |
| 5008 | ssize_t |
| 5009 | os_file_pread( |
| 5010 | const IORequest& type, |
| 5011 | os_file_t file, |
| 5012 | void* buf, |
| 5013 | ulint n, |
| 5014 | os_offset_t offset, |
| 5015 | dberr_t* err) |
| 5016 | { |
| 5017 | ut_ad(type.is_read()); |
| 5018 | |
| 5019 | ++os_n_file_reads; |
| 5020 | |
| 5021 | const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); |
| 5022 | MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); |
| 5023 | ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err); |
| 5024 | MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); |
| 5025 | |
| 5026 | return(n_bytes); |
| 5027 | } |
| 5028 | |
| 5029 | /** Requests a synchronous positioned read operation. |
| 5030 | @return DB_SUCCESS if request was successful, false if fail |
| 5031 | @param[in] type IO flags |
| 5032 | @param[in] file handle to an open file |
| 5033 | @param[out] buf buffer where to read |
| 5034 | @param[in] offset file offset from the start where to read |
| 5035 | @param[in] n number of bytes to read, starting from offset |
| 5036 | @param[out] o number of bytes actually read |
| 5037 | @param[in] exit_on_err if true then exit on error |
| 5038 | @return DB_SUCCESS or error code */ |
| 5039 | static MY_ATTRIBUTE((warn_unused_result)) |
| 5040 | dberr_t |
| 5041 | ( |
| 5042 | const IORequest& type, |
| 5043 | os_file_t file, |
| 5044 | void* buf, |
| 5045 | os_offset_t offset, |
| 5046 | ulint n, |
| 5047 | ulint* o, |
| 5048 | bool exit_on_err) |
| 5049 | { |
| 5050 | dberr_t err; |
| 5051 | |
| 5052 | os_bytes_read_since_printout += n; |
| 5053 | |
| 5054 | ut_ad(type.validate()); |
| 5055 | ut_ad(n > 0); |
| 5056 | |
| 5057 | ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err); |
| 5058 | |
| 5059 | if (o) { |
| 5060 | *o = n_bytes; |
| 5061 | } |
| 5062 | |
| 5063 | if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) { |
| 5064 | return err; |
| 5065 | } |
| 5066 | |
| 5067 | ib::error() << "Tried to read " << n << " bytes at offset " |
| 5068 | << offset << ", but was only able to read " << n_bytes; |
| 5069 | |
| 5070 | if (!os_file_handle_error_cond_exit( |
| 5071 | NULL, "read" , exit_on_err, false)) { |
| 5072 | ib::fatal() |
| 5073 | << "Cannot read from file. OS error number " |
| 5074 | << errno << "." ; |
| 5075 | } |
| 5076 | |
| 5077 | if (err == DB_SUCCESS) { |
| 5078 | err = DB_IO_ERROR; |
| 5079 | } |
| 5080 | |
| 5081 | return err; |
| 5082 | } |
| 5083 | |
| 5084 | /** Retrieves the last error number if an error occurs in a file io function. |
| 5085 | The number should be retrieved before any other OS calls (because they may |
| 5086 | overwrite the error number). If the number is not known to this program, |
| 5087 | the OS error number + 100 is returned. |
| 5088 | @param[in] report_all_errors true if we want an error printed |
| 5089 | for all errors |
| 5090 | @return error number, or OS error number + 100 */ |
| 5091 | ulint |
| 5092 | os_file_get_last_error( |
| 5093 | bool report_all_errors) |
| 5094 | { |
| 5095 | return(os_file_get_last_error_low(report_all_errors, false)); |
| 5096 | } |
| 5097 | |
| 5098 | /** Handle errors for file operations. |
| 5099 | @param[in] name name of a file or NULL |
| 5100 | @param[in] operation operation |
| 5101 | @param[in] should_abort whether to abort on an unknown error |
| 5102 | @param[in] on_error_silent whether to suppress reports of non-fatal errors |
| 5103 | @return true if we should retry the operation */ |
| 5104 | static MY_ATTRIBUTE((warn_unused_result)) |
| 5105 | bool |
| 5106 | os_file_handle_error_cond_exit( |
| 5107 | const char* name, |
| 5108 | const char* operation, |
| 5109 | bool should_abort, |
| 5110 | bool on_error_silent) |
| 5111 | { |
| 5112 | ulint err; |
| 5113 | |
| 5114 | err = os_file_get_last_error_low(false, on_error_silent); |
| 5115 | |
| 5116 | switch (err) { |
| 5117 | case OS_FILE_DISK_FULL: |
| 5118 | /* We only print a warning about disk full once */ |
| 5119 | |
| 5120 | if (os_has_said_disk_full) { |
| 5121 | |
| 5122 | return(false); |
| 5123 | } |
| 5124 | |
| 5125 | /* Disk full error is reported irrespective of the |
| 5126 | on_error_silent setting. */ |
| 5127 | |
| 5128 | if (name) { |
| 5129 | |
| 5130 | ib::error() |
| 5131 | << "Encountered a problem with file '" |
| 5132 | << name << "'" ; |
| 5133 | } |
| 5134 | |
| 5135 | ib::error() |
| 5136 | << "Disk is full. Try to clean the disk to free space." ; |
| 5137 | |
| 5138 | os_has_said_disk_full = true; |
| 5139 | |
| 5140 | return(false); |
| 5141 | |
| 5142 | case OS_FILE_AIO_RESOURCES_RESERVED: |
| 5143 | case OS_FILE_AIO_INTERRUPTED: |
| 5144 | |
| 5145 | return(true); |
| 5146 | |
| 5147 | case OS_FILE_PATH_ERROR: |
| 5148 | case OS_FILE_ALREADY_EXISTS: |
| 5149 | case OS_FILE_ACCESS_VIOLATION: |
| 5150 | |
| 5151 | return(false); |
| 5152 | |
| 5153 | case OS_FILE_SHARING_VIOLATION: |
| 5154 | |
| 5155 | os_thread_sleep(10000000); /* 10 sec */ |
| 5156 | return(true); |
| 5157 | |
| 5158 | case OS_FILE_OPERATION_ABORTED: |
| 5159 | case OS_FILE_INSUFFICIENT_RESOURCE: |
| 5160 | |
| 5161 | os_thread_sleep(100000); /* 100 ms */ |
| 5162 | return(true); |
| 5163 | |
| 5164 | default: |
| 5165 | |
| 5166 | /* If it is an operation that can crash on error then it |
| 5167 | is better to ignore on_error_silent and print an error message |
| 5168 | to the log. */ |
| 5169 | |
| 5170 | if (should_abort || !on_error_silent) { |
| 5171 | ib::error() << "File " |
| 5172 | << (name != NULL ? name : "(unknown)" ) |
| 5173 | << ": '" << operation << "'" |
| 5174 | " returned OS error " << err << "." |
| 5175 | << (should_abort |
| 5176 | ? " Cannot continue operation" : "" ); |
| 5177 | } |
| 5178 | |
| 5179 | if (should_abort) { |
| 5180 | abort(); |
| 5181 | } |
| 5182 | } |
| 5183 | |
| 5184 | return(false); |
| 5185 | } |
| 5186 | |
| 5187 | #ifndef _WIN32 |
| 5188 | /** Tries to disable OS caching on an opened file descriptor. |
| 5189 | @param[in] fd file descriptor to alter |
| 5190 | @param[in] file_name file name, used in the diagnostic message |
| 5191 | @param[in] name "open" or "create"; used in the diagnostic |
| 5192 | message */ |
| 5193 | void |
| 5194 | os_file_set_nocache( |
| 5195 | int fd MY_ATTRIBUTE((unused)), |
| 5196 | const char* file_name MY_ATTRIBUTE((unused)), |
| 5197 | const char* operation_name MY_ATTRIBUTE((unused))) |
| 5198 | { |
| 5199 | /* some versions of Solaris may not have DIRECTIO_ON */ |
| 5200 | #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) |
| 5201 | if (directio(fd, DIRECTIO_ON) == -1) { |
| 5202 | int errno_save = errno; |
| 5203 | |
| 5204 | ib::error() |
| 5205 | << "Failed to set DIRECTIO_ON on file " |
| 5206 | << file_name << "; " << operation_name << ": " |
| 5207 | << strerror(errno_save) << "," |
| 5208 | " continuing anyway." ; |
| 5209 | } |
| 5210 | #elif defined(O_DIRECT) |
| 5211 | if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { |
| 5212 | int errno_save = errno; |
| 5213 | static bool warning_message_printed = false; |
| 5214 | if (errno_save == EINVAL) { |
| 5215 | if (!warning_message_printed) { |
| 5216 | warning_message_printed = true; |
| 5217 | # ifdef UNIV_LINUX |
| 5218 | ib::warn() |
| 5219 | << "Failed to set O_DIRECT on file" |
| 5220 | << file_name << "; " << operation_name |
| 5221 | << ": " << strerror(errno_save) << ", " |
| 5222 | "continuing anyway. O_DIRECT is " |
| 5223 | "known to result in 'Invalid argument' " |
| 5224 | "on Linux on tmpfs, " |
| 5225 | "see MySQL Bug#26662." ; |
| 5226 | # else /* UNIV_LINUX */ |
| 5227 | goto short_warning; |
| 5228 | # endif /* UNIV_LINUX */ |
| 5229 | } |
| 5230 | } else { |
| 5231 | # ifndef UNIV_LINUX |
| 5232 | short_warning: |
| 5233 | # endif |
| 5234 | ib::warn() |
| 5235 | << "Failed to set O_DIRECT on file " |
| 5236 | << file_name << "; " << operation_name |
| 5237 | << " : " << strerror(errno_save) |
| 5238 | << ", continuing anyway." ; |
| 5239 | } |
| 5240 | } |
| 5241 | #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ |
| 5242 | } |
| 5243 | |
| 5244 | #endif /* _WIN32 */ |
| 5245 | |
| 5246 | /** Extend a file. |
| 5247 | |
| 5248 | On Windows, extending a file allocates blocks for the file, |
| 5249 | unless the file is sparse. |
| 5250 | |
| 5251 | On Unix, we will extend the file with ftruncate(), if |
| 5252 | file needs to be sparse. Otherwise posix_fallocate() is used |
| 5253 | when available, and if not, binary zeroes are added to the end |
| 5254 | of file. |
| 5255 | |
| 5256 | @param[in] name file name |
| 5257 | @param[in] file file handle |
| 5258 | @param[in] size desired file size |
| 5259 | @param[in] sparse whether to create a sparse file (no preallocating) |
| 5260 | @return whether the operation succeeded */ |
| 5261 | bool |
| 5262 | os_file_set_size( |
| 5263 | const char* name, |
| 5264 | os_file_t file, |
| 5265 | os_offset_t size, |
| 5266 | bool is_sparse) |
| 5267 | { |
| 5268 | #ifdef _WIN32 |
| 5269 | /* On Windows, changing file size works well and as expected for both |
| 5270 | sparse and normal files. |
| 5271 | |
| 5272 | However, 10.2 up until 10.2.9 made every file sparse in innodb, |
| 5273 | causing NTFS fragmentation issues(MDEV-13941). We try to undo |
| 5274 | the damage, and unsparse the file.*/ |
| 5275 | |
| 5276 | if (!is_sparse && os_is_sparse_file_supported(file)) { |
| 5277 | if (!os_file_set_sparse_win32(file, false)) |
| 5278 | /* Unsparsing file failed. Fallback to writing binary |
| 5279 | zeros, to avoid even higher fragmentation.*/ |
| 5280 | goto fallback; |
| 5281 | } |
| 5282 | |
| 5283 | return os_file_change_size_win32(name, file, size); |
| 5284 | |
| 5285 | fallback: |
| 5286 | #else |
| 5287 | if (is_sparse) { |
| 5288 | bool success = !ftruncate(file, size); |
| 5289 | if (!success) { |
| 5290 | ib::error() << "ftruncate of file " << name << " to " |
| 5291 | << size << " bytes failed with error " |
| 5292 | << errno; |
| 5293 | } |
| 5294 | return(success); |
| 5295 | } |
| 5296 | |
| 5297 | # ifdef HAVE_POSIX_FALLOCATE |
| 5298 | int err; |
| 5299 | do { |
| 5300 | os_offset_t current_size = os_file_get_size(file); |
| 5301 | err = current_size >= size |
| 5302 | ? 0 : posix_fallocate(file, current_size, |
| 5303 | size - current_size); |
| 5304 | } while (err == EINTR |
| 5305 | && srv_shutdown_state == SRV_SHUTDOWN_NONE); |
| 5306 | |
| 5307 | switch (err) { |
| 5308 | case 0: |
| 5309 | return true; |
| 5310 | default: |
| 5311 | ib::error() << "preallocating " |
| 5312 | << size << " bytes for file " << name |
| 5313 | << " failed with error " << err; |
| 5314 | /* fall through */ |
| 5315 | case EINTR: |
| 5316 | errno = err; |
| 5317 | return false; |
| 5318 | case EINVAL: |
| 5319 | /* fall back to the code below */ |
| 5320 | break; |
| 5321 | } |
| 5322 | # endif /* HAVE_POSIX_ALLOCATE */ |
| 5323 | #endif /* _WIN32*/ |
| 5324 | |
| 5325 | /* Write up to 1 megabyte at a time. */ |
| 5326 | ulint buf_size = ut_min(ulint(64), |
| 5327 | ulint(size >> srv_page_size_shift)) |
| 5328 | << srv_page_size_shift; |
| 5329 | |
| 5330 | /* Align the buffer for possible raw i/o */ |
| 5331 | byte* buf2; |
| 5332 | |
| 5333 | buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size)); |
| 5334 | |
| 5335 | byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size)); |
| 5336 | |
| 5337 | /* Write buffer full of zeros */ |
| 5338 | memset(buf, 0, buf_size); |
| 5339 | |
| 5340 | os_offset_t current_size = os_file_get_size(file); |
| 5341 | |
| 5342 | while (current_size < size |
| 5343 | && srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
| 5344 | ulint n_bytes; |
| 5345 | |
| 5346 | if (size - current_size < (os_offset_t) buf_size) { |
| 5347 | n_bytes = (ulint) (size - current_size); |
| 5348 | } else { |
| 5349 | n_bytes = buf_size; |
| 5350 | } |
| 5351 | |
| 5352 | dberr_t err; |
| 5353 | IORequest request(IORequest::WRITE); |
| 5354 | |
| 5355 | err = os_file_write( |
| 5356 | request, name, file, buf, current_size, n_bytes); |
| 5357 | |
| 5358 | if (err != DB_SUCCESS) { |
| 5359 | break; |
| 5360 | } |
| 5361 | |
| 5362 | current_size += n_bytes; |
| 5363 | } |
| 5364 | |
| 5365 | ut_free(buf2); |
| 5366 | |
| 5367 | return(current_size >= size && os_file_flush(file)); |
| 5368 | } |
| 5369 | |
| 5370 | /** Truncates a file to a specified size in bytes. |
| 5371 | Do nothing if the size to preserve is greater or equal to the current |
| 5372 | size of the file. |
| 5373 | @param[in] pathname file path |
| 5374 | @param[in] file file to be truncated |
| 5375 | @param[in] size size to preserve in bytes |
| 5376 | @return true if success */ |
| 5377 | bool |
| 5378 | os_file_truncate( |
| 5379 | const char* pathname, |
| 5380 | os_file_t file, |
| 5381 | os_offset_t size) |
| 5382 | { |
| 5383 | /* Do nothing if the size preserved is larger than or equal to the |
| 5384 | current size of file */ |
| 5385 | os_offset_t size_bytes = os_file_get_size(file); |
| 5386 | |
| 5387 | if (size >= size_bytes) { |
| 5388 | return(true); |
| 5389 | } |
| 5390 | |
| 5391 | #ifdef _WIN32 |
| 5392 | return(os_file_change_size_win32(pathname, file, size)); |
| 5393 | #else /* _WIN32 */ |
| 5394 | return(os_file_truncate_posix(pathname, file, size)); |
| 5395 | #endif /* _WIN32 */ |
| 5396 | } |
| 5397 | |
| 5398 | /** NOTE! Use the corresponding macro os_file_read(), not directly this |
| 5399 | function! |
| 5400 | Requests a synchronous positioned read operation. |
| 5401 | @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure |
| 5402 | @param[in] type IO flags |
| 5403 | @param[in] file handle to an open file |
| 5404 | @param[out] buf buffer where to read |
| 5405 | @param[in] offset file offset from the start where to read |
| 5406 | @param[in] n number of bytes to read, starting from offset |
| 5407 | @return DB_SUCCESS or error code */ |
| 5408 | dberr_t |
| 5409 | os_file_read_func( |
| 5410 | const IORequest& type, |
| 5411 | os_file_t file, |
| 5412 | void* buf, |
| 5413 | os_offset_t offset, |
| 5414 | ulint n) |
| 5415 | { |
| 5416 | return(os_file_read_page(type, file, buf, offset, n, NULL, true)); |
| 5417 | } |
| 5418 | |
| 5419 | /** NOTE! Use the corresponding macro os_file_read_no_error_handling(), |
| 5420 | not directly this function! |
| 5421 | Requests a synchronous positioned read operation. |
| 5422 | @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure |
| 5423 | @param[in] type IO flags |
| 5424 | @param[in] file handle to an open file |
| 5425 | @param[out] buf buffer where to read |
| 5426 | @param[in] offset file offset from the start where to read |
| 5427 | @param[in] n number of bytes to read, starting from offset |
| 5428 | @param[out] o number of bytes actually read |
| 5429 | @return DB_SUCCESS or error code */ |
| 5430 | dberr_t |
| 5431 | os_file_read_no_error_handling_func( |
| 5432 | const IORequest& type, |
| 5433 | os_file_t file, |
| 5434 | void* buf, |
| 5435 | os_offset_t offset, |
| 5436 | ulint n, |
| 5437 | ulint* o) |
| 5438 | { |
| 5439 | return(os_file_read_page(type, file, buf, offset, n, o, false)); |
| 5440 | } |
| 5441 | |
| 5442 | /** Check the existence and type of the given file. |
| 5443 | @param[in] path path name of file |
| 5444 | @param[out] exists true if the file exists |
| 5445 | @param[out] type Type of the file, if it exists |
| 5446 | @return true if call succeeded */ |
| 5447 | bool |
| 5448 | os_file_status( |
| 5449 | const char* path, |
| 5450 | bool* exists, |
| 5451 | os_file_type_t* type) |
| 5452 | { |
| 5453 | #ifdef _WIN32 |
| 5454 | return(os_file_status_win32(path, exists, type)); |
| 5455 | #else |
| 5456 | return(os_file_status_posix(path, exists, type)); |
| 5457 | #endif /* _WIN32 */ |
| 5458 | } |
| 5459 | |
| 5460 | /** Free storage space associated with a section of the file. |
| 5461 | @param[in] fh Open file handle |
| 5462 | @param[in] off Starting offset (SEEK_SET) |
| 5463 | @param[in] len Size of the hole |
| 5464 | @return DB_SUCCESS or error code */ |
| 5465 | dberr_t |
| 5466 | os_file_punch_hole( |
| 5467 | os_file_t fh, |
| 5468 | os_offset_t off, |
| 5469 | os_offset_t len) |
| 5470 | { |
| 5471 | dberr_t err; |
| 5472 | |
| 5473 | #ifdef _WIN32 |
| 5474 | err = os_file_punch_hole_win32(fh, off, len); |
| 5475 | #else |
| 5476 | err = os_file_punch_hole_posix(fh, off, len); |
| 5477 | #endif /* _WIN32 */ |
| 5478 | |
| 5479 | return (err); |
| 5480 | } |
| 5481 | |
| 5482 | /** Free storage space associated with a section of the file. |
| 5483 | @param[in] fh Open file handle |
| 5484 | @param[in] off Starting offset (SEEK_SET) |
| 5485 | @param[in] len Size of the hole |
| 5486 | @return DB_SUCCESS or error code */ |
| 5487 | dberr_t |
| 5488 | IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) |
| 5489 | { |
| 5490 | /* In this debugging mode, we act as if punch hole is supported, |
| 5491 | and then skip any calls to actually punch a hole here. |
| 5492 | In this way, Transparent Page Compression is still being tested. */ |
| 5493 | DBUG_EXECUTE_IF("ignore_punch_hole" , |
| 5494 | return(DB_SUCCESS); |
| 5495 | ); |
| 5496 | |
| 5497 | ulint trim_len = get_trim_length(len); |
| 5498 | |
| 5499 | if (trim_len == 0) { |
| 5500 | return(DB_SUCCESS); |
| 5501 | } |
| 5502 | |
| 5503 | off += len; |
| 5504 | |
| 5505 | /* Check does file system support punching holes for this |
| 5506 | tablespace. */ |
| 5507 | if (!should_punch_hole()) { |
| 5508 | return DB_IO_NO_PUNCH_HOLE; |
| 5509 | } |
| 5510 | |
| 5511 | dberr_t err = os_file_punch_hole(fh, off, trim_len); |
| 5512 | |
| 5513 | if (err == DB_SUCCESS) { |
| 5514 | srv_stats.page_compressed_trim_op.inc(); |
| 5515 | } else { |
| 5516 | /* If punch hole is not supported, |
| 5517 | set space so that it is not used. */ |
| 5518 | if (err == DB_IO_NO_PUNCH_HOLE) { |
| 5519 | space_no_punch_hole(); |
| 5520 | err = DB_SUCCESS; |
| 5521 | } |
| 5522 | } |
| 5523 | |
| 5524 | return (err); |
| 5525 | } |
| 5526 | |
| 5527 | /** Check if the file system supports sparse files. |
| 5528 | |
| 5529 | Warning: On POSIX systems we try and punch a hole from offset 0 to |
| 5530 | the system configured page size. This should only be called on an empty |
| 5531 | file. |
| 5532 | @param[in] fh File handle for the file - if opened |
| 5533 | @return true if the file system supports sparse files */ |
| 5534 | bool |
| 5535 | os_is_sparse_file_supported(os_file_t fh) |
| 5536 | { |
| 5537 | /* In this debugging mode, we act as if punch hole is supported, |
| 5538 | then we skip any calls to actually punch a hole. In this way, |
| 5539 | Transparent Page Compression is still being tested. */ |
| 5540 | DBUG_EXECUTE_IF("ignore_punch_hole" , |
| 5541 | return(true); |
| 5542 | ); |
| 5543 | |
| 5544 | #ifdef _WIN32 |
| 5545 | FILE_ATTRIBUTE_TAG_INFO info; |
| 5546 | if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, |
| 5547 | &info, (DWORD)sizeof(info))) { |
| 5548 | if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { |
| 5549 | return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; |
| 5550 | } |
| 5551 | } |
| 5552 | return false; |
| 5553 | #else |
| 5554 | dberr_t err; |
| 5555 | |
| 5556 | /* We don't know the FS block size, use the sector size. The FS |
| 5557 | will do the magic. */ |
| 5558 | err = os_file_punch_hole_posix(fh, 0, srv_page_size); |
| 5559 | |
| 5560 | return(err == DB_SUCCESS); |
| 5561 | #endif /* _WIN32 */ |
| 5562 | } |
| 5563 | |
| 5564 | /** This function returns information about the specified file |
| 5565 | @param[in] path pathname of the file |
| 5566 | @param[out] stat_info information of a file in a directory |
| 5567 | @param[in] check_rw_perm for testing whether the file can be opened |
| 5568 | in RW mode |
| 5569 | @param[in] read_only true if file is opened in read-only mode |
| 5570 | @return DB_SUCCESS if all OK */ |
| 5571 | dberr_t |
| 5572 | os_file_get_status( |
| 5573 | const char* path, |
| 5574 | os_file_stat_t* stat_info, |
| 5575 | bool check_rw_perm, |
| 5576 | bool read_only) |
| 5577 | { |
| 5578 | dberr_t ret; |
| 5579 | |
| 5580 | #ifdef _WIN32 |
| 5581 | struct _stat64 info; |
| 5582 | |
| 5583 | ret = os_file_get_status_win32( |
| 5584 | path, stat_info, &info, check_rw_perm, read_only); |
| 5585 | |
| 5586 | #else |
| 5587 | struct stat info; |
| 5588 | |
| 5589 | ret = os_file_get_status_posix( |
| 5590 | path, stat_info, &info, check_rw_perm, read_only); |
| 5591 | |
| 5592 | #endif /* _WIN32 */ |
| 5593 | |
| 5594 | if (ret == DB_SUCCESS) { |
| 5595 | stat_info->ctime = info.st_ctime; |
| 5596 | stat_info->atime = info.st_atime; |
| 5597 | stat_info->mtime = info.st_mtime; |
| 5598 | stat_info->size = info.st_size; |
| 5599 | } |
| 5600 | |
| 5601 | return(ret); |
| 5602 | } |
| 5603 | |
| 5604 | /** |
| 5605 | Waits for an AIO operation to complete. This function is used to wait the |
| 5606 | for completed requests. The aio array of pending requests is divided |
| 5607 | into segments. The thread specifies which segment or slot it wants to wait |
| 5608 | for. NOTE: this function will also take care of freeing the aio slot, |
| 5609 | therefore no other thread is allowed to do the freeing! |
| 5610 | @param[in] segment The number of the segment in the aio arrays to |
| 5611 | wait for; segment 0 is the ibuf I/O thread, |
| 5612 | segment 1 the log I/O thread, then follow the |
| 5613 | non-ibuf read threads, and as the last are the |
| 5614 | non-ibuf write threads; if this is |
| 5615 | ULINT_UNDEFINED, then it means that sync AIO |
| 5616 | is used, and this parameter is ignored |
| 5617 | @param[out] m1 the messages passed with the AIO request; note |
| 5618 | that also in the case where the AIO operation |
| 5619 | failed, these output parameters are valid and |
| 5620 | can be used to restart the operation, |
| 5621 | for example |
| 5622 | @param[out] m2 callback message |
| 5623 | @param[out] type OS_FILE_WRITE or ..._READ |
| 5624 | @return DB_SUCCESS or error code */ |
| 5625 | dberr_t |
| 5626 | os_aio_handler( |
| 5627 | ulint segment, |
| 5628 | fil_node_t** m1, |
| 5629 | void** m2, |
| 5630 | IORequest* request) |
| 5631 | { |
| 5632 | dberr_t err; |
| 5633 | |
| 5634 | if (srv_use_native_aio) { |
| 5635 | srv_set_io_thread_op_info(segment, "native aio handle" ); |
| 5636 | |
| 5637 | #ifdef WIN_ASYNC_IO |
| 5638 | |
| 5639 | err = os_aio_windows_handler(segment, 0, m1, m2, request); |
| 5640 | |
| 5641 | #elif defined(LINUX_NATIVE_AIO) |
| 5642 | |
| 5643 | err = os_aio_linux_handler(segment, m1, m2, request); |
| 5644 | |
| 5645 | #else |
| 5646 | ut_error; |
| 5647 | |
| 5648 | err = DB_ERROR; /* Eliminate compiler warning */ |
| 5649 | |
| 5650 | #endif /* WIN_ASYNC_IO */ |
| 5651 | |
| 5652 | } else { |
| 5653 | srv_set_io_thread_op_info(segment, "simulated aio handle" ); |
| 5654 | |
| 5655 | err = os_aio_simulated_handler(segment, m1, m2, request); |
| 5656 | } |
| 5657 | |
| 5658 | return(err); |
| 5659 | } |
| 5660 | |
| 5661 | #ifdef WIN_ASYNC_IO |
| 5662 | static HANDLE new_completion_port() |
| 5663 | { |
| 5664 | HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0); |
| 5665 | ut_a(h); |
| 5666 | return h; |
| 5667 | } |
| 5668 | #endif |
| 5669 | |
| 5670 | /** Constructor |
| 5671 | @param[in] id The latch ID |
| 5672 | @param[in] n Number of AIO slots |
| 5673 | @param[in] segments Number of segments */ |
| 5674 | AIO::AIO( |
| 5675 | latch_id_t id, |
| 5676 | ulint n, |
| 5677 | ulint segments) |
| 5678 | : |
| 5679 | m_slots(n), |
| 5680 | m_n_segments(segments), |
| 5681 | m_n_reserved() |
| 5682 | # ifdef LINUX_NATIVE_AIO |
| 5683 | ,m_aio_ctx(), |
| 5684 | m_events(m_slots.size()) |
| 5685 | # endif /* LINUX_NATIVE_AIO */ |
| 5686 | #ifdef WIN_ASYNC_IO |
| 5687 | ,m_completion_port(new_completion_port()) |
| 5688 | #endif |
| 5689 | { |
| 5690 | ut_a(n > 0); |
| 5691 | ut_a(m_n_segments > 0); |
| 5692 | |
| 5693 | mutex_create(id, &m_mutex); |
| 5694 | |
| 5695 | m_not_full = os_event_create("aio_not_full" ); |
| 5696 | m_is_empty = os_event_create("aio_is_empty" ); |
| 5697 | |
| 5698 | memset(&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size()); |
| 5699 | #ifdef LINUX_NATIVE_AIO |
| 5700 | memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size()); |
| 5701 | #endif /* LINUX_NATIVE_AIO */ |
| 5702 | |
| 5703 | os_event_set(m_is_empty); |
| 5704 | } |
| 5705 | |
| 5706 | /** Initialise the slots */ |
| 5707 | dberr_t |
| 5708 | AIO::init_slots() |
| 5709 | { |
| 5710 | for (ulint i = 0; i < m_slots.size(); ++i) { |
| 5711 | Slot& slot = m_slots[i]; |
| 5712 | |
| 5713 | slot.pos = static_cast<uint16_t>(i); |
| 5714 | |
| 5715 | slot.is_reserved = false; |
| 5716 | |
| 5717 | #ifdef WIN_ASYNC_IO |
| 5718 | |
| 5719 | slot.array = this; |
| 5720 | |
| 5721 | #elif defined(LINUX_NATIVE_AIO) |
| 5722 | |
| 5723 | slot.ret = 0; |
| 5724 | |
| 5725 | slot.n_bytes = 0; |
| 5726 | |
| 5727 | memset(&slot.control, 0x0, sizeof(slot.control)); |
| 5728 | |
| 5729 | #endif /* WIN_ASYNC_IO */ |
| 5730 | } |
| 5731 | |
| 5732 | return(DB_SUCCESS); |
| 5733 | } |
| 5734 | |
| 5735 | #ifdef LINUX_NATIVE_AIO |
| 5736 | /** Initialise the Linux Native AIO interface */ |
| 5737 | dberr_t |
| 5738 | AIO::init_linux_native_aio() |
| 5739 | { |
| 5740 | /* Initialize the io_context array. One io_context |
| 5741 | per segment in the array. */ |
| 5742 | |
| 5743 | ut_a(m_aio_ctx == NULL); |
| 5744 | |
| 5745 | m_aio_ctx = static_cast<io_context**>( |
| 5746 | ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx))); |
| 5747 | |
| 5748 | if (m_aio_ctx == NULL) { |
| 5749 | return(DB_OUT_OF_MEMORY); |
| 5750 | } |
| 5751 | |
| 5752 | io_context** ctx = m_aio_ctx; |
| 5753 | ulint max_events = slots_per_segment(); |
| 5754 | |
| 5755 | for (ulint i = 0; i < m_n_segments; ++i, ++ctx) { |
| 5756 | |
| 5757 | if (!linux_create_io_ctx(max_events, ctx)) { |
| 5758 | /* If something bad happened during aio setup |
| 5759 | we disable linux native aio. |
| 5760 | The disadvantage will be a small memory leak |
| 5761 | at shutdown but that's ok compared to a crash |
| 5762 | or a not working server. |
| 5763 | This frequently happens when running the test suite |
| 5764 | with many threads on a system with low fs.aio-max-nr! |
| 5765 | */ |
| 5766 | |
| 5767 | ib::warn() |
| 5768 | << "Warning: Linux Native AIO disabled " |
| 5769 | << "because _linux_create_io_ctx() " |
| 5770 | << "failed. To get rid of this warning you can " |
| 5771 | << "try increasing system " |
| 5772 | << "fs.aio-max-nr to 1048576 or larger or " |
| 5773 | << "setting innodb_use_native_aio = 0 in my.cnf" ; |
| 5774 | ut_free(m_aio_ctx); |
| 5775 | m_aio_ctx = 0; |
| 5776 | srv_use_native_aio = FALSE; |
| 5777 | return(DB_SUCCESS); |
| 5778 | } |
| 5779 | } |
| 5780 | |
| 5781 | return(DB_SUCCESS); |
| 5782 | } |
| 5783 | #endif /* LINUX_NATIVE_AIO */ |
| 5784 | |
| 5785 | /** Initialise the array */ |
| 5786 | dberr_t |
| 5787 | AIO::init() |
| 5788 | { |
| 5789 | ut_a(!m_slots.empty()); |
| 5790 | |
| 5791 | |
| 5792 | if (srv_use_native_aio) { |
| 5793 | #ifdef LINUX_NATIVE_AIO |
| 5794 | dberr_t err = init_linux_native_aio(); |
| 5795 | |
| 5796 | if (err != DB_SUCCESS) { |
| 5797 | return(err); |
| 5798 | } |
| 5799 | |
| 5800 | #endif /* LINUX_NATIVE_AIO */ |
| 5801 | } |
| 5802 | |
| 5803 | return(init_slots()); |
| 5804 | } |
| 5805 | |
| 5806 | /** Creates an aio wait array. Note that we return NULL in case of failure. |
| 5807 | We don't care about freeing memory here because we assume that a |
| 5808 | failure will result in server refusing to start up. |
| 5809 | @param[in] id Latch ID |
| 5810 | @param[in] n maximum number of pending AIO operations |
| 5811 | allowed; n must be divisible by m_n_segments |
| 5812 | @param[in] n_segments number of segments in the AIO array |
| 5813 | @return own: AIO array, NULL on failure */ |
| 5814 | AIO* |
| 5815 | AIO::create( |
| 5816 | latch_id_t id, |
| 5817 | ulint n, |
| 5818 | ulint n_segments) |
| 5819 | { |
| 5820 | if ((n % n_segments)) { |
| 5821 | |
| 5822 | ib::error() |
| 5823 | << "Maximum number of AIO operations must be " |
| 5824 | << "divisible by number of segments" ; |
| 5825 | |
| 5826 | return(NULL); |
| 5827 | } |
| 5828 | |
| 5829 | AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments)); |
| 5830 | |
| 5831 | if (array != NULL && array->init() != DB_SUCCESS) { |
| 5832 | |
| 5833 | UT_DELETE(array); |
| 5834 | |
| 5835 | array = NULL; |
| 5836 | } |
| 5837 | |
| 5838 | return(array); |
| 5839 | } |
| 5840 | |
| 5841 | /** AIO destructor */ |
| 5842 | AIO::~AIO() |
| 5843 | { |
| 5844 | mutex_destroy(&m_mutex); |
| 5845 | |
| 5846 | os_event_destroy(m_not_full); |
| 5847 | os_event_destroy(m_is_empty); |
| 5848 | |
| 5849 | #if defined(LINUX_NATIVE_AIO) |
| 5850 | if (srv_use_native_aio) { |
| 5851 | m_events.clear(); |
| 5852 | ut_free(m_aio_ctx); |
| 5853 | } |
| 5854 | #endif /* LINUX_NATIVE_AIO */ |
| 5855 | #if defined(WIN_ASYNC_IO) |
| 5856 | CloseHandle(m_completion_port); |
| 5857 | #endif |
| 5858 | |
| 5859 | m_slots.clear(); |
| 5860 | } |
| 5861 | |
| 5862 | /** Initializes the asynchronous io system. Creates one array each for ibuf |
| 5863 | and log i/o. Also creates one array each for read and write where each |
| 5864 | array is divided logically into n_readers and n_writers |
| 5865 | respectively. The caller must create an i/o handler thread for each |
| 5866 | segment in these arrays. This function also creates the sync array. |
| 5867 | No i/o handler thread needs to be created for that |
| 5868 | @param[in] n_per_seg maximum number of pending aio |
| 5869 | operations allowed per segment |
| 5870 | @param[in] n_readers number of reader threads |
| 5871 | @param[in] n_writers number of writer threads |
| 5872 | @param[in] n_slots_sync number of slots in the sync aio array |
| 5873 | @return true if the AIO sub-system was started successfully */ |
| 5874 | bool |
| 5875 | AIO::start( |
| 5876 | ulint n_per_seg, |
| 5877 | ulint n_readers, |
| 5878 | ulint n_writers, |
| 5879 | ulint n_slots_sync) |
| 5880 | { |
| 5881 | #if defined(LINUX_NATIVE_AIO) |
| 5882 | /* Check if native aio is supported on this system and tmpfs */ |
| 5883 | if (srv_use_native_aio && !is_linux_native_aio_supported()) { |
| 5884 | |
| 5885 | ib::warn() << "Linux Native AIO disabled." ; |
| 5886 | |
| 5887 | srv_use_native_aio = FALSE; |
| 5888 | } |
| 5889 | #endif /* LINUX_NATIVE_AIO */ |
| 5890 | |
| 5891 | srv_reset_io_thread_op_info(); |
| 5892 | |
| 5893 | s_reads = create( |
| 5894 | LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers); |
| 5895 | |
| 5896 | if (s_reads == NULL) { |
| 5897 | return(false); |
| 5898 | } |
| 5899 | |
| 5900 | ulint start = srv_read_only_mode ? 0 : 2; |
| 5901 | ulint n_segs = n_readers + start; |
| 5902 | |
| 5903 | /* 0 is the ibuf segment and 1 is the redo log segment. */ |
| 5904 | for (ulint i = start; i < n_segs; ++i) { |
| 5905 | ut_a(i < SRV_MAX_N_IO_THREADS); |
| 5906 | srv_io_thread_function[i] = "read thread" ; |
| 5907 | } |
| 5908 | |
| 5909 | ulint n_segments = n_readers; |
| 5910 | |
| 5911 | if (!srv_read_only_mode) { |
| 5912 | |
| 5913 | s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1); |
| 5914 | |
| 5915 | if (s_ibuf == NULL) { |
| 5916 | return(false); |
| 5917 | } |
| 5918 | |
| 5919 | ++n_segments; |
| 5920 | |
| 5921 | srv_io_thread_function[0] = "insert buffer thread" ; |
| 5922 | |
| 5923 | s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1); |
| 5924 | |
| 5925 | if (s_log == NULL) { |
| 5926 | return(false); |
| 5927 | } |
| 5928 | |
| 5929 | ++n_segments; |
| 5930 | |
| 5931 | srv_io_thread_function[1] = "log thread" ; |
| 5932 | |
| 5933 | } else { |
| 5934 | s_ibuf = s_log = NULL; |
| 5935 | } |
| 5936 | |
| 5937 | s_writes = create( |
| 5938 | LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers); |
| 5939 | |
| 5940 | if (s_writes == NULL) { |
| 5941 | return(false); |
| 5942 | } |
| 5943 | |
| 5944 | #ifdef WIN_ASYNC_IO |
| 5945 | data_completion_port = s_writes->m_completion_port; |
| 5946 | log_completion_port = |
| 5947 | s_log ? s_log->m_completion_port : data_completion_port; |
| 5948 | #endif |
| 5949 | |
| 5950 | n_segments += n_writers; |
| 5951 | |
| 5952 | for (ulint i = start + n_readers; i < n_segments; ++i) { |
| 5953 | ut_a(i < SRV_MAX_N_IO_THREADS); |
| 5954 | srv_io_thread_function[i] = "write thread" ; |
| 5955 | } |
| 5956 | |
| 5957 | ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4)); |
| 5958 | |
| 5959 | s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1); |
| 5960 | |
| 5961 | if (s_sync == NULL) { |
| 5962 | |
| 5963 | return(false); |
| 5964 | } |
| 5965 | |
| 5966 | os_aio_n_segments = n_segments; |
| 5967 | |
| 5968 | os_aio_validate(); |
| 5969 | |
| 5970 | os_last_printout = ut_time(); |
| 5971 | |
| 5972 | if (srv_use_native_aio) { |
| 5973 | return(true); |
| 5974 | } |
| 5975 | |
| 5976 | os_aio_segment_wait_events = static_cast<os_event_t*>( |
| 5977 | ut_zalloc_nokey( |
| 5978 | n_segments * sizeof *os_aio_segment_wait_events)); |
| 5979 | |
| 5980 | if (os_aio_segment_wait_events == NULL) { |
| 5981 | |
| 5982 | return(false); |
| 5983 | } |
| 5984 | |
| 5985 | for (ulint i = 0; i < n_segments; ++i) { |
| 5986 | os_aio_segment_wait_events[i] = os_event_create(0); |
| 5987 | } |
| 5988 | |
| 5989 | return(true); |
| 5990 | } |
| 5991 | |
| 5992 | /** Free the AIO arrays */ |
| 5993 | void |
| 5994 | AIO::shutdown() |
| 5995 | { |
| 5996 | UT_DELETE(s_ibuf); |
| 5997 | s_ibuf = NULL; |
| 5998 | |
| 5999 | UT_DELETE(s_log); |
| 6000 | s_log = NULL; |
| 6001 | |
| 6002 | UT_DELETE(s_writes); |
| 6003 | s_writes = NULL; |
| 6004 | |
| 6005 | UT_DELETE(s_sync); |
| 6006 | s_sync = NULL; |
| 6007 | |
| 6008 | UT_DELETE(s_reads); |
| 6009 | s_reads = NULL; |
| 6010 | } |
| 6011 | |
| 6012 | /** Initializes the asynchronous io system. Creates one array each for ibuf |
| 6013 | and log i/o. Also creates one array each for read and write where each |
| 6014 | array is divided logically into n_readers and n_writers |
| 6015 | respectively. The caller must create an i/o handler thread for each |
| 6016 | segment in these arrays. This function also creates the sync array. |
| 6017 | No i/o handler thread needs to be created for that |
| 6018 | @param[in] n_readers number of reader threads |
| 6019 | @param[in] n_writers number of writer threads |
| 6020 | @param[in] n_slots_sync number of slots in the sync aio array */ |
| 6021 | bool |
| 6022 | os_aio_init( |
| 6023 | ulint n_readers, |
| 6024 | ulint n_writers, |
| 6025 | ulint n_slots_sync) |
| 6026 | { |
| 6027 | /* Maximum number of pending aio operations allowed per segment */ |
| 6028 | ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD; |
| 6029 | |
| 6030 | return(AIO::start(limit, n_readers, n_writers, n_slots_sync)); |
| 6031 | } |
| 6032 | |
| 6033 | /** Frees the asynchronous io system. */ |
| 6034 | void |
| 6035 | os_aio_free() |
| 6036 | { |
| 6037 | AIO::shutdown(); |
| 6038 | |
| 6039 | ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio); |
| 6040 | ut_ad(srv_use_native_aio || os_aio_segment_wait_events |
| 6041 | || !srv_was_started); |
| 6042 | |
| 6043 | if (!srv_use_native_aio && os_aio_segment_wait_events) { |
| 6044 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
| 6045 | os_event_destroy(os_aio_segment_wait_events[i]); |
| 6046 | } |
| 6047 | |
| 6048 | ut_free(os_aio_segment_wait_events); |
| 6049 | os_aio_segment_wait_events = 0; |
| 6050 | } |
| 6051 | os_aio_n_segments = 0; |
| 6052 | } |
| 6053 | |
| 6054 | /** Wakes up all async i/o threads so that they know to exit themselves in |
| 6055 | shutdown. */ |
| 6056 | void |
| 6057 | os_aio_wake_all_threads_at_shutdown() |
| 6058 | { |
| 6059 | #ifdef WIN_ASYNC_IO |
| 6060 | AIO::wake_at_shutdown(); |
| 6061 | #elif defined(LINUX_NATIVE_AIO) |
| 6062 | /* When using native AIO interface the io helper threads |
| 6063 | wait on io_getevents with a timeout value of 500ms. At |
| 6064 | each wake up these threads check the server status. |
| 6065 | No need to do anything to wake them up. */ |
| 6066 | #endif /* !WIN_ASYNC_AIO */ |
| 6067 | |
| 6068 | if (srv_use_native_aio) { |
| 6069 | return; |
| 6070 | } |
| 6071 | |
| 6072 | /* This loop wakes up all simulated ai/o threads */ |
| 6073 | |
| 6074 | for (ulint i = 0; i < os_aio_n_segments; ++i) { |
| 6075 | |
| 6076 | os_event_set(os_aio_segment_wait_events[i]); |
| 6077 | } |
| 6078 | } |
| 6079 | |
| 6080 | /** Waits until there are no pending writes in AIO::s_writes. There can |
| 6081 | be other, synchronous, pending writes. */ |
| 6082 | void |
| 6083 | os_aio_wait_until_no_pending_writes() |
| 6084 | { |
| 6085 | AIO::wait_until_no_pending_writes(); |
| 6086 | } |
| 6087 | |
| 6088 | /** Calculates segment number for a slot. |
| 6089 | @param[in] array AIO wait array |
| 6090 | @param[in] slot slot in this array |
| 6091 | @return segment number (which is the number used by, for example, |
| 6092 | I/O-handler threads) */ |
| 6093 | ulint |
| 6094 | AIO::get_segment_no_from_slot( |
| 6095 | const AIO* array, |
| 6096 | const Slot* slot) |
| 6097 | { |
| 6098 | ulint segment; |
| 6099 | ulint seg_len; |
| 6100 | |
| 6101 | if (array == s_ibuf) { |
| 6102 | ut_ad(!srv_read_only_mode); |
| 6103 | |
| 6104 | segment = IO_IBUF_SEGMENT; |
| 6105 | |
| 6106 | } else if (array == s_log) { |
| 6107 | ut_ad(!srv_read_only_mode); |
| 6108 | |
| 6109 | segment = IO_LOG_SEGMENT; |
| 6110 | |
| 6111 | } else if (array == s_reads) { |
| 6112 | seg_len = s_reads->slots_per_segment(); |
| 6113 | |
| 6114 | segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; |
| 6115 | } else { |
| 6116 | ut_a(array == s_writes); |
| 6117 | |
| 6118 | seg_len = s_writes->slots_per_segment(); |
| 6119 | |
| 6120 | segment = s_reads->m_n_segments |
| 6121 | + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; |
| 6122 | } |
| 6123 | |
| 6124 | return(segment); |
| 6125 | } |
| 6126 | |
| 6127 | /** Requests for a slot in the aio array. If no slot is available, waits until |
| 6128 | not_full-event becomes signaled. |
| 6129 | |
| 6130 | @param[in] type IO context |
| 6131 | @param[in,out] m1 message to be passed along with the AIO |
| 6132 | operation |
| 6133 | @param[in,out] m2 message to be passed along with the AIO |
| 6134 | operation |
| 6135 | @param[in] file file handle |
| 6136 | @param[in] name name of the file or path as a NUL-terminated |
| 6137 | string |
| 6138 | @param[in,out] buf buffer where to read or from which to write |
| 6139 | @param[in] offset file offset, where to read from or start writing |
| 6140 | @param[in] len length of the block to read or write |
| 6141 | @return pointer to slot */ |
| 6142 | Slot* |
| 6143 | AIO::reserve_slot( |
| 6144 | const IORequest& type, |
| 6145 | fil_node_t* m1, |
| 6146 | void* m2, |
| 6147 | pfs_os_file_t file, |
| 6148 | const char* name, |
| 6149 | void* buf, |
| 6150 | os_offset_t offset, |
| 6151 | ulint len) |
| 6152 | { |
| 6153 | #ifdef WIN_ASYNC_IO |
| 6154 | ut_a((len & 0xFFFFFFFFUL) == len); |
| 6155 | #endif /* WIN_ASYNC_IO */ |
| 6156 | |
| 6157 | /* No need of a mutex. Only reading constant fields */ |
| 6158 | ulint slots_per_seg; |
| 6159 | |
| 6160 | ut_ad(type.validate()); |
| 6161 | |
| 6162 | slots_per_seg = slots_per_segment(); |
| 6163 | |
| 6164 | /* We attempt to keep adjacent blocks in the same local |
| 6165 | segment. This can help in merging IO requests when we are |
| 6166 | doing simulated AIO */ |
| 6167 | ulint local_seg; |
| 6168 | |
| 6169 | local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments; |
| 6170 | |
| 6171 | for (;;) { |
| 6172 | |
| 6173 | acquire(); |
| 6174 | |
| 6175 | if (m_n_reserved != m_slots.size()) { |
| 6176 | break; |
| 6177 | } |
| 6178 | |
| 6179 | release(); |
| 6180 | |
| 6181 | if (!srv_use_native_aio) { |
| 6182 | /* If the handler threads are suspended, |
| 6183 | wake them so that we get more slots */ |
| 6184 | |
| 6185 | os_aio_simulated_wake_handler_threads(); |
| 6186 | } |
| 6187 | |
| 6188 | os_event_wait(m_not_full); |
| 6189 | } |
| 6190 | |
| 6191 | ulint counter = 0; |
| 6192 | Slot* slot = NULL; |
| 6193 | |
| 6194 | /* We start our search for an available slot from our preferred |
| 6195 | local segment and do a full scan of the array. We are |
| 6196 | guaranteed to find a slot in full scan. */ |
| 6197 | for (ulint i = local_seg * slots_per_seg; |
| 6198 | counter < m_slots.size(); |
| 6199 | ++i, ++counter) { |
| 6200 | |
| 6201 | i %= m_slots.size(); |
| 6202 | |
| 6203 | slot = at(i); |
| 6204 | |
| 6205 | if (slot->is_reserved == false) { |
| 6206 | break; |
| 6207 | } |
| 6208 | } |
| 6209 | |
| 6210 | /* We MUST always be able to get hold of a reserved slot. */ |
| 6211 | ut_a(counter < m_slots.size()); |
| 6212 | |
| 6213 | ut_a(slot->is_reserved == false); |
| 6214 | |
| 6215 | ++m_n_reserved; |
| 6216 | |
| 6217 | if (m_n_reserved == 1) { |
| 6218 | os_event_reset(m_is_empty); |
| 6219 | } |
| 6220 | |
| 6221 | if (m_n_reserved == m_slots.size()) { |
| 6222 | os_event_reset(m_not_full); |
| 6223 | } |
| 6224 | |
| 6225 | slot->is_reserved = true; |
| 6226 | slot->reservation_time = ut_time(); |
| 6227 | slot->m1 = m1; |
| 6228 | slot->m2 = m2; |
| 6229 | slot->file = file; |
| 6230 | slot->name = name; |
| 6231 | #ifdef _WIN32 |
| 6232 | slot->len = static_cast<DWORD>(len); |
| 6233 | #else |
| 6234 | slot->len = static_cast<ulint>(len); |
| 6235 | #endif /* _WIN32 */ |
| 6236 | slot->type = type; |
| 6237 | slot->buf = static_cast<byte*>(buf); |
| 6238 | slot->ptr = slot->buf; |
| 6239 | slot->offset = offset; |
| 6240 | slot->err = DB_SUCCESS; |
| 6241 | slot->original_len = static_cast<uint32>(len); |
| 6242 | slot->io_already_done = false; |
| 6243 | slot->buf = static_cast<byte*>(buf); |
| 6244 | |
| 6245 | #ifdef WIN_ASYNC_IO |
| 6246 | { |
| 6247 | OVERLAPPED* control; |
| 6248 | |
| 6249 | control = &slot->control; |
| 6250 | control->Offset = (DWORD) offset & 0xFFFFFFFF; |
| 6251 | control->OffsetHigh = (DWORD) (offset >> 32); |
| 6252 | } |
| 6253 | #elif defined(LINUX_NATIVE_AIO) |
| 6254 | |
| 6255 | /* If we are not using native AIO skip this part. */ |
| 6256 | if (srv_use_native_aio) { |
| 6257 | |
| 6258 | off_t aio_offset; |
| 6259 | |
| 6260 | /* Check if we are dealing with 64 bit arch. |
| 6261 | If not then make sure that offset fits in 32 bits. */ |
| 6262 | aio_offset = (off_t) offset; |
| 6263 | |
| 6264 | ut_a(sizeof(aio_offset) >= sizeof(offset) |
| 6265 | || ((os_offset_t) aio_offset) == offset); |
| 6266 | |
| 6267 | struct iocb* iocb = &slot->control; |
| 6268 | |
| 6269 | if (type.is_read()) { |
| 6270 | |
| 6271 | io_prep_pread( |
| 6272 | iocb, file, slot->ptr, slot->len, aio_offset); |
| 6273 | } else { |
| 6274 | ut_ad(type.is_write()); |
| 6275 | |
| 6276 | io_prep_pwrite( |
| 6277 | iocb, file, slot->ptr, slot->len, aio_offset); |
| 6278 | } |
| 6279 | |
| 6280 | iocb->data = slot; |
| 6281 | |
| 6282 | slot->n_bytes = 0; |
| 6283 | slot->ret = 0; |
| 6284 | } |
| 6285 | #endif /* LINUX_NATIVE_AIO */ |
| 6286 | |
| 6287 | release(); |
| 6288 | |
| 6289 | return(slot); |
| 6290 | } |
| 6291 | |
| 6292 | /** Wakes up a simulated aio i/o-handler thread if it has something to do. |
| 6293 | @param[in] global_segment The number of the segment in the AIO arrays */ |
| 6294 | void |
| 6295 | AIO::wake_simulated_handler_thread(ulint global_segment) |
| 6296 | { |
| 6297 | ut_ad(!srv_use_native_aio); |
| 6298 | |
| 6299 | AIO* array; |
| 6300 | ulint segment = get_array_and_local_segment(&array, global_segment); |
| 6301 | |
| 6302 | array->wake_simulated_handler_thread(global_segment, segment); |
| 6303 | } |
| 6304 | |
| 6305 | /** Wakes up a simulated AIO I/O-handler thread if it has something to do |
| 6306 | for a local segment in the AIO array. |
| 6307 | @param[in] global_segment The number of the segment in the AIO arrays |
| 6308 | @param[in] segment The local segment in the AIO array */ |
| 6309 | void |
| 6310 | AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment) |
| 6311 | { |
| 6312 | ut_ad(!srv_use_native_aio); |
| 6313 | |
| 6314 | ulint n = slots_per_segment(); |
| 6315 | ulint offset = segment * n; |
| 6316 | |
| 6317 | /* Look through n slots after the segment * n'th slot */ |
| 6318 | |
| 6319 | acquire(); |
| 6320 | |
| 6321 | const Slot* slot = at(offset); |
| 6322 | |
| 6323 | for (ulint i = 0; i < n; ++i, ++slot) { |
| 6324 | |
| 6325 | if (slot->is_reserved) { |
| 6326 | |
| 6327 | /* Found an i/o request */ |
| 6328 | |
| 6329 | release(); |
| 6330 | |
| 6331 | os_event_t event; |
| 6332 | |
| 6333 | event = os_aio_segment_wait_events[global_segment]; |
| 6334 | |
| 6335 | os_event_set(event); |
| 6336 | |
| 6337 | return; |
| 6338 | } |
| 6339 | } |
| 6340 | |
| 6341 | release(); |
| 6342 | } |
| 6343 | |
| 6344 | /** Wakes up simulated aio i/o-handler threads if they have something to do. */ |
| 6345 | void |
| 6346 | os_aio_simulated_wake_handler_threads() |
| 6347 | { |
| 6348 | if (srv_use_native_aio) { |
| 6349 | /* We do not use simulated aio: do nothing */ |
| 6350 | |
| 6351 | return; |
| 6352 | } |
| 6353 | |
| 6354 | os_aio_recommend_sleep_for_read_threads = false; |
| 6355 | |
| 6356 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
| 6357 | AIO::wake_simulated_handler_thread(i); |
| 6358 | } |
| 6359 | } |
| 6360 | |
| 6361 | /** Select the IO slot array |
| 6362 | @param[in,out] type Type of IO, READ or WRITE |
| 6363 | @param[in] read_only true if running in read-only mode |
| 6364 | @param[in] mode IO mode |
| 6365 | @return slot array or NULL if invalid mode specified */ |
| 6366 | AIO* |
| 6367 | AIO::select_slot_array(IORequest& type, bool read_only, ulint mode) |
| 6368 | { |
| 6369 | AIO* array; |
| 6370 | |
| 6371 | ut_ad(type.validate()); |
| 6372 | |
| 6373 | switch (mode) { |
| 6374 | case OS_AIO_NORMAL: |
| 6375 | |
| 6376 | array = type.is_read() ? AIO::s_reads : AIO::s_writes; |
| 6377 | break; |
| 6378 | |
| 6379 | case OS_AIO_IBUF: |
| 6380 | ut_ad(type.is_read()); |
| 6381 | |
| 6382 | /* Reduce probability of deadlock bugs in connection with ibuf: |
| 6383 | do not let the ibuf i/o handler sleep */ |
| 6384 | |
| 6385 | type.clear_do_not_wake(); |
| 6386 | |
| 6387 | array = read_only ? AIO::s_reads : AIO::s_ibuf; |
| 6388 | break; |
| 6389 | |
| 6390 | case OS_AIO_LOG: |
| 6391 | |
| 6392 | array = read_only ? AIO::s_reads : AIO::s_log; |
| 6393 | break; |
| 6394 | |
| 6395 | case OS_AIO_SYNC: |
| 6396 | |
| 6397 | array = AIO::s_sync; |
| 6398 | #if defined(LINUX_NATIVE_AIO) |
| 6399 | /* In Linux native AIO we don't use sync IO array. */ |
| 6400 | ut_a(!srv_use_native_aio); |
| 6401 | #endif /* LINUX_NATIVE_AIO */ |
| 6402 | break; |
| 6403 | |
| 6404 | default: |
| 6405 | ut_error; |
| 6406 | array = NULL; /* Eliminate compiler warning */ |
| 6407 | } |
| 6408 | |
| 6409 | return(array); |
| 6410 | } |
| 6411 | |
| 6412 | #ifdef WIN_ASYNC_IO |
| 6413 | /** This function is only used in Windows asynchronous i/o. |
| 6414 | Waits for an aio operation to complete. This function is used to wait the |
| 6415 | for completed requests. The aio array of pending requests is divided |
| 6416 | into segments. The thread specifies which segment or slot it wants to wait |
| 6417 | for. NOTE: this function will also take care of freeing the aio slot, |
| 6418 | therefore no other thread is allowed to do the freeing! |
| 6419 | @param[in] segment The number of the segment in the aio arrays to |
| 6420 | wait for; segment 0 is the ibuf I/O thread, |
| 6421 | segment 1 the log I/O thread, then follow the |
| 6422 | non-ibuf read threads, and as the last are the |
| 6423 | non-ibuf write threads; if this is |
| 6424 | ULINT_UNDEFINED, then it means that sync AIO |
| 6425 | is used, and this parameter is ignored |
| 6426 | @param[in] pos this parameter is used only in sync AIO: |
| 6427 | wait for the aio slot at this position |
| 6428 | @param[out] m1 the messages passed with the AIO request; note |
| 6429 | that also in the case where the AIO operation |
| 6430 | failed, these output parameters are valid and |
| 6431 | can be used to restart the operation, |
| 6432 | for example |
| 6433 | @param[out] m2 callback message |
| 6434 | @param[out] type OS_FILE_WRITE or ..._READ |
| 6435 | @return DB_SUCCESS or error code */ |
| 6436 | |
| 6437 | |
| 6438 | |
| 6439 | static |
| 6440 | dberr_t |
| 6441 | os_aio_windows_handler( |
| 6442 | ulint segment, |
| 6443 | ulint pos, |
| 6444 | fil_node_t** m1, |
| 6445 | void** m2, |
| 6446 | IORequest* type) |
| 6447 | { |
| 6448 | Slot* slot= 0; |
| 6449 | dberr_t err; |
| 6450 | |
| 6451 | BOOL ret; |
| 6452 | ULONG_PTR key; |
| 6453 | |
| 6454 | ut_a(segment != ULINT_UNDEFINED); |
| 6455 | |
| 6456 | /* NOTE! We only access constant fields in os_aio_array. Therefore |
| 6457 | we do not have to acquire the protecting mutex yet */ |
| 6458 | |
| 6459 | ut_ad(os_aio_validate_skip()); |
| 6460 | AIO *my_array; |
| 6461 | AIO::get_array_and_local_segment(&my_array, segment); |
| 6462 | |
| 6463 | HANDLE port = my_array->m_completion_port; |
| 6464 | ut_ad(port); |
| 6465 | for (;;) { |
| 6466 | DWORD len; |
| 6467 | ret = GetQueuedCompletionStatus(port, &len, &key, |
| 6468 | (OVERLAPPED **)&slot, INFINITE); |
| 6469 | |
| 6470 | /* If shutdown key was received, repost the shutdown message and exit */ |
| 6471 | if (ret && key == IOCP_SHUTDOWN_KEY) { |
| 6472 | PostQueuedCompletionStatus(port, 0, key, NULL); |
| 6473 | *m1 = NULL; |
| 6474 | *m2 = NULL; |
| 6475 | return (DB_SUCCESS); |
| 6476 | } |
| 6477 | |
| 6478 | ut_a(slot); |
| 6479 | |
| 6480 | if (!ret) { |
| 6481 | /* IO failed */ |
| 6482 | break; |
| 6483 | } |
| 6484 | |
| 6485 | slot->n_bytes= len; |
| 6486 | ut_a(slot->array); |
| 6487 | HANDLE slot_port = slot->array->m_completion_port; |
| 6488 | if (slot_port != port) { |
| 6489 | /* there are no redirections between data and log */ |
| 6490 | ut_ad(port == data_completion_port); |
| 6491 | ut_ad(slot_port != log_completion_port); |
| 6492 | |
| 6493 | /* |
| 6494 | Redirect completions to the dedicated completion port |
| 6495 | and threads. |
| 6496 | |
| 6497 | "Write array" threads receive write,read and ibuf |
| 6498 | notifications, read and ibuf completions are redirected. |
| 6499 | |
| 6500 | Forwarding IO completion this way costs a context switch, |
| 6501 | and this seems tolerable since asynchronous reads are by |
| 6502 | far less frequent. |
| 6503 | */ |
| 6504 | ut_a(PostQueuedCompletionStatus(slot_port, |
| 6505 | len, key, &slot->control)); |
| 6506 | } |
| 6507 | else { |
| 6508 | break; |
| 6509 | } |
| 6510 | } |
| 6511 | |
| 6512 | ut_a(slot->is_reserved); |
| 6513 | |
| 6514 | *m1 = slot->m1; |
| 6515 | *m2 = slot->m2; |
| 6516 | |
| 6517 | *type = slot->type; |
| 6518 | |
| 6519 | bool retry = false; |
| 6520 | |
| 6521 | if (ret && slot->n_bytes == slot->len) { |
| 6522 | |
| 6523 | err = DB_SUCCESS; |
| 6524 | |
| 6525 | } else if (os_file_handle_error(slot->name, "Windows aio" )) { |
| 6526 | |
| 6527 | retry = true; |
| 6528 | |
| 6529 | } else { |
| 6530 | |
| 6531 | err = DB_IO_ERROR; |
| 6532 | } |
| 6533 | |
| 6534 | |
| 6535 | if (retry) { |
| 6536 | /* Retry failed read/write operation synchronously. */ |
| 6537 | |
| 6538 | #ifdef UNIV_PFS_IO |
| 6539 | /* This read/write does not go through os_file_read |
| 6540 | and os_file_write APIs, need to register with |
| 6541 | performance schema explicitly here. */ |
| 6542 | PSI_file_locker_state state; |
| 6543 | struct PSI_file_locker* locker = NULL; |
| 6544 | |
| 6545 | register_pfs_file_io_begin( |
| 6546 | &state, locker, slot->file, slot->len, |
| 6547 | slot->type.is_write() |
| 6548 | ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__); |
| 6549 | #endif /* UNIV_PFS_IO */ |
| 6550 | |
| 6551 | ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); |
| 6552 | |
| 6553 | ssize_t n_bytes = SyncFileIO::execute(slot); |
| 6554 | |
| 6555 | #ifdef UNIV_PFS_IO |
| 6556 | register_pfs_file_io_end(locker, slot->len); |
| 6557 | #endif /* UNIV_PFS_IO */ |
| 6558 | |
| 6559 | err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR; |
| 6560 | } |
| 6561 | |
| 6562 | if (err == DB_SUCCESS) { |
| 6563 | err = AIOHandler::post_io_processing(slot); |
| 6564 | } |
| 6565 | |
| 6566 | slot->array->release_with_mutex(slot); |
| 6567 | |
| 6568 | if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
| 6569 | && !buf_page_cleaner_is_active |
| 6570 | && os_aio_all_slots_free()) { |
| 6571 | /* Last IO, wakeup other io threads */ |
| 6572 | AIO::wake_at_shutdown(); |
| 6573 | } |
| 6574 | return(err); |
| 6575 | } |
| 6576 | #endif /* WIN_ASYNC_IO */ |
| 6577 | |
| 6578 | /** |
| 6579 | NOTE! Use the corresponding macro os_aio(), not directly this function! |
| 6580 | Requests an asynchronous i/o operation. |
| 6581 | @param[in,out] type IO request context |
| 6582 | @param[in] mode IO mode |
| 6583 | @param[in] name Name of the file or path as NUL terminated |
| 6584 | string |
| 6585 | @param[in] file Open file handle |
| 6586 | @param[out] buf buffer where to read |
| 6587 | @param[in] offset file offset where to read |
| 6588 | @param[in] n number of bytes to read |
| 6589 | @param[in] read_only if true read only mode checks are enforced |
| 6590 | @param[in,out] m1 Message for the AIO handler, (can be used to |
| 6591 | identify a completed AIO operation); ignored |
| 6592 | if mode is OS_AIO_SYNC |
| 6593 | @param[in,out] m2 message for the AIO handler (can be used to |
| 6594 | identify a completed AIO operation); ignored |
| 6595 | if mode is OS_AIO_SYNC |
| 6596 | |
| 6597 | @return DB_SUCCESS or error code */ |
| 6598 | dberr_t |
| 6599 | os_aio_func( |
| 6600 | IORequest& type, |
| 6601 | ulint mode, |
| 6602 | const char* name, |
| 6603 | pfs_os_file_t file, |
| 6604 | void* buf, |
| 6605 | os_offset_t offset, |
| 6606 | ulint n, |
| 6607 | bool read_only, |
| 6608 | fil_node_t* m1, |
| 6609 | void* m2) |
| 6610 | { |
| 6611 | #ifdef WIN_ASYNC_IO |
| 6612 | BOOL ret = TRUE; |
| 6613 | #endif /* WIN_ASYNC_IO */ |
| 6614 | |
| 6615 | ut_ad(n > 0); |
| 6616 | ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0); |
| 6617 | ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0); |
| 6618 | ut_ad(os_aio_validate_skip()); |
| 6619 | |
| 6620 | #ifdef WIN_ASYNC_IO |
| 6621 | ut_ad((n & 0xFFFFFFFFUL) == n); |
| 6622 | #endif /* WIN_ASYNC_IO */ |
| 6623 | |
| 6624 | DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28" , |
| 6625 | mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;); |
| 6626 | |
| 6627 | if (mode == OS_AIO_SYNC) { |
| 6628 | if (type.is_read()) { |
| 6629 | return(os_file_read_func(type, file, buf, offset, n)); |
| 6630 | } |
| 6631 | |
| 6632 | ut_ad(type.is_write()); |
| 6633 | |
| 6634 | return(os_file_write_func(type, name, file, buf, offset, n)); |
| 6635 | } |
| 6636 | |
| 6637 | try_again: |
| 6638 | |
| 6639 | AIO* array; |
| 6640 | |
| 6641 | array = AIO::select_slot_array(type, read_only, mode); |
| 6642 | |
| 6643 | Slot* slot; |
| 6644 | |
| 6645 | slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n); |
| 6646 | |
| 6647 | if (type.is_read()) { |
| 6648 | |
| 6649 | |
| 6650 | if (srv_use_native_aio) { |
| 6651 | |
| 6652 | ++os_n_file_reads; |
| 6653 | |
| 6654 | os_bytes_read_since_printout += n; |
| 6655 | #ifdef WIN_ASYNC_IO |
| 6656 | ret = ReadFile( |
| 6657 | file, slot->ptr, slot->len, |
| 6658 | NULL, &slot->control); |
| 6659 | #elif defined(LINUX_NATIVE_AIO) |
| 6660 | if (!array->linux_dispatch(slot)) { |
| 6661 | goto err_exit; |
| 6662 | } |
| 6663 | #endif /* WIN_ASYNC_IO */ |
| 6664 | } else if (type.is_wake()) { |
| 6665 | AIO::wake_simulated_handler_thread( |
| 6666 | AIO::get_segment_no_from_slot(array, slot)); |
| 6667 | } |
| 6668 | } else if (type.is_write()) { |
| 6669 | |
| 6670 | if (srv_use_native_aio) { |
| 6671 | ++os_n_file_writes; |
| 6672 | |
| 6673 | #ifdef WIN_ASYNC_IO |
| 6674 | ret = WriteFile( |
| 6675 | file, slot->ptr, slot->len, |
| 6676 | NULL, &slot->control); |
| 6677 | #elif defined(LINUX_NATIVE_AIO) |
| 6678 | if (!array->linux_dispatch(slot)) { |
| 6679 | goto err_exit; |
| 6680 | } |
| 6681 | #endif /* WIN_ASYNC_IO */ |
| 6682 | |
| 6683 | } else if (type.is_wake()) { |
| 6684 | AIO::wake_simulated_handler_thread( |
| 6685 | AIO::get_segment_no_from_slot(array, slot)); |
| 6686 | } |
| 6687 | } else { |
| 6688 | ut_error; |
| 6689 | } |
| 6690 | |
| 6691 | #ifdef WIN_ASYNC_IO |
| 6692 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
| 6693 | /* aio completed or was queued successfully! */ |
| 6694 | return(DB_SUCCESS); |
| 6695 | } |
| 6696 | |
| 6697 | goto err_exit; |
| 6698 | |
| 6699 | #endif /* WIN_ASYNC_IO */ |
| 6700 | |
| 6701 | /* AIO request was queued successfully! */ |
| 6702 | return(DB_SUCCESS); |
| 6703 | |
| 6704 | #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO |
| 6705 | err_exit: |
| 6706 | #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ |
| 6707 | |
| 6708 | array->release_with_mutex(slot); |
| 6709 | |
| 6710 | if (os_file_handle_error( |
| 6711 | name, type.is_read() ? "aio read" : "aio write" )) { |
| 6712 | |
| 6713 | goto try_again; |
| 6714 | } |
| 6715 | |
| 6716 | return(DB_IO_ERROR); |
| 6717 | } |
| 6718 | |
| 6719 | /** Simulated AIO handler for reaping IO requests */ |
| 6720 | class SimulatedAIOHandler { |
| 6721 | |
| 6722 | public: |
| 6723 | |
| 6724 | /** Constructor |
| 6725 | @param[in,out] array The AIO array |
| 6726 | @param[in] segment Local segment in the array */ |
| 6727 | SimulatedAIOHandler(AIO* array, ulint segment) |
| 6728 | : |
| 6729 | m_oldest(), |
| 6730 | m_n_elems(), |
| 6731 | m_lowest_offset(IB_UINT64_MAX), |
| 6732 | m_array(array), |
| 6733 | m_n_slots(), |
| 6734 | m_segment(segment), |
| 6735 | m_ptr(), |
| 6736 | m_buf() |
| 6737 | { |
| 6738 | ut_ad(m_segment < 100); |
| 6739 | |
| 6740 | m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE); |
| 6741 | } |
| 6742 | |
| 6743 | /** Destructor */ |
| 6744 | ~SimulatedAIOHandler() |
| 6745 | { |
| 6746 | if (m_ptr != NULL) { |
| 6747 | ut_free(m_ptr); |
| 6748 | } |
| 6749 | } |
| 6750 | |
| 6751 | /** Reset the state of the handler |
| 6752 | @param[in] n_slots Number of pending AIO operations supported */ |
| 6753 | void init(ulint n_slots) |
| 6754 | { |
| 6755 | m_oldest = 0; |
| 6756 | m_n_elems = 0; |
| 6757 | m_n_slots = n_slots; |
| 6758 | m_lowest_offset = IB_UINT64_MAX; |
| 6759 | |
| 6760 | if (m_ptr != NULL) { |
| 6761 | ut_free(m_ptr); |
| 6762 | m_ptr = m_buf = NULL; |
| 6763 | } |
| 6764 | |
| 6765 | m_slots[0] = NULL; |
| 6766 | } |
| 6767 | |
| 6768 | /** Check if there is a slot for which the i/o has already been done |
| 6769 | @param[out] n_reserved Number of reserved slots |
| 6770 | @return the first completed slot that is found. */ |
| 6771 | Slot* check_completed(ulint* n_reserved) |
| 6772 | { |
| 6773 | ulint offset = m_segment * m_n_slots; |
| 6774 | |
| 6775 | *n_reserved = 0; |
| 6776 | |
| 6777 | Slot* slot; |
| 6778 | |
| 6779 | slot = m_array->at(offset); |
| 6780 | |
| 6781 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
| 6782 | |
| 6783 | if (slot->is_reserved) { |
| 6784 | |
| 6785 | if (slot->io_already_done) { |
| 6786 | |
| 6787 | ut_a(slot->is_reserved); |
| 6788 | |
| 6789 | return(slot); |
| 6790 | } |
| 6791 | |
| 6792 | ++*n_reserved; |
| 6793 | } |
| 6794 | } |
| 6795 | |
| 6796 | return(NULL); |
| 6797 | } |
| 6798 | |
| 6799 | /** If there are at least 2 seconds old requests, then pick the |
| 6800 | oldest one to prevent starvation. If several requests have the |
| 6801 | same age, then pick the one at the lowest offset. |
| 6802 | @return true if request was selected */ |
| 6803 | bool select() |
| 6804 | { |
| 6805 | if (!select_oldest()) { |
| 6806 | |
| 6807 | return(select_lowest_offset()); |
| 6808 | } |
| 6809 | |
| 6810 | return(true); |
| 6811 | } |
| 6812 | |
| 6813 | /** Check if there are several consecutive blocks |
| 6814 | to read or write. Merge them if found. */ |
| 6815 | void merge() |
| 6816 | { |
| 6817 | /* if m_n_elems != 0, then we have assigned |
| 6818 | something valid to consecutive_ios[0] */ |
| 6819 | ut_ad(m_n_elems != 0); |
| 6820 | ut_ad(first_slot() != NULL); |
| 6821 | |
| 6822 | Slot* slot = first_slot(); |
| 6823 | |
| 6824 | while (!merge_adjacent(slot)) { |
| 6825 | /* No op */ |
| 6826 | } |
| 6827 | } |
| 6828 | |
| 6829 | /** We have now collected n_consecutive I/O requests |
| 6830 | in the array; allocate a single buffer which can hold |
| 6831 | all data, and perform the I/O |
| 6832 | @return the length of the buffer */ |
| 6833 | ulint allocate_buffer() |
| 6834 | MY_ATTRIBUTE((warn_unused_result)) |
| 6835 | { |
| 6836 | ulint len; |
| 6837 | Slot* slot = first_slot(); |
| 6838 | |
| 6839 | ut_ad(m_ptr == NULL); |
| 6840 | |
| 6841 | if (slot->type.is_read() && m_n_elems > 1) { |
| 6842 | |
| 6843 | len = 0; |
| 6844 | |
| 6845 | for (ulint i = 0; i < m_n_elems; ++i) { |
| 6846 | len += m_slots[i]->len; |
| 6847 | } |
| 6848 | |
| 6849 | m_ptr = static_cast<byte*>( |
| 6850 | ut_malloc_nokey(len + srv_page_size)); |
| 6851 | |
| 6852 | m_buf = static_cast<byte*>( |
| 6853 | ut_align(m_ptr, srv_page_size)); |
| 6854 | |
| 6855 | } else { |
| 6856 | len = first_slot()->len; |
| 6857 | m_buf = first_slot()->buf; |
| 6858 | } |
| 6859 | |
| 6860 | return(len); |
| 6861 | } |
| 6862 | |
| 6863 | /** We have to compress the individual pages and punch |
| 6864 | holes in them on a page by page basis when writing to |
| 6865 | tables that can be compresed at the IO level. |
| 6866 | @param[in] len Value returned by allocate_buffer */ |
| 6867 | void copy_to_buffer(ulint len) |
| 6868 | { |
| 6869 | Slot* slot = first_slot(); |
| 6870 | |
| 6871 | if (len > slot->len && slot->type.is_write()) { |
| 6872 | |
| 6873 | byte* ptr = m_buf; |
| 6874 | |
| 6875 | ut_ad(ptr != slot->buf); |
| 6876 | |
| 6877 | /* Copy the buffers to the combined buffer */ |
| 6878 | for (ulint i = 0; i < m_n_elems; ++i) { |
| 6879 | |
| 6880 | slot = m_slots[i]; |
| 6881 | |
| 6882 | memmove(ptr, slot->buf, slot->len); |
| 6883 | |
| 6884 | ptr += slot->len; |
| 6885 | } |
| 6886 | } |
| 6887 | } |
| 6888 | |
| 6889 | /** Do the I/O with ordinary, synchronous i/o functions: |
| 6890 | @param[in] len Length of buffer for IO */ |
| 6891 | void io() |
| 6892 | { |
| 6893 | if (first_slot()->type.is_write()) { |
| 6894 | |
| 6895 | for (ulint i = 0; i < m_n_elems; ++i) { |
| 6896 | write(m_slots[i]); |
| 6897 | } |
| 6898 | |
| 6899 | } else { |
| 6900 | |
| 6901 | for (ulint i = 0; i < m_n_elems; ++i) { |
| 6902 | read(m_slots[i]); |
| 6903 | } |
| 6904 | } |
| 6905 | } |
| 6906 | |
| 6907 | /** Mark the i/os done in slots */ |
| 6908 | void done() |
| 6909 | { |
| 6910 | for (ulint i = 0; i < m_n_elems; ++i) { |
| 6911 | m_slots[i]->io_already_done = true; |
| 6912 | } |
| 6913 | } |
| 6914 | |
| 6915 | /** @return the first slot in the consecutive array */ |
| 6916 | Slot* first_slot() |
| 6917 | MY_ATTRIBUTE((warn_unused_result)) |
| 6918 | { |
| 6919 | ut_a(m_n_elems > 0); |
| 6920 | |
| 6921 | return(m_slots[0]); |
| 6922 | } |
| 6923 | |
| 6924 | /** Wait for I/O requests |
| 6925 | @param[in] global_segment The global segment |
| 6926 | @param[in,out] event Wait on event if no active requests |
| 6927 | @return the number of slots */ |
| 6928 | ulint check_pending( |
| 6929 | ulint global_segment, |
| 6930 | os_event_t event) |
| 6931 | MY_ATTRIBUTE((warn_unused_result)); |
| 6932 | private: |
| 6933 | |
| 6934 | /** Do the file read |
| 6935 | @param[in,out] slot Slot that has the IO context */ |
| 6936 | void read(Slot* slot) |
| 6937 | { |
| 6938 | dberr_t err = os_file_read( |
| 6939 | slot->type, |
| 6940 | slot->file, |
| 6941 | slot->ptr, |
| 6942 | slot->offset, |
| 6943 | slot->len); |
| 6944 | |
| 6945 | ut_a(err == DB_SUCCESS); |
| 6946 | } |
| 6947 | |
| 6948 | /** Do the file read |
| 6949 | @param[in,out] slot Slot that has the IO context */ |
| 6950 | void write(Slot* slot) |
| 6951 | { |
| 6952 | dberr_t err = os_file_write( |
| 6953 | slot->type, |
| 6954 | slot->name, |
| 6955 | slot->file, |
| 6956 | slot->ptr, |
| 6957 | slot->offset, |
| 6958 | slot->len); |
| 6959 | |
| 6960 | ut_a(err == DB_SUCCESS); |
| 6961 | } |
| 6962 | |
| 6963 | /** @return true if the slots are adjacent and can be merged */ |
| 6964 | bool adjacent(const Slot* s1, const Slot* s2) const |
| 6965 | { |
| 6966 | return(s1 != s2 |
| 6967 | && s1->file == s2->file |
| 6968 | && s2->offset == s1->offset + s1->len |
| 6969 | && s1->type == s2->type); |
| 6970 | } |
| 6971 | |
| 6972 | /** @return true if merge limit reached or no adjacent slots found. */ |
| 6973 | bool merge_adjacent(Slot*& current) |
| 6974 | { |
| 6975 | Slot* slot; |
| 6976 | ulint offset = m_segment * m_n_slots; |
| 6977 | |
| 6978 | slot = m_array->at(offset); |
| 6979 | |
| 6980 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
| 6981 | |
| 6982 | if (slot->is_reserved && adjacent(current, slot)) { |
| 6983 | |
| 6984 | current = slot; |
| 6985 | |
| 6986 | /* Found a consecutive i/o request */ |
| 6987 | |
| 6988 | m_slots[m_n_elems] = slot; |
| 6989 | |
| 6990 | ++m_n_elems; |
| 6991 | |
| 6992 | return(m_n_elems >= m_slots.capacity()); |
| 6993 | } |
| 6994 | } |
| 6995 | |
| 6996 | return(true); |
| 6997 | } |
| 6998 | |
| 6999 | /** There were no old requests. Look for an I/O request at the lowest |
| 7000 | offset in the array (we ignore the high 32 bits of the offset in these |
| 7001 | heuristics) */ |
| 7002 | bool select_lowest_offset() |
| 7003 | { |
| 7004 | ut_ad(m_n_elems == 0); |
| 7005 | |
| 7006 | ulint offset = m_segment * m_n_slots; |
| 7007 | |
| 7008 | m_lowest_offset = IB_UINT64_MAX; |
| 7009 | |
| 7010 | for (ulint i = 0; i < m_n_slots; ++i) { |
| 7011 | Slot* slot; |
| 7012 | |
| 7013 | slot = m_array->at(i + offset); |
| 7014 | |
| 7015 | if (slot->is_reserved |
| 7016 | && slot->offset < m_lowest_offset) { |
| 7017 | |
| 7018 | /* Found an i/o request */ |
| 7019 | m_slots[0] = slot; |
| 7020 | |
| 7021 | m_n_elems = 1; |
| 7022 | |
| 7023 | m_lowest_offset = slot->offset; |
| 7024 | } |
| 7025 | } |
| 7026 | |
| 7027 | return(m_n_elems > 0); |
| 7028 | } |
| 7029 | |
| 7030 | /** Select the slot if it is older than the current oldest slot. |
| 7031 | @param[in] slot The slot to check */ |
| 7032 | void select_if_older(Slot* slot) |
| 7033 | { |
| 7034 | ulint age; |
| 7035 | |
| 7036 | age = (ulint) difftime(ut_time(), slot->reservation_time); |
| 7037 | |
| 7038 | if ((age >= 2 && age > m_oldest) |
| 7039 | || (age >= 2 |
| 7040 | && age == m_oldest |
| 7041 | && slot->offset < m_lowest_offset)) { |
| 7042 | |
| 7043 | /* Found an i/o request */ |
| 7044 | m_slots[0] = slot; |
| 7045 | |
| 7046 | m_n_elems = 1; |
| 7047 | |
| 7048 | m_oldest = age; |
| 7049 | |
| 7050 | m_lowest_offset = slot->offset; |
| 7051 | } |
| 7052 | } |
| 7053 | |
| 7054 | /** Select th oldest slot in the array |
| 7055 | @return true if oldest slot found */ |
| 7056 | bool select_oldest() |
| 7057 | { |
| 7058 | ut_ad(m_n_elems == 0); |
| 7059 | |
| 7060 | Slot* slot; |
| 7061 | ulint offset = m_n_slots * m_segment; |
| 7062 | |
| 7063 | slot = m_array->at(offset); |
| 7064 | |
| 7065 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
| 7066 | |
| 7067 | if (slot->is_reserved) { |
| 7068 | select_if_older(slot); |
| 7069 | } |
| 7070 | } |
| 7071 | |
| 7072 | return(m_n_elems > 0); |
| 7073 | } |
| 7074 | |
| 7075 | typedef std::vector<Slot*> slots_t; |
| 7076 | |
| 7077 | private: |
| 7078 | ulint m_oldest; |
| 7079 | ulint m_n_elems; |
| 7080 | os_offset_t m_lowest_offset; |
| 7081 | |
| 7082 | AIO* m_array; |
| 7083 | ulint m_n_slots; |
| 7084 | ulint m_segment; |
| 7085 | |
| 7086 | slots_t m_slots; |
| 7087 | |
| 7088 | byte* m_ptr; |
| 7089 | byte* m_buf; |
| 7090 | }; |
| 7091 | |
| 7092 | /** Wait for I/O requests |
| 7093 | @return the number of slots */ |
| 7094 | ulint |
| 7095 | SimulatedAIOHandler::check_pending( |
| 7096 | ulint global_segment, |
| 7097 | os_event_t event) |
| 7098 | { |
| 7099 | /* NOTE! We only access constant fields in os_aio_array. |
| 7100 | Therefore we do not have to acquire the protecting mutex yet */ |
| 7101 | |
| 7102 | ut_ad(os_aio_validate_skip()); |
| 7103 | |
| 7104 | ut_ad(m_segment < m_array->get_n_segments()); |
| 7105 | |
| 7106 | /* Look through n slots after the segment * n'th slot */ |
| 7107 | |
| 7108 | if (AIO::is_read(m_array) |
| 7109 | && os_aio_recommend_sleep_for_read_threads) { |
| 7110 | |
| 7111 | /* Give other threads chance to add several |
| 7112 | I/Os to the array at once. */ |
| 7113 | |
| 7114 | srv_set_io_thread_op_info( |
| 7115 | global_segment, "waiting for i/o request" ); |
| 7116 | |
| 7117 | os_event_wait(event); |
| 7118 | |
| 7119 | return(0); |
| 7120 | } |
| 7121 | |
| 7122 | return(m_array->slots_per_segment()); |
| 7123 | } |
| 7124 | |
| 7125 | /** Does simulated AIO. This function should be called by an i/o-handler |
| 7126 | thread. |
| 7127 | |
| 7128 | @param[in] segment The number of the segment in the aio arrays to wait |
| 7129 | for; segment 0 is the ibuf i/o thread, segment 1 the |
| 7130 | log i/o thread, then follow the non-ibuf read threads, |
| 7131 | and as the last are the non-ibuf write threads |
| 7132 | @param[out] m1 the messages passed with the AIO request; note that |
| 7133 | also in the case where the AIO operation failed, these |
| 7134 | output parameters are valid and can be used to restart |
| 7135 | the operation, for example |
| 7136 | @param[out] m2 Callback argument |
| 7137 | @param[in] type IO context |
| 7138 | @return DB_SUCCESS or error code */ |
| 7139 | static |
| 7140 | dberr_t |
| 7141 | os_aio_simulated_handler( |
| 7142 | ulint global_segment, |
| 7143 | fil_node_t** m1, |
| 7144 | void** m2, |
| 7145 | IORequest* type) |
| 7146 | { |
| 7147 | Slot* slot; |
| 7148 | AIO* array; |
| 7149 | ulint segment; |
| 7150 | os_event_t event = os_aio_segment_wait_events[global_segment]; |
| 7151 | |
| 7152 | segment = AIO::get_array_and_local_segment(&array, global_segment); |
| 7153 | |
| 7154 | SimulatedAIOHandler handler(array, segment); |
| 7155 | |
| 7156 | for (;;) { |
| 7157 | |
| 7158 | srv_set_io_thread_op_info( |
| 7159 | global_segment, "looking for i/o requests (a)" ); |
| 7160 | |
| 7161 | ulint n_slots = handler.check_pending(global_segment, event); |
| 7162 | |
| 7163 | if (n_slots == 0) { |
| 7164 | continue; |
| 7165 | } |
| 7166 | |
| 7167 | handler.init(n_slots); |
| 7168 | |
| 7169 | srv_set_io_thread_op_info( |
| 7170 | global_segment, "looking for i/o requests (b)" ); |
| 7171 | |
| 7172 | array->acquire(); |
| 7173 | |
| 7174 | ulint n_reserved; |
| 7175 | |
| 7176 | slot = handler.check_completed(&n_reserved); |
| 7177 | |
| 7178 | if (slot != NULL) { |
| 7179 | |
| 7180 | break; |
| 7181 | |
| 7182 | } else if (n_reserved == 0 |
| 7183 | && !buf_page_cleaner_is_active |
| 7184 | && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { |
| 7185 | |
| 7186 | /* There is no completed request. If there |
| 7187 | are no pending request at all, and the system |
| 7188 | is being shut down, exit. */ |
| 7189 | |
| 7190 | array->release(); |
| 7191 | |
| 7192 | *m1 = NULL; |
| 7193 | |
| 7194 | *m2 = NULL; |
| 7195 | |
| 7196 | return(DB_SUCCESS); |
| 7197 | |
| 7198 | } else if (handler.select()) { |
| 7199 | |
| 7200 | break; |
| 7201 | } |
| 7202 | |
| 7203 | /* No I/O requested at the moment */ |
| 7204 | |
| 7205 | srv_set_io_thread_op_info( |
| 7206 | global_segment, "resetting wait event" ); |
| 7207 | |
| 7208 | /* We wait here until tbere are more IO requests |
| 7209 | for this segment. */ |
| 7210 | |
| 7211 | os_event_reset(event); |
| 7212 | |
| 7213 | array->release(); |
| 7214 | |
| 7215 | srv_set_io_thread_op_info( |
| 7216 | global_segment, "waiting for i/o request" ); |
| 7217 | |
| 7218 | os_event_wait(event); |
| 7219 | } |
| 7220 | |
| 7221 | /** Found a slot that has already completed its IO */ |
| 7222 | |
| 7223 | if (slot == NULL) { |
| 7224 | /* Merge adjacent requests */ |
| 7225 | handler.merge(); |
| 7226 | |
| 7227 | /* Check if there are several consecutive blocks |
| 7228 | to read or write */ |
| 7229 | |
| 7230 | srv_set_io_thread_op_info( |
| 7231 | global_segment, "consecutive i/o requests" ); |
| 7232 | |
| 7233 | // Note: We don't support write combining for simulated AIO. |
| 7234 | //ulint total_len = handler.allocate_buffer(); |
| 7235 | |
| 7236 | /* We release the array mutex for the time of the I/O: NOTE that |
| 7237 | this assumes that there is just one i/o-handler thread serving |
| 7238 | a single segment of slots! */ |
| 7239 | |
| 7240 | array->release(); |
| 7241 | |
| 7242 | // Note: We don't support write combining for simulated AIO. |
| 7243 | //handler.copy_to_buffer(total_len); |
| 7244 | |
| 7245 | srv_set_io_thread_op_info(global_segment, "doing file i/o" ); |
| 7246 | |
| 7247 | handler.io(); |
| 7248 | |
| 7249 | srv_set_io_thread_op_info(global_segment, "file i/o done" ); |
| 7250 | |
| 7251 | array->acquire(); |
| 7252 | |
| 7253 | handler.done(); |
| 7254 | |
| 7255 | /* We return the messages for the first slot now, and if there |
| 7256 | were several slots, the messages will be returned with |
| 7257 | subsequent calls of this function */ |
| 7258 | |
| 7259 | slot = handler.first_slot(); |
| 7260 | } |
| 7261 | |
| 7262 | ut_ad(slot->is_reserved); |
| 7263 | |
| 7264 | *m1 = slot->m1; |
| 7265 | *m2 = slot->m2; |
| 7266 | |
| 7267 | *type = slot->type; |
| 7268 | |
| 7269 | array->release(slot); |
| 7270 | |
| 7271 | array->release(); |
| 7272 | |
| 7273 | return(DB_SUCCESS); |
| 7274 | } |
| 7275 | |
| 7276 | /** Get the total number of pending IOs |
| 7277 | @return the total number of pending IOs */ |
| 7278 | ulint |
| 7279 | AIO::total_pending_io_count() |
| 7280 | { |
| 7281 | ulint count = s_reads->pending_io_count(); |
| 7282 | |
| 7283 | if (s_writes != NULL) { |
| 7284 | count += s_writes->pending_io_count(); |
| 7285 | } |
| 7286 | |
| 7287 | if (s_ibuf != NULL) { |
| 7288 | count += s_ibuf->pending_io_count(); |
| 7289 | } |
| 7290 | |
| 7291 | if (s_log != NULL) { |
| 7292 | count += s_log->pending_io_count(); |
| 7293 | } |
| 7294 | |
| 7295 | if (s_sync != NULL) { |
| 7296 | count += s_sync->pending_io_count(); |
| 7297 | } |
| 7298 | |
| 7299 | return(count); |
| 7300 | } |
| 7301 | |
| 7302 | /** Validates the consistency the aio system. |
| 7303 | @return true if ok */ |
| 7304 | static |
| 7305 | bool |
| 7306 | os_aio_validate() |
| 7307 | { |
| 7308 | /* The methods countds and validates, we ignore the count. */ |
| 7309 | AIO::total_pending_io_count(); |
| 7310 | |
| 7311 | return(true); |
| 7312 | } |
| 7313 | |
| 7314 | /** Prints pending IO requests per segment of an aio array. |
| 7315 | We probably don't need per segment statistics but they can help us |
| 7316 | during development phase to see if the IO requests are being |
| 7317 | distributed as expected. |
| 7318 | @param[in,out] file File where to print |
| 7319 | @param[in] segments Pending IO array */ |
| 7320 | void |
| 7321 | AIO::print_segment_info( |
| 7322 | FILE* file, |
| 7323 | const ulint* segments) |
| 7324 | { |
| 7325 | ut_ad(m_n_segments > 0); |
| 7326 | |
| 7327 | if (m_n_segments > 1) { |
| 7328 | |
| 7329 | fprintf(file, " [" ); |
| 7330 | |
| 7331 | for (ulint i = 0; i < m_n_segments; ++i, ++segments) { |
| 7332 | |
| 7333 | if (i != 0) { |
| 7334 | fprintf(file, ", " ); |
| 7335 | } |
| 7336 | |
| 7337 | fprintf(file, ULINTPF, *segments); |
| 7338 | } |
| 7339 | |
| 7340 | fprintf(file, "] " ); |
| 7341 | } |
| 7342 | } |
| 7343 | |
| 7344 | /** Prints info about the aio array. |
| 7345 | @param[in,out] file Where to print */ |
| 7346 | void |
| 7347 | AIO::print(FILE* file) |
| 7348 | { |
| 7349 | ulint count = 0; |
| 7350 | ulint n_res_seg[SRV_MAX_N_IO_THREADS]; |
| 7351 | |
| 7352 | mutex_enter(&m_mutex); |
| 7353 | |
| 7354 | ut_a(!m_slots.empty()); |
| 7355 | ut_a(m_n_segments > 0); |
| 7356 | |
| 7357 | memset(n_res_seg, 0x0, sizeof(n_res_seg)); |
| 7358 | |
| 7359 | for (ulint i = 0; i < m_slots.size(); ++i) { |
| 7360 | Slot& slot = m_slots[i]; |
| 7361 | ulint segment = (i * m_n_segments) / m_slots.size(); |
| 7362 | |
| 7363 | if (slot.is_reserved) { |
| 7364 | |
| 7365 | ++count; |
| 7366 | |
| 7367 | ++n_res_seg[segment]; |
| 7368 | |
| 7369 | ut_a(slot.len > 0); |
| 7370 | } |
| 7371 | } |
| 7372 | |
| 7373 | ut_a(m_n_reserved == count); |
| 7374 | |
| 7375 | print_segment_info(file, n_res_seg); |
| 7376 | |
| 7377 | mutex_exit(&m_mutex); |
| 7378 | } |
| 7379 | |
| 7380 | /** Print all the AIO segments |
| 7381 | @param[in,out] file Where to print */ |
| 7382 | void |
| 7383 | AIO::print_all(FILE* file) |
| 7384 | { |
| 7385 | s_reads->print(file); |
| 7386 | |
| 7387 | if (s_writes != NULL) { |
| 7388 | fputs(", aio writes:" , file); |
| 7389 | s_writes->print(file); |
| 7390 | } |
| 7391 | |
| 7392 | if (s_ibuf != NULL) { |
| 7393 | fputs(",\n ibuf aio reads:" , file); |
| 7394 | s_ibuf->print(file); |
| 7395 | } |
| 7396 | |
| 7397 | if (s_log != NULL) { |
| 7398 | fputs(", log i/o's:" , file); |
| 7399 | s_log->print(file); |
| 7400 | } |
| 7401 | |
| 7402 | if (s_sync != NULL) { |
| 7403 | fputs(", sync i/o's:" , file); |
| 7404 | s_sync->print(file); |
| 7405 | } |
| 7406 | } |
| 7407 | |
| 7408 | /** Prints info of the aio arrays. |
| 7409 | @param[in,out] file file where to print */ |
| 7410 | void |
| 7411 | os_aio_print(FILE* file) |
| 7412 | { |
| 7413 | time_t current_time; |
| 7414 | double time_elapsed; |
| 7415 | double avg_bytes_read; |
| 7416 | |
| 7417 | for (ulint i = 0; i < srv_n_file_io_threads; ++i) { |
| 7418 | fprintf(file, "I/O thread " ULINTPF " state: %s (%s)" , |
| 7419 | i, |
| 7420 | srv_io_thread_op_info[i], |
| 7421 | srv_io_thread_function[i]); |
| 7422 | |
| 7423 | #ifndef _WIN32 |
| 7424 | if (!srv_use_native_aio |
| 7425 | && os_event_is_set(os_aio_segment_wait_events[i])) { |
| 7426 | fprintf(file, " ev set" ); |
| 7427 | } |
| 7428 | #endif /* _WIN32 */ |
| 7429 | |
| 7430 | fprintf(file, "\n" ); |
| 7431 | } |
| 7432 | |
| 7433 | fputs("Pending normal aio reads:" , file); |
| 7434 | |
| 7435 | AIO::print_all(file); |
| 7436 | |
| 7437 | putc('\n', file); |
| 7438 | current_time = ut_time(); |
| 7439 | time_elapsed = 0.001 + difftime(current_time, os_last_printout); |
| 7440 | |
| 7441 | fprintf(file, |
| 7442 | "Pending flushes (fsync) log: " ULINTPF |
| 7443 | "; buffer pool: " ULINTPF "\n" |
| 7444 | ULINTPF " OS file reads, " |
| 7445 | ULINTPF " OS file writes, " |
| 7446 | ULINTPF " OS fsyncs\n" , |
| 7447 | fil_n_pending_log_flushes, |
| 7448 | fil_n_pending_tablespace_flushes, |
| 7449 | os_n_file_reads, |
| 7450 | os_n_file_writes, |
| 7451 | os_n_fsyncs); |
| 7452 | |
| 7453 | const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); |
| 7454 | const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); |
| 7455 | |
| 7456 | if (n_reads != 0 || n_writes != 0) { |
| 7457 | fprintf(file, |
| 7458 | ULINTPF " pending reads, " ULINTPF " pending writes\n" , |
| 7459 | n_reads, n_writes); |
| 7460 | } |
| 7461 | |
| 7462 | if (os_n_file_reads == os_n_file_reads_old) { |
| 7463 | avg_bytes_read = 0.0; |
| 7464 | } else { |
| 7465 | avg_bytes_read = (double) os_bytes_read_since_printout |
| 7466 | / (os_n_file_reads - os_n_file_reads_old); |
| 7467 | } |
| 7468 | |
| 7469 | fprintf(file, |
| 7470 | "%.2f reads/s, " ULINTPF " avg bytes/read," |
| 7471 | " %.2f writes/s, %.2f fsyncs/s\n" , |
| 7472 | (os_n_file_reads - os_n_file_reads_old) |
| 7473 | / time_elapsed, |
| 7474 | (ulint) avg_bytes_read, |
| 7475 | (os_n_file_writes - os_n_file_writes_old) |
| 7476 | / time_elapsed, |
| 7477 | (os_n_fsyncs - os_n_fsyncs_old) |
| 7478 | / time_elapsed); |
| 7479 | |
| 7480 | os_n_file_reads_old = os_n_file_reads; |
| 7481 | os_n_file_writes_old = os_n_file_writes; |
| 7482 | os_n_fsyncs_old = os_n_fsyncs; |
| 7483 | os_bytes_read_since_printout = 0; |
| 7484 | |
| 7485 | os_last_printout = current_time; |
| 7486 | } |
| 7487 | |
| 7488 | /** Refreshes the statistics used to print per-second averages. */ |
| 7489 | void |
| 7490 | os_aio_refresh_stats() |
| 7491 | { |
| 7492 | os_n_fsyncs_old = os_n_fsyncs; |
| 7493 | |
| 7494 | os_bytes_read_since_printout = 0; |
| 7495 | |
| 7496 | os_n_file_reads_old = os_n_file_reads; |
| 7497 | |
| 7498 | os_n_file_writes_old = os_n_file_writes; |
| 7499 | |
| 7500 | os_n_fsyncs_old = os_n_fsyncs; |
| 7501 | |
| 7502 | os_bytes_read_since_printout = 0; |
| 7503 | |
| 7504 | os_last_printout = ut_time(); |
| 7505 | } |
| 7506 | |
| 7507 | /** Checks that all slots in the system have been freed, that is, there are |
| 7508 | no pending io operations. |
| 7509 | @return true if all free */ |
| 7510 | bool |
| 7511 | os_aio_all_slots_free() |
| 7512 | { |
| 7513 | return(AIO::total_pending_io_count() == 0); |
| 7514 | } |
| 7515 | |
| 7516 | #ifdef UNIV_DEBUG |
| 7517 | /** Prints all pending IO for the array |
| 7518 | @param[in] file file where to print |
| 7519 | @param[in] array array to process */ |
| 7520 | void |
| 7521 | AIO::to_file(FILE* file) const |
| 7522 | { |
| 7523 | acquire(); |
| 7524 | |
| 7525 | fprintf(file, " " ULINTPF "\n" , m_n_reserved); |
| 7526 | |
| 7527 | for (ulint i = 0; i < m_slots.size(); ++i) { |
| 7528 | |
| 7529 | const Slot& slot = m_slots[i]; |
| 7530 | |
| 7531 | if (slot.is_reserved) { |
| 7532 | |
| 7533 | fprintf(file, |
| 7534 | "%s IO for %s (offset=" UINT64PF |
| 7535 | ", size=%lu)\n" , |
| 7536 | slot.type.is_read() ? "read" : "write" , |
| 7537 | slot.name, slot.offset, (unsigned long)(slot.len)); |
| 7538 | } |
| 7539 | } |
| 7540 | |
| 7541 | release(); |
| 7542 | } |
| 7543 | |
| 7544 | /** Print pending IOs for all arrays */ |
| 7545 | void |
| 7546 | AIO::print_to_file(FILE* file) |
| 7547 | { |
| 7548 | fprintf(file, "Pending normal aio reads:" ); |
| 7549 | |
| 7550 | s_reads->to_file(file); |
| 7551 | |
| 7552 | if (s_writes != NULL) { |
| 7553 | fprintf(file, "Pending normal aio writes:" ); |
| 7554 | s_writes->to_file(file); |
| 7555 | } |
| 7556 | |
| 7557 | if (s_ibuf != NULL) { |
| 7558 | fprintf(file, "Pending ibuf aio reads:" ); |
| 7559 | s_ibuf->to_file(file); |
| 7560 | } |
| 7561 | |
| 7562 | if (s_log != NULL) { |
| 7563 | fprintf(file, "Pending log i/o's:" ); |
| 7564 | s_log->to_file(file); |
| 7565 | } |
| 7566 | |
| 7567 | if (s_sync != NULL) { |
| 7568 | fprintf(file, "Pending sync i/o's:" ); |
| 7569 | s_sync->to_file(file); |
| 7570 | } |
| 7571 | } |
| 7572 | |
| 7573 | /** Prints all pending IO |
| 7574 | @param[in] file File where to print */ |
| 7575 | void |
| 7576 | os_aio_print_pending_io( |
| 7577 | FILE* file) |
| 7578 | { |
| 7579 | AIO::print_to_file(file); |
| 7580 | } |
| 7581 | |
| 7582 | #endif /* UNIV_DEBUG */ |
| 7583 | |
| 7584 | /** |
| 7585 | Set the file create umask |
| 7586 | @param[in] umask The umask to use for file creation. */ |
| 7587 | void |
| 7588 | os_file_set_umask(ulint umask) |
| 7589 | { |
| 7590 | os_innodb_umask = umask; |
| 7591 | } |
| 7592 | |
| 7593 | #else |
| 7594 | #include "univ.i" |
| 7595 | #endif /* !UNIV_INNOCHECKSUM */ |
| 7596 | |
| 7597 | /** Normalizes a directory path for the current OS: |
| 7598 | On Windows, we convert '/' to '\', else we convert '\' to '/'. |
| 7599 | @param[in,out] str A null-terminated directory and file path */ |
| 7600 | void |
| 7601 | os_normalize_path( |
| 7602 | char* str) |
| 7603 | { |
| 7604 | if (str != NULL) { |
| 7605 | for (; *str; str++) { |
| 7606 | if (*str == OS_PATH_SEPARATOR_ALT) { |
| 7607 | *str = OS_PATH_SEPARATOR; |
| 7608 | } |
| 7609 | } |
| 7610 | } |
| 7611 | } |
| 7612 | |