1 | /*********************************************************************** |
2 | |
3 | Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2009, Percona Inc. |
5 | Copyright (c) 2013, 2018, MariaDB Corporation. |
6 | |
7 | Portions of this file contain modifications contributed and copyrighted |
8 | by Percona Inc.. Those modifications are |
9 | gratefully acknowledged and are described briefly in the InnoDB |
10 | documentation. The contributions by Percona Inc. are incorporated with |
11 | their permission, and subject to the conditions contained in the file |
12 | COPYING.Percona. |
13 | |
14 | This program is free software; you can redistribute it and/or modify it |
15 | under the terms of the GNU General Public License as published by the |
16 | Free Software Foundation; version 2 of the License. |
17 | |
18 | This program is distributed in the hope that it will be useful, but |
19 | WITHOUT ANY WARRANTY; without even the implied warranty of |
20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General |
21 | Public License for more details. |
22 | |
23 | You should have received a copy of the GNU General Public License along with |
24 | this program; if not, write to the Free Software Foundation, Inc., |
25 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
26 | |
27 | ***********************************************************************/ |
28 | |
29 | /**************************************************//** |
30 | @file os/os0file.cc |
31 | The interface to the operating system file i/o primitives |
32 | |
33 | Created 10/21/1995 Heikki Tuuri |
34 | *******************************************************/ |
35 | |
36 | #ifndef UNIV_INNOCHECKSUM |
37 | |
38 | #include "ha_prototypes.h" |
39 | #include "sql_const.h" |
40 | |
41 | #include "os0file.h" |
42 | |
43 | #ifdef UNIV_LINUX |
44 | #include <sys/types.h> |
45 | #include <sys/stat.h> |
46 | #endif |
47 | |
48 | #include "srv0srv.h" |
49 | #include "srv0start.h" |
50 | #include "fil0fil.h" |
51 | #include "fil0crypt.h" |
52 | #include "fsp0fsp.h" |
53 | #include "fil0pagecompress.h" |
54 | #include "srv0srv.h" |
55 | #ifdef HAVE_LINUX_UNISTD_H |
56 | #include "unistd.h" |
57 | #endif |
58 | #include "os0event.h" |
59 | #include "os0thread.h" |
60 | |
61 | #include <vector> |
62 | |
63 | #ifdef LINUX_NATIVE_AIO |
64 | #include <libaio.h> |
65 | #endif /* LINUX_NATIVE_AIO */ |
66 | |
67 | #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE |
68 | # include <fcntl.h> |
69 | # include <linux/falloc.h> |
70 | #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ |
71 | |
72 | #if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) |
73 | # include <sys/ioctl.h> |
74 | # ifndef DFS_IOCTL_ATOMIC_WRITE_SET |
75 | # define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) |
76 | # endif |
77 | #endif |
78 | |
79 | #if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) |
80 | #include <sys/statvfs.h> |
81 | #endif |
82 | |
83 | #if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H) |
84 | #include <linux/falloc.h> |
85 | #endif |
86 | |
87 | #ifdef _WIN32 |
88 | #include <winioctl.h> |
89 | #endif |
90 | |
91 | /** Insert buffer segment id */ |
92 | static const ulint IO_IBUF_SEGMENT = 0; |
93 | |
94 | /** Log segment id */ |
95 | static const ulint IO_LOG_SEGMENT = 1; |
96 | |
97 | /** Number of retries for partial I/O's */ |
98 | static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10; |
99 | |
100 | /* This specifies the file permissions InnoDB uses when it creates files in |
101 | Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to |
102 | my_umask */ |
103 | |
104 | #ifndef _WIN32 |
105 | /** Umask for creating files */ |
106 | static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; |
107 | #else |
108 | /** Umask for creating files */ |
109 | static ulint os_innodb_umask = 0; |
110 | static HANDLE data_completion_port; |
111 | static HANDLE log_completion_port; |
112 | |
113 | static DWORD fls_sync_io = FLS_OUT_OF_INDEXES; |
114 | #define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1 |
115 | #endif /* _WIN32 */ |
116 | |
117 | /** In simulated aio, merge at most this many consecutive i/os */ |
118 | static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64; |
119 | |
120 | /** Flag indicating if the page_cleaner is in active state. */ |
121 | extern bool buf_page_cleaner_is_active; |
122 | |
123 | #ifdef WITH_INNODB_DISALLOW_WRITES |
124 | #define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) |
125 | #else |
126 | #define WAIT_ALLOW_WRITES() do { } while (0) |
127 | #endif /* WITH_INNODB_DISALLOW_WRITES */ |
128 | |
129 | /********************************************************************** |
130 | |
131 | InnoDB AIO Implementation: |
132 | ========================= |
133 | |
134 | We support native AIO for Windows and Linux. For rest of the platforms |
135 | we simulate AIO by special IO-threads servicing the IO-requests. |
136 | |
137 | Simulated AIO: |
138 | ============== |
139 | |
140 | On platforms where we 'simulate' AIO, the following is a rough explanation |
141 | of the high level design. |
142 | There are four io-threads (for ibuf, log, read, write). |
143 | All synchronous IO requests are serviced by the calling thread using |
144 | os_file_write/os_file_read. The Asynchronous requests are queued up |
145 | in an array (there are four such arrays) by the calling thread. |
146 | Later these requests are picked up by the IO-thread and are serviced |
147 | synchronously. |
148 | |
149 | Windows native AIO: |
150 | ================== |
151 | |
152 | If srv_use_native_aio is not set then Windows follow the same |
153 | code as simulated AIO. If the flag is set then native AIO interface |
154 | is used. On windows, one of the limitation is that if a file is opened |
155 | for AIO no synchronous IO can be done on it. Therefore we have an |
156 | extra fifth array to queue up synchronous IO requests. |
157 | There are innodb_file_io_threads helper threads. These threads work |
158 | on the four arrays mentioned above in Simulated AIO. No thread is |
159 | required for the sync array. |
160 | If a synchronous IO request is made, it is first queued in the sync |
161 | array. Then the calling thread itself waits on the request, thus |
162 | making the call synchronous. |
163 | If an AIO request is made the calling thread not only queues it in the |
164 | array but also submits the requests. The helper thread then collects |
165 | the completed IO request and calls completion routine on it. |
166 | |
167 | Linux native AIO: |
168 | ================= |
169 | |
170 | If we have libaio installed on the system and innodb_use_native_aio |
171 | is set to true we follow the code path of native AIO, otherwise we |
172 | do simulated AIO. |
173 | There are innodb_file_io_threads helper threads. These threads work |
174 | on the four arrays mentioned above in Simulated AIO. |
175 | If a synchronous IO request is made, it is handled by calling |
176 | os_file_write/os_file_read. |
177 | If an AIO request is made the calling thread not only queues it in the |
178 | array but also submits the requests. The helper thread then collects |
179 | the completed IO request and calls completion routine on it. |
180 | |
181 | **********************************************************************/ |
182 | |
183 | |
184 | #ifdef UNIV_PFS_IO |
185 | /* Keys to register InnoDB I/O with performance schema */ |
186 | mysql_pfs_key_t innodb_data_file_key; |
187 | mysql_pfs_key_t innodb_log_file_key; |
188 | mysql_pfs_key_t innodb_temp_file_key; |
189 | #endif /* UNIV_PFS_IO */ |
190 | |
191 | class AIO; |
192 | |
193 | /** The asynchronous I/O context */ |
194 | struct Slot { |
195 | |
196 | #ifdef WIN_ASYNC_IO |
197 | /** Windows control block for the aio request |
198 | must be at the very start of Slot, so we can |
199 | cast Slot* to OVERLAPPED* |
200 | */ |
201 | OVERLAPPED control; |
202 | #endif |
203 | |
204 | /** index of the slot in the aio array */ |
205 | uint16_t pos; |
206 | |
207 | /** true if this slot is reserved */ |
208 | bool is_reserved; |
209 | |
210 | /** time when reserved */ |
211 | time_t reservation_time; |
212 | |
213 | /** buffer used in i/o */ |
214 | byte* buf; |
215 | |
216 | /** Buffer pointer used for actual IO. We advance this |
217 | when partial IO is required and not buf */ |
218 | byte* ptr; |
219 | |
220 | /** OS_FILE_READ or OS_FILE_WRITE */ |
221 | IORequest type; |
222 | |
223 | /** file offset in bytes */ |
224 | os_offset_t offset; |
225 | |
226 | /** file where to read or write */ |
227 | pfs_os_file_t file; |
228 | |
229 | /** file name or path */ |
230 | const char* name; |
231 | |
232 | /** used only in simulated aio: true if the physical i/o |
233 | already made and only the slot message needs to be passed |
234 | to the caller of os_aio_simulated_handle */ |
235 | bool io_already_done; |
236 | |
237 | /*!< file block size */ |
238 | ulint file_block_size; |
239 | |
240 | /** The file node for which the IO is requested. */ |
241 | fil_node_t* m1; |
242 | |
243 | /** the requester of an aio operation and which can be used |
244 | to identify which pending aio operation was completed */ |
245 | void* m2; |
246 | |
247 | /** AIO completion status */ |
248 | dberr_t err; |
249 | |
250 | #ifdef WIN_ASYNC_IO |
251 | |
252 | /** bytes written/read */ |
253 | DWORD n_bytes; |
254 | |
255 | /** length of the block to read or write */ |
256 | DWORD len; |
257 | |
258 | /** aio array containing this slot */ |
259 | AIO *array; |
260 | #elif defined(LINUX_NATIVE_AIO) |
261 | /** Linux control block for aio */ |
262 | struct iocb control; |
263 | |
264 | /** AIO return code */ |
265 | int ret; |
266 | |
267 | /** bytes written/read. */ |
268 | ssize_t n_bytes; |
269 | |
270 | /** length of the block to read or write */ |
271 | ulint len; |
272 | #else |
273 | /** length of the block to read or write */ |
274 | ulint len; |
275 | |
276 | /** bytes written/read. */ |
277 | ulint n_bytes; |
278 | #endif /* WIN_ASYNC_IO */ |
279 | |
280 | /** Length of the block before it was compressed */ |
281 | uint32 original_len; |
282 | |
283 | }; |
284 | |
285 | /** The asynchronous i/o array structure */ |
286 | class AIO { |
287 | public: |
288 | /** Constructor |
289 | @param[in] id Latch ID |
290 | @param[in] n_slots Number of slots to configure |
291 | @param[in] segments Number of segments to configure */ |
292 | AIO(latch_id_t id, ulint n_slots, ulint segments); |
293 | |
294 | /** Destructor */ |
295 | ~AIO(); |
296 | |
297 | /** Initialize the instance |
298 | @return DB_SUCCESS or error code */ |
299 | dberr_t init(); |
300 | |
301 | /** Requests for a slot in the aio array. If no slot is available, waits |
302 | until not_full-event becomes signaled. |
303 | |
304 | @param[in] type IO context |
305 | @param[in,out] m1 message to be passed along with the AIO |
306 | operation |
307 | @param[in,out] m2 message to be passed along with the AIO |
308 | operation |
309 | @param[in] file file handle |
310 | @param[in] name name of the file or path as a null-terminated |
311 | string |
312 | @param[in,out] buf buffer where to read or from which to write |
313 | @param[in] offset file offset, where to read from or start writing |
314 | @param[in] len length of the block to read or write |
315 | @return pointer to slot */ |
316 | Slot* reserve_slot( |
317 | const IORequest& type, |
318 | fil_node_t* m1, |
319 | void* m2, |
320 | pfs_os_file_t file, |
321 | const char* name, |
322 | void* buf, |
323 | os_offset_t offset, |
324 | ulint len) |
325 | MY_ATTRIBUTE((warn_unused_result)); |
326 | |
327 | /** @return number of reserved slots */ |
328 | ulint pending_io_count() const; |
329 | |
330 | /** Returns a pointer to the nth slot in the aio array. |
331 | @param[in] index Index of the slot in the array |
332 | @return pointer to slot */ |
333 | const Slot* at(ulint i) const |
334 | MY_ATTRIBUTE((warn_unused_result)) |
335 | { |
336 | ut_a(i < m_slots.size()); |
337 | |
338 | return(&m_slots[i]); |
339 | } |
340 | |
341 | /** Non const version */ |
342 | Slot* at(ulint i) |
343 | MY_ATTRIBUTE((warn_unused_result)) |
344 | { |
345 | ut_a(i < m_slots.size()); |
346 | |
347 | return(&m_slots[i]); |
348 | } |
349 | |
350 | /** Frees a slot in the AIO array, assumes caller owns the mutex. |
351 | @param[in,out] slot Slot to release */ |
352 | void release(Slot* slot); |
353 | |
354 | /** Frees a slot in the AIO array, assumes caller doesn't own the mutex. |
355 | @param[in,out] slot Slot to release */ |
356 | void release_with_mutex(Slot* slot); |
357 | |
358 | /** Prints info about the aio array. |
359 | @param[in,out] file Where to print */ |
360 | void print(FILE* file); |
361 | |
362 | /** @return the number of slots per segment */ |
363 | ulint slots_per_segment() const |
364 | MY_ATTRIBUTE((warn_unused_result)) |
365 | { |
366 | return(m_slots.size() / m_n_segments); |
367 | } |
368 | |
369 | /** @return accessor for n_segments */ |
370 | ulint get_n_segments() const |
371 | MY_ATTRIBUTE((warn_unused_result)) |
372 | { |
373 | return(m_n_segments); |
374 | } |
375 | |
376 | #ifdef UNIV_DEBUG |
377 | /** @return true if the thread owns the mutex */ |
378 | bool is_mutex_owned() const |
379 | MY_ATTRIBUTE((warn_unused_result)) |
380 | { |
381 | return(mutex_own(&m_mutex)); |
382 | } |
383 | #endif /* UNIV_DEBUG */ |
384 | |
385 | /** Acquire the mutex */ |
386 | void acquire() const |
387 | { |
388 | mutex_enter(&m_mutex); |
389 | } |
390 | |
391 | /** Release the mutex */ |
392 | void release() const |
393 | { |
394 | mutex_exit(&m_mutex); |
395 | } |
396 | |
397 | /** Write out the state to the file/stream |
398 | @param[in, out] file File to write to */ |
399 | void to_file(FILE* file) const; |
400 | |
401 | #ifdef LINUX_NATIVE_AIO |
402 | /** Dispatch an AIO request to the kernel. |
403 | @param[in,out] slot an already reserved slot |
404 | @return true on success. */ |
405 | bool linux_dispatch(Slot* slot) |
406 | MY_ATTRIBUTE((warn_unused_result)); |
407 | |
408 | /** Accessor for an AIO event |
409 | @param[in] index Index into the array |
410 | @return the event at the index */ |
411 | io_event* io_events(ulint index) |
412 | MY_ATTRIBUTE((warn_unused_result)) |
413 | { |
414 | ut_a(index < m_events.size()); |
415 | |
416 | return(&m_events[index]); |
417 | } |
418 | |
419 | /** Accessor for the AIO context |
420 | @param[in] segment Segment for which to get the context |
421 | @return the AIO context for the segment */ |
422 | io_context* io_ctx(ulint segment) |
423 | MY_ATTRIBUTE((warn_unused_result)) |
424 | { |
425 | ut_ad(segment < get_n_segments()); |
426 | |
427 | return(m_aio_ctx[segment]); |
428 | } |
429 | |
430 | /** Creates an io_context for native linux AIO. |
431 | @param[in] max_events number of events |
432 | @param[out] io_ctx io_ctx to initialize. |
433 | @return true on success. */ |
434 | static bool linux_create_io_ctx(unsigned max_events, io_context_t* io_ctx) |
435 | MY_ATTRIBUTE((warn_unused_result)); |
436 | |
437 | /** Checks if the system supports native linux aio. On some kernel |
438 | versions where native aio is supported it won't work on tmpfs. In such |
439 | cases we can't use native aio as it is not possible to mix simulated |
440 | and native aio. |
441 | @return true if supported, false otherwise. */ |
442 | static bool is_linux_native_aio_supported() |
443 | MY_ATTRIBUTE((warn_unused_result)); |
444 | #endif /* LINUX_NATIVE_AIO */ |
445 | |
446 | #ifdef WIN_ASYNC_IO |
447 | HANDLE m_completion_port; |
448 | /** Wake up all AIO threads in Windows native aio */ |
449 | static void wake_at_shutdown() { |
450 | AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf }; |
451 | for (size_t i = 0; i < array_elements(all_arrays); i++) { |
452 | AIO *a = all_arrays[i]; |
453 | if (a) { |
454 | PostQueuedCompletionStatus(a->m_completion_port, 0, |
455 | IOCP_SHUTDOWN_KEY, 0); |
456 | } |
457 | } |
458 | } |
459 | #endif /* WIN_ASYNC_IO */ |
460 | |
461 | #ifdef _WIN32 |
462 | /** This function can be called if one wants to post a batch of reads |
463 | and prefers an I/O - handler thread to handle them all at once later.You |
464 | must call os_aio_simulated_wake_handler_threads later to ensure the |
465 | threads are not left sleeping! */ |
466 | static void simulated_put_read_threads_to_sleep(); |
467 | #endif /* _WIN32 */ |
468 | |
469 | /** Create an instance using new(std::nothrow) |
470 | @param[in] id Latch ID |
471 | @param[in] n_slots The number of AIO request slots |
472 | @param[in] segments The number of segments |
473 | @return a new AIO instance */ |
474 | static AIO* create( |
475 | latch_id_t id, |
476 | ulint n_slots, |
477 | ulint segments) |
478 | MY_ATTRIBUTE((warn_unused_result)); |
479 | |
480 | /** Initializes the asynchronous io system. Creates one array each |
481 | for ibuf and log I/O. Also creates one array each for read and write |
482 | where each array is divided logically into n_readers and n_writers |
483 | respectively. The caller must create an i/o handler thread for each |
484 | segment in these arrays. This function also creates the sync array. |
485 | No I/O handler thread needs to be created for that |
486 | @param[in] n_per_seg maximum number of pending aio |
487 | operations allowed per segment |
488 | @param[in] n_readers number of reader threads |
489 | @param[in] n_writers number of writer threads |
490 | @param[in] n_slots_sync number of slots in the sync aio array |
491 | @return true if AIO sub-system was started successfully */ |
492 | static bool start( |
493 | ulint n_per_seg, |
494 | ulint n_readers, |
495 | ulint n_writers, |
496 | ulint n_slots_sync) |
497 | MY_ATTRIBUTE((warn_unused_result)); |
498 | |
499 | /** Free the AIO arrays */ |
500 | static void shutdown(); |
501 | |
502 | /** Print all the AIO segments |
503 | @param[in,out] file Where to print */ |
504 | static void print_all(FILE* file); |
505 | |
506 | /** Calculates local segment number and aio array from global |
507 | segment number. |
508 | @param[out] array AIO wait array |
509 | @param[in] segment global segment number |
510 | @return local segment number within the aio array */ |
511 | static ulint get_array_and_local_segment( |
512 | AIO** array, |
513 | ulint segment) |
514 | MY_ATTRIBUTE((warn_unused_result)); |
515 | |
516 | /** Select the IO slot array |
517 | @param[in,out] type Type of IO, READ or WRITE |
518 | @param[in] read_only true if running in read-only mode |
519 | @param[in] mode IO mode |
520 | @return slot array or NULL if invalid mode specified */ |
521 | static AIO* select_slot_array( |
522 | IORequest& type, |
523 | bool read_only, |
524 | ulint mode) |
525 | MY_ATTRIBUTE((warn_unused_result)); |
526 | |
527 | /** Calculates segment number for a slot. |
528 | @param[in] array AIO wait array |
529 | @param[in] slot slot in this array |
530 | @return segment number (which is the number used by, for example, |
531 | I/O handler threads) */ |
532 | static ulint get_segment_no_from_slot( |
533 | const AIO* array, |
534 | const Slot* slot) |
535 | MY_ATTRIBUTE((warn_unused_result)); |
536 | |
537 | /** Wakes up a simulated AIO I/O-handler thread if it has something |
538 | to do. |
539 | @param[in] global_segment the number of the segment in the |
540 | AIO arrays */ |
541 | static void wake_simulated_handler_thread(ulint global_segment); |
542 | |
543 | /** Check if it is a read request |
544 | @param[in] aio The AIO instance to check |
545 | @return true if the AIO instance is for reading. */ |
546 | static bool is_read(const AIO* aio) |
547 | MY_ATTRIBUTE((warn_unused_result)) |
548 | { |
549 | return(s_reads == aio); |
550 | } |
551 | |
552 | /** Wait on an event until no pending writes */ |
553 | static void wait_until_no_pending_writes() |
554 | { |
555 | os_event_wait(AIO::s_writes->m_is_empty); |
556 | } |
557 | |
558 | /** Print to file |
559 | @param[in] file File to write to */ |
560 | static void print_to_file(FILE* file); |
561 | |
562 | /** Check for pending IO. Gets the count and also validates the |
563 | data structures. |
564 | @return count of pending IO requests */ |
565 | static ulint total_pending_io_count(); |
566 | |
567 | private: |
568 | /** Initialise the slots |
569 | @return DB_SUCCESS or error code */ |
570 | dberr_t init_slots() |
571 | MY_ATTRIBUTE((warn_unused_result)); |
572 | |
573 | /** Wakes up a simulated AIO I/O-handler thread if it has something |
574 | to do for a local segment in the AIO array. |
575 | @param[in] global_segment the number of the segment in the |
576 | AIO arrays |
577 | @param[in] segment the local segment in the AIO array */ |
578 | void wake_simulated_handler_thread(ulint global_segment, ulint segment); |
579 | |
580 | /** Prints pending IO requests per segment of an aio array. |
581 | We probably don't need per segment statistics but they can help us |
582 | during development phase to see if the IO requests are being |
583 | distributed as expected. |
584 | @param[in,out] file file where to print |
585 | @param[in] segments pending IO array */ |
586 | void print_segment_info( |
587 | FILE* file, |
588 | const ulint* segments); |
589 | |
590 | #ifdef LINUX_NATIVE_AIO |
591 | /** Initialise the Linux native AIO data structures |
592 | @return DB_SUCCESS or error code */ |
593 | dberr_t init_linux_native_aio() |
594 | MY_ATTRIBUTE((warn_unused_result)); |
595 | #endif /* LINUX_NATIVE_AIO */ |
596 | |
597 | private: |
598 | typedef std::vector<Slot> Slots; |
599 | |
600 | /** the mutex protecting the aio array */ |
601 | mutable SysMutex m_mutex; |
602 | |
603 | /** Pointer to the slots in the array. |
604 | Number of elements must be divisible by n_threads. */ |
605 | Slots m_slots; |
606 | |
607 | /** Number of segments in the aio array of pending aio requests. |
608 | A thread can wait separately for any one of the segments. */ |
609 | ulint m_n_segments; |
610 | |
611 | /** The event which is set to the signaled state when |
612 | there is space in the aio outside the ibuf segment; |
613 | os_event_set() and os_event_reset() are protected by AIO::m_mutex */ |
614 | os_event_t m_not_full; |
615 | |
616 | /** The event which is set to the signaled state when |
617 | there are no pending i/os in this array; |
618 | os_event_set() and os_event_reset() are protected by AIO::m_mutex */ |
619 | os_event_t m_is_empty; |
620 | |
621 | /** Number of reserved slots in the AIO array outside |
622 | the ibuf segment */ |
623 | ulint m_n_reserved; |
624 | |
625 | |
626 | #if defined(LINUX_NATIVE_AIO) |
627 | typedef std::vector<io_event> IOEvents; |
628 | |
629 | /** completion queue for IO. There is one such queue per |
630 | segment. Each thread will work on one ctx exclusively. */ |
631 | io_context_t* m_aio_ctx; |
632 | |
633 | /** The array to collect completed IOs. There is one such |
634 | event for each possible pending IO. The size of the array |
635 | is equal to m_slots.size(). */ |
636 | IOEvents m_events; |
637 | #endif /* LINUX_NATIV_AIO */ |
638 | |
639 | /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as |
640 | sync AIO. These are NULL when the module has not yet been |
641 | initialized. */ |
642 | |
643 | /** Insert buffer */ |
644 | static AIO* s_ibuf; |
645 | |
646 | /** Redo log */ |
647 | static AIO* s_log; |
648 | |
649 | /** Reads */ |
650 | static AIO* s_reads; |
651 | |
652 | /** Writes */ |
653 | static AIO* s_writes; |
654 | |
655 | /** Synchronous I/O */ |
656 | static AIO* s_sync; |
657 | }; |
658 | |
659 | /** Static declarations */ |
660 | AIO* AIO::s_reads; |
661 | AIO* AIO::s_writes; |
662 | AIO* AIO::s_ibuf; |
663 | AIO* AIO::s_log; |
664 | AIO* AIO::s_sync; |
665 | |
666 | #if defined(LINUX_NATIVE_AIO) |
667 | /** timeout for each io_getevents() call = 500ms. */ |
668 | static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL; |
669 | |
670 | /** time to sleep, in microseconds if io_setup() returns EAGAIN. */ |
671 | static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL; |
672 | |
673 | /** number of attempts before giving up on io_setup(). */ |
674 | static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5; |
675 | #endif /* LINUX_NATIVE_AIO */ |
676 | |
677 | /** Array of events used in simulated AIO */ |
678 | static os_event_t* os_aio_segment_wait_events; |
679 | |
680 | /** Number of asynchronous I/O segments. Set by os_aio_init(). */ |
681 | static ulint os_aio_n_segments = ULINT_UNDEFINED; |
682 | |
683 | /** If the following is true, read i/o handler threads try to |
684 | wait until a batch of new read requests have been posted */ |
685 | static bool os_aio_recommend_sleep_for_read_threads; |
686 | |
687 | ulint os_n_file_reads; |
688 | static ulint os_bytes_read_since_printout; |
689 | ulint os_n_file_writes; |
690 | ulint os_n_fsyncs; |
691 | static ulint os_n_file_reads_old; |
692 | static ulint os_n_file_writes_old; |
693 | static ulint os_n_fsyncs_old; |
694 | |
695 | static time_t os_last_printout; |
696 | bool os_has_said_disk_full; |
697 | |
698 | /** Default Zip compression level */ |
699 | extern uint page_zip_level; |
700 | |
701 | /** Validates the consistency of the aio system. |
702 | @return true if ok */ |
703 | static |
704 | bool |
705 | os_aio_validate(); |
706 | |
707 | /** Handle errors for file operations. |
708 | @param[in] name name of a file or NULL |
709 | @param[in] operation operation |
710 | @param[in] should_abort whether to abort on an unknown error |
711 | @param[in] on_error_silent whether to suppress reports of non-fatal errors |
712 | @return true if we should retry the operation */ |
713 | static MY_ATTRIBUTE((warn_unused_result)) |
714 | bool |
715 | os_file_handle_error_cond_exit( |
716 | const char* name, |
717 | const char* operation, |
718 | bool should_abort, |
719 | bool on_error_silent); |
720 | |
721 | /** Does error handling when a file operation fails. |
722 | @param[in] name name of a file or NULL |
723 | @param[in] operation operation name that failed |
724 | @return true if we should retry the operation */ |
725 | static |
726 | bool |
727 | os_file_handle_error( |
728 | const char* name, |
729 | const char* operation) |
730 | { |
731 | /* Exit in case of unknown error */ |
732 | return(os_file_handle_error_cond_exit(name, operation, true, false)); |
733 | } |
734 | |
735 | /** Does error handling when a file operation fails. |
736 | @param[in] name name of a file or NULL |
737 | @param[in] operation operation name that failed |
738 | @param[in] on_error_silent if true then don't print any message to the log. |
739 | @return true if we should retry the operation */ |
740 | static |
741 | bool |
742 | os_file_handle_error_no_exit( |
743 | const char* name, |
744 | const char* operation, |
745 | bool on_error_silent) |
746 | { |
747 | /* Don't exit in case of unknown error */ |
748 | return(os_file_handle_error_cond_exit( |
749 | name, operation, false, on_error_silent)); |
750 | } |
751 | |
752 | /** Does simulated AIO. This function should be called by an i/o-handler |
753 | thread. |
754 | |
755 | @param[in] segment The number of the segment in the aio arrays to wait |
756 | for; segment 0 is the ibuf i/o thread, segment 1 the |
757 | log i/o thread, then follow the non-ibuf read threads, |
758 | and as the last are the non-ibuf write threads |
759 | @param[out] m1 the messages passed with the AIO request; note that |
760 | also in the case where the AIO operation failed, these |
761 | output parameters are valid and can be used to restart |
762 | the operation, for example |
763 | @param[out] m2 Callback argument |
764 | @param[in] type IO context |
765 | @return DB_SUCCESS or error code */ |
766 | static |
767 | dberr_t |
768 | os_aio_simulated_handler( |
769 | ulint global_segment, |
770 | fil_node_t** m1, |
771 | void** m2, |
772 | IORequest* type); |
773 | |
774 | #ifdef _WIN32 |
775 | static HANDLE win_get_syncio_event(); |
776 | #endif |
777 | |
778 | #ifdef _WIN32 |
779 | /** |
780 | Wrapper around Windows DeviceIoControl() function. |
781 | |
782 | Works synchronously, also in case for handle opened |
783 | for async access (i.e with FILE_FLAG_OVERLAPPED). |
784 | |
785 | Accepts the same parameters as DeviceIoControl(),except |
786 | last parameter (OVERLAPPED). |
787 | */ |
788 | static |
789 | BOOL |
790 | os_win32_device_io_control( |
791 | HANDLE handle, |
792 | DWORD code, |
793 | LPVOID inbuf, |
794 | DWORD inbuf_size, |
795 | LPVOID outbuf, |
796 | DWORD outbuf_size, |
797 | LPDWORD bytes_returned |
798 | ) |
799 | { |
800 | OVERLAPPED overlapped = { 0 }; |
801 | overlapped.hEvent = win_get_syncio_event(); |
802 | BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf, |
803 | outbuf_size, NULL, &overlapped); |
804 | |
805 | if (result || (GetLastError() == ERROR_IO_PENDING)) { |
806 | /* Wait for async io to complete */ |
807 | result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE); |
808 | } |
809 | |
810 | return result; |
811 | } |
812 | |
813 | #endif |
814 | |
815 | /***********************************************************************//** |
816 | Try to get number of bytes per sector from file system. |
817 | @return file block size */ |
818 | UNIV_INTERN |
819 | ulint |
820 | os_file_get_block_size( |
821 | /*===================*/ |
822 | os_file_t file, /*!< in: handle to a file */ |
823 | const char* name) /*!< in: file name */ |
824 | { |
825 | ulint fblock_size = 512; |
826 | |
827 | #if defined(UNIV_LINUX) |
828 | struct stat local_stat; |
829 | int err; |
830 | |
831 | err = fstat((int)file, &local_stat); |
832 | |
833 | if (err != 0) { |
834 | os_file_handle_error_no_exit(name, "fstat()" , FALSE); |
835 | } else { |
836 | fblock_size = local_stat.st_blksize; |
837 | } |
838 | #endif /* UNIV_LINUX */ |
839 | #ifdef _WIN32 |
840 | |
841 | fblock_size = 0; |
842 | BOOL result = false; |
843 | size_t len = 0; |
844 | // Open volume for this file, find out it "physical bytes per sector" |
845 | |
846 | HANDLE volume_handle = INVALID_HANDLE_VALUE; |
847 | char volume[MAX_PATH + 4]="\\\\.\\" ; // Special prefix required for volume names. |
848 | if (!GetVolumePathName(name , volume + 4, MAX_PATH)) { |
849 | os_file_handle_error_no_exit(name, |
850 | "GetVolumePathName()" , FALSE); |
851 | goto end; |
852 | } |
853 | |
854 | len = strlen(volume); |
855 | if (volume[len - 1] == '\\') { |
856 | // Trim trailing backslash from volume name. |
857 | volume[len - 1] = 0; |
858 | } |
859 | |
860 | volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES, |
861 | FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, |
862 | 0, OPEN_EXISTING, 0, 0); |
863 | |
864 | if (volume_handle == INVALID_HANDLE_VALUE) { |
865 | if (GetLastError() != ERROR_ACCESS_DENIED) { |
866 | os_file_handle_error_no_exit(volume, |
867 | "CreateFile()" , FALSE); |
868 | } |
869 | goto end; |
870 | } |
871 | |
872 | DWORD tmp; |
873 | STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment; |
874 | |
875 | STORAGE_PROPERTY_QUERY storage_query; |
876 | memset(&storage_query, 0, sizeof(storage_query)); |
877 | storage_query.PropertyId = StorageAccessAlignmentProperty; |
878 | storage_query.QueryType = PropertyStandardQuery; |
879 | |
880 | result = os_win32_device_io_control(volume_handle, |
881 | IOCTL_STORAGE_QUERY_PROPERTY, |
882 | &storage_query, |
883 | sizeof(storage_query), |
884 | &disk_alignment, |
885 | sizeof(disk_alignment), |
886 | &tmp); |
887 | |
888 | if (!result) { |
889 | DWORD err = GetLastError(); |
890 | if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) { |
891 | os_file_handle_error_no_exit(volume, |
892 | "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)" , FALSE); |
893 | } |
894 | goto end; |
895 | } |
896 | |
897 | fblock_size = disk_alignment.BytesPerPhysicalSector; |
898 | |
899 | end: |
900 | if (volume_handle != INVALID_HANDLE_VALUE) { |
901 | CloseHandle(volume_handle); |
902 | } |
903 | #endif /* _WIN32 */ |
904 | |
905 | /* Currently we support file block size up to 4Kb */ |
906 | if (fblock_size > 4096 || fblock_size < 512) { |
907 | if (fblock_size < 512) { |
908 | fblock_size = 512; |
909 | } else { |
910 | fblock_size = 4096; |
911 | } |
912 | } |
913 | |
914 | return fblock_size; |
915 | } |
916 | |
917 | #ifdef WIN_ASYNC_IO |
918 | /** This function is only used in Windows asynchronous i/o. |
919 | Waits for an aio operation to complete. This function is used to wait the |
920 | for completed requests. The aio array of pending requests is divided |
921 | into segments. The thread specifies which segment or slot it wants to wait |
922 | for. NOTE: this function will also take care of freeing the aio slot, |
923 | therefore no other thread is allowed to do the freeing! |
924 | @param[in] segment The number of the segment in the aio arrays to |
925 | wait for; segment 0 is the ibuf I/O thread, |
926 | segment 1 the log I/O thread, then follow the |
927 | non-ibuf read threads, and as the last are the |
928 | non-ibuf write threads; if this is |
929 | ULINT_UNDEFINED, then it means that sync AIO |
930 | is used, and this parameter is ignored |
931 | @param[in] pos this parameter is used only in sync AIO: |
932 | wait for the aio slot at this position |
933 | @param[out] m1 the messages passed with the AIO request; note |
934 | that also in the case where the AIO operation |
935 | failed, these output parameters are valid and |
936 | can be used to restart the operation, |
937 | for example |
938 | @param[out] m2 callback message |
939 | @param[out] type OS_FILE_WRITE or ..._READ |
940 | @return DB_SUCCESS or error code */ |
941 | static |
942 | dberr_t |
943 | os_aio_windows_handler( |
944 | ulint segment, |
945 | ulint pos, |
946 | fil_node_t** m1, |
947 | void** m2, |
948 | IORequest* type); |
949 | #endif /* WIN_ASYNC_IO */ |
950 | |
951 | /** Generic AIO Handler methods. Currently handles IO post processing. */ |
952 | class AIOHandler { |
953 | public: |
954 | /** Do any post processing after a read/write |
955 | @return DB_SUCCESS or error code. */ |
956 | static dberr_t post_io_processing(Slot* slot); |
957 | }; |
958 | |
959 | /** Helper class for doing synchronous file IO. Currently, the objective |
960 | is to hide the OS specific code, so that the higher level functions aren't |
961 | peppered with #ifdef. Makes the code flow difficult to follow. */ |
962 | class SyncFileIO { |
963 | public: |
964 | /** Constructor |
965 | @param[in] fh File handle |
966 | @param[in,out] buf Buffer to read/write |
967 | @param[in] n Number of bytes to read/write |
968 | @param[in] offset Offset where to read or write */ |
969 | SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset) |
970 | : |
971 | m_fh(fh), |
972 | m_buf(buf), |
973 | m_n(static_cast<ssize_t>(n)), |
974 | m_offset(offset) |
975 | { |
976 | ut_ad(m_n > 0); |
977 | } |
978 | |
979 | /** Destructor */ |
980 | ~SyncFileIO() |
981 | { |
982 | /* No op */ |
983 | } |
984 | |
985 | /** Do the read/write |
986 | @param[in] request The IO context and type |
987 | @return the number of bytes read/written or negative value on error */ |
988 | ssize_t execute(const IORequest& request); |
989 | |
990 | /** Do the read/write |
991 | @param[in,out] slot The IO slot, it has the IO context |
992 | @return the number of bytes read/written or negative value on error */ |
993 | static ssize_t execute(Slot* slot); |
994 | |
995 | /** Move the read/write offset up to where the partial IO succeeded. |
996 | @param[in] n_bytes The number of bytes to advance */ |
997 | void advance(ssize_t n_bytes) |
998 | { |
999 | m_offset += n_bytes; |
1000 | |
1001 | ut_ad(m_n >= n_bytes); |
1002 | |
1003 | m_n -= n_bytes; |
1004 | |
1005 | m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes; |
1006 | } |
1007 | |
1008 | private: |
1009 | /** Open file handle */ |
1010 | os_file_t m_fh; |
1011 | |
1012 | /** Buffer to read/write */ |
1013 | void* m_buf; |
1014 | |
1015 | /** Number of bytes to read/write */ |
1016 | ssize_t m_n; |
1017 | |
1018 | /** Offset from where to read/write */ |
1019 | os_offset_t m_offset; |
1020 | }; |
1021 | |
1022 | /** Do any post processing after a read/write |
1023 | @return DB_SUCCESS or error code. */ |
1024 | dberr_t |
1025 | AIOHandler::post_io_processing(Slot* slot) |
1026 | { |
1027 | ut_ad(slot->is_reserved); |
1028 | |
1029 | /* Total bytes read so far */ |
1030 | ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes; |
1031 | |
1032 | return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL); |
1033 | } |
1034 | |
1035 | /** Count the number of free slots |
1036 | @return number of reserved slots */ |
1037 | ulint |
1038 | AIO::pending_io_count() const |
1039 | { |
1040 | acquire(); |
1041 | |
1042 | #ifdef UNIV_DEBUG |
1043 | ut_a(m_n_segments > 0); |
1044 | ut_a(!m_slots.empty()); |
1045 | |
1046 | ulint count = 0; |
1047 | |
1048 | for (ulint i = 0; i < m_slots.size(); ++i) { |
1049 | |
1050 | const Slot& slot = m_slots[i]; |
1051 | |
1052 | if (slot.is_reserved) { |
1053 | ++count; |
1054 | ut_a(slot.len > 0); |
1055 | } |
1056 | } |
1057 | |
1058 | ut_a(m_n_reserved == count); |
1059 | #endif /* UNIV_DEBUG */ |
1060 | |
1061 | ulint reserved = m_n_reserved; |
1062 | |
1063 | release(); |
1064 | |
1065 | return(reserved); |
1066 | } |
1067 | |
1068 | #ifdef UNIV_DEBUG |
1069 | /** Validates the consistency the aio system some of the time. |
1070 | @return true if ok or the check was skipped */ |
1071 | static |
1072 | bool |
1073 | os_aio_validate_skip() |
1074 | { |
1075 | /** Try os_aio_validate() every this many times */ |
1076 | # define OS_AIO_VALIDATE_SKIP 13 |
1077 | |
1078 | /** The os_aio_validate() call skip counter. |
1079 | Use a signed type because of the race condition below. */ |
1080 | static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; |
1081 | |
1082 | /* There is a race condition below, but it does not matter, |
1083 | because this call is only for heuristic purposes. We want to |
1084 | reduce the call frequency of the costly os_aio_validate() |
1085 | check in debug builds. */ |
1086 | --os_aio_validate_count; |
1087 | |
1088 | if (os_aio_validate_count > 0) { |
1089 | return(true); |
1090 | } |
1091 | |
1092 | os_aio_validate_count = OS_AIO_VALIDATE_SKIP; |
1093 | return(os_aio_validate()); |
1094 | } |
1095 | #endif /* UNIV_DEBUG */ |
1096 | |
1097 | #undef USE_FILE_LOCK |
1098 | #ifndef _WIN32 |
1099 | /* On Windows, mandatory locking is used */ |
1100 | # define USE_FILE_LOCK |
1101 | #endif |
1102 | #ifdef USE_FILE_LOCK |
1103 | /** Obtain an exclusive lock on a file. |
1104 | @param[in] fd file descriptor |
1105 | @param[in] name file name |
1106 | @return 0 on success */ |
1107 | static |
1108 | int |
1109 | os_file_lock( |
1110 | int fd, |
1111 | const char* name) |
1112 | { |
1113 | struct flock lk; |
1114 | |
1115 | lk.l_type = F_WRLCK; |
1116 | lk.l_whence = SEEK_SET; |
1117 | lk.l_start = lk.l_len = 0; |
1118 | |
1119 | if (fcntl(fd, F_SETLK, &lk) == -1) { |
1120 | |
1121 | ib::error() |
1122 | << "Unable to lock " << name |
1123 | << " error: " << errno; |
1124 | |
1125 | if (errno == EAGAIN || errno == EACCES) { |
1126 | |
1127 | ib::info() |
1128 | << "Check that you do not already have" |
1129 | " another mysqld process using the" |
1130 | " same InnoDB data or log files." ; |
1131 | } |
1132 | |
1133 | return(-1); |
1134 | } |
1135 | |
1136 | return(0); |
1137 | } |
1138 | #endif /* USE_FILE_LOCK */ |
1139 | |
1140 | /** Calculates local segment number and aio array from global segment number. |
1141 | @param[out] array aio wait array |
1142 | @param[in] segment global segment number |
1143 | @return local segment number within the aio array */ |
1144 | ulint |
1145 | AIO::get_array_and_local_segment( |
1146 | AIO** array, |
1147 | ulint segment) |
1148 | { |
1149 | ulint local_segment; |
1150 | ulint = (srv_read_only_mode) ? 0 : 2; |
1151 | |
1152 | ut_a(segment < os_aio_n_segments); |
1153 | |
1154 | if (!srv_read_only_mode && segment < n_extra_segs) { |
1155 | |
1156 | /* We don't support ibuf/log IO during read only mode. */ |
1157 | |
1158 | if (segment == IO_IBUF_SEGMENT) { |
1159 | |
1160 | *array = s_ibuf; |
1161 | |
1162 | } else if (segment == IO_LOG_SEGMENT) { |
1163 | |
1164 | *array = s_log; |
1165 | |
1166 | } else { |
1167 | *array = NULL; |
1168 | } |
1169 | |
1170 | local_segment = 0; |
1171 | |
1172 | } else if (segment < s_reads->m_n_segments + n_extra_segs) { |
1173 | |
1174 | *array = s_reads; |
1175 | local_segment = segment - n_extra_segs; |
1176 | |
1177 | } else { |
1178 | *array = s_writes; |
1179 | |
1180 | local_segment = segment |
1181 | - (s_reads->m_n_segments + n_extra_segs); |
1182 | } |
1183 | |
1184 | return(local_segment); |
1185 | } |
1186 | |
1187 | /** Frees a slot in the aio array. Assumes caller owns the mutex. |
1188 | @param[in,out] slot Slot to release */ |
1189 | void |
1190 | AIO::release(Slot* slot) |
1191 | { |
1192 | ut_ad(is_mutex_owned()); |
1193 | |
1194 | ut_ad(slot->is_reserved); |
1195 | |
1196 | slot->is_reserved = false; |
1197 | |
1198 | --m_n_reserved; |
1199 | |
1200 | if (m_n_reserved == m_slots.size() - 1) { |
1201 | os_event_set(m_not_full); |
1202 | } |
1203 | |
1204 | if (m_n_reserved == 0) { |
1205 | os_event_set(m_is_empty); |
1206 | } |
1207 | |
1208 | #if defined(LINUX_NATIVE_AIO) |
1209 | |
1210 | if (srv_use_native_aio) { |
1211 | memset(&slot->control, 0x0, sizeof(slot->control)); |
1212 | slot->ret = 0; |
1213 | slot->n_bytes = 0; |
1214 | } else { |
1215 | /* These fields should not be used if we are not |
1216 | using native AIO. */ |
1217 | ut_ad(slot->n_bytes == 0); |
1218 | ut_ad(slot->ret == 0); |
1219 | } |
1220 | |
1221 | #endif /* WIN_ASYNC_IO */ |
1222 | } |
1223 | |
1224 | /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex. |
1225 | @param[in,out] slot Slot to release */ |
1226 | void |
1227 | AIO::release_with_mutex(Slot* slot) |
1228 | { |
1229 | acquire(); |
1230 | |
1231 | release(slot); |
1232 | |
1233 | release(); |
1234 | } |
1235 | |
1236 | /** Create a temporary file. This function is like tmpfile(3), but |
1237 | the temporary file is created in the in the mysql server configuration |
1238 | parameter (--tmpdir). |
1239 | @return temporary file handle, or NULL on error */ |
1240 | FILE* |
1241 | os_file_create_tmpfile() |
1242 | { |
1243 | FILE* file = NULL; |
1244 | WAIT_ALLOW_WRITES(); |
1245 | os_file_t fd = innobase_mysql_tmpfile(NULL); |
1246 | |
1247 | if (fd != OS_FILE_CLOSED) { |
1248 | #ifdef _WIN32 |
1249 | int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0); |
1250 | if (crt_fd != -1) { |
1251 | file = fdopen(crt_fd, "w+b" ); |
1252 | if (!file) { |
1253 | close(crt_fd); |
1254 | } |
1255 | } |
1256 | #else |
1257 | file = fdopen(fd, "w+b" ); |
1258 | if (!file) { |
1259 | close(fd); |
1260 | } |
1261 | #endif |
1262 | } |
1263 | |
1264 | if (file == NULL) { |
1265 | |
1266 | ib::error() |
1267 | << "Unable to create temporary file; errno: " |
1268 | << errno; |
1269 | } |
1270 | |
1271 | return(file); |
1272 | } |
1273 | |
1274 | /** Rewind file to its start, read at most size - 1 bytes from it to str, and |
1275 | NUL-terminate str. All errors are silently ignored. This function is |
1276 | mostly meant to be used with temporary files. |
1277 | @param[in,out] file File to read from |
1278 | @param[in,out] str Buffer where to read |
1279 | @param[in] size Size of buffer */ |
1280 | void |
1281 | os_file_read_string( |
1282 | FILE* file, |
1283 | char* str, |
1284 | ulint size) |
1285 | { |
1286 | if (size != 0) { |
1287 | rewind(file); |
1288 | |
1289 | size_t flen = fread(str, 1, size - 1, file); |
1290 | |
1291 | str[flen] = '\0'; |
1292 | } |
1293 | } |
1294 | |
1295 | /** This function returns a new path name after replacing the basename |
1296 | in an old path with a new basename. The old_path is a full path |
1297 | name including the extension. The tablename is in the normal |
1298 | form "databasename/tablename". The new base name is found after |
1299 | the forward slash. Both input strings are null terminated. |
1300 | |
1301 | This function allocates memory to be returned. It is the callers |
1302 | responsibility to free the return value after it is no longer needed. |
1303 | |
1304 | @param[in] old_path Pathname |
1305 | @param[in] tablename Contains new base name |
1306 | @return own: new full pathname */ |
1307 | char* |
1308 | os_file_make_new_pathname( |
1309 | const char* old_path, |
1310 | const char* tablename) |
1311 | { |
1312 | ulint dir_len; |
1313 | char* last_slash; |
1314 | char* base_name; |
1315 | char* new_path; |
1316 | ulint new_path_len; |
1317 | |
1318 | /* Split the tablename into its database and table name components. |
1319 | They are separated by a '/'. */ |
1320 | last_slash = strrchr((char*) tablename, '/'); |
1321 | base_name = last_slash ? last_slash + 1 : (char*) tablename; |
1322 | |
1323 | /* Find the offset of the last slash. We will strip off the |
1324 | old basename.ibd which starts after that slash. */ |
1325 | last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR); |
1326 | dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path); |
1327 | |
1328 | /* allocate a new path and move the old directory path to it. */ |
1329 | new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd" ; |
1330 | new_path = static_cast<char*>(ut_malloc_nokey(new_path_len)); |
1331 | memcpy(new_path, old_path, dir_len); |
1332 | |
1333 | snprintf(new_path + dir_len, new_path_len - dir_len, |
1334 | "%c%s.ibd" , OS_PATH_SEPARATOR, base_name); |
1335 | |
1336 | return(new_path); |
1337 | } |
1338 | |
1339 | /** This function reduces a null-terminated full remote path name into |
1340 | the path that is sent by MySQL for DATA DIRECTORY clause. It replaces |
1341 | the 'databasename/tablename.ibd' found at the end of the path with just |
1342 | 'tablename'. |
1343 | |
1344 | Since the result is always smaller than the path sent in, no new memory |
1345 | is allocated. The caller should allocate memory for the path sent in. |
1346 | This function manipulates that path in place. |
1347 | |
1348 | If the path format is not as expected, just return. The result is used |
1349 | to inform a SHOW CREATE TABLE command. |
1350 | @param[in,out] data_dir_path Full path/data_dir_path */ |
1351 | void |
1352 | os_file_make_data_dir_path( |
1353 | char* data_dir_path) |
1354 | { |
1355 | /* Replace the period before the extension with a null byte. */ |
1356 | char* ptr = strrchr((char*) data_dir_path, '.'); |
1357 | |
1358 | if (ptr == NULL) { |
1359 | return; |
1360 | } |
1361 | |
1362 | ptr[0] = '\0'; |
1363 | |
1364 | /* The tablename starts after the last slash. */ |
1365 | ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); |
1366 | |
1367 | if (ptr == NULL) { |
1368 | return; |
1369 | } |
1370 | |
1371 | ptr[0] = '\0'; |
1372 | |
1373 | char* tablename = ptr + 1; |
1374 | |
1375 | /* The databasename starts after the next to last slash. */ |
1376 | ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR); |
1377 | |
1378 | if (ptr == NULL) { |
1379 | return; |
1380 | } |
1381 | |
1382 | ulint tablename_len = ut_strlen(tablename); |
1383 | |
1384 | ut_memmove(++ptr, tablename, tablename_len); |
1385 | |
1386 | ptr[tablename_len] = '\0'; |
1387 | } |
1388 | |
1389 | /** Check if the path refers to the root of a drive using a pointer |
1390 | to the last directory separator that the caller has fixed. |
1391 | @param[in] path path name |
1392 | @param[in] path last directory separator in the path |
1393 | @return true if this path is a drive root, false if not */ |
1394 | UNIV_INLINE |
1395 | bool |
1396 | os_file_is_root( |
1397 | const char* path, |
1398 | const char* last_slash) |
1399 | { |
1400 | return( |
1401 | #ifdef _WIN32 |
1402 | (last_slash == path + 2 && path[1] == ':') || |
1403 | #endif /* _WIN32 */ |
1404 | last_slash == path); |
1405 | } |
1406 | |
1407 | /** Return the parent directory component of a null-terminated path. |
1408 | Return a new buffer containing the string up to, but not including, |
1409 | the final component of the path. |
1410 | The path returned will not contain a trailing separator. |
1411 | Do not return a root path, return NULL instead. |
1412 | The final component trimmed off may be a filename or a directory name. |
1413 | If the final component is the only component of the path, return NULL. |
1414 | It is the caller's responsibility to free the returned string after it |
1415 | is no longer needed. |
1416 | @param[in] path Path name |
1417 | @return own: parent directory of the path */ |
1418 | static |
1419 | char* |
1420 | os_file_get_parent_dir( |
1421 | const char* path) |
1422 | { |
1423 | bool has_trailing_slash = false; |
1424 | |
1425 | /* Find the offset of the last slash */ |
1426 | const char* last_slash = strrchr(path, OS_PATH_SEPARATOR); |
1427 | |
1428 | if (!last_slash) { |
1429 | /* No slash in the path, return NULL */ |
1430 | return(NULL); |
1431 | } |
1432 | |
1433 | /* Ok, there is a slash. Is there anything after it? */ |
1434 | if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) { |
1435 | has_trailing_slash = true; |
1436 | } |
1437 | |
1438 | /* Reduce repetative slashes. */ |
1439 | while (last_slash > path |
1440 | && last_slash[-1] == OS_PATH_SEPARATOR) { |
1441 | last_slash--; |
1442 | } |
1443 | |
1444 | /* Check for the root of a drive. */ |
1445 | if (os_file_is_root(path, last_slash)) { |
1446 | return(NULL); |
1447 | } |
1448 | |
1449 | /* If a trailing slash prevented the first strrchr() from trimming |
1450 | the last component of the path, trim that component now. */ |
1451 | if (has_trailing_slash) { |
1452 | /* Back up to the previous slash. */ |
1453 | last_slash--; |
1454 | while (last_slash > path |
1455 | && last_slash[0] != OS_PATH_SEPARATOR) { |
1456 | last_slash--; |
1457 | } |
1458 | |
1459 | /* Reduce repetative slashes. */ |
1460 | while (last_slash > path |
1461 | && last_slash[-1] == OS_PATH_SEPARATOR) { |
1462 | last_slash--; |
1463 | } |
1464 | } |
1465 | |
1466 | /* Check for the root of a drive. */ |
1467 | if (os_file_is_root(path, last_slash)) { |
1468 | return(NULL); |
1469 | } |
1470 | |
1471 | /* Non-trivial directory component */ |
1472 | |
1473 | return(mem_strdupl(path, ulint(last_slash - path))); |
1474 | } |
1475 | #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR |
1476 | |
1477 | /* Test the function os_file_get_parent_dir. */ |
1478 | void |
1479 | test_os_file_get_parent_dir( |
1480 | const char* child_dir, |
1481 | const char* expected_dir) |
1482 | { |
1483 | char* child = mem_strdup(child_dir); |
1484 | char* expected = expected_dir == NULL ? NULL |
1485 | : mem_strdup(expected_dir); |
1486 | |
1487 | /* os_file_get_parent_dir() assumes that separators are |
1488 | converted to OS_PATH_SEPARATOR. */ |
1489 | os_normalize_path(child); |
1490 | os_normalize_path(expected); |
1491 | |
1492 | char* parent = os_file_get_parent_dir(child); |
1493 | |
1494 | bool unexpected = (expected == NULL |
1495 | ? (parent != NULL) |
1496 | : (0 != strcmp(parent, expected))); |
1497 | if (unexpected) { |
1498 | ib::fatal() << "os_file_get_parent_dir('" << child |
1499 | << "') returned '" << parent |
1500 | << "', instead of '" << expected << "'." ; |
1501 | } |
1502 | ut_free(parent); |
1503 | ut_free(child); |
1504 | ut_free(expected); |
1505 | } |
1506 | |
1507 | /* Test the function os_file_get_parent_dir. */ |
1508 | void |
1509 | unit_test_os_file_get_parent_dir() |
1510 | { |
1511 | test_os_file_get_parent_dir("/usr/lib/a" , "/usr/lib" ); |
1512 | test_os_file_get_parent_dir("/usr/" , NULL); |
1513 | test_os_file_get_parent_dir("//usr//" , NULL); |
1514 | test_os_file_get_parent_dir("usr" , NULL); |
1515 | test_os_file_get_parent_dir("usr//" , NULL); |
1516 | test_os_file_get_parent_dir("/" , NULL); |
1517 | test_os_file_get_parent_dir("//" , NULL); |
1518 | test_os_file_get_parent_dir("." , NULL); |
1519 | test_os_file_get_parent_dir(".." , NULL); |
1520 | # ifdef _WIN32 |
1521 | test_os_file_get_parent_dir("D:" , NULL); |
1522 | test_os_file_get_parent_dir("D:/" , NULL); |
1523 | test_os_file_get_parent_dir("D:\\" , NULL); |
1524 | test_os_file_get_parent_dir("D:/data" , NULL); |
1525 | test_os_file_get_parent_dir("D:/data/" , NULL); |
1526 | test_os_file_get_parent_dir("D:\\data\\" , NULL); |
1527 | test_os_file_get_parent_dir("D:///data/////" , NULL); |
1528 | test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\" , NULL); |
1529 | test_os_file_get_parent_dir("D:/data//a" , "D:/data" ); |
1530 | test_os_file_get_parent_dir("D:\\data\\\\a" , "D:\\data" ); |
1531 | test_os_file_get_parent_dir("D:///data//a///b/" , "D:///data//a" ); |
1532 | test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\" , "D:\\\\\\data\\\\a" ); |
1533 | #endif /* _WIN32 */ |
1534 | } |
1535 | #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ |
1536 | |
1537 | |
1538 | /** Creates all missing subdirectories along the given path. |
1539 | @param[in] path Path name |
1540 | @return DB_SUCCESS if OK, otherwise error code. */ |
1541 | dberr_t |
1542 | os_file_create_subdirs_if_needed( |
1543 | const char* path) |
1544 | { |
1545 | if (srv_read_only_mode) { |
1546 | |
1547 | ib::error() |
1548 | << "read only mode set. Can't create " |
1549 | << "subdirectories '" << path << "'" ; |
1550 | |
1551 | return(DB_READ_ONLY); |
1552 | |
1553 | } |
1554 | |
1555 | char* subdir = os_file_get_parent_dir(path); |
1556 | |
1557 | if (subdir == NULL) { |
1558 | /* subdir is root or cwd, nothing to do */ |
1559 | return(DB_SUCCESS); |
1560 | } |
1561 | |
1562 | /* Test if subdir exists */ |
1563 | os_file_type_t type; |
1564 | bool subdir_exists; |
1565 | bool success = os_file_status(subdir, &subdir_exists, &type); |
1566 | |
1567 | if (success && !subdir_exists) { |
1568 | |
1569 | /* Subdir does not exist, create it */ |
1570 | dberr_t err = os_file_create_subdirs_if_needed(subdir); |
1571 | |
1572 | if (err != DB_SUCCESS) { |
1573 | |
1574 | ut_free(subdir); |
1575 | |
1576 | return(err); |
1577 | } |
1578 | |
1579 | success = os_file_create_directory(subdir, false); |
1580 | } |
1581 | |
1582 | ut_free(subdir); |
1583 | |
1584 | return(success ? DB_SUCCESS : DB_ERROR); |
1585 | } |
1586 | |
1587 | #ifndef _WIN32 |
1588 | |
1589 | /** Do the read/write |
1590 | @param[in] request The IO context and type |
1591 | @return the number of bytes read/written or negative value on error */ |
1592 | ssize_t |
1593 | SyncFileIO::execute(const IORequest& request) |
1594 | { |
1595 | ssize_t n_bytes; |
1596 | |
1597 | if (request.is_read()) { |
1598 | n_bytes = pread(m_fh, m_buf, m_n, m_offset); |
1599 | } else { |
1600 | ut_ad(request.is_write()); |
1601 | n_bytes = pwrite(m_fh, m_buf, m_n, m_offset); |
1602 | } |
1603 | |
1604 | return(n_bytes); |
1605 | } |
1606 | /** Free storage space associated with a section of the file. |
1607 | @param[in] fh Open file handle |
1608 | @param[in] off Starting offset (SEEK_SET) |
1609 | @param[in] len Size of the hole |
1610 | @return DB_SUCCESS or error code */ |
1611 | static |
1612 | dberr_t |
1613 | os_file_punch_hole_posix( |
1614 | os_file_t fh, |
1615 | os_offset_t off, |
1616 | os_offset_t len) |
1617 | { |
1618 | |
1619 | #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE |
1620 | const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; |
1621 | |
1622 | int ret = fallocate(fh, mode, off, len); |
1623 | |
1624 | if (ret == 0) { |
1625 | return(DB_SUCCESS); |
1626 | } |
1627 | |
1628 | if (errno == ENOTSUP) { |
1629 | return(DB_IO_NO_PUNCH_HOLE); |
1630 | } |
1631 | |
1632 | ib::warn() |
1633 | << "fallocate(" |
1634 | <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, " |
1635 | << off << ", " << len << ") returned errno: " |
1636 | << errno; |
1637 | |
1638 | return(DB_IO_ERROR); |
1639 | |
1640 | #elif defined(UNIV_SOLARIS) |
1641 | |
1642 | // Use F_FREESP |
1643 | |
1644 | #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */ |
1645 | |
1646 | return(DB_IO_NO_PUNCH_HOLE); |
1647 | } |
1648 | |
1649 | #if defined(LINUX_NATIVE_AIO) |
1650 | |
1651 | /** Linux native AIO handler */ |
1652 | class LinuxAIOHandler { |
1653 | public: |
1654 | /** |
1655 | @param[in] global_segment The global segment*/ |
1656 | LinuxAIOHandler(ulint global_segment) |
1657 | : |
1658 | m_global_segment(global_segment) |
1659 | { |
1660 | /* Should never be doing Sync IO here. */ |
1661 | ut_a(m_global_segment != ULINT_UNDEFINED); |
1662 | |
1663 | /* Find the array and the local segment. */ |
1664 | |
1665 | m_segment = AIO::get_array_and_local_segment( |
1666 | &m_array, m_global_segment); |
1667 | |
1668 | m_n_slots = m_array->slots_per_segment(); |
1669 | } |
1670 | |
1671 | /** Destructor */ |
1672 | ~LinuxAIOHandler() |
1673 | { |
1674 | // No op |
1675 | } |
1676 | |
1677 | /** |
1678 | Process a Linux AIO request |
1679 | @param[out] m1 the messages passed with the |
1680 | @param[out] m2 AIO request; note that in case the |
1681 | AIO operation failed, these output |
1682 | parameters are valid and can be used to |
1683 | restart the operation. |
1684 | @param[out] request IO context |
1685 | @return DB_SUCCESS or error code */ |
1686 | dberr_t poll(fil_node_t** m1, void** m2, IORequest* request); |
1687 | |
1688 | private: |
1689 | /** Resubmit an IO request that was only partially successful |
1690 | @param[in,out] slot Request to resubmit |
1691 | @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */ |
1692 | dberr_t resubmit(Slot* slot); |
1693 | |
1694 | /** Check if the AIO succeeded |
1695 | @param[in,out] slot The slot to check |
1696 | @return DB_SUCCESS, DB_FAIL if the operation should be retried or |
1697 | DB_IO_ERROR on all other errors */ |
1698 | dberr_t check_state(Slot* slot); |
1699 | |
1700 | /** @return true if a shutdown was detected */ |
1701 | bool is_shutdown() const |
1702 | { |
1703 | return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
1704 | && !buf_page_cleaner_is_active); |
1705 | } |
1706 | |
1707 | /** If no slot was found then the m_array->m_mutex will be released. |
1708 | @param[out] n_pending The number of pending IOs |
1709 | @return NULL or a slot that has completed IO */ |
1710 | Slot* find_completed_slot(ulint* n_pending); |
1711 | |
1712 | /** This is called from within the IO-thread. If there are no completed |
1713 | IO requests in the slot array, the thread calls this function to |
1714 | collect more requests from the Linux kernel. |
1715 | The IO-thread waits on io_getevents(), which is a blocking call, with |
1716 | a timeout value. Unless the system is very heavy loaded, keeping the |
1717 | IO-thread very busy, the io-thread will spend most of its time waiting |
1718 | in this function. |
1719 | The IO-thread also exits in this function. It checks server status at |
1720 | each wakeup and that is why we use timed wait in io_getevents(). */ |
1721 | void collect(); |
1722 | |
1723 | private: |
1724 | /** Slot array */ |
1725 | AIO* m_array; |
1726 | |
1727 | /** Number of slots inthe local segment */ |
1728 | ulint m_n_slots; |
1729 | |
1730 | /** The local segment to check */ |
1731 | ulint m_segment; |
1732 | |
1733 | /** The global segment */ |
1734 | ulint m_global_segment; |
1735 | }; |
1736 | |
1737 | /** Resubmit an IO request that was only partially successful |
1738 | @param[in,out] slot Request to resubmit |
1739 | @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */ |
1740 | dberr_t |
1741 | LinuxAIOHandler::resubmit(Slot* slot) |
1742 | { |
1743 | #ifdef UNIV_DEBUG |
1744 | /* Bytes already read/written out */ |
1745 | ulint n_bytes = slot->ptr - slot->buf; |
1746 | |
1747 | ut_ad(m_array->is_mutex_owned()); |
1748 | |
1749 | ut_ad(n_bytes < slot->original_len); |
1750 | ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes); |
1751 | /* Partial read or write scenario */ |
1752 | ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes)); |
1753 | #endif /* UNIV_DEBUG */ |
1754 | |
1755 | slot->len -= slot->n_bytes; |
1756 | slot->ptr += slot->n_bytes; |
1757 | slot->offset += slot->n_bytes; |
1758 | |
1759 | /* Resetting the bytes read/written */ |
1760 | slot->n_bytes = 0; |
1761 | slot->io_already_done = false; |
1762 | |
1763 | struct iocb* iocb = &slot->control; |
1764 | |
1765 | if (slot->type.is_read()) { |
1766 | |
1767 | io_prep_pread( |
1768 | iocb, |
1769 | slot->file, |
1770 | slot->ptr, |
1771 | slot->len, |
1772 | static_cast<off_t>(slot->offset)); |
1773 | } else { |
1774 | |
1775 | ut_a(slot->type.is_write()); |
1776 | |
1777 | io_prep_pwrite( |
1778 | iocb, |
1779 | slot->file, |
1780 | slot->ptr, |
1781 | slot->len, |
1782 | static_cast<off_t>(slot->offset)); |
1783 | } |
1784 | |
1785 | iocb->data = slot; |
1786 | |
1787 | /* Resubmit an I/O request */ |
1788 | int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb); |
1789 | |
1790 | if (ret < -1) { |
1791 | errno = -ret; |
1792 | } |
1793 | |
1794 | return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS); |
1795 | } |
1796 | |
1797 | /** Check if the AIO succeeded |
1798 | @param[in,out] slot The slot to check |
1799 | @return DB_SUCCESS, DB_FAIL if the operation should be retried or |
1800 | DB_IO_ERROR on all other errors */ |
1801 | dberr_t |
1802 | LinuxAIOHandler::check_state(Slot* slot) |
1803 | { |
1804 | ut_ad(m_array->is_mutex_owned()); |
1805 | |
1806 | /* Note that it may be that there is more then one completed |
1807 | IO requests. We process them one at a time. We may have a case |
1808 | here to improve the performance slightly by dealing with all |
1809 | requests in one sweep. */ |
1810 | |
1811 | srv_set_io_thread_op_info( |
1812 | m_global_segment, "processing completed aio requests" ); |
1813 | |
1814 | ut_ad(slot->io_already_done); |
1815 | |
1816 | dberr_t err = DB_SUCCESS; |
1817 | |
1818 | if (slot->ret == 0) { |
1819 | |
1820 | err = AIOHandler::post_io_processing(slot); |
1821 | |
1822 | } else { |
1823 | errno = -slot->ret; |
1824 | |
1825 | /* os_file_handle_error does tell us if we should retry |
1826 | this IO. As it stands now, we don't do this retry when |
1827 | reaping requests from a different context than |
1828 | the dispatcher. This non-retry logic is the same for |
1829 | Windows and Linux native AIO. |
1830 | We should probably look into this to transparently |
1831 | re-submit the IO. */ |
1832 | os_file_handle_error(slot->name, "Linux aio" ); |
1833 | |
1834 | err = DB_IO_ERROR; |
1835 | } |
1836 | |
1837 | return(err); |
1838 | } |
1839 | |
1840 | /** If no slot was found then the m_array->m_mutex will be released. |
1841 | @param[out] n_pending The number of pending IOs |
1842 | @return NULL or a slot that has completed IO */ |
1843 | Slot* |
1844 | LinuxAIOHandler::find_completed_slot(ulint* n_pending) |
1845 | { |
1846 | ulint offset = m_n_slots * m_segment; |
1847 | |
1848 | *n_pending = 0; |
1849 | |
1850 | m_array->acquire(); |
1851 | |
1852 | Slot* slot = m_array->at(offset); |
1853 | |
1854 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
1855 | |
1856 | if (slot->is_reserved) { |
1857 | |
1858 | ++*n_pending; |
1859 | |
1860 | if (slot->io_already_done) { |
1861 | |
1862 | /* Something for us to work on. |
1863 | Note: We don't release the mutex. */ |
1864 | return(slot); |
1865 | } |
1866 | } |
1867 | } |
1868 | |
1869 | m_array->release(); |
1870 | |
1871 | return(NULL); |
1872 | } |
1873 | |
1874 | /** This function is only used in Linux native asynchronous i/o. This is |
1875 | called from within the io-thread. If there are no completed IO requests |
1876 | in the slot array, the thread calls this function to collect more |
1877 | requests from the kernel. |
1878 | The io-thread waits on io_getevents(), which is a blocking call, with |
1879 | a timeout value. Unless the system is very heavy loaded, keeping the |
1880 | io-thread very busy, the io-thread will spend most of its time waiting |
1881 | in this function. |
1882 | The io-thread also exits in this function. It checks server status at |
1883 | each wakeup and that is why we use timed wait in io_getevents(). */ |
1884 | void |
1885 | LinuxAIOHandler::collect() |
1886 | { |
1887 | ut_ad(m_n_slots > 0); |
1888 | ut_ad(m_array != NULL); |
1889 | ut_ad(m_segment < m_array->get_n_segments()); |
1890 | |
1891 | /* Which io_context we are going to use. */ |
1892 | io_context* io_ctx = m_array->io_ctx(m_segment); |
1893 | |
1894 | /* Starting point of the m_segment we will be working on. */ |
1895 | ulint start_pos = m_segment * m_n_slots; |
1896 | |
1897 | /* End point. */ |
1898 | ulint end_pos = start_pos + m_n_slots; |
1899 | |
1900 | for (;;) { |
1901 | struct io_event* events; |
1902 | |
1903 | /* Which part of event array we are going to work on. */ |
1904 | events = m_array->io_events(m_segment * m_n_slots); |
1905 | |
1906 | /* Initialize the events. */ |
1907 | memset(events, 0, sizeof(*events) * m_n_slots); |
1908 | |
1909 | /* The timeout value is arbitrary. We probably need |
1910 | to experiment with it a little. */ |
1911 | struct timespec timeout; |
1912 | |
1913 | timeout.tv_sec = 0; |
1914 | timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; |
1915 | |
1916 | int ret; |
1917 | |
1918 | ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout); |
1919 | |
1920 | for (int i = 0; i < ret; ++i) { |
1921 | |
1922 | struct iocb* iocb; |
1923 | |
1924 | iocb = reinterpret_cast<struct iocb*>(events[i].obj); |
1925 | ut_a(iocb != NULL); |
1926 | |
1927 | Slot* slot = reinterpret_cast<Slot*>(iocb->data); |
1928 | |
1929 | /* Some sanity checks. */ |
1930 | ut_a(slot != NULL); |
1931 | ut_a(slot->is_reserved); |
1932 | |
1933 | /* We are not scribbling previous segment. */ |
1934 | ut_a(slot->pos >= start_pos); |
1935 | |
1936 | /* We have not overstepped to next segment. */ |
1937 | ut_a(slot->pos < end_pos); |
1938 | |
1939 | /* Deallocate unused blocks from file system. |
1940 | This is newer done to page 0 or to log files.*/ |
1941 | if (slot->offset > 0 |
1942 | && !slot->type.is_log() |
1943 | && slot->type.is_write() |
1944 | && slot->type.punch_hole()) { |
1945 | |
1946 | slot->err = slot->type.punch_hole( |
1947 | slot->file, |
1948 | slot->offset, slot->len); |
1949 | } else { |
1950 | slot->err = DB_SUCCESS; |
1951 | } |
1952 | |
1953 | /* Mark this request as completed. The error handling |
1954 | will be done in the calling function. */ |
1955 | m_array->acquire(); |
1956 | |
1957 | slot->ret = events[i].res2; |
1958 | slot->io_already_done = true; |
1959 | slot->n_bytes = events[i].res; |
1960 | |
1961 | m_array->release(); |
1962 | } |
1963 | |
1964 | if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
1965 | || !buf_page_cleaner_is_active |
1966 | || ret > 0) { |
1967 | |
1968 | break; |
1969 | } |
1970 | |
1971 | /* This error handling is for any error in collecting the |
1972 | IO requests. The errors, if any, for any particular IO |
1973 | request are simply passed on to the calling routine. */ |
1974 | |
1975 | switch (ret) { |
1976 | case -EAGAIN: |
1977 | /* Not enough resources! Try again. */ |
1978 | |
1979 | case -EINTR: |
1980 | /* Interrupted! The behaviour in case of an interrupt. |
1981 | If we have some completed IOs available then the |
1982 | return code will be the number of IOs. We get EINTR |
1983 | only if there are no completed IOs and we have been |
1984 | interrupted. */ |
1985 | |
1986 | case 0: |
1987 | /* No pending request! Go back and check again. */ |
1988 | |
1989 | continue; |
1990 | } |
1991 | |
1992 | /* All other errors should cause a trap for now. */ |
1993 | ib::fatal() |
1994 | << "Unexpected ret_code[" << ret |
1995 | << "] from io_getevents()!" ; |
1996 | |
1997 | break; |
1998 | } |
1999 | } |
2000 | |
2001 | /** Process a Linux AIO request |
2002 | @param[out] m1 the messages passed with the |
2003 | @param[out] m2 AIO request; note that in case the |
2004 | AIO operation failed, these output |
2005 | parameters are valid and can be used to |
2006 | restart the operation. |
2007 | @param[out] request IO context |
2008 | @return DB_SUCCESS or error code */ |
2009 | dberr_t |
2010 | LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request) |
2011 | { |
2012 | dberr_t err = DB_SUCCESS; |
2013 | Slot* slot; |
2014 | |
2015 | /* Loop until we have found a completed request. */ |
2016 | for (;;) { |
2017 | |
2018 | ulint n_pending; |
2019 | |
2020 | slot = find_completed_slot(&n_pending); |
2021 | |
2022 | if (slot != NULL) { |
2023 | |
2024 | ut_ad(m_array->is_mutex_owned()); |
2025 | |
2026 | err = check_state(slot); |
2027 | |
2028 | /* DB_FAIL is not a hard error, we should retry */ |
2029 | if (err != DB_FAIL) { |
2030 | break; |
2031 | } |
2032 | |
2033 | /* Partial IO, resubmit request for |
2034 | remaining bytes to read/write */ |
2035 | err = resubmit(slot); |
2036 | |
2037 | if (err != DB_SUCCESS) { |
2038 | break; |
2039 | } |
2040 | |
2041 | m_array->release(); |
2042 | |
2043 | } else if (is_shutdown() && n_pending == 0) { |
2044 | |
2045 | /* There is no completed request. If there is |
2046 | no pending request at all, and the system is |
2047 | being shut down, exit. */ |
2048 | |
2049 | *m1 = NULL; |
2050 | *m2 = NULL; |
2051 | |
2052 | return(DB_SUCCESS); |
2053 | |
2054 | } else { |
2055 | |
2056 | /* Wait for some request. Note that we return |
2057 | from wait if we have found a request. */ |
2058 | |
2059 | srv_set_io_thread_op_info( |
2060 | m_global_segment, |
2061 | "waiting for completed aio requests" ); |
2062 | |
2063 | collect(); |
2064 | } |
2065 | } |
2066 | |
2067 | if (err == DB_IO_PARTIAL_FAILED) { |
2068 | /* Aborting in case of submit failure */ |
2069 | ib::fatal() |
2070 | << "Native Linux AIO interface. " |
2071 | "io_submit() call failed when " |
2072 | "resubmitting a partial I/O " |
2073 | "request on the file " << slot->name |
2074 | << "." ; |
2075 | } |
2076 | |
2077 | *m1 = slot->m1; |
2078 | *m2 = slot->m2; |
2079 | |
2080 | *request = slot->type; |
2081 | |
2082 | m_array->release(slot); |
2083 | |
2084 | m_array->release(); |
2085 | |
2086 | return(err); |
2087 | } |
2088 | |
2089 | /** This function is only used in Linux native asynchronous i/o. |
2090 | Waits for an aio operation to complete. This function is used to wait for |
2091 | the completed requests. The aio array of pending requests is divided |
2092 | into segments. The thread specifies which segment or slot it wants to wait |
2093 | for. NOTE: this function will also take care of freeing the aio slot, |
2094 | therefore no other thread is allowed to do the freeing! |
2095 | |
2096 | @param[in] global_seg segment number in the aio array |
2097 | to wait for; segment 0 is the ibuf |
2098 | i/o thread, segment 1 is log i/o thread, |
2099 | then follow the non-ibuf read threads, |
2100 | and the last are the non-ibuf write |
2101 | threads. |
2102 | @param[out] m1 the messages passed with the |
2103 | @param[out] m2 AIO request; note that in case the |
2104 | AIO operation failed, these output |
2105 | parameters are valid and can be used to |
2106 | restart the operation. |
2107 | @param[out]xi request IO context |
2108 | @return DB_SUCCESS if the IO was successful */ |
2109 | static |
2110 | dberr_t |
2111 | os_aio_linux_handler( |
2112 | ulint global_segment, |
2113 | fil_node_t** m1, |
2114 | void** m2, |
2115 | IORequest* request) |
2116 | { |
2117 | return LinuxAIOHandler(global_segment).poll(m1, m2, request); |
2118 | } |
2119 | |
2120 | /** Dispatch an AIO request to the kernel. |
2121 | @param[in,out] slot an already reserved slot |
2122 | @return true on success. */ |
2123 | bool |
2124 | AIO::linux_dispatch(Slot* slot) |
2125 | { |
2126 | ut_a(slot->is_reserved); |
2127 | ut_ad(slot->type.validate()); |
2128 | |
2129 | /* Find out what we are going to work with. |
2130 | The iocb struct is directly in the slot. |
2131 | The io_context is one per segment. */ |
2132 | |
2133 | ulint io_ctx_index; |
2134 | struct iocb* iocb = &slot->control; |
2135 | |
2136 | io_ctx_index = (slot->pos * m_n_segments) / m_slots.size(); |
2137 | |
2138 | int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb); |
2139 | |
2140 | /* io_submit() returns number of successfully queued requests |
2141 | or -errno. */ |
2142 | |
2143 | if (ret != 1) { |
2144 | errno = -ret; |
2145 | } |
2146 | |
2147 | return(ret == 1); |
2148 | } |
2149 | |
2150 | /** Creates an io_context for native linux AIO. |
2151 | @param[in] max_events number of events |
2152 | @param[out] io_ctx io_ctx to initialize. |
2153 | @return true on success. */ |
2154 | bool |
2155 | AIO::linux_create_io_ctx( |
2156 | unsigned max_events, |
2157 | io_context_t* io_ctx) |
2158 | { |
2159 | ssize_t n_retries = 0; |
2160 | |
2161 | for (;;) { |
2162 | |
2163 | memset(io_ctx, 0x0, sizeof(*io_ctx)); |
2164 | |
2165 | /* Initialize the io_ctx. Tell it how many pending |
2166 | IO requests this context will handle. */ |
2167 | |
2168 | int ret = io_setup(max_events, io_ctx); |
2169 | |
2170 | if (ret == 0) { |
2171 | /* Success. Return now. */ |
2172 | return(true); |
2173 | } |
2174 | |
2175 | /* If we hit EAGAIN we'll make a few attempts before failing. */ |
2176 | |
2177 | switch (ret) { |
2178 | case -EAGAIN: |
2179 | if (n_retries == 0) { |
2180 | /* First time around. */ |
2181 | ib::warn() |
2182 | << "io_setup() failed with EAGAIN." |
2183 | " Will make " |
2184 | << OS_AIO_IO_SETUP_RETRY_ATTEMPTS |
2185 | << " attempts before giving up." ; |
2186 | } |
2187 | |
2188 | if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { |
2189 | |
2190 | ++n_retries; |
2191 | |
2192 | ib::warn() |
2193 | << "io_setup() attempt " |
2194 | << n_retries << "." ; |
2195 | |
2196 | os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); |
2197 | |
2198 | continue; |
2199 | } |
2200 | |
2201 | /* Have tried enough. Better call it a day. */ |
2202 | ib::error() |
2203 | << "io_setup() failed with EAGAIN after " |
2204 | << OS_AIO_IO_SETUP_RETRY_ATTEMPTS |
2205 | << " attempts." ; |
2206 | break; |
2207 | |
2208 | case -ENOSYS: |
2209 | ib::error() |
2210 | << "Linux Native AIO interface" |
2211 | " is not supported on this platform. Please" |
2212 | " check your OS documentation and install" |
2213 | " appropriate binary of InnoDB." ; |
2214 | |
2215 | break; |
2216 | |
2217 | default: |
2218 | ib::error() |
2219 | << "Linux Native AIO setup" |
2220 | << " returned following error[" |
2221 | << ret << "]" ; |
2222 | break; |
2223 | } |
2224 | |
2225 | ib::info() |
2226 | << "You can disable Linux Native AIO by" |
2227 | " setting innodb_use_native_aio = 0 in my.cnf" ; |
2228 | |
2229 | break; |
2230 | } |
2231 | |
2232 | return(false); |
2233 | } |
2234 | |
2235 | /** Checks if the system supports native linux aio. On some kernel |
2236 | versions where native aio is supported it won't work on tmpfs. In such |
2237 | cases we can't use native aio as it is not possible to mix simulated |
2238 | and native aio. |
2239 | @return: true if supported, false otherwise. */ |
2240 | bool |
2241 | AIO::is_linux_native_aio_supported() |
2242 | { |
2243 | int fd; |
2244 | io_context_t io_ctx; |
2245 | char name[1000]; |
2246 | |
2247 | if (!linux_create_io_ctx(1, &io_ctx)) { |
2248 | |
2249 | /* The platform does not support native aio. */ |
2250 | |
2251 | return(false); |
2252 | |
2253 | } else if (!srv_read_only_mode) { |
2254 | |
2255 | /* Now check if tmpdir supports native aio ops. */ |
2256 | fd = innobase_mysql_tmpfile(NULL); |
2257 | |
2258 | if (fd < 0) { |
2259 | ib::warn() |
2260 | << "Unable to create temp file to check" |
2261 | " native AIO support." ; |
2262 | |
2263 | return(false); |
2264 | } |
2265 | } else { |
2266 | |
2267 | os_normalize_path(srv_log_group_home_dir); |
2268 | |
2269 | ulint dirnamelen = strlen(srv_log_group_home_dir); |
2270 | |
2271 | ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile" ); |
2272 | |
2273 | memcpy(name, srv_log_group_home_dir, dirnamelen); |
2274 | |
2275 | /* Add a path separator if needed. */ |
2276 | if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) { |
2277 | |
2278 | name[dirnamelen++] = OS_PATH_SEPARATOR; |
2279 | } |
2280 | |
2281 | strcpy(name + dirnamelen, "ib_logfile0" ); |
2282 | |
2283 | fd = open(name, O_RDONLY | O_CLOEXEC); |
2284 | |
2285 | if (fd == -1) { |
2286 | |
2287 | ib::warn() |
2288 | << "Unable to open" |
2289 | << " \"" << name << "\" to check native" |
2290 | << " AIO read support." ; |
2291 | |
2292 | return(false); |
2293 | } |
2294 | } |
2295 | |
2296 | struct io_event io_event; |
2297 | |
2298 | memset(&io_event, 0x0, sizeof(io_event)); |
2299 | |
2300 | byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2)); |
2301 | byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size)); |
2302 | |
2303 | struct iocb iocb; |
2304 | |
2305 | /* Suppress valgrind warning. */ |
2306 | memset(buf, 0x00, srv_page_size * 2); |
2307 | memset(&iocb, 0x0, sizeof(iocb)); |
2308 | |
2309 | struct iocb* p_iocb = &iocb; |
2310 | |
2311 | if (!srv_read_only_mode) { |
2312 | |
2313 | io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0); |
2314 | |
2315 | } else { |
2316 | ut_a(srv_page_size >= 512); |
2317 | io_prep_pread(p_iocb, fd, ptr, 512, 0); |
2318 | } |
2319 | |
2320 | int err = io_submit(io_ctx, 1, &p_iocb); |
2321 | |
2322 | if (err >= 1) { |
2323 | /* Now collect the submitted IO request. */ |
2324 | err = io_getevents(io_ctx, 1, 1, &io_event, NULL); |
2325 | } |
2326 | |
2327 | ut_free(buf); |
2328 | close(fd); |
2329 | |
2330 | switch (err) { |
2331 | case 1: |
2332 | return(true); |
2333 | |
2334 | case -EINVAL: |
2335 | case -ENOSYS: |
2336 | ib::error() |
2337 | << "Linux Native AIO not supported. You can either" |
2338 | " move " |
2339 | << (srv_read_only_mode ? name : "tmpdir" ) |
2340 | << " to a file system that supports native" |
2341 | " AIO or you can set innodb_use_native_aio to" |
2342 | " FALSE to avoid this message." ; |
2343 | |
2344 | /* fall through. */ |
2345 | default: |
2346 | ib::error() |
2347 | << "Linux Native AIO check on " |
2348 | << (srv_read_only_mode ? name : "tmpdir" ) |
2349 | << "returned error[" << -err << "]" ; |
2350 | } |
2351 | |
2352 | return(false); |
2353 | } |
2354 | |
2355 | #endif /* LINUX_NATIVE_AIO */ |
2356 | |
2357 | /** Retrieves the last error number if an error occurs in a file io function. |
2358 | The number should be retrieved before any other OS calls (because they may |
2359 | overwrite the error number). If the number is not known to this program, |
2360 | the OS error number + 100 is returned. |
2361 | @param[in] report_all_errors true if we want an error message |
2362 | printed of all errors |
2363 | @param[in] on_error_silent true then don't print any diagnostic |
2364 | to the log |
2365 | @return error number, or OS error number + 100 */ |
2366 | static |
2367 | ulint |
2368 | os_file_get_last_error_low( |
2369 | bool report_all_errors, |
2370 | bool on_error_silent) |
2371 | { |
2372 | int err = errno; |
2373 | |
2374 | if (err == 0) { |
2375 | return(0); |
2376 | } |
2377 | |
2378 | if (report_all_errors |
2379 | || (err != ENOSPC && err != EEXIST && !on_error_silent)) { |
2380 | |
2381 | ib::error() |
2382 | << "Operating system error number " |
2383 | << err |
2384 | << " in a file operation." ; |
2385 | |
2386 | if (err == ENOENT) { |
2387 | |
2388 | ib::error() |
2389 | << "The error means the system" |
2390 | " cannot find the path specified." ; |
2391 | |
2392 | if (srv_is_being_started) { |
2393 | |
2394 | ib::error() |
2395 | << "If you are installing InnoDB," |
2396 | " remember that you must create" |
2397 | " directories yourself, InnoDB" |
2398 | " does not create them." ; |
2399 | } |
2400 | } else if (err == EACCES) { |
2401 | |
2402 | ib::error() |
2403 | << "The error means mysqld does not have" |
2404 | " the access rights to the directory." ; |
2405 | |
2406 | } else { |
2407 | if (strerror(err) != NULL) { |
2408 | |
2409 | ib::error() |
2410 | << "Error number " << err << " means '" |
2411 | << strerror(err) << "'" ; |
2412 | } |
2413 | |
2414 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
2415 | } |
2416 | } |
2417 | |
2418 | switch (err) { |
2419 | case ENOSPC: |
2420 | return(OS_FILE_DISK_FULL); |
2421 | case ENOENT: |
2422 | return(OS_FILE_NOT_FOUND); |
2423 | case EEXIST: |
2424 | return(OS_FILE_ALREADY_EXISTS); |
2425 | case EXDEV: |
2426 | case ENOTDIR: |
2427 | case EISDIR: |
2428 | return(OS_FILE_PATH_ERROR); |
2429 | case EAGAIN: |
2430 | if (srv_use_native_aio) { |
2431 | return(OS_FILE_AIO_RESOURCES_RESERVED); |
2432 | } |
2433 | break; |
2434 | case EINTR: |
2435 | if (srv_use_native_aio) { |
2436 | return(OS_FILE_AIO_INTERRUPTED); |
2437 | } |
2438 | break; |
2439 | case EACCES: |
2440 | return(OS_FILE_ACCESS_VIOLATION); |
2441 | } |
2442 | return(OS_FILE_ERROR_MAX + err); |
2443 | } |
2444 | |
2445 | /** Wrapper to fsync(2) that retries the call on some errors. |
2446 | Returns the value 0 if successful; otherwise the value -1 is returned and |
2447 | the global variable errno is set to indicate the error. |
2448 | @param[in] file open file handle |
2449 | @return 0 if success, -1 otherwise */ |
2450 | static |
2451 | int |
2452 | os_file_fsync_posix( |
2453 | os_file_t file) |
2454 | { |
2455 | ulint failures = 0; |
2456 | |
2457 | for (;;) { |
2458 | |
2459 | ++os_n_fsyncs; |
2460 | |
2461 | int ret = fsync(file); |
2462 | |
2463 | if (ret == 0) { |
2464 | return(ret); |
2465 | } |
2466 | |
2467 | switch(errno) { |
2468 | case ENOLCK: |
2469 | |
2470 | ++failures; |
2471 | ut_a(failures < 1000); |
2472 | |
2473 | if (!(failures % 100)) { |
2474 | |
2475 | ib::warn() |
2476 | << "fsync(): " |
2477 | << "No locks available; retrying" ; |
2478 | } |
2479 | |
2480 | /* 0.2 sec */ |
2481 | os_thread_sleep(200000); |
2482 | break; |
2483 | |
2484 | case EIO: |
2485 | |
2486 | ++failures; |
2487 | ut_a(failures < 1000); |
2488 | |
2489 | if (!(failures % 100)) { |
2490 | |
2491 | ib::warn() |
2492 | << "fsync(): " |
2493 | << "An error occurred during " |
2494 | << "synchronization," |
2495 | << " retrying" ; |
2496 | } |
2497 | |
2498 | /* 0.2 sec */ |
2499 | os_thread_sleep(200000); |
2500 | break; |
2501 | |
2502 | case EINTR: |
2503 | |
2504 | ++failures; |
2505 | ut_a(failures < 2000); |
2506 | break; |
2507 | |
2508 | default: |
2509 | ut_error; |
2510 | break; |
2511 | } |
2512 | } |
2513 | |
2514 | ut_error; |
2515 | |
2516 | return(-1); |
2517 | } |
2518 | |
2519 | /** Check the existence and type of the given file. |
2520 | @param[in] path path name of file |
2521 | @param[out] exists true if the file exists |
2522 | @param[out] type Type of the file, if it exists |
2523 | @return true if call succeeded */ |
2524 | static |
2525 | bool |
2526 | os_file_status_posix( |
2527 | const char* path, |
2528 | bool* exists, |
2529 | os_file_type_t* type) |
2530 | { |
2531 | struct stat statinfo; |
2532 | |
2533 | int ret = stat(path, &statinfo); |
2534 | |
2535 | *exists = !ret; |
2536 | |
2537 | if (!ret) { |
2538 | /* file exists, everything OK */ |
2539 | |
2540 | } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { |
2541 | /* file does not exist */ |
2542 | return(true); |
2543 | |
2544 | } else { |
2545 | /* file exists, but stat call failed */ |
2546 | os_file_handle_error_no_exit(path, "stat" , false); |
2547 | return(false); |
2548 | } |
2549 | |
2550 | if (S_ISDIR(statinfo.st_mode)) { |
2551 | *type = OS_FILE_TYPE_DIR; |
2552 | |
2553 | } else if (S_ISLNK(statinfo.st_mode)) { |
2554 | *type = OS_FILE_TYPE_LINK; |
2555 | |
2556 | } else if (S_ISREG(statinfo.st_mode)) { |
2557 | *type = OS_FILE_TYPE_FILE; |
2558 | } else { |
2559 | *type = OS_FILE_TYPE_UNKNOWN; |
2560 | } |
2561 | |
2562 | return(true); |
2563 | } |
2564 | |
2565 | /** NOTE! Use the corresponding macro os_file_flush(), not directly this |
2566 | function! |
2567 | Flushes the write buffers of a given file to the disk. |
2568 | @param[in] file handle to a file |
2569 | @return true if success */ |
2570 | bool |
2571 | os_file_flush_func( |
2572 | os_file_t file) |
2573 | { |
2574 | int ret; |
2575 | |
2576 | WAIT_ALLOW_WRITES(); |
2577 | ret = os_file_fsync_posix(file); |
2578 | |
2579 | if (ret == 0) { |
2580 | return(true); |
2581 | } |
2582 | |
2583 | /* Since Linux returns EINVAL if the 'file' is actually a raw device, |
2584 | we choose to ignore that error if we are using raw disks */ |
2585 | |
2586 | if (srv_start_raw_disk_in_use && errno == EINVAL) { |
2587 | |
2588 | return(true); |
2589 | } |
2590 | |
2591 | ib::error() << "The OS said file flush did not succeed" ; |
2592 | |
2593 | os_file_handle_error(NULL, "flush" ); |
2594 | |
2595 | /* It is a fatal error if a file flush does not succeed, because then |
2596 | the database can get corrupt on disk */ |
2597 | ut_error; |
2598 | |
2599 | return(false); |
2600 | } |
2601 | |
2602 | /** NOTE! Use the corresponding macro os_file_create_simple(), not directly |
2603 | this function! |
2604 | A simple function to open or create a file. |
2605 | @param[in] name name of the file or path as a null-terminated |
2606 | string |
2607 | @param[in] create_mode create mode |
2608 | @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE |
2609 | @param[in] read_only if true, read only checks are enforced |
2610 | @param[out] success true if succeed, false if error |
2611 | @return handle to the file, not defined if error, error number |
2612 | can be retrieved with os_file_get_last_error */ |
2613 | pfs_os_file_t |
2614 | os_file_create_simple_func( |
2615 | const char* name, |
2616 | ulint create_mode, |
2617 | ulint access_type, |
2618 | bool read_only, |
2619 | bool* success) |
2620 | { |
2621 | pfs_os_file_t file; |
2622 | |
2623 | *success = false; |
2624 | |
2625 | int create_flag; |
2626 | const char* mode_str = NULL; |
2627 | |
2628 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
2629 | WAIT_ALLOW_WRITES(); |
2630 | } |
2631 | |
2632 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
2633 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
2634 | |
2635 | if (create_mode == OS_FILE_OPEN) { |
2636 | mode_str = "OPEN" ; |
2637 | |
2638 | if (access_type == OS_FILE_READ_ONLY) { |
2639 | |
2640 | create_flag = O_RDONLY; |
2641 | |
2642 | } else if (read_only) { |
2643 | |
2644 | create_flag = O_RDONLY; |
2645 | |
2646 | } else { |
2647 | create_flag = O_RDWR; |
2648 | } |
2649 | |
2650 | } else if (read_only) { |
2651 | |
2652 | mode_str = "OPEN" ; |
2653 | create_flag = O_RDONLY; |
2654 | |
2655 | } else if (create_mode == OS_FILE_CREATE) { |
2656 | |
2657 | mode_str = "CREATE" ; |
2658 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
2659 | |
2660 | } else if (create_mode == OS_FILE_CREATE_PATH) { |
2661 | |
2662 | mode_str = "CREATE PATH" ; |
2663 | /* Create subdirs along the path if needed. */ |
2664 | |
2665 | *success = os_file_create_subdirs_if_needed(name); |
2666 | |
2667 | if (!*success) { |
2668 | |
2669 | ib::error() |
2670 | << "Unable to create subdirectories '" |
2671 | << name << "'" ; |
2672 | |
2673 | return(OS_FILE_CLOSED); |
2674 | } |
2675 | |
2676 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
2677 | create_mode = OS_FILE_CREATE; |
2678 | } else { |
2679 | |
2680 | ib::error() |
2681 | << "Unknown file create mode (" |
2682 | << create_mode |
2683 | << " for file '" << name << "'" ; |
2684 | |
2685 | return(OS_FILE_CLOSED); |
2686 | } |
2687 | |
2688 | bool retry; |
2689 | |
2690 | do { |
2691 | file = open(name, create_flag, os_innodb_umask); |
2692 | |
2693 | if (file == -1) { |
2694 | *success = false; |
2695 | retry = os_file_handle_error( |
2696 | name, |
2697 | create_mode == OS_FILE_OPEN |
2698 | ? "open" : "create" ); |
2699 | } else { |
2700 | *success = true; |
2701 | retry = false; |
2702 | } |
2703 | |
2704 | } while (retry); |
2705 | |
2706 | /* This function is always called for data files, we should disable |
2707 | OS caching (O_DIRECT) here as we do in os_file_create_func(), so |
2708 | we open the same file in the same mode, see man page of open(2). */ |
2709 | if (!srv_read_only_mode |
2710 | && *success |
2711 | && (srv_file_flush_method == SRV_O_DIRECT |
2712 | || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { |
2713 | |
2714 | os_file_set_nocache(file, name, mode_str); |
2715 | } |
2716 | |
2717 | #ifdef USE_FILE_LOCK |
2718 | if (!read_only |
2719 | && *success |
2720 | && (access_type == OS_FILE_READ_WRITE) |
2721 | && os_file_lock(file, name)) { |
2722 | |
2723 | *success = false; |
2724 | close(file); |
2725 | file = -1; |
2726 | } |
2727 | #endif /* USE_FILE_LOCK */ |
2728 | |
2729 | return(file); |
2730 | } |
2731 | |
2732 | /** This function attempts to create a directory named pathname. The new |
2733 | directory gets default permissions. On Unix the permissions are |
2734 | (0770 & ~umask). If the directory exists already, nothing is done and |
2735 | the call succeeds, unless the fail_if_exists arguments is true. |
2736 | If another error occurs, such as a permission error, this does not crash, |
2737 | but reports the error and returns false. |
2738 | @param[in] pathname directory name as null-terminated string |
2739 | @param[in] fail_if_exists if true, pre-existing directory is treated as |
2740 | an error. |
2741 | @return true if call succeeds, false on error */ |
2742 | bool |
2743 | os_file_create_directory( |
2744 | const char* pathname, |
2745 | bool fail_if_exists) |
2746 | { |
2747 | int rcode; |
2748 | |
2749 | WAIT_ALLOW_WRITES(); |
2750 | rcode = mkdir(pathname, 0770); |
2751 | |
2752 | if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { |
2753 | /* failure */ |
2754 | os_file_handle_error_no_exit(pathname, "mkdir" , false); |
2755 | |
2756 | return(false); |
2757 | } |
2758 | |
2759 | return(true); |
2760 | } |
2761 | |
2762 | /** |
2763 | The os_file_opendir() function opens a directory stream corresponding to the |
2764 | directory named by the dirname argument. The directory stream is positioned |
2765 | at the first entry. In both Unix and Windows we automatically skip the '.' |
2766 | and '..' items at the start of the directory listing. |
2767 | @param[in] dirname directory name; it must not contain a trailing |
2768 | '\' or '/' |
2769 | @param[in] is_fatal true if we should treat an error as a fatal |
2770 | error; if we try to open symlinks then we do |
2771 | not wish a fatal error if it happens not to be |
2772 | a directory |
2773 | @return directory stream, NULL if error */ |
2774 | os_file_dir_t |
2775 | os_file_opendir( |
2776 | const char* dirname, |
2777 | bool error_is_fatal) |
2778 | { |
2779 | os_file_dir_t dir; |
2780 | dir = opendir(dirname); |
2781 | |
2782 | if (dir == NULL && error_is_fatal) { |
2783 | os_file_handle_error(dirname, "opendir" ); |
2784 | } |
2785 | |
2786 | return(dir); |
2787 | } |
2788 | |
2789 | /** Closes a directory stream. |
2790 | @param[in] dir directory stream |
2791 | @return 0 if success, -1 if failure */ |
2792 | int |
2793 | os_file_closedir( |
2794 | os_file_dir_t dir) |
2795 | { |
2796 | int ret = closedir(dir); |
2797 | |
2798 | if (ret != 0) { |
2799 | os_file_handle_error_no_exit(NULL, "closedir" , false); |
2800 | } |
2801 | |
2802 | return(ret); |
2803 | } |
2804 | |
2805 | /** This function returns information of the next file in the directory. We jump |
2806 | over the '.' and '..' entries in the directory. |
2807 | @param[in] dirname directory name or path |
2808 | @param[in] dir directory stream |
2809 | @param[out] info buffer where the info is returned |
2810 | @return 0 if ok, -1 if error, 1 if at the end of the directory */ |
2811 | int |
2812 | os_file_readdir_next_file( |
2813 | const char* dirname, |
2814 | os_file_dir_t dir, |
2815 | os_file_stat_t* info) |
2816 | { |
2817 | struct dirent* ent; |
2818 | char* full_path; |
2819 | int ret; |
2820 | struct stat statinfo; |
2821 | |
2822 | next_file: |
2823 | |
2824 | ent = readdir(dir); |
2825 | |
2826 | if (ent == NULL) { |
2827 | |
2828 | return(1); |
2829 | } |
2830 | |
2831 | ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); |
2832 | |
2833 | if (strcmp(ent->d_name, "." ) == 0 || strcmp(ent->d_name, ".." ) == 0) { |
2834 | |
2835 | goto next_file; |
2836 | } |
2837 | |
2838 | strcpy(info->name, ent->d_name); |
2839 | |
2840 | full_path = static_cast<char*>( |
2841 | ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10)); |
2842 | |
2843 | sprintf(full_path, "%s/%s" , dirname, ent->d_name); |
2844 | |
2845 | ret = stat(full_path, &statinfo); |
2846 | |
2847 | if (ret) { |
2848 | |
2849 | if (errno == ENOENT) { |
2850 | /* readdir() returned a file that does not exist, |
2851 | it must have been deleted in the meantime. Do what |
2852 | would have happened if the file was deleted before |
2853 | readdir() - ignore and go to the next entry. |
2854 | If this is the last entry then info->name will still |
2855 | contain the name of the deleted file when this |
2856 | function returns, but this is not an issue since the |
2857 | caller shouldn't be looking at info when end of |
2858 | directory is returned. */ |
2859 | |
2860 | ut_free(full_path); |
2861 | |
2862 | goto next_file; |
2863 | } |
2864 | |
2865 | os_file_handle_error_no_exit(full_path, "stat" , false); |
2866 | |
2867 | ut_free(full_path); |
2868 | |
2869 | return(-1); |
2870 | } |
2871 | |
2872 | info->size = statinfo.st_size; |
2873 | |
2874 | if (S_ISDIR(statinfo.st_mode)) { |
2875 | info->type = OS_FILE_TYPE_DIR; |
2876 | } else if (S_ISLNK(statinfo.st_mode)) { |
2877 | info->type = OS_FILE_TYPE_LINK; |
2878 | } else if (S_ISREG(statinfo.st_mode)) { |
2879 | info->type = OS_FILE_TYPE_FILE; |
2880 | } else { |
2881 | info->type = OS_FILE_TYPE_UNKNOWN; |
2882 | } |
2883 | |
2884 | ut_free(full_path); |
2885 | |
2886 | return(0); |
2887 | } |
2888 | |
2889 | /** NOTE! Use the corresponding macro os_file_create(), not directly |
2890 | this function! |
2891 | Opens an existing file or creates a new. |
2892 | @param[in] name name of the file or path as a null-terminated |
2893 | string |
2894 | @param[in] create_mode create mode |
2895 | @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O |
2896 | is desired, OS_FILE_NORMAL, if any normal file; |
2897 | NOTE that it also depends on type, os_aio_.. |
2898 | and srv_.. variables whether we really use async |
2899 | I/O or unbuffered I/O: look in the function |
2900 | source code for the exact rules |
2901 | @param[in] type OS_DATA_FILE or OS_LOG_FILE |
2902 | @param[in] read_only true, if read only checks should be enforcedm |
2903 | @param[in] success true if succeeded |
2904 | @return handle to the file, not defined if error, error number |
2905 | can be retrieved with os_file_get_last_error */ |
2906 | pfs_os_file_t |
2907 | os_file_create_func( |
2908 | const char* name, |
2909 | ulint create_mode, |
2910 | ulint purpose, |
2911 | ulint type, |
2912 | bool read_only, |
2913 | bool* success) |
2914 | { |
2915 | bool on_error_no_exit; |
2916 | bool on_error_silent; |
2917 | |
2918 | *success = false; |
2919 | |
2920 | DBUG_EXECUTE_IF( |
2921 | "ib_create_table_fail_disk_full" , |
2922 | *success = false; |
2923 | errno = ENOSPC; |
2924 | return(OS_FILE_CLOSED); |
2925 | ); |
2926 | |
2927 | int create_flag; |
2928 | const char* mode_str = NULL; |
2929 | |
2930 | on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT |
2931 | ? true : false; |
2932 | on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT |
2933 | ? true : false; |
2934 | |
2935 | create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT |
2936 | | OS_FILE_ON_ERROR_SILENT)); |
2937 | |
2938 | if (create_mode == OS_FILE_OPEN |
2939 | || create_mode == OS_FILE_OPEN_RAW |
2940 | || create_mode == OS_FILE_OPEN_RETRY) { |
2941 | |
2942 | mode_str = "OPEN" ; |
2943 | |
2944 | create_flag = read_only ? O_RDONLY : O_RDWR; |
2945 | |
2946 | } else if (read_only) { |
2947 | |
2948 | mode_str = "OPEN" ; |
2949 | |
2950 | create_flag = O_RDONLY; |
2951 | |
2952 | } else if (create_mode == OS_FILE_CREATE) { |
2953 | |
2954 | mode_str = "CREATE" ; |
2955 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
2956 | |
2957 | } else if (create_mode == OS_FILE_OVERWRITE) { |
2958 | |
2959 | mode_str = "OVERWRITE" ; |
2960 | create_flag = O_RDWR | O_CREAT | O_TRUNC; |
2961 | |
2962 | } else { |
2963 | ib::error() |
2964 | << "Unknown file create mode (" << create_mode << ")" |
2965 | << " for file '" << name << "'" ; |
2966 | |
2967 | return(OS_FILE_CLOSED); |
2968 | } |
2969 | |
2970 | ut_a(type == OS_LOG_FILE |
2971 | || type == OS_DATA_FILE |
2972 | || type == OS_DATA_TEMP_FILE); |
2973 | |
2974 | ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); |
2975 | |
2976 | #ifdef O_SYNC |
2977 | /* We let O_SYNC only affect log files; note that we map O_DSYNC to |
2978 | O_SYNC because the datasync options seemed to corrupt files in 2001 |
2979 | in both Linux and Solaris */ |
2980 | |
2981 | if (!read_only |
2982 | && type == OS_LOG_FILE |
2983 | && srv_file_flush_method == SRV_O_DSYNC) { |
2984 | |
2985 | create_flag |= O_SYNC; |
2986 | } |
2987 | #endif /* O_SYNC */ |
2988 | |
2989 | os_file_t file; |
2990 | bool retry; |
2991 | |
2992 | do { |
2993 | file = open(name, create_flag, os_innodb_umask); |
2994 | |
2995 | if (file == -1) { |
2996 | const char* operation; |
2997 | |
2998 | operation = (create_mode == OS_FILE_CREATE |
2999 | && !read_only) ? "create" : "open" ; |
3000 | |
3001 | *success = false; |
3002 | |
3003 | if (on_error_no_exit) { |
3004 | retry = os_file_handle_error_no_exit( |
3005 | name, operation, on_error_silent); |
3006 | } else { |
3007 | retry = os_file_handle_error(name, operation); |
3008 | } |
3009 | } else { |
3010 | *success = true; |
3011 | retry = false; |
3012 | } |
3013 | |
3014 | } while (retry); |
3015 | |
3016 | /* We disable OS caching (O_DIRECT) only on data files */ |
3017 | if (!read_only |
3018 | && *success |
3019 | && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE) |
3020 | && (srv_file_flush_method == SRV_O_DIRECT |
3021 | || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { |
3022 | |
3023 | os_file_set_nocache(file, name, mode_str); |
3024 | } |
3025 | |
3026 | #ifdef USE_FILE_LOCK |
3027 | if (!read_only |
3028 | && *success |
3029 | && create_mode != OS_FILE_OPEN_RAW |
3030 | && os_file_lock(file, name)) { |
3031 | |
3032 | if (create_mode == OS_FILE_OPEN_RETRY) { |
3033 | |
3034 | ib::info() |
3035 | << "Retrying to lock the first data file" ; |
3036 | |
3037 | for (int i = 0; i < 100; i++) { |
3038 | os_thread_sleep(1000000); |
3039 | |
3040 | if (!os_file_lock(file, name)) { |
3041 | *success = true; |
3042 | return(file); |
3043 | } |
3044 | } |
3045 | |
3046 | ib::info() |
3047 | << "Unable to open the first data file" ; |
3048 | } |
3049 | |
3050 | *success = false; |
3051 | close(file); |
3052 | file = -1; |
3053 | } |
3054 | #endif /* USE_FILE_LOCK */ |
3055 | |
3056 | return(file); |
3057 | } |
3058 | |
3059 | /** NOTE! Use the corresponding macro |
3060 | os_file_create_simple_no_error_handling(), not directly this function! |
3061 | A simple function to open or create a file. |
3062 | @param[in] name name of the file or path as a null-terminated |
3063 | string |
3064 | @param[in] create_mode create mode |
3065 | @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or |
3066 | OS_FILE_READ_ALLOW_DELETE; the last option |
3067 | is used by a backup program reading the file |
3068 | @param[in] read_only if true read only mode checks are enforced |
3069 | @param[out] success true if succeeded |
3070 | @return own: handle to the file, not defined if error, error number |
3071 | can be retrieved with os_file_get_last_error */ |
3072 | pfs_os_file_t |
3073 | os_file_create_simple_no_error_handling_func( |
3074 | const char* name, |
3075 | ulint create_mode, |
3076 | ulint access_type, |
3077 | bool read_only, |
3078 | bool* success) |
3079 | { |
3080 | os_file_t file; |
3081 | int create_flag; |
3082 | |
3083 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
3084 | WAIT_ALLOW_WRITES(); |
3085 | } |
3086 | |
3087 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
3088 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
3089 | |
3090 | *success = false; |
3091 | |
3092 | if (create_mode == OS_FILE_OPEN) { |
3093 | |
3094 | if (access_type == OS_FILE_READ_ONLY) { |
3095 | |
3096 | create_flag = O_RDONLY; |
3097 | |
3098 | } else if (read_only) { |
3099 | |
3100 | create_flag = O_RDONLY; |
3101 | |
3102 | } else { |
3103 | |
3104 | ut_a(access_type == OS_FILE_READ_WRITE |
3105 | || access_type == OS_FILE_READ_ALLOW_DELETE); |
3106 | |
3107 | create_flag = O_RDWR; |
3108 | } |
3109 | |
3110 | } else if (read_only) { |
3111 | |
3112 | create_flag = O_RDONLY; |
3113 | |
3114 | } else if (create_mode == OS_FILE_CREATE) { |
3115 | |
3116 | create_flag = O_RDWR | O_CREAT | O_EXCL; |
3117 | |
3118 | } else { |
3119 | |
3120 | ib::error() |
3121 | << "Unknown file create mode " |
3122 | << create_mode << " for file '" << name << "'" ; |
3123 | |
3124 | return(OS_FILE_CLOSED); |
3125 | } |
3126 | |
3127 | file = open(name, create_flag, os_innodb_umask); |
3128 | |
3129 | *success = (file != -1); |
3130 | |
3131 | #ifdef USE_FILE_LOCK |
3132 | if (!read_only |
3133 | && *success |
3134 | && access_type == OS_FILE_READ_WRITE |
3135 | && os_file_lock(file, name)) { |
3136 | |
3137 | *success = false; |
3138 | close(file); |
3139 | file = -1; |
3140 | |
3141 | } |
3142 | #endif /* USE_FILE_LOCK */ |
3143 | |
3144 | return(file); |
3145 | } |
3146 | |
3147 | /** Deletes a file if it exists. The file has to be closed before calling this. |
3148 | @param[in] name file path as a null-terminated string |
3149 | @param[out] exist indicate if file pre-exist |
3150 | @return true if success */ |
3151 | bool |
3152 | os_file_delete_if_exists_func( |
3153 | const char* name, |
3154 | bool* exist) |
3155 | { |
3156 | if (exist != NULL) { |
3157 | *exist = true; |
3158 | } |
3159 | |
3160 | int ret; |
3161 | WAIT_ALLOW_WRITES(); |
3162 | |
3163 | ret = unlink(name); |
3164 | |
3165 | if (ret != 0 && errno == ENOENT) { |
3166 | if (exist != NULL) { |
3167 | *exist = false; |
3168 | } |
3169 | } else if (ret != 0 && errno != ENOENT) { |
3170 | os_file_handle_error_no_exit(name, "delete" , false); |
3171 | |
3172 | return(false); |
3173 | } |
3174 | |
3175 | return(true); |
3176 | } |
3177 | |
3178 | /** Deletes a file. The file has to be closed before calling this. |
3179 | @param[in] name file path as a null-terminated string |
3180 | @return true if success */ |
3181 | bool |
3182 | os_file_delete_func( |
3183 | const char* name) |
3184 | { |
3185 | int ret; |
3186 | WAIT_ALLOW_WRITES(); |
3187 | |
3188 | ret = unlink(name); |
3189 | |
3190 | if (ret != 0) { |
3191 | os_file_handle_error_no_exit(name, "delete" , FALSE); |
3192 | |
3193 | return(false); |
3194 | } |
3195 | |
3196 | return(true); |
3197 | } |
3198 | |
3199 | /** NOTE! Use the corresponding macro os_file_rename(), not directly this |
3200 | function! |
3201 | Renames a file (can also move it to another directory). It is safest that the |
3202 | file is closed before calling this function. |
3203 | @param[in] oldpath old file path as a null-terminated string |
3204 | @param[in] newpath new file path |
3205 | @return true if success */ |
3206 | bool |
3207 | os_file_rename_func( |
3208 | const char* oldpath, |
3209 | const char* newpath) |
3210 | { |
3211 | #ifdef UNIV_DEBUG |
3212 | os_file_type_t type; |
3213 | bool exists; |
3214 | |
3215 | /* New path must not exist. */ |
3216 | ut_ad(os_file_status(newpath, &exists, &type)); |
3217 | ut_ad(!exists); |
3218 | |
3219 | /* Old path must exist. */ |
3220 | ut_ad(os_file_status(oldpath, &exists, &type)); |
3221 | ut_ad(exists); |
3222 | #endif /* UNIV_DEBUG */ |
3223 | |
3224 | int ret; |
3225 | WAIT_ALLOW_WRITES(); |
3226 | |
3227 | ret = rename(oldpath, newpath); |
3228 | |
3229 | if (ret != 0) { |
3230 | os_file_handle_error_no_exit(oldpath, "rename" , FALSE); |
3231 | |
3232 | return(false); |
3233 | } |
3234 | |
3235 | return(true); |
3236 | } |
3237 | |
3238 | /** NOTE! Use the corresponding macro os_file_close(), not directly this |
3239 | function! |
3240 | Closes a file handle. In case of error, error number can be retrieved with |
3241 | os_file_get_last_error. |
3242 | @param[in] file Handle to close |
3243 | @return true if success */ |
3244 | bool |
3245 | os_file_close_func( |
3246 | os_file_t file) |
3247 | { |
3248 | int ret = close(file); |
3249 | |
3250 | if (ret == -1) { |
3251 | os_file_handle_error(NULL, "close" ); |
3252 | |
3253 | return(false); |
3254 | } |
3255 | |
3256 | return(true); |
3257 | } |
3258 | |
3259 | /** Gets a file size. |
3260 | @param[in] file handle to an open file |
3261 | @return file size, or (os_offset_t) -1 on failure */ |
3262 | os_offset_t |
3263 | os_file_get_size(os_file_t file) |
3264 | { |
3265 | struct stat statbuf; |
3266 | return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size; |
3267 | } |
3268 | |
3269 | /** Gets a file size. |
3270 | @param[in] filename Full path to the filename to check |
3271 | @return file size if OK, else set m_total_size to ~0 and m_alloc_size to |
3272 | errno */ |
3273 | os_file_size_t |
3274 | os_file_get_size( |
3275 | const char* filename) |
3276 | { |
3277 | struct stat s; |
3278 | os_file_size_t file_size; |
3279 | |
3280 | int ret = stat(filename, &s); |
3281 | |
3282 | if (ret == 0) { |
3283 | file_size.m_total_size = s.st_size; |
3284 | /* st_blocks is in 512 byte sized blocks */ |
3285 | file_size.m_alloc_size = s.st_blocks * 512; |
3286 | } else { |
3287 | file_size.m_total_size = ~0U; |
3288 | file_size.m_alloc_size = (os_offset_t) errno; |
3289 | } |
3290 | |
3291 | return(file_size); |
3292 | } |
3293 | |
3294 | /** This function returns information about the specified file |
3295 | @param[in] path pathname of the file |
3296 | @param[out] stat_info information of a file in a directory |
3297 | @param[in,out] statinfo information of a file in a directory |
3298 | @param[in] check_rw_perm for testing whether the file can be opened |
3299 | in RW mode |
3300 | @param[in] read_only if true read only mode checks are enforced |
3301 | @return DB_SUCCESS if all OK */ |
3302 | static |
3303 | dberr_t |
3304 | os_file_get_status_posix( |
3305 | const char* path, |
3306 | os_file_stat_t* stat_info, |
3307 | struct stat* statinfo, |
3308 | bool check_rw_perm, |
3309 | bool read_only) |
3310 | { |
3311 | int ret = stat(path, statinfo); |
3312 | |
3313 | if (ret && (errno == ENOENT || errno == ENOTDIR)) { |
3314 | /* file does not exist */ |
3315 | |
3316 | return(DB_NOT_FOUND); |
3317 | |
3318 | } else if (ret) { |
3319 | /* file exists, but stat call failed */ |
3320 | |
3321 | os_file_handle_error_no_exit(path, "stat" , false); |
3322 | |
3323 | return(DB_FAIL); |
3324 | } |
3325 | |
3326 | switch (statinfo->st_mode & S_IFMT) { |
3327 | case S_IFDIR: |
3328 | stat_info->type = OS_FILE_TYPE_DIR; |
3329 | break; |
3330 | case S_IFLNK: |
3331 | stat_info->type = OS_FILE_TYPE_LINK; |
3332 | break; |
3333 | case S_IFBLK: |
3334 | /* Handle block device as regular file. */ |
3335 | case S_IFCHR: |
3336 | /* Handle character device as regular file. */ |
3337 | case S_IFREG: |
3338 | stat_info->type = OS_FILE_TYPE_FILE; |
3339 | break; |
3340 | default: |
3341 | stat_info->type = OS_FILE_TYPE_UNKNOWN; |
3342 | } |
3343 | |
3344 | stat_info->size = statinfo->st_size; |
3345 | stat_info->block_size = statinfo->st_blksize; |
3346 | stat_info->alloc_size = statinfo->st_blocks * 512; |
3347 | |
3348 | if (check_rw_perm |
3349 | && (stat_info->type == OS_FILE_TYPE_FILE |
3350 | || stat_info->type == OS_FILE_TYPE_BLOCK)) { |
3351 | |
3352 | stat_info->rw_perm = !access(path, read_only |
3353 | ? R_OK : R_OK | W_OK); |
3354 | } |
3355 | |
3356 | return(DB_SUCCESS); |
3357 | } |
3358 | |
3359 | /** Truncates a file to a specified size in bytes. |
3360 | Do nothing if the size to preserve is greater or equal to the current |
3361 | size of the file. |
3362 | @param[in] pathname file path |
3363 | @param[in] file file to be truncated |
3364 | @param[in] size size to preserve in bytes |
3365 | @return true if success */ |
3366 | static |
3367 | bool |
3368 | os_file_truncate_posix( |
3369 | const char* pathname, |
3370 | os_file_t file, |
3371 | os_offset_t size) |
3372 | { |
3373 | int res = ftruncate(file, size); |
3374 | |
3375 | if (res == -1) { |
3376 | |
3377 | bool retry; |
3378 | |
3379 | retry = os_file_handle_error_no_exit( |
3380 | pathname, "truncate" , false); |
3381 | |
3382 | if (retry) { |
3383 | ib::warn() |
3384 | << "Truncate failed for '" |
3385 | << pathname << "'" ; |
3386 | } |
3387 | } |
3388 | |
3389 | return(res == 0); |
3390 | } |
3391 | |
3392 | /** Truncates a file at its current position. |
3393 | @return true if success */ |
3394 | bool |
3395 | os_file_set_eof( |
3396 | FILE* file) /*!< in: file to be truncated */ |
3397 | { |
3398 | WAIT_ALLOW_WRITES(); |
3399 | return(!ftruncate(fileno(file), ftell(file))); |
3400 | } |
3401 | |
3402 | #else /* !_WIN32 */ |
3403 | |
3404 | #include <WinIoCtl.h> |
3405 | |
3406 | /* |
3407 | Windows : Handling synchronous IO on files opened asynchronously. |
3408 | |
3409 | If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to |
3410 | a completion port, then every IO on this file would normally be enqueued to the |
3411 | completion port. Sometimes however we would like to do a synchronous IO. This is |
3412 | possible if we initialitze have overlapped.hEvent with a valid event and set its |
3413 | lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) |
3414 | |
3415 | We'll create this special event once for each thread and store in thread local |
3416 | storage. |
3417 | */ |
3418 | |
3419 | |
3420 | static void __stdcall win_free_syncio_event(void *data) { |
3421 | if (data) { |
3422 | CloseHandle((HANDLE)data); |
3423 | } |
3424 | } |
3425 | |
3426 | |
3427 | /* |
3428 | Retrieve per-thread event for doing synchronous io on asyncronously opened files |
3429 | */ |
3430 | static HANDLE win_get_syncio_event() |
3431 | { |
3432 | HANDLE h; |
3433 | |
3434 | h = (HANDLE)FlsGetValue(fls_sync_io); |
3435 | if (h) { |
3436 | return h; |
3437 | } |
3438 | h = CreateEventA(NULL, FALSE, FALSE, NULL); |
3439 | ut_a(h); |
3440 | /* Set low-order bit to keeps I/O completion from being queued */ |
3441 | h = (HANDLE)((uintptr_t)h | 1); |
3442 | FlsSetValue(fls_sync_io, h); |
3443 | return h; |
3444 | } |
3445 | |
3446 | |
3447 | /** Do the read/write |
3448 | @param[in] request The IO context and type |
3449 | @return the number of bytes read/written or negative value on error */ |
3450 | ssize_t |
3451 | SyncFileIO::execute(const IORequest& request) |
3452 | { |
3453 | OVERLAPPED seek; |
3454 | |
3455 | memset(&seek, 0x0, sizeof(seek)); |
3456 | |
3457 | seek.hEvent = win_get_syncio_event(); |
3458 | seek.Offset = (DWORD) m_offset & 0xFFFFFFFF; |
3459 | seek.OffsetHigh = (DWORD) (m_offset >> 32); |
3460 | |
3461 | BOOL ret; |
3462 | DWORD n_bytes; |
3463 | |
3464 | if (request.is_read()) { |
3465 | ret = ReadFile(m_fh, m_buf, |
3466 | static_cast<DWORD>(m_n), NULL, &seek); |
3467 | |
3468 | } else { |
3469 | ut_ad(request.is_write()); |
3470 | ret = WriteFile(m_fh, m_buf, |
3471 | static_cast<DWORD>(m_n), NULL, &seek); |
3472 | } |
3473 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
3474 | /* Wait for async io to complete */ |
3475 | ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE); |
3476 | } |
3477 | |
3478 | return(ret ? static_cast<ssize_t>(n_bytes) : -1); |
3479 | } |
3480 | |
3481 | /** Do the read/write |
3482 | @param[in,out] slot The IO slot, it has the IO context |
3483 | @return the number of bytes read/written or negative value on error */ |
3484 | ssize_t |
3485 | SyncFileIO::execute(Slot* slot) |
3486 | { |
3487 | BOOL ret; |
3488 | slot->control.hEvent = win_get_syncio_event(); |
3489 | if (slot->type.is_read()) { |
3490 | |
3491 | ret = ReadFile( |
3492 | slot->file, slot->ptr, slot->len, |
3493 | NULL, &slot->control); |
3494 | |
3495 | } else { |
3496 | ut_ad(slot->type.is_write()); |
3497 | |
3498 | ret = WriteFile( |
3499 | slot->file, slot->ptr, slot->len, |
3500 | NULL, &slot->control); |
3501 | |
3502 | } |
3503 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
3504 | /* Wait for async io to complete */ |
3505 | ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE); |
3506 | } |
3507 | |
3508 | return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1); |
3509 | } |
3510 | |
3511 | /* Startup/shutdown */ |
3512 | |
3513 | struct WinIoInit |
3514 | { |
3515 | WinIoInit() { |
3516 | fls_sync_io = FlsAlloc(win_free_syncio_event); |
3517 | ut_a(fls_sync_io != FLS_OUT_OF_INDEXES); |
3518 | } |
3519 | |
3520 | ~WinIoInit() { |
3521 | FlsFree(fls_sync_io); |
3522 | } |
3523 | }; |
3524 | |
3525 | /* Ensures proper initialization and shutdown */ |
3526 | static WinIoInit win_io_init; |
3527 | |
3528 | |
3529 | /** Free storage space associated with a section of the file. |
3530 | @param[in] fh Open file handle |
3531 | @param[in] page_size Tablespace page size |
3532 | @param[in] block_size File system block size |
3533 | @param[in] off Starting offset (SEEK_SET) |
3534 | @param[in] len Size of the hole |
3535 | @return 0 on success or errno */ |
3536 | static |
3537 | dberr_t |
3538 | os_file_punch_hole_win32( |
3539 | os_file_t fh, |
3540 | os_offset_t off, |
3541 | os_offset_t len) |
3542 | { |
3543 | FILE_ZERO_DATA_INFORMATION punch; |
3544 | |
3545 | punch.FileOffset.QuadPart = off; |
3546 | punch.BeyondFinalZero.QuadPart = off + len; |
3547 | |
3548 | /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL, |
3549 | therefore we pass a dummy parameter. */ |
3550 | DWORD temp; |
3551 | BOOL success = os_win32_device_io_control( |
3552 | fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch), |
3553 | NULL, 0, &temp); |
3554 | |
3555 | return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE); |
3556 | } |
3557 | |
3558 | /** Check the existence and type of the given file. |
3559 | @param[in] path path name of file |
3560 | @param[out] exists true if the file exists |
3561 | @param[out] type Type of the file, if it exists |
3562 | @return true if call succeeded */ |
3563 | static |
3564 | bool |
3565 | os_file_status_win32( |
3566 | const char* path, |
3567 | bool* exists, |
3568 | os_file_type_t* type) |
3569 | { |
3570 | int ret; |
3571 | struct _stat64 statinfo; |
3572 | |
3573 | ret = _stat64(path, &statinfo); |
3574 | |
3575 | *exists = !ret; |
3576 | |
3577 | if (!ret) { |
3578 | /* file exists, everything OK */ |
3579 | |
3580 | } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) { |
3581 | /* file does not exist */ |
3582 | return(true); |
3583 | |
3584 | } else { |
3585 | /* file exists, but stat call failed */ |
3586 | os_file_handle_error_no_exit(path, "stat" , false); |
3587 | return(false); |
3588 | } |
3589 | |
3590 | if (_S_IFDIR & statinfo.st_mode) { |
3591 | *type = OS_FILE_TYPE_DIR; |
3592 | |
3593 | } else if (_S_IFREG & statinfo.st_mode) { |
3594 | *type = OS_FILE_TYPE_FILE; |
3595 | |
3596 | } else { |
3597 | *type = OS_FILE_TYPE_UNKNOWN; |
3598 | } |
3599 | |
3600 | return(true); |
3601 | } |
3602 | |
3603 | /** NOTE! Use the corresponding macro os_file_flush(), not directly this |
3604 | function! |
3605 | Flushes the write buffers of a given file to the disk. |
3606 | @param[in] file handle to a file |
3607 | @return true if success */ |
3608 | bool |
3609 | os_file_flush_func( |
3610 | os_file_t file) |
3611 | { |
3612 | ++os_n_fsyncs; |
3613 | |
3614 | BOOL ret = FlushFileBuffers(file); |
3615 | |
3616 | if (ret) { |
3617 | return(true); |
3618 | } |
3619 | |
3620 | /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is |
3621 | actually a raw device, we choose to ignore that error if we are using |
3622 | raw disks */ |
3623 | |
3624 | if (srv_start_raw_disk_in_use && GetLastError() |
3625 | == ERROR_INVALID_FUNCTION) { |
3626 | return(true); |
3627 | } |
3628 | |
3629 | os_file_handle_error(NULL, "flush" ); |
3630 | |
3631 | /* It is a fatal error if a file flush does not succeed, because then |
3632 | the database can get corrupt on disk */ |
3633 | ut_error; |
3634 | |
3635 | return(false); |
3636 | } |
3637 | |
3638 | /** Retrieves the last error number if an error occurs in a file io function. |
3639 | The number should be retrieved before any other OS calls (because they may |
3640 | overwrite the error number). If the number is not known to this program, |
3641 | the OS error number + 100 is returned. |
3642 | @param[in] report_all_errors true if we want an error message printed |
3643 | of all errors |
3644 | @param[in] on_error_silent true then don't print any diagnostic |
3645 | to the log |
3646 | @return error number, or OS error number + 100 */ |
3647 | static |
3648 | ulint |
3649 | os_file_get_last_error_low( |
3650 | bool report_all_errors, |
3651 | bool on_error_silent) |
3652 | { |
3653 | ulint err = (ulint) GetLastError(); |
3654 | |
3655 | if (err == ERROR_SUCCESS) { |
3656 | return(0); |
3657 | } |
3658 | |
3659 | if (report_all_errors |
3660 | || (!on_error_silent |
3661 | && err != ERROR_DISK_FULL |
3662 | && err != ERROR_FILE_EXISTS)) { |
3663 | |
3664 | ib::error() |
3665 | << "Operating system error number " << err |
3666 | << " in a file operation." ; |
3667 | |
3668 | if (err == ERROR_PATH_NOT_FOUND) { |
3669 | ib::error() |
3670 | << "The error means the system" |
3671 | " cannot find the path specified." ; |
3672 | |
3673 | if (srv_is_being_started) { |
3674 | ib::error() |
3675 | << "If you are installing InnoDB," |
3676 | " remember that you must create" |
3677 | " directories yourself, InnoDB" |
3678 | " does not create them." ; |
3679 | } |
3680 | |
3681 | } else if (err == ERROR_ACCESS_DENIED) { |
3682 | |
3683 | ib::error() |
3684 | << "The error means mysqld does not have" |
3685 | " the access rights to" |
3686 | " the directory. It may also be" |
3687 | " you have created a subdirectory" |
3688 | " of the same name as a data file." ; |
3689 | |
3690 | } else if (err == ERROR_SHARING_VIOLATION |
3691 | || err == ERROR_LOCK_VIOLATION) { |
3692 | |
3693 | ib::error() |
3694 | << "The error means that another program" |
3695 | " is using InnoDB's files." |
3696 | " This might be a backup or antivirus" |
3697 | " software or another instance" |
3698 | " of MySQL." |
3699 | " Please close it to get rid of this error." ; |
3700 | |
3701 | } else if (err == ERROR_WORKING_SET_QUOTA |
3702 | || err == ERROR_NO_SYSTEM_RESOURCES) { |
3703 | |
3704 | ib::error() |
3705 | << "The error means that there are no" |
3706 | " sufficient system resources or quota to" |
3707 | " complete the operation." ; |
3708 | |
3709 | } else if (err == ERROR_OPERATION_ABORTED) { |
3710 | |
3711 | ib::error() |
3712 | << "The error means that the I/O" |
3713 | " operation has been aborted" |
3714 | " because of either a thread exit" |
3715 | " or an application request." |
3716 | " Retry attempt is made." ; |
3717 | } else { |
3718 | |
3719 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
3720 | } |
3721 | } |
3722 | |
3723 | if (err == ERROR_FILE_NOT_FOUND) { |
3724 | return(OS_FILE_NOT_FOUND); |
3725 | } else if (err == ERROR_DISK_FULL) { |
3726 | return(OS_FILE_DISK_FULL); |
3727 | } else if (err == ERROR_FILE_EXISTS) { |
3728 | return(OS_FILE_ALREADY_EXISTS); |
3729 | } else if (err == ERROR_SHARING_VIOLATION |
3730 | || err == ERROR_LOCK_VIOLATION) { |
3731 | return(OS_FILE_SHARING_VIOLATION); |
3732 | } else if (err == ERROR_WORKING_SET_QUOTA |
3733 | || err == ERROR_NO_SYSTEM_RESOURCES) { |
3734 | return(OS_FILE_INSUFFICIENT_RESOURCE); |
3735 | } else if (err == ERROR_OPERATION_ABORTED) { |
3736 | return(OS_FILE_OPERATION_ABORTED); |
3737 | } else if (err == ERROR_ACCESS_DENIED) { |
3738 | return(OS_FILE_ACCESS_VIOLATION); |
3739 | } |
3740 | |
3741 | return(OS_FILE_ERROR_MAX + err); |
3742 | } |
3743 | |
3744 | |
3745 | /** NOTE! Use the corresponding macro os_file_create_simple(), not directly |
3746 | this function! |
3747 | A simple function to open or create a file. |
3748 | @param[in] name name of the file or path as a null-terminated |
3749 | string |
3750 | @param[in] create_mode create mode |
3751 | @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE |
3752 | @param[in] read_only if true read only mode checks are enforced |
3753 | @param[out] success true if succeed, false if error |
3754 | @return handle to the file, not defined if error, error number |
3755 | can be retrieved with os_file_get_last_error */ |
3756 | pfs_os_file_t |
3757 | os_file_create_simple_func( |
3758 | const char* name, |
3759 | ulint create_mode, |
3760 | ulint access_type, |
3761 | bool read_only, |
3762 | bool* success) |
3763 | { |
3764 | os_file_t file; |
3765 | |
3766 | *success = false; |
3767 | |
3768 | DWORD access; |
3769 | DWORD create_flag; |
3770 | DWORD attributes = 0; |
3771 | |
3772 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
3773 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
3774 | ut_ad(srv_operation == SRV_OPERATION_NORMAL); |
3775 | |
3776 | if (create_mode == OS_FILE_OPEN) { |
3777 | |
3778 | create_flag = OPEN_EXISTING; |
3779 | |
3780 | } else if (read_only) { |
3781 | |
3782 | create_flag = OPEN_EXISTING; |
3783 | |
3784 | } else if (create_mode == OS_FILE_CREATE) { |
3785 | |
3786 | create_flag = CREATE_NEW; |
3787 | |
3788 | } else if (create_mode == OS_FILE_CREATE_PATH) { |
3789 | |
3790 | /* Create subdirs along the path if needed. */ |
3791 | *success = os_file_create_subdirs_if_needed(name); |
3792 | |
3793 | if (!*success) { |
3794 | |
3795 | ib::error() |
3796 | << "Unable to create subdirectories '" |
3797 | << name << "'" ; |
3798 | |
3799 | return(OS_FILE_CLOSED); |
3800 | } |
3801 | |
3802 | create_flag = CREATE_NEW; |
3803 | create_mode = OS_FILE_CREATE; |
3804 | |
3805 | } else { |
3806 | |
3807 | ib::error() |
3808 | << "Unknown file create mode (" |
3809 | << create_mode << ") for file '" |
3810 | << name << "'" ; |
3811 | |
3812 | return(OS_FILE_CLOSED); |
3813 | } |
3814 | |
3815 | if (access_type == OS_FILE_READ_ONLY) { |
3816 | |
3817 | access = GENERIC_READ; |
3818 | |
3819 | } else if (read_only) { |
3820 | |
3821 | ib::info() |
3822 | << "Read only mode set. Unable to" |
3823 | " open file '" << name << "' in RW mode, " |
3824 | << "trying RO mode" ; |
3825 | |
3826 | access = GENERIC_READ; |
3827 | |
3828 | } else if (access_type == OS_FILE_READ_WRITE) { |
3829 | |
3830 | access = GENERIC_READ | GENERIC_WRITE; |
3831 | |
3832 | } else { |
3833 | |
3834 | ib::error() |
3835 | << "Unknown file access type (" << access_type << ") " |
3836 | "for file '" << name << "'" ; |
3837 | |
3838 | return(OS_FILE_CLOSED); |
3839 | } |
3840 | |
3841 | bool retry; |
3842 | |
3843 | do { |
3844 | /* Use default security attributes and no template file. */ |
3845 | |
3846 | file = CreateFile( |
3847 | (LPCTSTR) name, access, FILE_SHARE_READ, NULL, |
3848 | create_flag, attributes, NULL); |
3849 | |
3850 | if (file == INVALID_HANDLE_VALUE) { |
3851 | |
3852 | *success = false; |
3853 | |
3854 | retry = os_file_handle_error( |
3855 | name, create_mode == OS_FILE_OPEN ? |
3856 | "open" : "create" ); |
3857 | |
3858 | } else { |
3859 | |
3860 | retry = false; |
3861 | |
3862 | *success = true; |
3863 | } |
3864 | |
3865 | } while (retry); |
3866 | |
3867 | return(file); |
3868 | } |
3869 | |
3870 | /** This function attempts to create a directory named pathname. The new |
3871 | directory gets default permissions. On Unix the permissions are |
3872 | (0770 & ~umask). If the directory exists already, nothing is done and |
3873 | the call succeeds, unless the fail_if_exists arguments is true. |
3874 | If another error occurs, such as a permission error, this does not crash, |
3875 | but reports the error and returns false. |
3876 | @param[in] pathname directory name as null-terminated string |
3877 | @param[in] fail_if_exists if true, pre-existing directory is treated |
3878 | as an error. |
3879 | @return true if call succeeds, false on error */ |
3880 | bool |
3881 | os_file_create_directory( |
3882 | const char* pathname, |
3883 | bool fail_if_exists) |
3884 | { |
3885 | BOOL rcode; |
3886 | |
3887 | rcode = CreateDirectory((LPCTSTR) pathname, NULL); |
3888 | if (!(rcode != 0 |
3889 | || (GetLastError() == ERROR_ALREADY_EXISTS |
3890 | && !fail_if_exists))) { |
3891 | |
3892 | os_file_handle_error_no_exit( |
3893 | pathname, "CreateDirectory" , false); |
3894 | |
3895 | return(false); |
3896 | } |
3897 | |
3898 | return(true); |
3899 | } |
3900 | |
3901 | /** The os_file_opendir() function opens a directory stream corresponding to the |
3902 | directory named by the dirname argument. The directory stream is positioned |
3903 | at the first entry. In both Unix and Windows we automatically skip the '.' |
3904 | and '..' items at the start of the directory listing. |
3905 | @param[in] dirname directory name; it must not contain a trailing |
3906 | '\' or '/' |
3907 | @param[in] is_fatal true if we should treat an error as a fatal |
3908 | error; if we try to open symlinks then we do |
3909 | not wish a fatal error if it happens not to |
3910 | be a directory |
3911 | @return directory stream, NULL if error */ |
3912 | os_file_dir_t |
3913 | os_file_opendir( |
3914 | const char* dirname, |
3915 | bool error_is_fatal) |
3916 | { |
3917 | os_file_dir_t dir; |
3918 | LPWIN32_FIND_DATA lpFindFileData; |
3919 | char path[OS_FILE_MAX_PATH + 3]; |
3920 | |
3921 | ut_a(strlen(dirname) < OS_FILE_MAX_PATH); |
3922 | |
3923 | strcpy(path, dirname); |
3924 | strcpy(path + strlen(path), "\\*" ); |
3925 | |
3926 | /* Note that in Windows opening the 'directory stream' also retrieves |
3927 | the first entry in the directory. Since it is '.', that is no problem, |
3928 | as we will skip over the '.' and '..' entries anyway. */ |
3929 | |
3930 | lpFindFileData = static_cast<LPWIN32_FIND_DATA>( |
3931 | ut_malloc_nokey(sizeof(WIN32_FIND_DATA))); |
3932 | |
3933 | dir = FindFirstFile((LPCTSTR) path, lpFindFileData); |
3934 | |
3935 | ut_free(lpFindFileData); |
3936 | |
3937 | if (dir == INVALID_HANDLE_VALUE) { |
3938 | |
3939 | if (error_is_fatal) { |
3940 | os_file_handle_error(dirname, "opendir" ); |
3941 | } |
3942 | |
3943 | return(NULL); |
3944 | } |
3945 | |
3946 | return(dir); |
3947 | } |
3948 | |
3949 | /** Closes a directory stream. |
3950 | @param[in] dir directory stream |
3951 | @return 0 if success, -1 if failure */ |
3952 | int |
3953 | os_file_closedir( |
3954 | os_file_dir_t dir) |
3955 | { |
3956 | BOOL ret; |
3957 | |
3958 | ret = FindClose(dir); |
3959 | |
3960 | if (!ret) { |
3961 | os_file_handle_error_no_exit(NULL, "closedir" , false); |
3962 | |
3963 | return(-1); |
3964 | } |
3965 | |
3966 | return(0); |
3967 | } |
3968 | |
3969 | /** This function returns information of the next file in the directory. We |
3970 | jump over the '.' and '..' entries in the directory. |
3971 | @param[in] dirname directory name or path |
3972 | @param[in] dir directory stream |
3973 | @param[out] info buffer where the info is returned |
3974 | @return 0 if ok, -1 if error, 1 if at the end of the directory */ |
3975 | int |
3976 | os_file_readdir_next_file( |
3977 | const char* dirname, |
3978 | os_file_dir_t dir, |
3979 | os_file_stat_t* info) |
3980 | { |
3981 | BOOL ret; |
3982 | int status; |
3983 | WIN32_FIND_DATA find_data; |
3984 | |
3985 | next_file: |
3986 | |
3987 | ret = FindNextFile(dir, &find_data); |
3988 | |
3989 | if (ret > 0) { |
3990 | |
3991 | const char* name; |
3992 | |
3993 | name = static_cast<const char*>(find_data.cFileName); |
3994 | |
3995 | ut_a(strlen(name) < OS_FILE_MAX_PATH); |
3996 | |
3997 | if (strcmp(name, "." ) == 0 || strcmp(name, ".." ) == 0) { |
3998 | |
3999 | goto next_file; |
4000 | } |
4001 | |
4002 | strcpy(info->name, name); |
4003 | |
4004 | info->size = find_data.nFileSizeHigh; |
4005 | info->size <<= 32; |
4006 | info->size |= find_data.nFileSizeLow; |
4007 | |
4008 | if (find_data.dwFileAttributes |
4009 | & FILE_ATTRIBUTE_REPARSE_POINT) { |
4010 | |
4011 | /* TODO: test Windows symlinks */ |
4012 | /* TODO: MySQL has apparently its own symlink |
4013 | implementation in Windows, dbname.sym can |
4014 | redirect a database directory: |
4015 | REFMAN "windows-symbolic-links.html" */ |
4016 | |
4017 | info->type = OS_FILE_TYPE_LINK; |
4018 | |
4019 | } else if (find_data.dwFileAttributes |
4020 | & FILE_ATTRIBUTE_DIRECTORY) { |
4021 | |
4022 | info->type = OS_FILE_TYPE_DIR; |
4023 | |
4024 | } else { |
4025 | |
4026 | /* It is probably safest to assume that all other |
4027 | file types are normal. Better to check them rather |
4028 | than blindly skip them. */ |
4029 | |
4030 | info->type = OS_FILE_TYPE_FILE; |
4031 | } |
4032 | |
4033 | status = 0; |
4034 | |
4035 | } else if (GetLastError() == ERROR_NO_MORE_FILES) { |
4036 | |
4037 | status = 1; |
4038 | |
4039 | } else { |
4040 | |
4041 | os_file_handle_error_no_exit(NULL, "readdir_next_file" , false); |
4042 | |
4043 | status = -1; |
4044 | } |
4045 | |
4046 | return(status); |
4047 | } |
4048 | |
4049 | /** NOTE! Use the corresponding macro os_file_create(), not directly |
4050 | this function! |
4051 | Opens an existing file or creates a new. |
4052 | @param[in] name name of the file or path as a null-terminated |
4053 | string |
4054 | @param[in] create_mode create mode |
4055 | @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O |
4056 | is desired, OS_FILE_NORMAL, if any normal file; |
4057 | NOTE that it also depends on type, os_aio_.. |
4058 | and srv_.. variables whether we really use async |
4059 | I/O or unbuffered I/O: look in the function |
4060 | source code for the exact rules |
4061 | @param[in] type OS_DATA_FILE or OS_LOG_FILE |
4062 | @param[in] success true if succeeded |
4063 | @return handle to the file, not defined if error, error number |
4064 | can be retrieved with os_file_get_last_error */ |
4065 | pfs_os_file_t |
4066 | os_file_create_func( |
4067 | const char* name, |
4068 | ulint create_mode, |
4069 | ulint purpose, |
4070 | ulint type, |
4071 | bool read_only, |
4072 | bool* success) |
4073 | { |
4074 | os_file_t file; |
4075 | bool retry; |
4076 | bool on_error_no_exit; |
4077 | bool on_error_silent; |
4078 | |
4079 | *success = false; |
4080 | |
4081 | DBUG_EXECUTE_IF( |
4082 | "ib_create_table_fail_disk_full" , |
4083 | *success = false; |
4084 | SetLastError(ERROR_DISK_FULL); |
4085 | return(OS_FILE_CLOSED); |
4086 | ); |
4087 | |
4088 | DWORD create_flag; |
4089 | DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL |
4090 | ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE |
4091 | : FILE_SHARE_READ; |
4092 | |
4093 | if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) { |
4094 | WAIT_ALLOW_WRITES(); |
4095 | } |
4096 | |
4097 | on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT |
4098 | ? true : false; |
4099 | |
4100 | on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT |
4101 | ? true : false; |
4102 | |
4103 | create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT); |
4104 | |
4105 | if (create_mode == OS_FILE_OPEN_RAW) { |
4106 | |
4107 | ut_a(!read_only); |
4108 | |
4109 | create_flag = OPEN_EXISTING; |
4110 | |
4111 | /* On Windows Physical devices require admin privileges and |
4112 | have to have the write-share mode set. See the remarks |
4113 | section for the CreateFile() function documentation in MSDN. */ |
4114 | |
4115 | share_mode |= FILE_SHARE_WRITE; |
4116 | |
4117 | } else if (create_mode == OS_FILE_OPEN |
4118 | || create_mode == OS_FILE_OPEN_RETRY) { |
4119 | |
4120 | create_flag = OPEN_EXISTING; |
4121 | |
4122 | } else if (read_only) { |
4123 | |
4124 | create_flag = OPEN_EXISTING; |
4125 | |
4126 | } else if (create_mode == OS_FILE_CREATE) { |
4127 | |
4128 | create_flag = CREATE_NEW; |
4129 | |
4130 | } else if (create_mode == OS_FILE_OVERWRITE) { |
4131 | |
4132 | create_flag = CREATE_ALWAYS; |
4133 | |
4134 | } else { |
4135 | ib::error() |
4136 | << "Unknown file create mode (" << create_mode << ") " |
4137 | << " for file '" << name << "'" ; |
4138 | |
4139 | return(OS_FILE_CLOSED); |
4140 | } |
4141 | |
4142 | DWORD attributes = 0; |
4143 | |
4144 | if (purpose == OS_FILE_AIO) { |
4145 | |
4146 | #ifdef WIN_ASYNC_IO |
4147 | /* If specified, use asynchronous (overlapped) io and no |
4148 | buffering of writes in the OS */ |
4149 | |
4150 | if (srv_use_native_aio) { |
4151 | attributes |= FILE_FLAG_OVERLAPPED; |
4152 | } |
4153 | #endif /* WIN_ASYNC_IO */ |
4154 | |
4155 | } else if (purpose == OS_FILE_NORMAL) { |
4156 | |
4157 | /* Use default setting. */ |
4158 | |
4159 | } else { |
4160 | |
4161 | ib::error() |
4162 | << "Unknown purpose flag (" << purpose << ") " |
4163 | << "while opening file '" << name << "'" ; |
4164 | |
4165 | return(OS_FILE_CLOSED); |
4166 | } |
4167 | |
4168 | if (type == OS_LOG_FILE) { |
4169 | /* There is not reason to use buffered write to logs.*/ |
4170 | attributes |= FILE_FLAG_NO_BUFFERING; |
4171 | } |
4172 | |
4173 | switch (srv_file_flush_method) |
4174 | { |
4175 | case SRV_O_DSYNC: |
4176 | if (type == OS_LOG_FILE) { |
4177 | /* Map O_SYNC to FILE_WRITE_THROUGH */ |
4178 | attributes |= FILE_FLAG_WRITE_THROUGH; |
4179 | } |
4180 | break; |
4181 | |
4182 | case SRV_O_DIRECT_NO_FSYNC: |
4183 | case SRV_O_DIRECT: |
4184 | if (type == OS_DATA_FILE) { |
4185 | attributes |= FILE_FLAG_NO_BUFFERING; |
4186 | } |
4187 | break; |
4188 | |
4189 | case SRV_ALL_O_DIRECT_FSYNC: |
4190 | /*Traditional Windows behavior, no buffering for any files.*/ |
4191 | attributes |= FILE_FLAG_NO_BUFFERING; |
4192 | break; |
4193 | |
4194 | case SRV_FSYNC: |
4195 | case SRV_LITTLESYNC: |
4196 | break; |
4197 | |
4198 | case SRV_NOSYNC: |
4199 | /* Let Windows cache manager handle all writes.*/ |
4200 | attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); |
4201 | break; |
4202 | |
4203 | default: |
4204 | ut_a(false); /* unknown flush mode.*/ |
4205 | } |
4206 | |
4207 | |
4208 | // TODO: Create a bug, this looks wrong. The flush log |
4209 | // parameter is dynamic. |
4210 | if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { |
4211 | /* Do not use unbuffered i/o for the log files because |
4212 | value 2 denotes that we do not flush the log at every |
4213 | commit, but only once per second */ |
4214 | attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING); |
4215 | } |
4216 | |
4217 | |
4218 | DWORD access = GENERIC_READ; |
4219 | |
4220 | if (!read_only) { |
4221 | access |= GENERIC_WRITE; |
4222 | } |
4223 | |
4224 | do { |
4225 | /* Use default security attributes and no template file. */ |
4226 | file = CreateFile( |
4227 | (LPCTSTR) name, access, share_mode, NULL, |
4228 | create_flag, attributes, NULL); |
4229 | |
4230 | if (file == INVALID_HANDLE_VALUE) { |
4231 | const char* operation; |
4232 | |
4233 | operation = (create_mode == OS_FILE_CREATE |
4234 | && !read_only) |
4235 | ? "create" : "open" ; |
4236 | |
4237 | *success = false; |
4238 | |
4239 | if (on_error_no_exit) { |
4240 | retry = os_file_handle_error_no_exit( |
4241 | name, operation, on_error_silent); |
4242 | } else { |
4243 | retry = os_file_handle_error(name, operation); |
4244 | } |
4245 | } else { |
4246 | |
4247 | retry = false; |
4248 | |
4249 | *success = true; |
4250 | |
4251 | if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) { |
4252 | /* Bind the file handle to completion port. Completion port |
4253 | might not be created yet, in some stages of backup, but |
4254 | must always be there for the server.*/ |
4255 | HANDLE port =(type == OS_LOG_FILE)? |
4256 | log_completion_port : data_completion_port; |
4257 | ut_a(port || srv_operation != SRV_OPERATION_NORMAL); |
4258 | if (port) { |
4259 | ut_a(CreateIoCompletionPort(file, port, 0, 0)); |
4260 | } |
4261 | } |
4262 | } |
4263 | } while (retry); |
4264 | |
4265 | return(file); |
4266 | } |
4267 | |
4268 | /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(), |
4269 | not directly this function! |
4270 | A simple function to open or create a file. |
4271 | @param[in] name name of the file or path as a null-terminated |
4272 | string |
4273 | @param[in] create_mode create mode |
4274 | @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or |
4275 | OS_FILE_READ_ALLOW_DELETE; the last option is |
4276 | used by a backup program reading the file |
4277 | @param[out] success true if succeeded |
4278 | @return own: handle to the file, not defined if error, error number |
4279 | can be retrieved with os_file_get_last_error */ |
4280 | pfs_os_file_t |
4281 | os_file_create_simple_no_error_handling_func( |
4282 | const char* name, |
4283 | ulint create_mode, |
4284 | ulint access_type, |
4285 | bool read_only, |
4286 | bool* success) |
4287 | { |
4288 | os_file_t file; |
4289 | |
4290 | *success = false; |
4291 | |
4292 | DWORD access; |
4293 | DWORD create_flag; |
4294 | DWORD attributes = 0; |
4295 | DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL |
4296 | ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE |
4297 | : FILE_SHARE_READ; |
4298 | |
4299 | ut_a(name); |
4300 | |
4301 | ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); |
4302 | ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); |
4303 | |
4304 | if (create_mode == OS_FILE_OPEN) { |
4305 | |
4306 | create_flag = OPEN_EXISTING; |
4307 | |
4308 | } else if (read_only) { |
4309 | |
4310 | create_flag = OPEN_EXISTING; |
4311 | |
4312 | } else if (create_mode == OS_FILE_CREATE) { |
4313 | |
4314 | create_flag = CREATE_NEW; |
4315 | |
4316 | } else { |
4317 | |
4318 | ib::error() |
4319 | << "Unknown file create mode (" << create_mode << ") " |
4320 | << " for file '" << name << "'" ; |
4321 | |
4322 | return(OS_FILE_CLOSED); |
4323 | } |
4324 | |
4325 | if (access_type == OS_FILE_READ_ONLY) { |
4326 | |
4327 | access = GENERIC_READ; |
4328 | |
4329 | } else if (read_only) { |
4330 | |
4331 | access = GENERIC_READ; |
4332 | |
4333 | } else if (access_type == OS_FILE_READ_WRITE) { |
4334 | |
4335 | access = GENERIC_READ | GENERIC_WRITE; |
4336 | |
4337 | } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { |
4338 | |
4339 | ut_a(!read_only); |
4340 | |
4341 | access = GENERIC_READ; |
4342 | |
4343 | /*!< A backup program has to give mysqld the maximum |
4344 | freedom to do what it likes with the file */ |
4345 | |
4346 | share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE |
4347 | | FILE_SHARE_READ; |
4348 | } else { |
4349 | |
4350 | ib::error() |
4351 | << "Unknown file access type (" << access_type << ") " |
4352 | << "for file '" << name << "'" ; |
4353 | |
4354 | return(OS_FILE_CLOSED); |
4355 | } |
4356 | |
4357 | file = CreateFile((LPCTSTR) name, |
4358 | access, |
4359 | share_mode, |
4360 | NULL, // Security attributes |
4361 | create_flag, |
4362 | attributes, |
4363 | NULL); // No template file |
4364 | |
4365 | *success = (file != INVALID_HANDLE_VALUE); |
4366 | |
4367 | return(file); |
4368 | } |
4369 | |
4370 | /** Deletes a file if it exists. The file has to be closed before calling this. |
4371 | @param[in] name file path as a null-terminated string |
4372 | @param[out] exist indicate if file pre-exist |
4373 | @return true if success */ |
4374 | bool |
4375 | os_file_delete_if_exists_func( |
4376 | const char* name, |
4377 | bool* exist) |
4378 | { |
4379 | ulint count = 0; |
4380 | |
4381 | if (exist != NULL) { |
4382 | *exist = true; |
4383 | } |
4384 | |
4385 | for (;;) { |
4386 | /* In Windows, deleting an .ibd file may fail if |
4387 | the file is being accessed by an external program, |
4388 | such as a backup tool. */ |
4389 | |
4390 | bool ret = DeleteFile((LPCTSTR) name); |
4391 | |
4392 | if (ret) { |
4393 | return(true); |
4394 | } |
4395 | |
4396 | DWORD lasterr = GetLastError(); |
4397 | |
4398 | if (lasterr == ERROR_FILE_NOT_FOUND |
4399 | || lasterr == ERROR_PATH_NOT_FOUND) { |
4400 | |
4401 | /* the file does not exist, this not an error */ |
4402 | if (exist != NULL) { |
4403 | *exist = false; |
4404 | } |
4405 | |
4406 | return(true); |
4407 | } |
4408 | |
4409 | ++count; |
4410 | |
4411 | if (count > 100 && 0 == (count % 10)) { |
4412 | |
4413 | /* Print error information */ |
4414 | os_file_get_last_error(true); |
4415 | |
4416 | ib::warn() << "Delete of file '" << name << "' failed." ; |
4417 | } |
4418 | |
4419 | /* Sleep for a second */ |
4420 | os_thread_sleep(1000000); |
4421 | |
4422 | if (count > 2000) { |
4423 | |
4424 | return(false); |
4425 | } |
4426 | } |
4427 | } |
4428 | |
4429 | /** Deletes a file. The file has to be closed before calling this. |
4430 | @param[in] name File path as NUL terminated string |
4431 | @return true if success */ |
4432 | bool |
4433 | os_file_delete_func( |
4434 | const char* name) |
4435 | { |
4436 | ulint count = 0; |
4437 | |
4438 | for (;;) { |
4439 | /* In Windows, deleting an .ibd file may fail if |
4440 | the file is being accessed by an external program, |
4441 | such as a backup tool. */ |
4442 | |
4443 | BOOL ret = DeleteFile((LPCTSTR) name); |
4444 | |
4445 | if (ret) { |
4446 | return(true); |
4447 | } |
4448 | |
4449 | if (GetLastError() == ERROR_FILE_NOT_FOUND) { |
4450 | /* If the file does not exist, we classify this as |
4451 | a 'mild' error and return */ |
4452 | |
4453 | return(false); |
4454 | } |
4455 | |
4456 | ++count; |
4457 | |
4458 | if (count > 100 && 0 == (count % 10)) { |
4459 | |
4460 | /* print error information */ |
4461 | os_file_get_last_error(true); |
4462 | |
4463 | ib::warn() |
4464 | << "Cannot delete file '" << name << "'. Is " |
4465 | << "another program accessing it?" ; |
4466 | } |
4467 | |
4468 | /* sleep for a second */ |
4469 | os_thread_sleep(1000000); |
4470 | |
4471 | if (count > 2000) { |
4472 | |
4473 | return(false); |
4474 | } |
4475 | } |
4476 | |
4477 | ut_error; |
4478 | return(false); |
4479 | } |
4480 | |
4481 | /** NOTE! Use the corresponding macro os_file_rename(), not directly this |
4482 | function! |
4483 | Renames a file (can also move it to another directory). It is safest that the |
4484 | file is closed before calling this function. |
4485 | @param[in] oldpath old file path as a null-terminated string |
4486 | @param[in] newpath new file path |
4487 | @return true if success */ |
4488 | bool |
4489 | os_file_rename_func( |
4490 | const char* oldpath, |
4491 | const char* newpath) |
4492 | { |
4493 | #ifdef UNIV_DEBUG |
4494 | os_file_type_t type; |
4495 | bool exists; |
4496 | |
4497 | /* New path must not exist. */ |
4498 | ut_ad(os_file_status(newpath, &exists, &type)); |
4499 | ut_ad(!exists); |
4500 | |
4501 | /* Old path must exist. */ |
4502 | ut_ad(os_file_status(oldpath, &exists, &type)); |
4503 | ut_ad(exists); |
4504 | #endif /* UNIV_DEBUG */ |
4505 | |
4506 | if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) { |
4507 | return(true); |
4508 | } |
4509 | |
4510 | os_file_handle_error_no_exit(oldpath, "rename" , false); |
4511 | |
4512 | return(false); |
4513 | } |
4514 | |
4515 | /** NOTE! Use the corresponding macro os_file_close(), not directly |
4516 | this function! |
4517 | Closes a file handle. In case of error, error number can be retrieved with |
4518 | os_file_get_last_error. |
4519 | @param[in,own] file Handle to a file |
4520 | @return true if success */ |
4521 | bool |
4522 | os_file_close_func( |
4523 | os_file_t file) |
4524 | { |
4525 | ut_a(file); |
4526 | |
4527 | if (CloseHandle(file)) { |
4528 | return(true); |
4529 | } |
4530 | |
4531 | os_file_handle_error(NULL, "close" ); |
4532 | |
4533 | return(false); |
4534 | } |
4535 | |
4536 | /** Gets a file size. |
4537 | @param[in] file Handle to a file |
4538 | @return file size, or (os_offset_t) -1 on failure */ |
4539 | os_offset_t |
4540 | os_file_get_size( |
4541 | os_file_t file) |
4542 | { |
4543 | DWORD high; |
4544 | DWORD low = GetFileSize(file, &high); |
4545 | |
4546 | if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) { |
4547 | return((os_offset_t) -1); |
4548 | } |
4549 | |
4550 | return(os_offset_t(low | (os_offset_t(high) << 32))); |
4551 | } |
4552 | |
4553 | /** Gets a file size. |
4554 | @param[in] filename Full path to the filename to check |
4555 | @return file size if OK, else set m_total_size to ~0 and m_alloc_size to |
4556 | errno */ |
4557 | os_file_size_t |
4558 | os_file_get_size( |
4559 | const char* filename) |
4560 | { |
4561 | struct __stat64 s; |
4562 | os_file_size_t file_size; |
4563 | |
4564 | int ret = _stat64(filename, &s); |
4565 | |
4566 | if (ret == 0) { |
4567 | |
4568 | file_size.m_total_size = s.st_size; |
4569 | |
4570 | DWORD low_size; |
4571 | DWORD high_size; |
4572 | |
4573 | low_size = GetCompressedFileSize(filename, &high_size); |
4574 | |
4575 | if (low_size != INVALID_FILE_SIZE) { |
4576 | |
4577 | file_size.m_alloc_size = high_size; |
4578 | file_size.m_alloc_size <<= 32; |
4579 | file_size.m_alloc_size |= low_size; |
4580 | |
4581 | } else { |
4582 | ib::error() |
4583 | << "GetCompressedFileSize(" |
4584 | << filename << ", ..) failed." ; |
4585 | |
4586 | file_size.m_alloc_size = (os_offset_t) -1; |
4587 | } |
4588 | } else { |
4589 | file_size.m_total_size = ~0; |
4590 | file_size.m_alloc_size = (os_offset_t) ret; |
4591 | } |
4592 | |
4593 | return(file_size); |
4594 | } |
4595 | |
4596 | /** This function returns information about the specified file |
4597 | @param[in] path pathname of the file |
4598 | @param[out] stat_info information of a file in a directory |
4599 | @param[in,out] statinfo information of a file in a directory |
4600 | @param[in] check_rw_perm for testing whether the file can be opened |
4601 | in RW mode |
4602 | @param[in] read_only true if the file is opened in read-only mode |
4603 | @return DB_SUCCESS if all OK */ |
4604 | static |
4605 | dberr_t |
4606 | os_file_get_status_win32( |
4607 | const char* path, |
4608 | os_file_stat_t* stat_info, |
4609 | struct _stat64* statinfo, |
4610 | bool check_rw_perm, |
4611 | bool read_only) |
4612 | { |
4613 | int ret = _stat64(path, statinfo); |
4614 | |
4615 | if (ret && (errno == ENOENT || errno == ENOTDIR)) { |
4616 | /* file does not exist */ |
4617 | |
4618 | return(DB_NOT_FOUND); |
4619 | |
4620 | } else if (ret) { |
4621 | /* file exists, but stat call failed */ |
4622 | |
4623 | os_file_handle_error_no_exit(path, "STAT" , false); |
4624 | |
4625 | return(DB_FAIL); |
4626 | |
4627 | } else if (_S_IFDIR & statinfo->st_mode) { |
4628 | |
4629 | stat_info->type = OS_FILE_TYPE_DIR; |
4630 | |
4631 | } else if (_S_IFREG & statinfo->st_mode) { |
4632 | |
4633 | DWORD access = GENERIC_READ; |
4634 | |
4635 | if (!read_only) { |
4636 | access |= GENERIC_WRITE; |
4637 | } |
4638 | |
4639 | stat_info->type = OS_FILE_TYPE_FILE; |
4640 | |
4641 | /* Check if we can open it in read-only mode. */ |
4642 | |
4643 | if (check_rw_perm) { |
4644 | HANDLE fh; |
4645 | |
4646 | fh = CreateFile( |
4647 | (LPCTSTR) path, // File to open |
4648 | access, |
4649 | FILE_SHARE_READ | FILE_SHARE_WRITE |
4650 | | FILE_SHARE_DELETE, // Full sharing |
4651 | NULL, // Default security |
4652 | OPEN_EXISTING, // Existing file only |
4653 | FILE_ATTRIBUTE_NORMAL, // Normal file |
4654 | NULL); // No attr. template |
4655 | |
4656 | if (fh == INVALID_HANDLE_VALUE) { |
4657 | stat_info->rw_perm = false; |
4658 | } else { |
4659 | stat_info->rw_perm = true; |
4660 | CloseHandle(fh); |
4661 | } |
4662 | } |
4663 | |
4664 | char volname[MAX_PATH]; |
4665 | BOOL result = GetVolumePathName(path, volname, MAX_PATH); |
4666 | |
4667 | if (!result) { |
4668 | |
4669 | ib::error() |
4670 | << "os_file_get_status_win32: " |
4671 | << "Failed to get the volume path name for: " |
4672 | << path |
4673 | << "- OS error number " << GetLastError(); |
4674 | |
4675 | return(DB_FAIL); |
4676 | } |
4677 | |
4678 | DWORD sectorsPerCluster; |
4679 | DWORD bytesPerSector; |
4680 | DWORD numberOfFreeClusters; |
4681 | DWORD totalNumberOfClusters; |
4682 | |
4683 | result = GetDiskFreeSpace( |
4684 | (LPCSTR) volname, |
4685 | §orsPerCluster, |
4686 | &bytesPerSector, |
4687 | &numberOfFreeClusters, |
4688 | &totalNumberOfClusters); |
4689 | |
4690 | if (!result) { |
4691 | |
4692 | ib::error() |
4693 | << "GetDiskFreeSpace(" << volname << ",...) " |
4694 | << "failed " |
4695 | << "- OS error number " << GetLastError(); |
4696 | |
4697 | return(DB_FAIL); |
4698 | } |
4699 | |
4700 | stat_info->block_size = bytesPerSector * sectorsPerCluster; |
4701 | } else { |
4702 | stat_info->type = OS_FILE_TYPE_UNKNOWN; |
4703 | } |
4704 | |
4705 | return(DB_SUCCESS); |
4706 | } |
4707 | |
4708 | /** |
4709 | Sets a sparse flag on Windows file. |
4710 | @param[in] file file handle |
4711 | @return true on success, false on error |
4712 | */ |
4713 | #include <versionhelpers.h> |
4714 | bool os_file_set_sparse_win32(os_file_t file, bool is_sparse) |
4715 | { |
4716 | if (!is_sparse && !IsWindows8OrGreater()) { |
4717 | /* Cannot unset sparse flag on older Windows. |
4718 | Until Windows8 it is documented to produce unpredictable results, |
4719 | if there are unallocated ranges in file.*/ |
4720 | return false; |
4721 | } |
4722 | DWORD temp; |
4723 | FILE_SET_SPARSE_BUFFER sparse_buffer; |
4724 | sparse_buffer.SetSparse = is_sparse; |
4725 | return os_win32_device_io_control(file, |
4726 | FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp); |
4727 | } |
4728 | |
4729 | |
4730 | /** |
4731 | Change file size on Windows. |
4732 | |
4733 | If file is extended, the bytes between old and new EOF |
4734 | are zeros. |
4735 | |
4736 | If file is sparse, "virtual" block is added at the end of |
4737 | allocated area. |
4738 | |
4739 | If file is normal, file system allocates storage. |
4740 | |
4741 | @param[in] pathname file path |
4742 | @param[in] file file handle |
4743 | @param[in] size size to preserve in bytes |
4744 | @return true if success */ |
4745 | bool |
4746 | os_file_change_size_win32( |
4747 | const char* pathname, |
4748 | os_file_t file, |
4749 | os_offset_t size) |
4750 | { |
4751 | LARGE_INTEGER length; |
4752 | |
4753 | length.QuadPart = size; |
4754 | |
4755 | BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN); |
4756 | |
4757 | if (!success) { |
4758 | os_file_handle_error_no_exit( |
4759 | pathname, "SetFilePointerEx" , false); |
4760 | } else { |
4761 | success = SetEndOfFile(file); |
4762 | if (!success) { |
4763 | os_file_handle_error_no_exit( |
4764 | pathname, "SetEndOfFile" , false); |
4765 | } |
4766 | } |
4767 | return(success); |
4768 | } |
4769 | |
4770 | /** Truncates a file at its current position. |
4771 | @param[in] file Handle to be truncated |
4772 | @return true if success */ |
4773 | bool |
4774 | os_file_set_eof( |
4775 | FILE* file) |
4776 | { |
4777 | HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); |
4778 | |
4779 | return(SetEndOfFile(h)); |
4780 | } |
4781 | |
4782 | /** This function can be called if one wants to post a batch of reads and |
4783 | prefers an i/o-handler thread to handle them all at once later. You must |
4784 | call os_aio_simulated_wake_handler_threads later to ensure the threads |
4785 | are not left sleeping! */ |
4786 | void |
4787 | os_aio_simulated_put_read_threads_to_sleep() |
4788 | { |
4789 | AIO::simulated_put_read_threads_to_sleep(); |
4790 | } |
4791 | |
4792 | /** This function can be called if one wants to post a batch of reads and |
4793 | prefers an i/o-handler thread to handle them all at once later. You must |
4794 | call os_aio_simulated_wake_handler_threads later to ensure the threads |
4795 | are not left sleeping! */ |
4796 | void |
4797 | AIO::simulated_put_read_threads_to_sleep() |
4798 | { |
4799 | /* The idea of putting background IO threads to sleep is only for |
4800 | Windows when using simulated AIO. Windows XP seems to schedule |
4801 | background threads too eagerly to allow for coalescing during |
4802 | readahead requests. */ |
4803 | |
4804 | if (srv_use_native_aio) { |
4805 | /* We do not use simulated AIO: do nothing */ |
4806 | |
4807 | return; |
4808 | } |
4809 | |
4810 | os_aio_recommend_sleep_for_read_threads = true; |
4811 | |
4812 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
4813 | AIO* array; |
4814 | |
4815 | get_array_and_local_segment(&array, i); |
4816 | |
4817 | if (array == s_reads) { |
4818 | |
4819 | os_event_reset(os_aio_segment_wait_events[i]); |
4820 | } |
4821 | } |
4822 | } |
4823 | |
4824 | #endif /* !_WIN32*/ |
4825 | |
4826 | /** Does a syncronous read or write depending upon the type specified |
4827 | In case of partial reads/writes the function tries |
4828 | NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data. |
4829 | @param[in] type, IO flags |
4830 | @param[in] file handle to an open file |
4831 | @param[out] buf buffer where to read |
4832 | @param[in] offset file offset from the start where to read |
4833 | @param[in] n number of bytes to read, starting from offset |
4834 | @param[out] err DB_SUCCESS or error code |
4835 | @return number of bytes read/written, -1 if error */ |
4836 | static MY_ATTRIBUTE((warn_unused_result)) |
4837 | ssize_t |
4838 | os_file_io( |
4839 | const IORequest&in_type, |
4840 | os_file_t file, |
4841 | void* buf, |
4842 | ulint n, |
4843 | os_offset_t offset, |
4844 | dberr_t* err) |
4845 | { |
4846 | ssize_t original_n = ssize_t(n); |
4847 | IORequest type = in_type; |
4848 | ssize_t bytes_returned = 0; |
4849 | |
4850 | SyncFileIO sync_file_io(file, buf, n, offset); |
4851 | |
4852 | for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) { |
4853 | |
4854 | ssize_t n_bytes = sync_file_io.execute(type); |
4855 | |
4856 | /* Check for a hard error. Not much we can do now. */ |
4857 | if (n_bytes < 0) { |
4858 | |
4859 | break; |
4860 | |
4861 | } else if (n_bytes + bytes_returned == ssize_t(n)) { |
4862 | |
4863 | bytes_returned += n_bytes; |
4864 | |
4865 | if (offset > 0 |
4866 | && !type.is_log() |
4867 | && type.is_write() |
4868 | && type.punch_hole()) { |
4869 | *err = type.punch_hole(file, offset, n); |
4870 | |
4871 | } else { |
4872 | *err = DB_SUCCESS; |
4873 | } |
4874 | |
4875 | return(original_n); |
4876 | } |
4877 | |
4878 | /* Handle partial read/write. */ |
4879 | |
4880 | ut_ad(ulint(n_bytes + bytes_returned) < n); |
4881 | |
4882 | bytes_returned += n_bytes; |
4883 | |
4884 | if (!type.is_partial_io_warning_disabled()) { |
4885 | |
4886 | const char* op = type.is_read() |
4887 | ? "read" : "written" ; |
4888 | |
4889 | ib::warn() |
4890 | << n |
4891 | << " bytes should have been " << op << ". Only " |
4892 | << bytes_returned |
4893 | << " bytes " << op << ". Retrying" |
4894 | << " for the remaining bytes." ; |
4895 | } |
4896 | |
4897 | /* Advance the offset and buffer by n_bytes */ |
4898 | sync_file_io.advance(n_bytes); |
4899 | } |
4900 | |
4901 | *err = DB_IO_ERROR; |
4902 | |
4903 | if (!type.is_partial_io_warning_disabled()) { |
4904 | ib::warn() |
4905 | << "Retry attempts for " |
4906 | << (type.is_read() ? "reading" : "writing" ) |
4907 | << " partial data failed." ; |
4908 | } |
4909 | |
4910 | return(bytes_returned); |
4911 | } |
4912 | |
4913 | /** Does a synchronous write operation in Posix. |
4914 | @param[in] type IO context |
4915 | @param[in] file handle to an open file |
4916 | @param[out] buf buffer from which to write |
4917 | @param[in] n number of bytes to read, starting from offset |
4918 | @param[in] offset file offset from the start where to read |
4919 | @param[out] err DB_SUCCESS or error code |
4920 | @return number of bytes written, -1 if error */ |
4921 | static MY_ATTRIBUTE((warn_unused_result)) |
4922 | ssize_t |
4923 | os_file_pwrite( |
4924 | const IORequest& type, |
4925 | os_file_t file, |
4926 | const byte* buf, |
4927 | ulint n, |
4928 | os_offset_t offset, |
4929 | dberr_t* err) |
4930 | { |
4931 | ut_ad(type.validate()); |
4932 | ut_ad(type.is_write()); |
4933 | |
4934 | ++os_n_file_writes; |
4935 | |
4936 | const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); |
4937 | MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); |
4938 | ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf), |
4939 | n, offset, err); |
4940 | MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); |
4941 | |
4942 | return(n_bytes); |
4943 | } |
4944 | |
4945 | /** NOTE! Use the corresponding macro os_file_write(), not directly |
4946 | Requests a synchronous write operation. |
4947 | @param[in] type IO flags |
4948 | @param[in] file handle to an open file |
4949 | @param[out] buf buffer from which to write |
4950 | @param[in] offset file offset from the start where to read |
4951 | @param[in] n number of bytes to read, starting from offset |
4952 | @return DB_SUCCESS if request was successful, false if fail */ |
4953 | dberr_t |
4954 | os_file_write_func( |
4955 | const IORequest& type, |
4956 | const char* name, |
4957 | os_file_t file, |
4958 | const void* buf, |
4959 | os_offset_t offset, |
4960 | ulint n) |
4961 | { |
4962 | dberr_t err; |
4963 | |
4964 | ut_ad(type.validate()); |
4965 | ut_ad(n > 0); |
4966 | |
4967 | WAIT_ALLOW_WRITES(); |
4968 | |
4969 | ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err); |
4970 | |
4971 | if ((ulint) n_bytes != n && !os_has_said_disk_full) { |
4972 | |
4973 | ib::error() |
4974 | << "Write to file " << name << " failed at offset " |
4975 | << offset << ", " << n |
4976 | << " bytes should have been written," |
4977 | " only " << n_bytes << " were written." |
4978 | " Operating system error number " << errno << "." |
4979 | " Check that your OS and file system" |
4980 | " support files of this size." |
4981 | " Check also that the disk is not full" |
4982 | " or a disk quota exceeded." ; |
4983 | |
4984 | if (strerror(errno) != NULL) { |
4985 | |
4986 | ib::error() |
4987 | << "Error number " << errno |
4988 | << " means '" << strerror(errno) << "'" ; |
4989 | } |
4990 | |
4991 | ib::info() << OPERATING_SYSTEM_ERROR_MSG; |
4992 | |
4993 | os_has_said_disk_full = true; |
4994 | } |
4995 | |
4996 | return(err); |
4997 | } |
4998 | |
4999 | /** Does a synchronous read operation in Posix. |
5000 | @param[in] type IO flags |
5001 | @param[in] file handle to an open file |
5002 | @param[out] buf buffer where to read |
5003 | @param[in] offset file offset from the start where to read |
5004 | @param[in] n number of bytes to read, starting from offset |
5005 | @param[out] err DB_SUCCESS or error code |
5006 | @return number of bytes read, -1 if error */ |
5007 | static MY_ATTRIBUTE((warn_unused_result)) |
5008 | ssize_t |
5009 | os_file_pread( |
5010 | const IORequest& type, |
5011 | os_file_t file, |
5012 | void* buf, |
5013 | ulint n, |
5014 | os_offset_t offset, |
5015 | dberr_t* err) |
5016 | { |
5017 | ut_ad(type.is_read()); |
5018 | |
5019 | ++os_n_file_reads; |
5020 | |
5021 | const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); |
5022 | MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); |
5023 | ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err); |
5024 | MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); |
5025 | |
5026 | return(n_bytes); |
5027 | } |
5028 | |
5029 | /** Requests a synchronous positioned read operation. |
5030 | @return DB_SUCCESS if request was successful, false if fail |
5031 | @param[in] type IO flags |
5032 | @param[in] file handle to an open file |
5033 | @param[out] buf buffer where to read |
5034 | @param[in] offset file offset from the start where to read |
5035 | @param[in] n number of bytes to read, starting from offset |
5036 | @param[out] o number of bytes actually read |
5037 | @param[in] exit_on_err if true then exit on error |
5038 | @return DB_SUCCESS or error code */ |
5039 | static MY_ATTRIBUTE((warn_unused_result)) |
5040 | dberr_t |
5041 | ( |
5042 | const IORequest& type, |
5043 | os_file_t file, |
5044 | void* buf, |
5045 | os_offset_t offset, |
5046 | ulint n, |
5047 | ulint* o, |
5048 | bool exit_on_err) |
5049 | { |
5050 | dberr_t err; |
5051 | |
5052 | os_bytes_read_since_printout += n; |
5053 | |
5054 | ut_ad(type.validate()); |
5055 | ut_ad(n > 0); |
5056 | |
5057 | ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err); |
5058 | |
5059 | if (o) { |
5060 | *o = n_bytes; |
5061 | } |
5062 | |
5063 | if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) { |
5064 | return err; |
5065 | } |
5066 | |
5067 | ib::error() << "Tried to read " << n << " bytes at offset " |
5068 | << offset << ", but was only able to read " << n_bytes; |
5069 | |
5070 | if (!os_file_handle_error_cond_exit( |
5071 | NULL, "read" , exit_on_err, false)) { |
5072 | ib::fatal() |
5073 | << "Cannot read from file. OS error number " |
5074 | << errno << "." ; |
5075 | } |
5076 | |
5077 | if (err == DB_SUCCESS) { |
5078 | err = DB_IO_ERROR; |
5079 | } |
5080 | |
5081 | return err; |
5082 | } |
5083 | |
5084 | /** Retrieves the last error number if an error occurs in a file io function. |
5085 | The number should be retrieved before any other OS calls (because they may |
5086 | overwrite the error number). If the number is not known to this program, |
5087 | the OS error number + 100 is returned. |
5088 | @param[in] report_all_errors true if we want an error printed |
5089 | for all errors |
5090 | @return error number, or OS error number + 100 */ |
5091 | ulint |
5092 | os_file_get_last_error( |
5093 | bool report_all_errors) |
5094 | { |
5095 | return(os_file_get_last_error_low(report_all_errors, false)); |
5096 | } |
5097 | |
5098 | /** Handle errors for file operations. |
5099 | @param[in] name name of a file or NULL |
5100 | @param[in] operation operation |
5101 | @param[in] should_abort whether to abort on an unknown error |
5102 | @param[in] on_error_silent whether to suppress reports of non-fatal errors |
5103 | @return true if we should retry the operation */ |
5104 | static MY_ATTRIBUTE((warn_unused_result)) |
5105 | bool |
5106 | os_file_handle_error_cond_exit( |
5107 | const char* name, |
5108 | const char* operation, |
5109 | bool should_abort, |
5110 | bool on_error_silent) |
5111 | { |
5112 | ulint err; |
5113 | |
5114 | err = os_file_get_last_error_low(false, on_error_silent); |
5115 | |
5116 | switch (err) { |
5117 | case OS_FILE_DISK_FULL: |
5118 | /* We only print a warning about disk full once */ |
5119 | |
5120 | if (os_has_said_disk_full) { |
5121 | |
5122 | return(false); |
5123 | } |
5124 | |
5125 | /* Disk full error is reported irrespective of the |
5126 | on_error_silent setting. */ |
5127 | |
5128 | if (name) { |
5129 | |
5130 | ib::error() |
5131 | << "Encountered a problem with file '" |
5132 | << name << "'" ; |
5133 | } |
5134 | |
5135 | ib::error() |
5136 | << "Disk is full. Try to clean the disk to free space." ; |
5137 | |
5138 | os_has_said_disk_full = true; |
5139 | |
5140 | return(false); |
5141 | |
5142 | case OS_FILE_AIO_RESOURCES_RESERVED: |
5143 | case OS_FILE_AIO_INTERRUPTED: |
5144 | |
5145 | return(true); |
5146 | |
5147 | case OS_FILE_PATH_ERROR: |
5148 | case OS_FILE_ALREADY_EXISTS: |
5149 | case OS_FILE_ACCESS_VIOLATION: |
5150 | |
5151 | return(false); |
5152 | |
5153 | case OS_FILE_SHARING_VIOLATION: |
5154 | |
5155 | os_thread_sleep(10000000); /* 10 sec */ |
5156 | return(true); |
5157 | |
5158 | case OS_FILE_OPERATION_ABORTED: |
5159 | case OS_FILE_INSUFFICIENT_RESOURCE: |
5160 | |
5161 | os_thread_sleep(100000); /* 100 ms */ |
5162 | return(true); |
5163 | |
5164 | default: |
5165 | |
5166 | /* If it is an operation that can crash on error then it |
5167 | is better to ignore on_error_silent and print an error message |
5168 | to the log. */ |
5169 | |
5170 | if (should_abort || !on_error_silent) { |
5171 | ib::error() << "File " |
5172 | << (name != NULL ? name : "(unknown)" ) |
5173 | << ": '" << operation << "'" |
5174 | " returned OS error " << err << "." |
5175 | << (should_abort |
5176 | ? " Cannot continue operation" : "" ); |
5177 | } |
5178 | |
5179 | if (should_abort) { |
5180 | abort(); |
5181 | } |
5182 | } |
5183 | |
5184 | return(false); |
5185 | } |
5186 | |
5187 | #ifndef _WIN32 |
5188 | /** Tries to disable OS caching on an opened file descriptor. |
5189 | @param[in] fd file descriptor to alter |
5190 | @param[in] file_name file name, used in the diagnostic message |
5191 | @param[in] name "open" or "create"; used in the diagnostic |
5192 | message */ |
5193 | void |
5194 | os_file_set_nocache( |
5195 | int fd MY_ATTRIBUTE((unused)), |
5196 | const char* file_name MY_ATTRIBUTE((unused)), |
5197 | const char* operation_name MY_ATTRIBUTE((unused))) |
5198 | { |
5199 | /* some versions of Solaris may not have DIRECTIO_ON */ |
5200 | #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) |
5201 | if (directio(fd, DIRECTIO_ON) == -1) { |
5202 | int errno_save = errno; |
5203 | |
5204 | ib::error() |
5205 | << "Failed to set DIRECTIO_ON on file " |
5206 | << file_name << "; " << operation_name << ": " |
5207 | << strerror(errno_save) << "," |
5208 | " continuing anyway." ; |
5209 | } |
5210 | #elif defined(O_DIRECT) |
5211 | if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { |
5212 | int errno_save = errno; |
5213 | static bool warning_message_printed = false; |
5214 | if (errno_save == EINVAL) { |
5215 | if (!warning_message_printed) { |
5216 | warning_message_printed = true; |
5217 | # ifdef UNIV_LINUX |
5218 | ib::warn() |
5219 | << "Failed to set O_DIRECT on file" |
5220 | << file_name << "; " << operation_name |
5221 | << ": " << strerror(errno_save) << ", " |
5222 | "continuing anyway. O_DIRECT is " |
5223 | "known to result in 'Invalid argument' " |
5224 | "on Linux on tmpfs, " |
5225 | "see MySQL Bug#26662." ; |
5226 | # else /* UNIV_LINUX */ |
5227 | goto short_warning; |
5228 | # endif /* UNIV_LINUX */ |
5229 | } |
5230 | } else { |
5231 | # ifndef UNIV_LINUX |
5232 | short_warning: |
5233 | # endif |
5234 | ib::warn() |
5235 | << "Failed to set O_DIRECT on file " |
5236 | << file_name << "; " << operation_name |
5237 | << " : " << strerror(errno_save) |
5238 | << ", continuing anyway." ; |
5239 | } |
5240 | } |
5241 | #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ |
5242 | } |
5243 | |
5244 | #endif /* _WIN32 */ |
5245 | |
5246 | /** Extend a file. |
5247 | |
5248 | On Windows, extending a file allocates blocks for the file, |
5249 | unless the file is sparse. |
5250 | |
5251 | On Unix, we will extend the file with ftruncate(), if |
5252 | file needs to be sparse. Otherwise posix_fallocate() is used |
5253 | when available, and if not, binary zeroes are added to the end |
5254 | of file. |
5255 | |
5256 | @param[in] name file name |
5257 | @param[in] file file handle |
5258 | @param[in] size desired file size |
5259 | @param[in] sparse whether to create a sparse file (no preallocating) |
5260 | @return whether the operation succeeded */ |
5261 | bool |
5262 | os_file_set_size( |
5263 | const char* name, |
5264 | os_file_t file, |
5265 | os_offset_t size, |
5266 | bool is_sparse) |
5267 | { |
5268 | #ifdef _WIN32 |
5269 | /* On Windows, changing file size works well and as expected for both |
5270 | sparse and normal files. |
5271 | |
5272 | However, 10.2 up until 10.2.9 made every file sparse in innodb, |
5273 | causing NTFS fragmentation issues(MDEV-13941). We try to undo |
5274 | the damage, and unsparse the file.*/ |
5275 | |
5276 | if (!is_sparse && os_is_sparse_file_supported(file)) { |
5277 | if (!os_file_set_sparse_win32(file, false)) |
5278 | /* Unsparsing file failed. Fallback to writing binary |
5279 | zeros, to avoid even higher fragmentation.*/ |
5280 | goto fallback; |
5281 | } |
5282 | |
5283 | return os_file_change_size_win32(name, file, size); |
5284 | |
5285 | fallback: |
5286 | #else |
5287 | if (is_sparse) { |
5288 | bool success = !ftruncate(file, size); |
5289 | if (!success) { |
5290 | ib::error() << "ftruncate of file " << name << " to " |
5291 | << size << " bytes failed with error " |
5292 | << errno; |
5293 | } |
5294 | return(success); |
5295 | } |
5296 | |
5297 | # ifdef HAVE_POSIX_FALLOCATE |
5298 | int err; |
5299 | do { |
5300 | os_offset_t current_size = os_file_get_size(file); |
5301 | err = current_size >= size |
5302 | ? 0 : posix_fallocate(file, current_size, |
5303 | size - current_size); |
5304 | } while (err == EINTR |
5305 | && srv_shutdown_state == SRV_SHUTDOWN_NONE); |
5306 | |
5307 | switch (err) { |
5308 | case 0: |
5309 | return true; |
5310 | default: |
5311 | ib::error() << "preallocating " |
5312 | << size << " bytes for file " << name |
5313 | << " failed with error " << err; |
5314 | /* fall through */ |
5315 | case EINTR: |
5316 | errno = err; |
5317 | return false; |
5318 | case EINVAL: |
5319 | /* fall back to the code below */ |
5320 | break; |
5321 | } |
5322 | # endif /* HAVE_POSIX_ALLOCATE */ |
5323 | #endif /* _WIN32*/ |
5324 | |
5325 | /* Write up to 1 megabyte at a time. */ |
5326 | ulint buf_size = ut_min(ulint(64), |
5327 | ulint(size >> srv_page_size_shift)) |
5328 | << srv_page_size_shift; |
5329 | |
5330 | /* Align the buffer for possible raw i/o */ |
5331 | byte* buf2; |
5332 | |
5333 | buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size)); |
5334 | |
5335 | byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size)); |
5336 | |
5337 | /* Write buffer full of zeros */ |
5338 | memset(buf, 0, buf_size); |
5339 | |
5340 | os_offset_t current_size = os_file_get_size(file); |
5341 | |
5342 | while (current_size < size |
5343 | && srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
5344 | ulint n_bytes; |
5345 | |
5346 | if (size - current_size < (os_offset_t) buf_size) { |
5347 | n_bytes = (ulint) (size - current_size); |
5348 | } else { |
5349 | n_bytes = buf_size; |
5350 | } |
5351 | |
5352 | dberr_t err; |
5353 | IORequest request(IORequest::WRITE); |
5354 | |
5355 | err = os_file_write( |
5356 | request, name, file, buf, current_size, n_bytes); |
5357 | |
5358 | if (err != DB_SUCCESS) { |
5359 | break; |
5360 | } |
5361 | |
5362 | current_size += n_bytes; |
5363 | } |
5364 | |
5365 | ut_free(buf2); |
5366 | |
5367 | return(current_size >= size && os_file_flush(file)); |
5368 | } |
5369 | |
5370 | /** Truncates a file to a specified size in bytes. |
5371 | Do nothing if the size to preserve is greater or equal to the current |
5372 | size of the file. |
5373 | @param[in] pathname file path |
5374 | @param[in] file file to be truncated |
5375 | @param[in] size size to preserve in bytes |
5376 | @return true if success */ |
5377 | bool |
5378 | os_file_truncate( |
5379 | const char* pathname, |
5380 | os_file_t file, |
5381 | os_offset_t size) |
5382 | { |
5383 | /* Do nothing if the size preserved is larger than or equal to the |
5384 | current size of file */ |
5385 | os_offset_t size_bytes = os_file_get_size(file); |
5386 | |
5387 | if (size >= size_bytes) { |
5388 | return(true); |
5389 | } |
5390 | |
5391 | #ifdef _WIN32 |
5392 | return(os_file_change_size_win32(pathname, file, size)); |
5393 | #else /* _WIN32 */ |
5394 | return(os_file_truncate_posix(pathname, file, size)); |
5395 | #endif /* _WIN32 */ |
5396 | } |
5397 | |
5398 | /** NOTE! Use the corresponding macro os_file_read(), not directly this |
5399 | function! |
5400 | Requests a synchronous positioned read operation. |
5401 | @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure |
5402 | @param[in] type IO flags |
5403 | @param[in] file handle to an open file |
5404 | @param[out] buf buffer where to read |
5405 | @param[in] offset file offset from the start where to read |
5406 | @param[in] n number of bytes to read, starting from offset |
5407 | @return DB_SUCCESS or error code */ |
5408 | dberr_t |
5409 | os_file_read_func( |
5410 | const IORequest& type, |
5411 | os_file_t file, |
5412 | void* buf, |
5413 | os_offset_t offset, |
5414 | ulint n) |
5415 | { |
5416 | return(os_file_read_page(type, file, buf, offset, n, NULL, true)); |
5417 | } |
5418 | |
5419 | /** NOTE! Use the corresponding macro os_file_read_no_error_handling(), |
5420 | not directly this function! |
5421 | Requests a synchronous positioned read operation. |
5422 | @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure |
5423 | @param[in] type IO flags |
5424 | @param[in] file handle to an open file |
5425 | @param[out] buf buffer where to read |
5426 | @param[in] offset file offset from the start where to read |
5427 | @param[in] n number of bytes to read, starting from offset |
5428 | @param[out] o number of bytes actually read |
5429 | @return DB_SUCCESS or error code */ |
5430 | dberr_t |
5431 | os_file_read_no_error_handling_func( |
5432 | const IORequest& type, |
5433 | os_file_t file, |
5434 | void* buf, |
5435 | os_offset_t offset, |
5436 | ulint n, |
5437 | ulint* o) |
5438 | { |
5439 | return(os_file_read_page(type, file, buf, offset, n, o, false)); |
5440 | } |
5441 | |
5442 | /** Check the existence and type of the given file. |
5443 | @param[in] path path name of file |
5444 | @param[out] exists true if the file exists |
5445 | @param[out] type Type of the file, if it exists |
5446 | @return true if call succeeded */ |
5447 | bool |
5448 | os_file_status( |
5449 | const char* path, |
5450 | bool* exists, |
5451 | os_file_type_t* type) |
5452 | { |
5453 | #ifdef _WIN32 |
5454 | return(os_file_status_win32(path, exists, type)); |
5455 | #else |
5456 | return(os_file_status_posix(path, exists, type)); |
5457 | #endif /* _WIN32 */ |
5458 | } |
5459 | |
5460 | /** Free storage space associated with a section of the file. |
5461 | @param[in] fh Open file handle |
5462 | @param[in] off Starting offset (SEEK_SET) |
5463 | @param[in] len Size of the hole |
5464 | @return DB_SUCCESS or error code */ |
5465 | dberr_t |
5466 | os_file_punch_hole( |
5467 | os_file_t fh, |
5468 | os_offset_t off, |
5469 | os_offset_t len) |
5470 | { |
5471 | dberr_t err; |
5472 | |
5473 | #ifdef _WIN32 |
5474 | err = os_file_punch_hole_win32(fh, off, len); |
5475 | #else |
5476 | err = os_file_punch_hole_posix(fh, off, len); |
5477 | #endif /* _WIN32 */ |
5478 | |
5479 | return (err); |
5480 | } |
5481 | |
5482 | /** Free storage space associated with a section of the file. |
5483 | @param[in] fh Open file handle |
5484 | @param[in] off Starting offset (SEEK_SET) |
5485 | @param[in] len Size of the hole |
5486 | @return DB_SUCCESS or error code */ |
5487 | dberr_t |
5488 | IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) |
5489 | { |
5490 | /* In this debugging mode, we act as if punch hole is supported, |
5491 | and then skip any calls to actually punch a hole here. |
5492 | In this way, Transparent Page Compression is still being tested. */ |
5493 | DBUG_EXECUTE_IF("ignore_punch_hole" , |
5494 | return(DB_SUCCESS); |
5495 | ); |
5496 | |
5497 | ulint trim_len = get_trim_length(len); |
5498 | |
5499 | if (trim_len == 0) { |
5500 | return(DB_SUCCESS); |
5501 | } |
5502 | |
5503 | off += len; |
5504 | |
5505 | /* Check does file system support punching holes for this |
5506 | tablespace. */ |
5507 | if (!should_punch_hole()) { |
5508 | return DB_IO_NO_PUNCH_HOLE; |
5509 | } |
5510 | |
5511 | dberr_t err = os_file_punch_hole(fh, off, trim_len); |
5512 | |
5513 | if (err == DB_SUCCESS) { |
5514 | srv_stats.page_compressed_trim_op.inc(); |
5515 | } else { |
5516 | /* If punch hole is not supported, |
5517 | set space so that it is not used. */ |
5518 | if (err == DB_IO_NO_PUNCH_HOLE) { |
5519 | space_no_punch_hole(); |
5520 | err = DB_SUCCESS; |
5521 | } |
5522 | } |
5523 | |
5524 | return (err); |
5525 | } |
5526 | |
5527 | /** Check if the file system supports sparse files. |
5528 | |
5529 | Warning: On POSIX systems we try and punch a hole from offset 0 to |
5530 | the system configured page size. This should only be called on an empty |
5531 | file. |
5532 | @param[in] fh File handle for the file - if opened |
5533 | @return true if the file system supports sparse files */ |
5534 | bool |
5535 | os_is_sparse_file_supported(os_file_t fh) |
5536 | { |
5537 | /* In this debugging mode, we act as if punch hole is supported, |
5538 | then we skip any calls to actually punch a hole. In this way, |
5539 | Transparent Page Compression is still being tested. */ |
5540 | DBUG_EXECUTE_IF("ignore_punch_hole" , |
5541 | return(true); |
5542 | ); |
5543 | |
5544 | #ifdef _WIN32 |
5545 | FILE_ATTRIBUTE_TAG_INFO info; |
5546 | if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo, |
5547 | &info, (DWORD)sizeof(info))) { |
5548 | if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) { |
5549 | return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0; |
5550 | } |
5551 | } |
5552 | return false; |
5553 | #else |
5554 | dberr_t err; |
5555 | |
5556 | /* We don't know the FS block size, use the sector size. The FS |
5557 | will do the magic. */ |
5558 | err = os_file_punch_hole_posix(fh, 0, srv_page_size); |
5559 | |
5560 | return(err == DB_SUCCESS); |
5561 | #endif /* _WIN32 */ |
5562 | } |
5563 | |
5564 | /** This function returns information about the specified file |
5565 | @param[in] path pathname of the file |
5566 | @param[out] stat_info information of a file in a directory |
5567 | @param[in] check_rw_perm for testing whether the file can be opened |
5568 | in RW mode |
5569 | @param[in] read_only true if file is opened in read-only mode |
5570 | @return DB_SUCCESS if all OK */ |
5571 | dberr_t |
5572 | os_file_get_status( |
5573 | const char* path, |
5574 | os_file_stat_t* stat_info, |
5575 | bool check_rw_perm, |
5576 | bool read_only) |
5577 | { |
5578 | dberr_t ret; |
5579 | |
5580 | #ifdef _WIN32 |
5581 | struct _stat64 info; |
5582 | |
5583 | ret = os_file_get_status_win32( |
5584 | path, stat_info, &info, check_rw_perm, read_only); |
5585 | |
5586 | #else |
5587 | struct stat info; |
5588 | |
5589 | ret = os_file_get_status_posix( |
5590 | path, stat_info, &info, check_rw_perm, read_only); |
5591 | |
5592 | #endif /* _WIN32 */ |
5593 | |
5594 | if (ret == DB_SUCCESS) { |
5595 | stat_info->ctime = info.st_ctime; |
5596 | stat_info->atime = info.st_atime; |
5597 | stat_info->mtime = info.st_mtime; |
5598 | stat_info->size = info.st_size; |
5599 | } |
5600 | |
5601 | return(ret); |
5602 | } |
5603 | |
5604 | /** |
5605 | Waits for an AIO operation to complete. This function is used to wait the |
5606 | for completed requests. The aio array of pending requests is divided |
5607 | into segments. The thread specifies which segment or slot it wants to wait |
5608 | for. NOTE: this function will also take care of freeing the aio slot, |
5609 | therefore no other thread is allowed to do the freeing! |
5610 | @param[in] segment The number of the segment in the aio arrays to |
5611 | wait for; segment 0 is the ibuf I/O thread, |
5612 | segment 1 the log I/O thread, then follow the |
5613 | non-ibuf read threads, and as the last are the |
5614 | non-ibuf write threads; if this is |
5615 | ULINT_UNDEFINED, then it means that sync AIO |
5616 | is used, and this parameter is ignored |
5617 | @param[out] m1 the messages passed with the AIO request; note |
5618 | that also in the case where the AIO operation |
5619 | failed, these output parameters are valid and |
5620 | can be used to restart the operation, |
5621 | for example |
5622 | @param[out] m2 callback message |
5623 | @param[out] type OS_FILE_WRITE or ..._READ |
5624 | @return DB_SUCCESS or error code */ |
5625 | dberr_t |
5626 | os_aio_handler( |
5627 | ulint segment, |
5628 | fil_node_t** m1, |
5629 | void** m2, |
5630 | IORequest* request) |
5631 | { |
5632 | dberr_t err; |
5633 | |
5634 | if (srv_use_native_aio) { |
5635 | srv_set_io_thread_op_info(segment, "native aio handle" ); |
5636 | |
5637 | #ifdef WIN_ASYNC_IO |
5638 | |
5639 | err = os_aio_windows_handler(segment, 0, m1, m2, request); |
5640 | |
5641 | #elif defined(LINUX_NATIVE_AIO) |
5642 | |
5643 | err = os_aio_linux_handler(segment, m1, m2, request); |
5644 | |
5645 | #else |
5646 | ut_error; |
5647 | |
5648 | err = DB_ERROR; /* Eliminate compiler warning */ |
5649 | |
5650 | #endif /* WIN_ASYNC_IO */ |
5651 | |
5652 | } else { |
5653 | srv_set_io_thread_op_info(segment, "simulated aio handle" ); |
5654 | |
5655 | err = os_aio_simulated_handler(segment, m1, m2, request); |
5656 | } |
5657 | |
5658 | return(err); |
5659 | } |
5660 | |
5661 | #ifdef WIN_ASYNC_IO |
5662 | static HANDLE new_completion_port() |
5663 | { |
5664 | HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0); |
5665 | ut_a(h); |
5666 | return h; |
5667 | } |
5668 | #endif |
5669 | |
5670 | /** Constructor |
5671 | @param[in] id The latch ID |
5672 | @param[in] n Number of AIO slots |
5673 | @param[in] segments Number of segments */ |
5674 | AIO::AIO( |
5675 | latch_id_t id, |
5676 | ulint n, |
5677 | ulint segments) |
5678 | : |
5679 | m_slots(n), |
5680 | m_n_segments(segments), |
5681 | m_n_reserved() |
5682 | # ifdef LINUX_NATIVE_AIO |
5683 | ,m_aio_ctx(), |
5684 | m_events(m_slots.size()) |
5685 | # endif /* LINUX_NATIVE_AIO */ |
5686 | #ifdef WIN_ASYNC_IO |
5687 | ,m_completion_port(new_completion_port()) |
5688 | #endif |
5689 | { |
5690 | ut_a(n > 0); |
5691 | ut_a(m_n_segments > 0); |
5692 | |
5693 | mutex_create(id, &m_mutex); |
5694 | |
5695 | m_not_full = os_event_create("aio_not_full" ); |
5696 | m_is_empty = os_event_create("aio_is_empty" ); |
5697 | |
5698 | memset(&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size()); |
5699 | #ifdef LINUX_NATIVE_AIO |
5700 | memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size()); |
5701 | #endif /* LINUX_NATIVE_AIO */ |
5702 | |
5703 | os_event_set(m_is_empty); |
5704 | } |
5705 | |
5706 | /** Initialise the slots */ |
5707 | dberr_t |
5708 | AIO::init_slots() |
5709 | { |
5710 | for (ulint i = 0; i < m_slots.size(); ++i) { |
5711 | Slot& slot = m_slots[i]; |
5712 | |
5713 | slot.pos = static_cast<uint16_t>(i); |
5714 | |
5715 | slot.is_reserved = false; |
5716 | |
5717 | #ifdef WIN_ASYNC_IO |
5718 | |
5719 | slot.array = this; |
5720 | |
5721 | #elif defined(LINUX_NATIVE_AIO) |
5722 | |
5723 | slot.ret = 0; |
5724 | |
5725 | slot.n_bytes = 0; |
5726 | |
5727 | memset(&slot.control, 0x0, sizeof(slot.control)); |
5728 | |
5729 | #endif /* WIN_ASYNC_IO */ |
5730 | } |
5731 | |
5732 | return(DB_SUCCESS); |
5733 | } |
5734 | |
5735 | #ifdef LINUX_NATIVE_AIO |
5736 | /** Initialise the Linux Native AIO interface */ |
5737 | dberr_t |
5738 | AIO::init_linux_native_aio() |
5739 | { |
5740 | /* Initialize the io_context array. One io_context |
5741 | per segment in the array. */ |
5742 | |
5743 | ut_a(m_aio_ctx == NULL); |
5744 | |
5745 | m_aio_ctx = static_cast<io_context**>( |
5746 | ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx))); |
5747 | |
5748 | if (m_aio_ctx == NULL) { |
5749 | return(DB_OUT_OF_MEMORY); |
5750 | } |
5751 | |
5752 | io_context** ctx = m_aio_ctx; |
5753 | ulint max_events = slots_per_segment(); |
5754 | |
5755 | for (ulint i = 0; i < m_n_segments; ++i, ++ctx) { |
5756 | |
5757 | if (!linux_create_io_ctx(max_events, ctx)) { |
5758 | /* If something bad happened during aio setup |
5759 | we disable linux native aio. |
5760 | The disadvantage will be a small memory leak |
5761 | at shutdown but that's ok compared to a crash |
5762 | or a not working server. |
5763 | This frequently happens when running the test suite |
5764 | with many threads on a system with low fs.aio-max-nr! |
5765 | */ |
5766 | |
5767 | ib::warn() |
5768 | << "Warning: Linux Native AIO disabled " |
5769 | << "because _linux_create_io_ctx() " |
5770 | << "failed. To get rid of this warning you can " |
5771 | << "try increasing system " |
5772 | << "fs.aio-max-nr to 1048576 or larger or " |
5773 | << "setting innodb_use_native_aio = 0 in my.cnf" ; |
5774 | ut_free(m_aio_ctx); |
5775 | m_aio_ctx = 0; |
5776 | srv_use_native_aio = FALSE; |
5777 | return(DB_SUCCESS); |
5778 | } |
5779 | } |
5780 | |
5781 | return(DB_SUCCESS); |
5782 | } |
5783 | #endif /* LINUX_NATIVE_AIO */ |
5784 | |
5785 | /** Initialise the array */ |
5786 | dberr_t |
5787 | AIO::init() |
5788 | { |
5789 | ut_a(!m_slots.empty()); |
5790 | |
5791 | |
5792 | if (srv_use_native_aio) { |
5793 | #ifdef LINUX_NATIVE_AIO |
5794 | dberr_t err = init_linux_native_aio(); |
5795 | |
5796 | if (err != DB_SUCCESS) { |
5797 | return(err); |
5798 | } |
5799 | |
5800 | #endif /* LINUX_NATIVE_AIO */ |
5801 | } |
5802 | |
5803 | return(init_slots()); |
5804 | } |
5805 | |
5806 | /** Creates an aio wait array. Note that we return NULL in case of failure. |
5807 | We don't care about freeing memory here because we assume that a |
5808 | failure will result in server refusing to start up. |
5809 | @param[in] id Latch ID |
5810 | @param[in] n maximum number of pending AIO operations |
5811 | allowed; n must be divisible by m_n_segments |
5812 | @param[in] n_segments number of segments in the AIO array |
5813 | @return own: AIO array, NULL on failure */ |
5814 | AIO* |
5815 | AIO::create( |
5816 | latch_id_t id, |
5817 | ulint n, |
5818 | ulint n_segments) |
5819 | { |
5820 | if ((n % n_segments)) { |
5821 | |
5822 | ib::error() |
5823 | << "Maximum number of AIO operations must be " |
5824 | << "divisible by number of segments" ; |
5825 | |
5826 | return(NULL); |
5827 | } |
5828 | |
5829 | AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments)); |
5830 | |
5831 | if (array != NULL && array->init() != DB_SUCCESS) { |
5832 | |
5833 | UT_DELETE(array); |
5834 | |
5835 | array = NULL; |
5836 | } |
5837 | |
5838 | return(array); |
5839 | } |
5840 | |
5841 | /** AIO destructor */ |
5842 | AIO::~AIO() |
5843 | { |
5844 | mutex_destroy(&m_mutex); |
5845 | |
5846 | os_event_destroy(m_not_full); |
5847 | os_event_destroy(m_is_empty); |
5848 | |
5849 | #if defined(LINUX_NATIVE_AIO) |
5850 | if (srv_use_native_aio) { |
5851 | m_events.clear(); |
5852 | ut_free(m_aio_ctx); |
5853 | } |
5854 | #endif /* LINUX_NATIVE_AIO */ |
5855 | #if defined(WIN_ASYNC_IO) |
5856 | CloseHandle(m_completion_port); |
5857 | #endif |
5858 | |
5859 | m_slots.clear(); |
5860 | } |
5861 | |
5862 | /** Initializes the asynchronous io system. Creates one array each for ibuf |
5863 | and log i/o. Also creates one array each for read and write where each |
5864 | array is divided logically into n_readers and n_writers |
5865 | respectively. The caller must create an i/o handler thread for each |
5866 | segment in these arrays. This function also creates the sync array. |
5867 | No i/o handler thread needs to be created for that |
5868 | @param[in] n_per_seg maximum number of pending aio |
5869 | operations allowed per segment |
5870 | @param[in] n_readers number of reader threads |
5871 | @param[in] n_writers number of writer threads |
5872 | @param[in] n_slots_sync number of slots in the sync aio array |
5873 | @return true if the AIO sub-system was started successfully */ |
5874 | bool |
5875 | AIO::start( |
5876 | ulint n_per_seg, |
5877 | ulint n_readers, |
5878 | ulint n_writers, |
5879 | ulint n_slots_sync) |
5880 | { |
5881 | #if defined(LINUX_NATIVE_AIO) |
5882 | /* Check if native aio is supported on this system and tmpfs */ |
5883 | if (srv_use_native_aio && !is_linux_native_aio_supported()) { |
5884 | |
5885 | ib::warn() << "Linux Native AIO disabled." ; |
5886 | |
5887 | srv_use_native_aio = FALSE; |
5888 | } |
5889 | #endif /* LINUX_NATIVE_AIO */ |
5890 | |
5891 | srv_reset_io_thread_op_info(); |
5892 | |
5893 | s_reads = create( |
5894 | LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers); |
5895 | |
5896 | if (s_reads == NULL) { |
5897 | return(false); |
5898 | } |
5899 | |
5900 | ulint start = srv_read_only_mode ? 0 : 2; |
5901 | ulint n_segs = n_readers + start; |
5902 | |
5903 | /* 0 is the ibuf segment and 1 is the redo log segment. */ |
5904 | for (ulint i = start; i < n_segs; ++i) { |
5905 | ut_a(i < SRV_MAX_N_IO_THREADS); |
5906 | srv_io_thread_function[i] = "read thread" ; |
5907 | } |
5908 | |
5909 | ulint n_segments = n_readers; |
5910 | |
5911 | if (!srv_read_only_mode) { |
5912 | |
5913 | s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1); |
5914 | |
5915 | if (s_ibuf == NULL) { |
5916 | return(false); |
5917 | } |
5918 | |
5919 | ++n_segments; |
5920 | |
5921 | srv_io_thread_function[0] = "insert buffer thread" ; |
5922 | |
5923 | s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1); |
5924 | |
5925 | if (s_log == NULL) { |
5926 | return(false); |
5927 | } |
5928 | |
5929 | ++n_segments; |
5930 | |
5931 | srv_io_thread_function[1] = "log thread" ; |
5932 | |
5933 | } else { |
5934 | s_ibuf = s_log = NULL; |
5935 | } |
5936 | |
5937 | s_writes = create( |
5938 | LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers); |
5939 | |
5940 | if (s_writes == NULL) { |
5941 | return(false); |
5942 | } |
5943 | |
5944 | #ifdef WIN_ASYNC_IO |
5945 | data_completion_port = s_writes->m_completion_port; |
5946 | log_completion_port = |
5947 | s_log ? s_log->m_completion_port : data_completion_port; |
5948 | #endif |
5949 | |
5950 | n_segments += n_writers; |
5951 | |
5952 | for (ulint i = start + n_readers; i < n_segments; ++i) { |
5953 | ut_a(i < SRV_MAX_N_IO_THREADS); |
5954 | srv_io_thread_function[i] = "write thread" ; |
5955 | } |
5956 | |
5957 | ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4)); |
5958 | |
5959 | s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1); |
5960 | |
5961 | if (s_sync == NULL) { |
5962 | |
5963 | return(false); |
5964 | } |
5965 | |
5966 | os_aio_n_segments = n_segments; |
5967 | |
5968 | os_aio_validate(); |
5969 | |
5970 | os_last_printout = ut_time(); |
5971 | |
5972 | if (srv_use_native_aio) { |
5973 | return(true); |
5974 | } |
5975 | |
5976 | os_aio_segment_wait_events = static_cast<os_event_t*>( |
5977 | ut_zalloc_nokey( |
5978 | n_segments * sizeof *os_aio_segment_wait_events)); |
5979 | |
5980 | if (os_aio_segment_wait_events == NULL) { |
5981 | |
5982 | return(false); |
5983 | } |
5984 | |
5985 | for (ulint i = 0; i < n_segments; ++i) { |
5986 | os_aio_segment_wait_events[i] = os_event_create(0); |
5987 | } |
5988 | |
5989 | return(true); |
5990 | } |
5991 | |
5992 | /** Free the AIO arrays */ |
5993 | void |
5994 | AIO::shutdown() |
5995 | { |
5996 | UT_DELETE(s_ibuf); |
5997 | s_ibuf = NULL; |
5998 | |
5999 | UT_DELETE(s_log); |
6000 | s_log = NULL; |
6001 | |
6002 | UT_DELETE(s_writes); |
6003 | s_writes = NULL; |
6004 | |
6005 | UT_DELETE(s_sync); |
6006 | s_sync = NULL; |
6007 | |
6008 | UT_DELETE(s_reads); |
6009 | s_reads = NULL; |
6010 | } |
6011 | |
6012 | /** Initializes the asynchronous io system. Creates one array each for ibuf |
6013 | and log i/o. Also creates one array each for read and write where each |
6014 | array is divided logically into n_readers and n_writers |
6015 | respectively. The caller must create an i/o handler thread for each |
6016 | segment in these arrays. This function also creates the sync array. |
6017 | No i/o handler thread needs to be created for that |
6018 | @param[in] n_readers number of reader threads |
6019 | @param[in] n_writers number of writer threads |
6020 | @param[in] n_slots_sync number of slots in the sync aio array */ |
6021 | bool |
6022 | os_aio_init( |
6023 | ulint n_readers, |
6024 | ulint n_writers, |
6025 | ulint n_slots_sync) |
6026 | { |
6027 | /* Maximum number of pending aio operations allowed per segment */ |
6028 | ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD; |
6029 | |
6030 | return(AIO::start(limit, n_readers, n_writers, n_slots_sync)); |
6031 | } |
6032 | |
6033 | /** Frees the asynchronous io system. */ |
6034 | void |
6035 | os_aio_free() |
6036 | { |
6037 | AIO::shutdown(); |
6038 | |
6039 | ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio); |
6040 | ut_ad(srv_use_native_aio || os_aio_segment_wait_events |
6041 | || !srv_was_started); |
6042 | |
6043 | if (!srv_use_native_aio && os_aio_segment_wait_events) { |
6044 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
6045 | os_event_destroy(os_aio_segment_wait_events[i]); |
6046 | } |
6047 | |
6048 | ut_free(os_aio_segment_wait_events); |
6049 | os_aio_segment_wait_events = 0; |
6050 | } |
6051 | os_aio_n_segments = 0; |
6052 | } |
6053 | |
6054 | /** Wakes up all async i/o threads so that they know to exit themselves in |
6055 | shutdown. */ |
6056 | void |
6057 | os_aio_wake_all_threads_at_shutdown() |
6058 | { |
6059 | #ifdef WIN_ASYNC_IO |
6060 | AIO::wake_at_shutdown(); |
6061 | #elif defined(LINUX_NATIVE_AIO) |
6062 | /* When using native AIO interface the io helper threads |
6063 | wait on io_getevents with a timeout value of 500ms. At |
6064 | each wake up these threads check the server status. |
6065 | No need to do anything to wake them up. */ |
6066 | #endif /* !WIN_ASYNC_AIO */ |
6067 | |
6068 | if (srv_use_native_aio) { |
6069 | return; |
6070 | } |
6071 | |
6072 | /* This loop wakes up all simulated ai/o threads */ |
6073 | |
6074 | for (ulint i = 0; i < os_aio_n_segments; ++i) { |
6075 | |
6076 | os_event_set(os_aio_segment_wait_events[i]); |
6077 | } |
6078 | } |
6079 | |
6080 | /** Waits until there are no pending writes in AIO::s_writes. There can |
6081 | be other, synchronous, pending writes. */ |
6082 | void |
6083 | os_aio_wait_until_no_pending_writes() |
6084 | { |
6085 | AIO::wait_until_no_pending_writes(); |
6086 | } |
6087 | |
6088 | /** Calculates segment number for a slot. |
6089 | @param[in] array AIO wait array |
6090 | @param[in] slot slot in this array |
6091 | @return segment number (which is the number used by, for example, |
6092 | I/O-handler threads) */ |
6093 | ulint |
6094 | AIO::get_segment_no_from_slot( |
6095 | const AIO* array, |
6096 | const Slot* slot) |
6097 | { |
6098 | ulint segment; |
6099 | ulint seg_len; |
6100 | |
6101 | if (array == s_ibuf) { |
6102 | ut_ad(!srv_read_only_mode); |
6103 | |
6104 | segment = IO_IBUF_SEGMENT; |
6105 | |
6106 | } else if (array == s_log) { |
6107 | ut_ad(!srv_read_only_mode); |
6108 | |
6109 | segment = IO_LOG_SEGMENT; |
6110 | |
6111 | } else if (array == s_reads) { |
6112 | seg_len = s_reads->slots_per_segment(); |
6113 | |
6114 | segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; |
6115 | } else { |
6116 | ut_a(array == s_writes); |
6117 | |
6118 | seg_len = s_writes->slots_per_segment(); |
6119 | |
6120 | segment = s_reads->m_n_segments |
6121 | + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; |
6122 | } |
6123 | |
6124 | return(segment); |
6125 | } |
6126 | |
6127 | /** Requests for a slot in the aio array. If no slot is available, waits until |
6128 | not_full-event becomes signaled. |
6129 | |
6130 | @param[in] type IO context |
6131 | @param[in,out] m1 message to be passed along with the AIO |
6132 | operation |
6133 | @param[in,out] m2 message to be passed along with the AIO |
6134 | operation |
6135 | @param[in] file file handle |
6136 | @param[in] name name of the file or path as a NUL-terminated |
6137 | string |
6138 | @param[in,out] buf buffer where to read or from which to write |
6139 | @param[in] offset file offset, where to read from or start writing |
6140 | @param[in] len length of the block to read or write |
6141 | @return pointer to slot */ |
6142 | Slot* |
6143 | AIO::reserve_slot( |
6144 | const IORequest& type, |
6145 | fil_node_t* m1, |
6146 | void* m2, |
6147 | pfs_os_file_t file, |
6148 | const char* name, |
6149 | void* buf, |
6150 | os_offset_t offset, |
6151 | ulint len) |
6152 | { |
6153 | #ifdef WIN_ASYNC_IO |
6154 | ut_a((len & 0xFFFFFFFFUL) == len); |
6155 | #endif /* WIN_ASYNC_IO */ |
6156 | |
6157 | /* No need of a mutex. Only reading constant fields */ |
6158 | ulint slots_per_seg; |
6159 | |
6160 | ut_ad(type.validate()); |
6161 | |
6162 | slots_per_seg = slots_per_segment(); |
6163 | |
6164 | /* We attempt to keep adjacent blocks in the same local |
6165 | segment. This can help in merging IO requests when we are |
6166 | doing simulated AIO */ |
6167 | ulint local_seg; |
6168 | |
6169 | local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments; |
6170 | |
6171 | for (;;) { |
6172 | |
6173 | acquire(); |
6174 | |
6175 | if (m_n_reserved != m_slots.size()) { |
6176 | break; |
6177 | } |
6178 | |
6179 | release(); |
6180 | |
6181 | if (!srv_use_native_aio) { |
6182 | /* If the handler threads are suspended, |
6183 | wake them so that we get more slots */ |
6184 | |
6185 | os_aio_simulated_wake_handler_threads(); |
6186 | } |
6187 | |
6188 | os_event_wait(m_not_full); |
6189 | } |
6190 | |
6191 | ulint counter = 0; |
6192 | Slot* slot = NULL; |
6193 | |
6194 | /* We start our search for an available slot from our preferred |
6195 | local segment and do a full scan of the array. We are |
6196 | guaranteed to find a slot in full scan. */ |
6197 | for (ulint i = local_seg * slots_per_seg; |
6198 | counter < m_slots.size(); |
6199 | ++i, ++counter) { |
6200 | |
6201 | i %= m_slots.size(); |
6202 | |
6203 | slot = at(i); |
6204 | |
6205 | if (slot->is_reserved == false) { |
6206 | break; |
6207 | } |
6208 | } |
6209 | |
6210 | /* We MUST always be able to get hold of a reserved slot. */ |
6211 | ut_a(counter < m_slots.size()); |
6212 | |
6213 | ut_a(slot->is_reserved == false); |
6214 | |
6215 | ++m_n_reserved; |
6216 | |
6217 | if (m_n_reserved == 1) { |
6218 | os_event_reset(m_is_empty); |
6219 | } |
6220 | |
6221 | if (m_n_reserved == m_slots.size()) { |
6222 | os_event_reset(m_not_full); |
6223 | } |
6224 | |
6225 | slot->is_reserved = true; |
6226 | slot->reservation_time = ut_time(); |
6227 | slot->m1 = m1; |
6228 | slot->m2 = m2; |
6229 | slot->file = file; |
6230 | slot->name = name; |
6231 | #ifdef _WIN32 |
6232 | slot->len = static_cast<DWORD>(len); |
6233 | #else |
6234 | slot->len = static_cast<ulint>(len); |
6235 | #endif /* _WIN32 */ |
6236 | slot->type = type; |
6237 | slot->buf = static_cast<byte*>(buf); |
6238 | slot->ptr = slot->buf; |
6239 | slot->offset = offset; |
6240 | slot->err = DB_SUCCESS; |
6241 | slot->original_len = static_cast<uint32>(len); |
6242 | slot->io_already_done = false; |
6243 | slot->buf = static_cast<byte*>(buf); |
6244 | |
6245 | #ifdef WIN_ASYNC_IO |
6246 | { |
6247 | OVERLAPPED* control; |
6248 | |
6249 | control = &slot->control; |
6250 | control->Offset = (DWORD) offset & 0xFFFFFFFF; |
6251 | control->OffsetHigh = (DWORD) (offset >> 32); |
6252 | } |
6253 | #elif defined(LINUX_NATIVE_AIO) |
6254 | |
6255 | /* If we are not using native AIO skip this part. */ |
6256 | if (srv_use_native_aio) { |
6257 | |
6258 | off_t aio_offset; |
6259 | |
6260 | /* Check if we are dealing with 64 bit arch. |
6261 | If not then make sure that offset fits in 32 bits. */ |
6262 | aio_offset = (off_t) offset; |
6263 | |
6264 | ut_a(sizeof(aio_offset) >= sizeof(offset) |
6265 | || ((os_offset_t) aio_offset) == offset); |
6266 | |
6267 | struct iocb* iocb = &slot->control; |
6268 | |
6269 | if (type.is_read()) { |
6270 | |
6271 | io_prep_pread( |
6272 | iocb, file, slot->ptr, slot->len, aio_offset); |
6273 | } else { |
6274 | ut_ad(type.is_write()); |
6275 | |
6276 | io_prep_pwrite( |
6277 | iocb, file, slot->ptr, slot->len, aio_offset); |
6278 | } |
6279 | |
6280 | iocb->data = slot; |
6281 | |
6282 | slot->n_bytes = 0; |
6283 | slot->ret = 0; |
6284 | } |
6285 | #endif /* LINUX_NATIVE_AIO */ |
6286 | |
6287 | release(); |
6288 | |
6289 | return(slot); |
6290 | } |
6291 | |
6292 | /** Wakes up a simulated aio i/o-handler thread if it has something to do. |
6293 | @param[in] global_segment The number of the segment in the AIO arrays */ |
6294 | void |
6295 | AIO::wake_simulated_handler_thread(ulint global_segment) |
6296 | { |
6297 | ut_ad(!srv_use_native_aio); |
6298 | |
6299 | AIO* array; |
6300 | ulint segment = get_array_and_local_segment(&array, global_segment); |
6301 | |
6302 | array->wake_simulated_handler_thread(global_segment, segment); |
6303 | } |
6304 | |
6305 | /** Wakes up a simulated AIO I/O-handler thread if it has something to do |
6306 | for a local segment in the AIO array. |
6307 | @param[in] global_segment The number of the segment in the AIO arrays |
6308 | @param[in] segment The local segment in the AIO array */ |
6309 | void |
6310 | AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment) |
6311 | { |
6312 | ut_ad(!srv_use_native_aio); |
6313 | |
6314 | ulint n = slots_per_segment(); |
6315 | ulint offset = segment * n; |
6316 | |
6317 | /* Look through n slots after the segment * n'th slot */ |
6318 | |
6319 | acquire(); |
6320 | |
6321 | const Slot* slot = at(offset); |
6322 | |
6323 | for (ulint i = 0; i < n; ++i, ++slot) { |
6324 | |
6325 | if (slot->is_reserved) { |
6326 | |
6327 | /* Found an i/o request */ |
6328 | |
6329 | release(); |
6330 | |
6331 | os_event_t event; |
6332 | |
6333 | event = os_aio_segment_wait_events[global_segment]; |
6334 | |
6335 | os_event_set(event); |
6336 | |
6337 | return; |
6338 | } |
6339 | } |
6340 | |
6341 | release(); |
6342 | } |
6343 | |
6344 | /** Wakes up simulated aio i/o-handler threads if they have something to do. */ |
6345 | void |
6346 | os_aio_simulated_wake_handler_threads() |
6347 | { |
6348 | if (srv_use_native_aio) { |
6349 | /* We do not use simulated aio: do nothing */ |
6350 | |
6351 | return; |
6352 | } |
6353 | |
6354 | os_aio_recommend_sleep_for_read_threads = false; |
6355 | |
6356 | for (ulint i = 0; i < os_aio_n_segments; i++) { |
6357 | AIO::wake_simulated_handler_thread(i); |
6358 | } |
6359 | } |
6360 | |
6361 | /** Select the IO slot array |
6362 | @param[in,out] type Type of IO, READ or WRITE |
6363 | @param[in] read_only true if running in read-only mode |
6364 | @param[in] mode IO mode |
6365 | @return slot array or NULL if invalid mode specified */ |
6366 | AIO* |
6367 | AIO::select_slot_array(IORequest& type, bool read_only, ulint mode) |
6368 | { |
6369 | AIO* array; |
6370 | |
6371 | ut_ad(type.validate()); |
6372 | |
6373 | switch (mode) { |
6374 | case OS_AIO_NORMAL: |
6375 | |
6376 | array = type.is_read() ? AIO::s_reads : AIO::s_writes; |
6377 | break; |
6378 | |
6379 | case OS_AIO_IBUF: |
6380 | ut_ad(type.is_read()); |
6381 | |
6382 | /* Reduce probability of deadlock bugs in connection with ibuf: |
6383 | do not let the ibuf i/o handler sleep */ |
6384 | |
6385 | type.clear_do_not_wake(); |
6386 | |
6387 | array = read_only ? AIO::s_reads : AIO::s_ibuf; |
6388 | break; |
6389 | |
6390 | case OS_AIO_LOG: |
6391 | |
6392 | array = read_only ? AIO::s_reads : AIO::s_log; |
6393 | break; |
6394 | |
6395 | case OS_AIO_SYNC: |
6396 | |
6397 | array = AIO::s_sync; |
6398 | #if defined(LINUX_NATIVE_AIO) |
6399 | /* In Linux native AIO we don't use sync IO array. */ |
6400 | ut_a(!srv_use_native_aio); |
6401 | #endif /* LINUX_NATIVE_AIO */ |
6402 | break; |
6403 | |
6404 | default: |
6405 | ut_error; |
6406 | array = NULL; /* Eliminate compiler warning */ |
6407 | } |
6408 | |
6409 | return(array); |
6410 | } |
6411 | |
6412 | #ifdef WIN_ASYNC_IO |
6413 | /** This function is only used in Windows asynchronous i/o. |
6414 | Waits for an aio operation to complete. This function is used to wait the |
6415 | for completed requests. The aio array of pending requests is divided |
6416 | into segments. The thread specifies which segment or slot it wants to wait |
6417 | for. NOTE: this function will also take care of freeing the aio slot, |
6418 | therefore no other thread is allowed to do the freeing! |
6419 | @param[in] segment The number of the segment in the aio arrays to |
6420 | wait for; segment 0 is the ibuf I/O thread, |
6421 | segment 1 the log I/O thread, then follow the |
6422 | non-ibuf read threads, and as the last are the |
6423 | non-ibuf write threads; if this is |
6424 | ULINT_UNDEFINED, then it means that sync AIO |
6425 | is used, and this parameter is ignored |
6426 | @param[in] pos this parameter is used only in sync AIO: |
6427 | wait for the aio slot at this position |
6428 | @param[out] m1 the messages passed with the AIO request; note |
6429 | that also in the case where the AIO operation |
6430 | failed, these output parameters are valid and |
6431 | can be used to restart the operation, |
6432 | for example |
6433 | @param[out] m2 callback message |
6434 | @param[out] type OS_FILE_WRITE or ..._READ |
6435 | @return DB_SUCCESS or error code */ |
6436 | |
6437 | |
6438 | |
6439 | static |
6440 | dberr_t |
6441 | os_aio_windows_handler( |
6442 | ulint segment, |
6443 | ulint pos, |
6444 | fil_node_t** m1, |
6445 | void** m2, |
6446 | IORequest* type) |
6447 | { |
6448 | Slot* slot= 0; |
6449 | dberr_t err; |
6450 | |
6451 | BOOL ret; |
6452 | ULONG_PTR key; |
6453 | |
6454 | ut_a(segment != ULINT_UNDEFINED); |
6455 | |
6456 | /* NOTE! We only access constant fields in os_aio_array. Therefore |
6457 | we do not have to acquire the protecting mutex yet */ |
6458 | |
6459 | ut_ad(os_aio_validate_skip()); |
6460 | AIO *my_array; |
6461 | AIO::get_array_and_local_segment(&my_array, segment); |
6462 | |
6463 | HANDLE port = my_array->m_completion_port; |
6464 | ut_ad(port); |
6465 | for (;;) { |
6466 | DWORD len; |
6467 | ret = GetQueuedCompletionStatus(port, &len, &key, |
6468 | (OVERLAPPED **)&slot, INFINITE); |
6469 | |
6470 | /* If shutdown key was received, repost the shutdown message and exit */ |
6471 | if (ret && key == IOCP_SHUTDOWN_KEY) { |
6472 | PostQueuedCompletionStatus(port, 0, key, NULL); |
6473 | *m1 = NULL; |
6474 | *m2 = NULL; |
6475 | return (DB_SUCCESS); |
6476 | } |
6477 | |
6478 | ut_a(slot); |
6479 | |
6480 | if (!ret) { |
6481 | /* IO failed */ |
6482 | break; |
6483 | } |
6484 | |
6485 | slot->n_bytes= len; |
6486 | ut_a(slot->array); |
6487 | HANDLE slot_port = slot->array->m_completion_port; |
6488 | if (slot_port != port) { |
6489 | /* there are no redirections between data and log */ |
6490 | ut_ad(port == data_completion_port); |
6491 | ut_ad(slot_port != log_completion_port); |
6492 | |
6493 | /* |
6494 | Redirect completions to the dedicated completion port |
6495 | and threads. |
6496 | |
6497 | "Write array" threads receive write,read and ibuf |
6498 | notifications, read and ibuf completions are redirected. |
6499 | |
6500 | Forwarding IO completion this way costs a context switch, |
6501 | and this seems tolerable since asynchronous reads are by |
6502 | far less frequent. |
6503 | */ |
6504 | ut_a(PostQueuedCompletionStatus(slot_port, |
6505 | len, key, &slot->control)); |
6506 | } |
6507 | else { |
6508 | break; |
6509 | } |
6510 | } |
6511 | |
6512 | ut_a(slot->is_reserved); |
6513 | |
6514 | *m1 = slot->m1; |
6515 | *m2 = slot->m2; |
6516 | |
6517 | *type = slot->type; |
6518 | |
6519 | bool retry = false; |
6520 | |
6521 | if (ret && slot->n_bytes == slot->len) { |
6522 | |
6523 | err = DB_SUCCESS; |
6524 | |
6525 | } else if (os_file_handle_error(slot->name, "Windows aio" )) { |
6526 | |
6527 | retry = true; |
6528 | |
6529 | } else { |
6530 | |
6531 | err = DB_IO_ERROR; |
6532 | } |
6533 | |
6534 | |
6535 | if (retry) { |
6536 | /* Retry failed read/write operation synchronously. */ |
6537 | |
6538 | #ifdef UNIV_PFS_IO |
6539 | /* This read/write does not go through os_file_read |
6540 | and os_file_write APIs, need to register with |
6541 | performance schema explicitly here. */ |
6542 | PSI_file_locker_state state; |
6543 | struct PSI_file_locker* locker = NULL; |
6544 | |
6545 | register_pfs_file_io_begin( |
6546 | &state, locker, slot->file, slot->len, |
6547 | slot->type.is_write() |
6548 | ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__); |
6549 | #endif /* UNIV_PFS_IO */ |
6550 | |
6551 | ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); |
6552 | |
6553 | ssize_t n_bytes = SyncFileIO::execute(slot); |
6554 | |
6555 | #ifdef UNIV_PFS_IO |
6556 | register_pfs_file_io_end(locker, slot->len); |
6557 | #endif /* UNIV_PFS_IO */ |
6558 | |
6559 | err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR; |
6560 | } |
6561 | |
6562 | if (err == DB_SUCCESS) { |
6563 | err = AIOHandler::post_io_processing(slot); |
6564 | } |
6565 | |
6566 | slot->array->release_with_mutex(slot); |
6567 | |
6568 | if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS |
6569 | && !buf_page_cleaner_is_active |
6570 | && os_aio_all_slots_free()) { |
6571 | /* Last IO, wakeup other io threads */ |
6572 | AIO::wake_at_shutdown(); |
6573 | } |
6574 | return(err); |
6575 | } |
6576 | #endif /* WIN_ASYNC_IO */ |
6577 | |
6578 | /** |
6579 | NOTE! Use the corresponding macro os_aio(), not directly this function! |
6580 | Requests an asynchronous i/o operation. |
6581 | @param[in,out] type IO request context |
6582 | @param[in] mode IO mode |
6583 | @param[in] name Name of the file or path as NUL terminated |
6584 | string |
6585 | @param[in] file Open file handle |
6586 | @param[out] buf buffer where to read |
6587 | @param[in] offset file offset where to read |
6588 | @param[in] n number of bytes to read |
6589 | @param[in] read_only if true read only mode checks are enforced |
6590 | @param[in,out] m1 Message for the AIO handler, (can be used to |
6591 | identify a completed AIO operation); ignored |
6592 | if mode is OS_AIO_SYNC |
6593 | @param[in,out] m2 message for the AIO handler (can be used to |
6594 | identify a completed AIO operation); ignored |
6595 | if mode is OS_AIO_SYNC |
6596 | |
6597 | @return DB_SUCCESS or error code */ |
6598 | dberr_t |
6599 | os_aio_func( |
6600 | IORequest& type, |
6601 | ulint mode, |
6602 | const char* name, |
6603 | pfs_os_file_t file, |
6604 | void* buf, |
6605 | os_offset_t offset, |
6606 | ulint n, |
6607 | bool read_only, |
6608 | fil_node_t* m1, |
6609 | void* m2) |
6610 | { |
6611 | #ifdef WIN_ASYNC_IO |
6612 | BOOL ret = TRUE; |
6613 | #endif /* WIN_ASYNC_IO */ |
6614 | |
6615 | ut_ad(n > 0); |
6616 | ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0); |
6617 | ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0); |
6618 | ut_ad(os_aio_validate_skip()); |
6619 | |
6620 | #ifdef WIN_ASYNC_IO |
6621 | ut_ad((n & 0xFFFFFFFFUL) == n); |
6622 | #endif /* WIN_ASYNC_IO */ |
6623 | |
6624 | DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28" , |
6625 | mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;); |
6626 | |
6627 | if (mode == OS_AIO_SYNC) { |
6628 | if (type.is_read()) { |
6629 | return(os_file_read_func(type, file, buf, offset, n)); |
6630 | } |
6631 | |
6632 | ut_ad(type.is_write()); |
6633 | |
6634 | return(os_file_write_func(type, name, file, buf, offset, n)); |
6635 | } |
6636 | |
6637 | try_again: |
6638 | |
6639 | AIO* array; |
6640 | |
6641 | array = AIO::select_slot_array(type, read_only, mode); |
6642 | |
6643 | Slot* slot; |
6644 | |
6645 | slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n); |
6646 | |
6647 | if (type.is_read()) { |
6648 | |
6649 | |
6650 | if (srv_use_native_aio) { |
6651 | |
6652 | ++os_n_file_reads; |
6653 | |
6654 | os_bytes_read_since_printout += n; |
6655 | #ifdef WIN_ASYNC_IO |
6656 | ret = ReadFile( |
6657 | file, slot->ptr, slot->len, |
6658 | NULL, &slot->control); |
6659 | #elif defined(LINUX_NATIVE_AIO) |
6660 | if (!array->linux_dispatch(slot)) { |
6661 | goto err_exit; |
6662 | } |
6663 | #endif /* WIN_ASYNC_IO */ |
6664 | } else if (type.is_wake()) { |
6665 | AIO::wake_simulated_handler_thread( |
6666 | AIO::get_segment_no_from_slot(array, slot)); |
6667 | } |
6668 | } else if (type.is_write()) { |
6669 | |
6670 | if (srv_use_native_aio) { |
6671 | ++os_n_file_writes; |
6672 | |
6673 | #ifdef WIN_ASYNC_IO |
6674 | ret = WriteFile( |
6675 | file, slot->ptr, slot->len, |
6676 | NULL, &slot->control); |
6677 | #elif defined(LINUX_NATIVE_AIO) |
6678 | if (!array->linux_dispatch(slot)) { |
6679 | goto err_exit; |
6680 | } |
6681 | #endif /* WIN_ASYNC_IO */ |
6682 | |
6683 | } else if (type.is_wake()) { |
6684 | AIO::wake_simulated_handler_thread( |
6685 | AIO::get_segment_no_from_slot(array, slot)); |
6686 | } |
6687 | } else { |
6688 | ut_error; |
6689 | } |
6690 | |
6691 | #ifdef WIN_ASYNC_IO |
6692 | if (ret || (GetLastError() == ERROR_IO_PENDING)) { |
6693 | /* aio completed or was queued successfully! */ |
6694 | return(DB_SUCCESS); |
6695 | } |
6696 | |
6697 | goto err_exit; |
6698 | |
6699 | #endif /* WIN_ASYNC_IO */ |
6700 | |
6701 | /* AIO request was queued successfully! */ |
6702 | return(DB_SUCCESS); |
6703 | |
6704 | #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO |
6705 | err_exit: |
6706 | #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ |
6707 | |
6708 | array->release_with_mutex(slot); |
6709 | |
6710 | if (os_file_handle_error( |
6711 | name, type.is_read() ? "aio read" : "aio write" )) { |
6712 | |
6713 | goto try_again; |
6714 | } |
6715 | |
6716 | return(DB_IO_ERROR); |
6717 | } |
6718 | |
6719 | /** Simulated AIO handler for reaping IO requests */ |
6720 | class SimulatedAIOHandler { |
6721 | |
6722 | public: |
6723 | |
6724 | /** Constructor |
6725 | @param[in,out] array The AIO array |
6726 | @param[in] segment Local segment in the array */ |
6727 | SimulatedAIOHandler(AIO* array, ulint segment) |
6728 | : |
6729 | m_oldest(), |
6730 | m_n_elems(), |
6731 | m_lowest_offset(IB_UINT64_MAX), |
6732 | m_array(array), |
6733 | m_n_slots(), |
6734 | m_segment(segment), |
6735 | m_ptr(), |
6736 | m_buf() |
6737 | { |
6738 | ut_ad(m_segment < 100); |
6739 | |
6740 | m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE); |
6741 | } |
6742 | |
6743 | /** Destructor */ |
6744 | ~SimulatedAIOHandler() |
6745 | { |
6746 | if (m_ptr != NULL) { |
6747 | ut_free(m_ptr); |
6748 | } |
6749 | } |
6750 | |
6751 | /** Reset the state of the handler |
6752 | @param[in] n_slots Number of pending AIO operations supported */ |
6753 | void init(ulint n_slots) |
6754 | { |
6755 | m_oldest = 0; |
6756 | m_n_elems = 0; |
6757 | m_n_slots = n_slots; |
6758 | m_lowest_offset = IB_UINT64_MAX; |
6759 | |
6760 | if (m_ptr != NULL) { |
6761 | ut_free(m_ptr); |
6762 | m_ptr = m_buf = NULL; |
6763 | } |
6764 | |
6765 | m_slots[0] = NULL; |
6766 | } |
6767 | |
6768 | /** Check if there is a slot for which the i/o has already been done |
6769 | @param[out] n_reserved Number of reserved slots |
6770 | @return the first completed slot that is found. */ |
6771 | Slot* check_completed(ulint* n_reserved) |
6772 | { |
6773 | ulint offset = m_segment * m_n_slots; |
6774 | |
6775 | *n_reserved = 0; |
6776 | |
6777 | Slot* slot; |
6778 | |
6779 | slot = m_array->at(offset); |
6780 | |
6781 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
6782 | |
6783 | if (slot->is_reserved) { |
6784 | |
6785 | if (slot->io_already_done) { |
6786 | |
6787 | ut_a(slot->is_reserved); |
6788 | |
6789 | return(slot); |
6790 | } |
6791 | |
6792 | ++*n_reserved; |
6793 | } |
6794 | } |
6795 | |
6796 | return(NULL); |
6797 | } |
6798 | |
6799 | /** If there are at least 2 seconds old requests, then pick the |
6800 | oldest one to prevent starvation. If several requests have the |
6801 | same age, then pick the one at the lowest offset. |
6802 | @return true if request was selected */ |
6803 | bool select() |
6804 | { |
6805 | if (!select_oldest()) { |
6806 | |
6807 | return(select_lowest_offset()); |
6808 | } |
6809 | |
6810 | return(true); |
6811 | } |
6812 | |
6813 | /** Check if there are several consecutive blocks |
6814 | to read or write. Merge them if found. */ |
6815 | void merge() |
6816 | { |
6817 | /* if m_n_elems != 0, then we have assigned |
6818 | something valid to consecutive_ios[0] */ |
6819 | ut_ad(m_n_elems != 0); |
6820 | ut_ad(first_slot() != NULL); |
6821 | |
6822 | Slot* slot = first_slot(); |
6823 | |
6824 | while (!merge_adjacent(slot)) { |
6825 | /* No op */ |
6826 | } |
6827 | } |
6828 | |
6829 | /** We have now collected n_consecutive I/O requests |
6830 | in the array; allocate a single buffer which can hold |
6831 | all data, and perform the I/O |
6832 | @return the length of the buffer */ |
6833 | ulint allocate_buffer() |
6834 | MY_ATTRIBUTE((warn_unused_result)) |
6835 | { |
6836 | ulint len; |
6837 | Slot* slot = first_slot(); |
6838 | |
6839 | ut_ad(m_ptr == NULL); |
6840 | |
6841 | if (slot->type.is_read() && m_n_elems > 1) { |
6842 | |
6843 | len = 0; |
6844 | |
6845 | for (ulint i = 0; i < m_n_elems; ++i) { |
6846 | len += m_slots[i]->len; |
6847 | } |
6848 | |
6849 | m_ptr = static_cast<byte*>( |
6850 | ut_malloc_nokey(len + srv_page_size)); |
6851 | |
6852 | m_buf = static_cast<byte*>( |
6853 | ut_align(m_ptr, srv_page_size)); |
6854 | |
6855 | } else { |
6856 | len = first_slot()->len; |
6857 | m_buf = first_slot()->buf; |
6858 | } |
6859 | |
6860 | return(len); |
6861 | } |
6862 | |
6863 | /** We have to compress the individual pages and punch |
6864 | holes in them on a page by page basis when writing to |
6865 | tables that can be compresed at the IO level. |
6866 | @param[in] len Value returned by allocate_buffer */ |
6867 | void copy_to_buffer(ulint len) |
6868 | { |
6869 | Slot* slot = first_slot(); |
6870 | |
6871 | if (len > slot->len && slot->type.is_write()) { |
6872 | |
6873 | byte* ptr = m_buf; |
6874 | |
6875 | ut_ad(ptr != slot->buf); |
6876 | |
6877 | /* Copy the buffers to the combined buffer */ |
6878 | for (ulint i = 0; i < m_n_elems; ++i) { |
6879 | |
6880 | slot = m_slots[i]; |
6881 | |
6882 | memmove(ptr, slot->buf, slot->len); |
6883 | |
6884 | ptr += slot->len; |
6885 | } |
6886 | } |
6887 | } |
6888 | |
6889 | /** Do the I/O with ordinary, synchronous i/o functions: |
6890 | @param[in] len Length of buffer for IO */ |
6891 | void io() |
6892 | { |
6893 | if (first_slot()->type.is_write()) { |
6894 | |
6895 | for (ulint i = 0; i < m_n_elems; ++i) { |
6896 | write(m_slots[i]); |
6897 | } |
6898 | |
6899 | } else { |
6900 | |
6901 | for (ulint i = 0; i < m_n_elems; ++i) { |
6902 | read(m_slots[i]); |
6903 | } |
6904 | } |
6905 | } |
6906 | |
6907 | /** Mark the i/os done in slots */ |
6908 | void done() |
6909 | { |
6910 | for (ulint i = 0; i < m_n_elems; ++i) { |
6911 | m_slots[i]->io_already_done = true; |
6912 | } |
6913 | } |
6914 | |
6915 | /** @return the first slot in the consecutive array */ |
6916 | Slot* first_slot() |
6917 | MY_ATTRIBUTE((warn_unused_result)) |
6918 | { |
6919 | ut_a(m_n_elems > 0); |
6920 | |
6921 | return(m_slots[0]); |
6922 | } |
6923 | |
6924 | /** Wait for I/O requests |
6925 | @param[in] global_segment The global segment |
6926 | @param[in,out] event Wait on event if no active requests |
6927 | @return the number of slots */ |
6928 | ulint check_pending( |
6929 | ulint global_segment, |
6930 | os_event_t event) |
6931 | MY_ATTRIBUTE((warn_unused_result)); |
6932 | private: |
6933 | |
6934 | /** Do the file read |
6935 | @param[in,out] slot Slot that has the IO context */ |
6936 | void read(Slot* slot) |
6937 | { |
6938 | dberr_t err = os_file_read( |
6939 | slot->type, |
6940 | slot->file, |
6941 | slot->ptr, |
6942 | slot->offset, |
6943 | slot->len); |
6944 | |
6945 | ut_a(err == DB_SUCCESS); |
6946 | } |
6947 | |
6948 | /** Do the file read |
6949 | @param[in,out] slot Slot that has the IO context */ |
6950 | void write(Slot* slot) |
6951 | { |
6952 | dberr_t err = os_file_write( |
6953 | slot->type, |
6954 | slot->name, |
6955 | slot->file, |
6956 | slot->ptr, |
6957 | slot->offset, |
6958 | slot->len); |
6959 | |
6960 | ut_a(err == DB_SUCCESS); |
6961 | } |
6962 | |
6963 | /** @return true if the slots are adjacent and can be merged */ |
6964 | bool adjacent(const Slot* s1, const Slot* s2) const |
6965 | { |
6966 | return(s1 != s2 |
6967 | && s1->file == s2->file |
6968 | && s2->offset == s1->offset + s1->len |
6969 | && s1->type == s2->type); |
6970 | } |
6971 | |
6972 | /** @return true if merge limit reached or no adjacent slots found. */ |
6973 | bool merge_adjacent(Slot*& current) |
6974 | { |
6975 | Slot* slot; |
6976 | ulint offset = m_segment * m_n_slots; |
6977 | |
6978 | slot = m_array->at(offset); |
6979 | |
6980 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
6981 | |
6982 | if (slot->is_reserved && adjacent(current, slot)) { |
6983 | |
6984 | current = slot; |
6985 | |
6986 | /* Found a consecutive i/o request */ |
6987 | |
6988 | m_slots[m_n_elems] = slot; |
6989 | |
6990 | ++m_n_elems; |
6991 | |
6992 | return(m_n_elems >= m_slots.capacity()); |
6993 | } |
6994 | } |
6995 | |
6996 | return(true); |
6997 | } |
6998 | |
6999 | /** There were no old requests. Look for an I/O request at the lowest |
7000 | offset in the array (we ignore the high 32 bits of the offset in these |
7001 | heuristics) */ |
7002 | bool select_lowest_offset() |
7003 | { |
7004 | ut_ad(m_n_elems == 0); |
7005 | |
7006 | ulint offset = m_segment * m_n_slots; |
7007 | |
7008 | m_lowest_offset = IB_UINT64_MAX; |
7009 | |
7010 | for (ulint i = 0; i < m_n_slots; ++i) { |
7011 | Slot* slot; |
7012 | |
7013 | slot = m_array->at(i + offset); |
7014 | |
7015 | if (slot->is_reserved |
7016 | && slot->offset < m_lowest_offset) { |
7017 | |
7018 | /* Found an i/o request */ |
7019 | m_slots[0] = slot; |
7020 | |
7021 | m_n_elems = 1; |
7022 | |
7023 | m_lowest_offset = slot->offset; |
7024 | } |
7025 | } |
7026 | |
7027 | return(m_n_elems > 0); |
7028 | } |
7029 | |
7030 | /** Select the slot if it is older than the current oldest slot. |
7031 | @param[in] slot The slot to check */ |
7032 | void select_if_older(Slot* slot) |
7033 | { |
7034 | ulint age; |
7035 | |
7036 | age = (ulint) difftime(ut_time(), slot->reservation_time); |
7037 | |
7038 | if ((age >= 2 && age > m_oldest) |
7039 | || (age >= 2 |
7040 | && age == m_oldest |
7041 | && slot->offset < m_lowest_offset)) { |
7042 | |
7043 | /* Found an i/o request */ |
7044 | m_slots[0] = slot; |
7045 | |
7046 | m_n_elems = 1; |
7047 | |
7048 | m_oldest = age; |
7049 | |
7050 | m_lowest_offset = slot->offset; |
7051 | } |
7052 | } |
7053 | |
7054 | /** Select th oldest slot in the array |
7055 | @return true if oldest slot found */ |
7056 | bool select_oldest() |
7057 | { |
7058 | ut_ad(m_n_elems == 0); |
7059 | |
7060 | Slot* slot; |
7061 | ulint offset = m_n_slots * m_segment; |
7062 | |
7063 | slot = m_array->at(offset); |
7064 | |
7065 | for (ulint i = 0; i < m_n_slots; ++i, ++slot) { |
7066 | |
7067 | if (slot->is_reserved) { |
7068 | select_if_older(slot); |
7069 | } |
7070 | } |
7071 | |
7072 | return(m_n_elems > 0); |
7073 | } |
7074 | |
7075 | typedef std::vector<Slot*> slots_t; |
7076 | |
7077 | private: |
7078 | ulint m_oldest; |
7079 | ulint m_n_elems; |
7080 | os_offset_t m_lowest_offset; |
7081 | |
7082 | AIO* m_array; |
7083 | ulint m_n_slots; |
7084 | ulint m_segment; |
7085 | |
7086 | slots_t m_slots; |
7087 | |
7088 | byte* m_ptr; |
7089 | byte* m_buf; |
7090 | }; |
7091 | |
7092 | /** Wait for I/O requests |
7093 | @return the number of slots */ |
7094 | ulint |
7095 | SimulatedAIOHandler::check_pending( |
7096 | ulint global_segment, |
7097 | os_event_t event) |
7098 | { |
7099 | /* NOTE! We only access constant fields in os_aio_array. |
7100 | Therefore we do not have to acquire the protecting mutex yet */ |
7101 | |
7102 | ut_ad(os_aio_validate_skip()); |
7103 | |
7104 | ut_ad(m_segment < m_array->get_n_segments()); |
7105 | |
7106 | /* Look through n slots after the segment * n'th slot */ |
7107 | |
7108 | if (AIO::is_read(m_array) |
7109 | && os_aio_recommend_sleep_for_read_threads) { |
7110 | |
7111 | /* Give other threads chance to add several |
7112 | I/Os to the array at once. */ |
7113 | |
7114 | srv_set_io_thread_op_info( |
7115 | global_segment, "waiting for i/o request" ); |
7116 | |
7117 | os_event_wait(event); |
7118 | |
7119 | return(0); |
7120 | } |
7121 | |
7122 | return(m_array->slots_per_segment()); |
7123 | } |
7124 | |
7125 | /** Does simulated AIO. This function should be called by an i/o-handler |
7126 | thread. |
7127 | |
7128 | @param[in] segment The number of the segment in the aio arrays to wait |
7129 | for; segment 0 is the ibuf i/o thread, segment 1 the |
7130 | log i/o thread, then follow the non-ibuf read threads, |
7131 | and as the last are the non-ibuf write threads |
7132 | @param[out] m1 the messages passed with the AIO request; note that |
7133 | also in the case where the AIO operation failed, these |
7134 | output parameters are valid and can be used to restart |
7135 | the operation, for example |
7136 | @param[out] m2 Callback argument |
7137 | @param[in] type IO context |
7138 | @return DB_SUCCESS or error code */ |
7139 | static |
7140 | dberr_t |
7141 | os_aio_simulated_handler( |
7142 | ulint global_segment, |
7143 | fil_node_t** m1, |
7144 | void** m2, |
7145 | IORequest* type) |
7146 | { |
7147 | Slot* slot; |
7148 | AIO* array; |
7149 | ulint segment; |
7150 | os_event_t event = os_aio_segment_wait_events[global_segment]; |
7151 | |
7152 | segment = AIO::get_array_and_local_segment(&array, global_segment); |
7153 | |
7154 | SimulatedAIOHandler handler(array, segment); |
7155 | |
7156 | for (;;) { |
7157 | |
7158 | srv_set_io_thread_op_info( |
7159 | global_segment, "looking for i/o requests (a)" ); |
7160 | |
7161 | ulint n_slots = handler.check_pending(global_segment, event); |
7162 | |
7163 | if (n_slots == 0) { |
7164 | continue; |
7165 | } |
7166 | |
7167 | handler.init(n_slots); |
7168 | |
7169 | srv_set_io_thread_op_info( |
7170 | global_segment, "looking for i/o requests (b)" ); |
7171 | |
7172 | array->acquire(); |
7173 | |
7174 | ulint n_reserved; |
7175 | |
7176 | slot = handler.check_completed(&n_reserved); |
7177 | |
7178 | if (slot != NULL) { |
7179 | |
7180 | break; |
7181 | |
7182 | } else if (n_reserved == 0 |
7183 | && !buf_page_cleaner_is_active |
7184 | && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { |
7185 | |
7186 | /* There is no completed request. If there |
7187 | are no pending request at all, and the system |
7188 | is being shut down, exit. */ |
7189 | |
7190 | array->release(); |
7191 | |
7192 | *m1 = NULL; |
7193 | |
7194 | *m2 = NULL; |
7195 | |
7196 | return(DB_SUCCESS); |
7197 | |
7198 | } else if (handler.select()) { |
7199 | |
7200 | break; |
7201 | } |
7202 | |
7203 | /* No I/O requested at the moment */ |
7204 | |
7205 | srv_set_io_thread_op_info( |
7206 | global_segment, "resetting wait event" ); |
7207 | |
7208 | /* We wait here until tbere are more IO requests |
7209 | for this segment. */ |
7210 | |
7211 | os_event_reset(event); |
7212 | |
7213 | array->release(); |
7214 | |
7215 | srv_set_io_thread_op_info( |
7216 | global_segment, "waiting for i/o request" ); |
7217 | |
7218 | os_event_wait(event); |
7219 | } |
7220 | |
7221 | /** Found a slot that has already completed its IO */ |
7222 | |
7223 | if (slot == NULL) { |
7224 | /* Merge adjacent requests */ |
7225 | handler.merge(); |
7226 | |
7227 | /* Check if there are several consecutive blocks |
7228 | to read or write */ |
7229 | |
7230 | srv_set_io_thread_op_info( |
7231 | global_segment, "consecutive i/o requests" ); |
7232 | |
7233 | // Note: We don't support write combining for simulated AIO. |
7234 | //ulint total_len = handler.allocate_buffer(); |
7235 | |
7236 | /* We release the array mutex for the time of the I/O: NOTE that |
7237 | this assumes that there is just one i/o-handler thread serving |
7238 | a single segment of slots! */ |
7239 | |
7240 | array->release(); |
7241 | |
7242 | // Note: We don't support write combining for simulated AIO. |
7243 | //handler.copy_to_buffer(total_len); |
7244 | |
7245 | srv_set_io_thread_op_info(global_segment, "doing file i/o" ); |
7246 | |
7247 | handler.io(); |
7248 | |
7249 | srv_set_io_thread_op_info(global_segment, "file i/o done" ); |
7250 | |
7251 | array->acquire(); |
7252 | |
7253 | handler.done(); |
7254 | |
7255 | /* We return the messages for the first slot now, and if there |
7256 | were several slots, the messages will be returned with |
7257 | subsequent calls of this function */ |
7258 | |
7259 | slot = handler.first_slot(); |
7260 | } |
7261 | |
7262 | ut_ad(slot->is_reserved); |
7263 | |
7264 | *m1 = slot->m1; |
7265 | *m2 = slot->m2; |
7266 | |
7267 | *type = slot->type; |
7268 | |
7269 | array->release(slot); |
7270 | |
7271 | array->release(); |
7272 | |
7273 | return(DB_SUCCESS); |
7274 | } |
7275 | |
7276 | /** Get the total number of pending IOs |
7277 | @return the total number of pending IOs */ |
7278 | ulint |
7279 | AIO::total_pending_io_count() |
7280 | { |
7281 | ulint count = s_reads->pending_io_count(); |
7282 | |
7283 | if (s_writes != NULL) { |
7284 | count += s_writes->pending_io_count(); |
7285 | } |
7286 | |
7287 | if (s_ibuf != NULL) { |
7288 | count += s_ibuf->pending_io_count(); |
7289 | } |
7290 | |
7291 | if (s_log != NULL) { |
7292 | count += s_log->pending_io_count(); |
7293 | } |
7294 | |
7295 | if (s_sync != NULL) { |
7296 | count += s_sync->pending_io_count(); |
7297 | } |
7298 | |
7299 | return(count); |
7300 | } |
7301 | |
7302 | /** Validates the consistency the aio system. |
7303 | @return true if ok */ |
7304 | static |
7305 | bool |
7306 | os_aio_validate() |
7307 | { |
7308 | /* The methods countds and validates, we ignore the count. */ |
7309 | AIO::total_pending_io_count(); |
7310 | |
7311 | return(true); |
7312 | } |
7313 | |
7314 | /** Prints pending IO requests per segment of an aio array. |
7315 | We probably don't need per segment statistics but they can help us |
7316 | during development phase to see if the IO requests are being |
7317 | distributed as expected. |
7318 | @param[in,out] file File where to print |
7319 | @param[in] segments Pending IO array */ |
7320 | void |
7321 | AIO::print_segment_info( |
7322 | FILE* file, |
7323 | const ulint* segments) |
7324 | { |
7325 | ut_ad(m_n_segments > 0); |
7326 | |
7327 | if (m_n_segments > 1) { |
7328 | |
7329 | fprintf(file, " [" ); |
7330 | |
7331 | for (ulint i = 0; i < m_n_segments; ++i, ++segments) { |
7332 | |
7333 | if (i != 0) { |
7334 | fprintf(file, ", " ); |
7335 | } |
7336 | |
7337 | fprintf(file, ULINTPF, *segments); |
7338 | } |
7339 | |
7340 | fprintf(file, "] " ); |
7341 | } |
7342 | } |
7343 | |
7344 | /** Prints info about the aio array. |
7345 | @param[in,out] file Where to print */ |
7346 | void |
7347 | AIO::print(FILE* file) |
7348 | { |
7349 | ulint count = 0; |
7350 | ulint n_res_seg[SRV_MAX_N_IO_THREADS]; |
7351 | |
7352 | mutex_enter(&m_mutex); |
7353 | |
7354 | ut_a(!m_slots.empty()); |
7355 | ut_a(m_n_segments > 0); |
7356 | |
7357 | memset(n_res_seg, 0x0, sizeof(n_res_seg)); |
7358 | |
7359 | for (ulint i = 0; i < m_slots.size(); ++i) { |
7360 | Slot& slot = m_slots[i]; |
7361 | ulint segment = (i * m_n_segments) / m_slots.size(); |
7362 | |
7363 | if (slot.is_reserved) { |
7364 | |
7365 | ++count; |
7366 | |
7367 | ++n_res_seg[segment]; |
7368 | |
7369 | ut_a(slot.len > 0); |
7370 | } |
7371 | } |
7372 | |
7373 | ut_a(m_n_reserved == count); |
7374 | |
7375 | print_segment_info(file, n_res_seg); |
7376 | |
7377 | mutex_exit(&m_mutex); |
7378 | } |
7379 | |
7380 | /** Print all the AIO segments |
7381 | @param[in,out] file Where to print */ |
7382 | void |
7383 | AIO::print_all(FILE* file) |
7384 | { |
7385 | s_reads->print(file); |
7386 | |
7387 | if (s_writes != NULL) { |
7388 | fputs(", aio writes:" , file); |
7389 | s_writes->print(file); |
7390 | } |
7391 | |
7392 | if (s_ibuf != NULL) { |
7393 | fputs(",\n ibuf aio reads:" , file); |
7394 | s_ibuf->print(file); |
7395 | } |
7396 | |
7397 | if (s_log != NULL) { |
7398 | fputs(", log i/o's:" , file); |
7399 | s_log->print(file); |
7400 | } |
7401 | |
7402 | if (s_sync != NULL) { |
7403 | fputs(", sync i/o's:" , file); |
7404 | s_sync->print(file); |
7405 | } |
7406 | } |
7407 | |
7408 | /** Prints info of the aio arrays. |
7409 | @param[in,out] file file where to print */ |
7410 | void |
7411 | os_aio_print(FILE* file) |
7412 | { |
7413 | time_t current_time; |
7414 | double time_elapsed; |
7415 | double avg_bytes_read; |
7416 | |
7417 | for (ulint i = 0; i < srv_n_file_io_threads; ++i) { |
7418 | fprintf(file, "I/O thread " ULINTPF " state: %s (%s)" , |
7419 | i, |
7420 | srv_io_thread_op_info[i], |
7421 | srv_io_thread_function[i]); |
7422 | |
7423 | #ifndef _WIN32 |
7424 | if (!srv_use_native_aio |
7425 | && os_event_is_set(os_aio_segment_wait_events[i])) { |
7426 | fprintf(file, " ev set" ); |
7427 | } |
7428 | #endif /* _WIN32 */ |
7429 | |
7430 | fprintf(file, "\n" ); |
7431 | } |
7432 | |
7433 | fputs("Pending normal aio reads:" , file); |
7434 | |
7435 | AIO::print_all(file); |
7436 | |
7437 | putc('\n', file); |
7438 | current_time = ut_time(); |
7439 | time_elapsed = 0.001 + difftime(current_time, os_last_printout); |
7440 | |
7441 | fprintf(file, |
7442 | "Pending flushes (fsync) log: " ULINTPF |
7443 | "; buffer pool: " ULINTPF "\n" |
7444 | ULINTPF " OS file reads, " |
7445 | ULINTPF " OS file writes, " |
7446 | ULINTPF " OS fsyncs\n" , |
7447 | fil_n_pending_log_flushes, |
7448 | fil_n_pending_tablespace_flushes, |
7449 | os_n_file_reads, |
7450 | os_n_file_writes, |
7451 | os_n_fsyncs); |
7452 | |
7453 | const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); |
7454 | const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); |
7455 | |
7456 | if (n_reads != 0 || n_writes != 0) { |
7457 | fprintf(file, |
7458 | ULINTPF " pending reads, " ULINTPF " pending writes\n" , |
7459 | n_reads, n_writes); |
7460 | } |
7461 | |
7462 | if (os_n_file_reads == os_n_file_reads_old) { |
7463 | avg_bytes_read = 0.0; |
7464 | } else { |
7465 | avg_bytes_read = (double) os_bytes_read_since_printout |
7466 | / (os_n_file_reads - os_n_file_reads_old); |
7467 | } |
7468 | |
7469 | fprintf(file, |
7470 | "%.2f reads/s, " ULINTPF " avg bytes/read," |
7471 | " %.2f writes/s, %.2f fsyncs/s\n" , |
7472 | (os_n_file_reads - os_n_file_reads_old) |
7473 | / time_elapsed, |
7474 | (ulint) avg_bytes_read, |
7475 | (os_n_file_writes - os_n_file_writes_old) |
7476 | / time_elapsed, |
7477 | (os_n_fsyncs - os_n_fsyncs_old) |
7478 | / time_elapsed); |
7479 | |
7480 | os_n_file_reads_old = os_n_file_reads; |
7481 | os_n_file_writes_old = os_n_file_writes; |
7482 | os_n_fsyncs_old = os_n_fsyncs; |
7483 | os_bytes_read_since_printout = 0; |
7484 | |
7485 | os_last_printout = current_time; |
7486 | } |
7487 | |
7488 | /** Refreshes the statistics used to print per-second averages. */ |
7489 | void |
7490 | os_aio_refresh_stats() |
7491 | { |
7492 | os_n_fsyncs_old = os_n_fsyncs; |
7493 | |
7494 | os_bytes_read_since_printout = 0; |
7495 | |
7496 | os_n_file_reads_old = os_n_file_reads; |
7497 | |
7498 | os_n_file_writes_old = os_n_file_writes; |
7499 | |
7500 | os_n_fsyncs_old = os_n_fsyncs; |
7501 | |
7502 | os_bytes_read_since_printout = 0; |
7503 | |
7504 | os_last_printout = ut_time(); |
7505 | } |
7506 | |
7507 | /** Checks that all slots in the system have been freed, that is, there are |
7508 | no pending io operations. |
7509 | @return true if all free */ |
7510 | bool |
7511 | os_aio_all_slots_free() |
7512 | { |
7513 | return(AIO::total_pending_io_count() == 0); |
7514 | } |
7515 | |
7516 | #ifdef UNIV_DEBUG |
7517 | /** Prints all pending IO for the array |
7518 | @param[in] file file where to print |
7519 | @param[in] array array to process */ |
7520 | void |
7521 | AIO::to_file(FILE* file) const |
7522 | { |
7523 | acquire(); |
7524 | |
7525 | fprintf(file, " " ULINTPF "\n" , m_n_reserved); |
7526 | |
7527 | for (ulint i = 0; i < m_slots.size(); ++i) { |
7528 | |
7529 | const Slot& slot = m_slots[i]; |
7530 | |
7531 | if (slot.is_reserved) { |
7532 | |
7533 | fprintf(file, |
7534 | "%s IO for %s (offset=" UINT64PF |
7535 | ", size=%lu)\n" , |
7536 | slot.type.is_read() ? "read" : "write" , |
7537 | slot.name, slot.offset, (unsigned long)(slot.len)); |
7538 | } |
7539 | } |
7540 | |
7541 | release(); |
7542 | } |
7543 | |
7544 | /** Print pending IOs for all arrays */ |
7545 | void |
7546 | AIO::print_to_file(FILE* file) |
7547 | { |
7548 | fprintf(file, "Pending normal aio reads:" ); |
7549 | |
7550 | s_reads->to_file(file); |
7551 | |
7552 | if (s_writes != NULL) { |
7553 | fprintf(file, "Pending normal aio writes:" ); |
7554 | s_writes->to_file(file); |
7555 | } |
7556 | |
7557 | if (s_ibuf != NULL) { |
7558 | fprintf(file, "Pending ibuf aio reads:" ); |
7559 | s_ibuf->to_file(file); |
7560 | } |
7561 | |
7562 | if (s_log != NULL) { |
7563 | fprintf(file, "Pending log i/o's:" ); |
7564 | s_log->to_file(file); |
7565 | } |
7566 | |
7567 | if (s_sync != NULL) { |
7568 | fprintf(file, "Pending sync i/o's:" ); |
7569 | s_sync->to_file(file); |
7570 | } |
7571 | } |
7572 | |
7573 | /** Prints all pending IO |
7574 | @param[in] file File where to print */ |
7575 | void |
7576 | os_aio_print_pending_io( |
7577 | FILE* file) |
7578 | { |
7579 | AIO::print_to_file(file); |
7580 | } |
7581 | |
7582 | #endif /* UNIV_DEBUG */ |
7583 | |
7584 | /** |
7585 | Set the file create umask |
7586 | @param[in] umask The umask to use for file creation. */ |
7587 | void |
7588 | os_file_set_umask(ulint umask) |
7589 | { |
7590 | os_innodb_umask = umask; |
7591 | } |
7592 | |
7593 | #else |
7594 | #include "univ.i" |
7595 | #endif /* !UNIV_INNOCHECKSUM */ |
7596 | |
7597 | /** Normalizes a directory path for the current OS: |
7598 | On Windows, we convert '/' to '\', else we convert '\' to '/'. |
7599 | @param[in,out] str A null-terminated directory and file path */ |
7600 | void |
7601 | os_normalize_path( |
7602 | char* str) |
7603 | { |
7604 | if (str != NULL) { |
7605 | for (; *str; str++) { |
7606 | if (*str == OS_PATH_SEPARATOR_ALT) { |
7607 | *str = OS_PATH_SEPARATOR; |
7608 | } |
7609 | } |
7610 | } |
7611 | } |
7612 | |