1/***********************************************************************
2
3Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2009, Percona Inc.
5Copyright (c) 2013, 2018, MariaDB Corporation.
6
7Portions of this file contain modifications contributed and copyrighted
8by Percona Inc.. Those modifications are
9gratefully acknowledged and are described briefly in the InnoDB
10documentation. The contributions by Percona Inc. are incorporated with
11their permission, and subject to the conditions contained in the file
12COPYING.Percona.
13
14This program is free software; you can redistribute it and/or modify it
15under the terms of the GNU General Public License as published by the
16Free Software Foundation; version 2 of the License.
17
18This program is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
21Public License for more details.
22
23You should have received a copy of the GNU General Public License along with
24this program; if not, write to the Free Software Foundation, Inc.,
2551 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
26
27***********************************************************************/
28
29/**************************************************//**
30@file os/os0file.cc
31The interface to the operating system file i/o primitives
32
33Created 10/21/1995 Heikki Tuuri
34*******************************************************/
35
36#ifndef UNIV_INNOCHECKSUM
37
38#include "ha_prototypes.h"
39#include "sql_const.h"
40
41#include "os0file.h"
42
43#ifdef UNIV_LINUX
44#include <sys/types.h>
45#include <sys/stat.h>
46#endif
47
48#include "srv0srv.h"
49#include "srv0start.h"
50#include "fil0fil.h"
51#include "fil0crypt.h"
52#include "fsp0fsp.h"
53#include "fil0pagecompress.h"
54#include "srv0srv.h"
55#ifdef HAVE_LINUX_UNISTD_H
56#include "unistd.h"
57#endif
58#include "os0event.h"
59#include "os0thread.h"
60
61#include <vector>
62
63#ifdef LINUX_NATIVE_AIO
64#include <libaio.h>
65#endif /* LINUX_NATIVE_AIO */
66
67#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
68# include <fcntl.h>
69# include <linux/falloc.h>
70#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
71
72#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
73# include <sys/ioctl.h>
74# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
75# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
76# endif
77#endif
78
79#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
80#include <sys/statvfs.h>
81#endif
82
83#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H)
84#include <linux/falloc.h>
85#endif
86
87#ifdef _WIN32
88#include <winioctl.h>
89#endif
90
91/** Insert buffer segment id */
92static const ulint IO_IBUF_SEGMENT = 0;
93
94/** Log segment id */
95static const ulint IO_LOG_SEGMENT = 1;
96
97/** Number of retries for partial I/O's */
98static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
99
100/* This specifies the file permissions InnoDB uses when it creates files in
101Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
102my_umask */
103
104#ifndef _WIN32
105/** Umask for creating files */
106static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
107#else
108/** Umask for creating files */
109static ulint os_innodb_umask = 0;
110static HANDLE data_completion_port;
111static HANDLE log_completion_port;
112
113static DWORD fls_sync_io = FLS_OUT_OF_INDEXES;
114#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
115#endif /* _WIN32 */
116
117/** In simulated aio, merge at most this many consecutive i/os */
118static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
119
120/** Flag indicating if the page_cleaner is in active state. */
121extern bool buf_page_cleaner_is_active;
122
123#ifdef WITH_INNODB_DISALLOW_WRITES
124#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
125#else
126#define WAIT_ALLOW_WRITES() do { } while (0)
127#endif /* WITH_INNODB_DISALLOW_WRITES */
128
129/**********************************************************************
130
131InnoDB AIO Implementation:
132=========================
133
134We support native AIO for Windows and Linux. For rest of the platforms
135we simulate AIO by special IO-threads servicing the IO-requests.
136
137Simulated AIO:
138==============
139
140On platforms where we 'simulate' AIO, the following is a rough explanation
141of the high level design.
142There are four io-threads (for ibuf, log, read, write).
143All synchronous IO requests are serviced by the calling thread using
144os_file_write/os_file_read. The Asynchronous requests are queued up
145in an array (there are four such arrays) by the calling thread.
146Later these requests are picked up by the IO-thread and are serviced
147synchronously.
148
149Windows native AIO:
150==================
151
152If srv_use_native_aio is not set then Windows follow the same
153code as simulated AIO. If the flag is set then native AIO interface
154is used. On windows, one of the limitation is that if a file is opened
155for AIO no synchronous IO can be done on it. Therefore we have an
156extra fifth array to queue up synchronous IO requests.
157There are innodb_file_io_threads helper threads. These threads work
158on the four arrays mentioned above in Simulated AIO. No thread is
159required for the sync array.
160If a synchronous IO request is made, it is first queued in the sync
161array. Then the calling thread itself waits on the request, thus
162making the call synchronous.
163If an AIO request is made the calling thread not only queues it in the
164array but also submits the requests. The helper thread then collects
165the completed IO request and calls completion routine on it.
166
167Linux native AIO:
168=================
169
170If we have libaio installed on the system and innodb_use_native_aio
171is set to true we follow the code path of native AIO, otherwise we
172do simulated AIO.
173There are innodb_file_io_threads helper threads. These threads work
174on the four arrays mentioned above in Simulated AIO.
175If a synchronous IO request is made, it is handled by calling
176os_file_write/os_file_read.
177If an AIO request is made the calling thread not only queues it in the
178array but also submits the requests. The helper thread then collects
179the completed IO request and calls completion routine on it.
180
181**********************************************************************/
182
183
184#ifdef UNIV_PFS_IO
185/* Keys to register InnoDB I/O with performance schema */
186mysql_pfs_key_t innodb_data_file_key;
187mysql_pfs_key_t innodb_log_file_key;
188mysql_pfs_key_t innodb_temp_file_key;
189#endif /* UNIV_PFS_IO */
190
191class AIO;
192
193/** The asynchronous I/O context */
194struct Slot {
195
196#ifdef WIN_ASYNC_IO
197 /** Windows control block for the aio request
198 must be at the very start of Slot, so we can
199 cast Slot* to OVERLAPPED*
200 */
201 OVERLAPPED control;
202#endif
203
204 /** index of the slot in the aio array */
205 uint16_t pos;
206
207 /** true if this slot is reserved */
208 bool is_reserved;
209
210 /** time when reserved */
211 time_t reservation_time;
212
213 /** buffer used in i/o */
214 byte* buf;
215
216 /** Buffer pointer used for actual IO. We advance this
217 when partial IO is required and not buf */
218 byte* ptr;
219
220 /** OS_FILE_READ or OS_FILE_WRITE */
221 IORequest type;
222
223 /** file offset in bytes */
224 os_offset_t offset;
225
226 /** file where to read or write */
227 pfs_os_file_t file;
228
229 /** file name or path */
230 const char* name;
231
232 /** used only in simulated aio: true if the physical i/o
233 already made and only the slot message needs to be passed
234 to the caller of os_aio_simulated_handle */
235 bool io_already_done;
236
237 /*!< file block size */
238 ulint file_block_size;
239
240 /** The file node for which the IO is requested. */
241 fil_node_t* m1;
242
243 /** the requester of an aio operation and which can be used
244 to identify which pending aio operation was completed */
245 void* m2;
246
247 /** AIO completion status */
248 dberr_t err;
249
250#ifdef WIN_ASYNC_IO
251
252 /** bytes written/read */
253 DWORD n_bytes;
254
255 /** length of the block to read or write */
256 DWORD len;
257
258 /** aio array containing this slot */
259 AIO *array;
260#elif defined(LINUX_NATIVE_AIO)
261 /** Linux control block for aio */
262 struct iocb control;
263
264 /** AIO return code */
265 int ret;
266
267 /** bytes written/read. */
268 ssize_t n_bytes;
269
270 /** length of the block to read or write */
271 ulint len;
272#else
273 /** length of the block to read or write */
274 ulint len;
275
276 /** bytes written/read. */
277 ulint n_bytes;
278#endif /* WIN_ASYNC_IO */
279
280 /** Length of the block before it was compressed */
281 uint32 original_len;
282
283};
284
285/** The asynchronous i/o array structure */
286class AIO {
287public:
288 /** Constructor
289 @param[in] id Latch ID
290 @param[in] n_slots Number of slots to configure
291 @param[in] segments Number of segments to configure */
292 AIO(latch_id_t id, ulint n_slots, ulint segments);
293
294 /** Destructor */
295 ~AIO();
296
297 /** Initialize the instance
298 @return DB_SUCCESS or error code */
299 dberr_t init();
300
301 /** Requests for a slot in the aio array. If no slot is available, waits
302 until not_full-event becomes signaled.
303
304 @param[in] type IO context
305 @param[in,out] m1 message to be passed along with the AIO
306 operation
307 @param[in,out] m2 message to be passed along with the AIO
308 operation
309 @param[in] file file handle
310 @param[in] name name of the file or path as a null-terminated
311 string
312 @param[in,out] buf buffer where to read or from which to write
313 @param[in] offset file offset, where to read from or start writing
314 @param[in] len length of the block to read or write
315 @return pointer to slot */
316 Slot* reserve_slot(
317 const IORequest& type,
318 fil_node_t* m1,
319 void* m2,
320 pfs_os_file_t file,
321 const char* name,
322 void* buf,
323 os_offset_t offset,
324 ulint len)
325 MY_ATTRIBUTE((warn_unused_result));
326
327 /** @return number of reserved slots */
328 ulint pending_io_count() const;
329
330 /** Returns a pointer to the nth slot in the aio array.
331 @param[in] index Index of the slot in the array
332 @return pointer to slot */
333 const Slot* at(ulint i) const
334 MY_ATTRIBUTE((warn_unused_result))
335 {
336 ut_a(i < m_slots.size());
337
338 return(&m_slots[i]);
339 }
340
341 /** Non const version */
342 Slot* at(ulint i)
343 MY_ATTRIBUTE((warn_unused_result))
344 {
345 ut_a(i < m_slots.size());
346
347 return(&m_slots[i]);
348 }
349
350 /** Frees a slot in the AIO array, assumes caller owns the mutex.
351 @param[in,out] slot Slot to release */
352 void release(Slot* slot);
353
354 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
355 @param[in,out] slot Slot to release */
356 void release_with_mutex(Slot* slot);
357
358 /** Prints info about the aio array.
359 @param[in,out] file Where to print */
360 void print(FILE* file);
361
362 /** @return the number of slots per segment */
363 ulint slots_per_segment() const
364 MY_ATTRIBUTE((warn_unused_result))
365 {
366 return(m_slots.size() / m_n_segments);
367 }
368
369 /** @return accessor for n_segments */
370 ulint get_n_segments() const
371 MY_ATTRIBUTE((warn_unused_result))
372 {
373 return(m_n_segments);
374 }
375
376#ifdef UNIV_DEBUG
377 /** @return true if the thread owns the mutex */
378 bool is_mutex_owned() const
379 MY_ATTRIBUTE((warn_unused_result))
380 {
381 return(mutex_own(&m_mutex));
382 }
383#endif /* UNIV_DEBUG */
384
385 /** Acquire the mutex */
386 void acquire() const
387 {
388 mutex_enter(&m_mutex);
389 }
390
391 /** Release the mutex */
392 void release() const
393 {
394 mutex_exit(&m_mutex);
395 }
396
397 /** Write out the state to the file/stream
398 @param[in, out] file File to write to */
399 void to_file(FILE* file) const;
400
401#ifdef LINUX_NATIVE_AIO
402 /** Dispatch an AIO request to the kernel.
403 @param[in,out] slot an already reserved slot
404 @return true on success. */
405 bool linux_dispatch(Slot* slot)
406 MY_ATTRIBUTE((warn_unused_result));
407
408 /** Accessor for an AIO event
409 @param[in] index Index into the array
410 @return the event at the index */
411 io_event* io_events(ulint index)
412 MY_ATTRIBUTE((warn_unused_result))
413 {
414 ut_a(index < m_events.size());
415
416 return(&m_events[index]);
417 }
418
419 /** Accessor for the AIO context
420 @param[in] segment Segment for which to get the context
421 @return the AIO context for the segment */
422 io_context* io_ctx(ulint segment)
423 MY_ATTRIBUTE((warn_unused_result))
424 {
425 ut_ad(segment < get_n_segments());
426
427 return(m_aio_ctx[segment]);
428 }
429
430 /** Creates an io_context for native linux AIO.
431 @param[in] max_events number of events
432 @param[out] io_ctx io_ctx to initialize.
433 @return true on success. */
434 static bool linux_create_io_ctx(unsigned max_events, io_context_t* io_ctx)
435 MY_ATTRIBUTE((warn_unused_result));
436
437 /** Checks if the system supports native linux aio. On some kernel
438 versions where native aio is supported it won't work on tmpfs. In such
439 cases we can't use native aio as it is not possible to mix simulated
440 and native aio.
441 @return true if supported, false otherwise. */
442 static bool is_linux_native_aio_supported()
443 MY_ATTRIBUTE((warn_unused_result));
444#endif /* LINUX_NATIVE_AIO */
445
446#ifdef WIN_ASYNC_IO
447 HANDLE m_completion_port;
448 /** Wake up all AIO threads in Windows native aio */
449 static void wake_at_shutdown() {
450 AIO *all_arrays[] = {s_reads, s_writes, s_log, s_ibuf };
451 for (size_t i = 0; i < array_elements(all_arrays); i++) {
452 AIO *a = all_arrays[i];
453 if (a) {
454 PostQueuedCompletionStatus(a->m_completion_port, 0,
455 IOCP_SHUTDOWN_KEY, 0);
456 }
457 }
458 }
459#endif /* WIN_ASYNC_IO */
460
461#ifdef _WIN32
462 /** This function can be called if one wants to post a batch of reads
463 and prefers an I/O - handler thread to handle them all at once later.You
464 must call os_aio_simulated_wake_handler_threads later to ensure the
465 threads are not left sleeping! */
466 static void simulated_put_read_threads_to_sleep();
467#endif /* _WIN32 */
468
469 /** Create an instance using new(std::nothrow)
470 @param[in] id Latch ID
471 @param[in] n_slots The number of AIO request slots
472 @param[in] segments The number of segments
473 @return a new AIO instance */
474 static AIO* create(
475 latch_id_t id,
476 ulint n_slots,
477 ulint segments)
478 MY_ATTRIBUTE((warn_unused_result));
479
480 /** Initializes the asynchronous io system. Creates one array each
481 for ibuf and log I/O. Also creates one array each for read and write
482 where each array is divided logically into n_readers and n_writers
483 respectively. The caller must create an i/o handler thread for each
484 segment in these arrays. This function also creates the sync array.
485 No I/O handler thread needs to be created for that
486 @param[in] n_per_seg maximum number of pending aio
487 operations allowed per segment
488 @param[in] n_readers number of reader threads
489 @param[in] n_writers number of writer threads
490 @param[in] n_slots_sync number of slots in the sync aio array
491 @return true if AIO sub-system was started successfully */
492 static bool start(
493 ulint n_per_seg,
494 ulint n_readers,
495 ulint n_writers,
496 ulint n_slots_sync)
497 MY_ATTRIBUTE((warn_unused_result));
498
499 /** Free the AIO arrays */
500 static void shutdown();
501
502 /** Print all the AIO segments
503 @param[in,out] file Where to print */
504 static void print_all(FILE* file);
505
506 /** Calculates local segment number and aio array from global
507 segment number.
508 @param[out] array AIO wait array
509 @param[in] segment global segment number
510 @return local segment number within the aio array */
511 static ulint get_array_and_local_segment(
512 AIO** array,
513 ulint segment)
514 MY_ATTRIBUTE((warn_unused_result));
515
516 /** Select the IO slot array
517 @param[in,out] type Type of IO, READ or WRITE
518 @param[in] read_only true if running in read-only mode
519 @param[in] mode IO mode
520 @return slot array or NULL if invalid mode specified */
521 static AIO* select_slot_array(
522 IORequest& type,
523 bool read_only,
524 ulint mode)
525 MY_ATTRIBUTE((warn_unused_result));
526
527 /** Calculates segment number for a slot.
528 @param[in] array AIO wait array
529 @param[in] slot slot in this array
530 @return segment number (which is the number used by, for example,
531 I/O handler threads) */
532 static ulint get_segment_no_from_slot(
533 const AIO* array,
534 const Slot* slot)
535 MY_ATTRIBUTE((warn_unused_result));
536
537 /** Wakes up a simulated AIO I/O-handler thread if it has something
538 to do.
539 @param[in] global_segment the number of the segment in the
540 AIO arrays */
541 static void wake_simulated_handler_thread(ulint global_segment);
542
543 /** Check if it is a read request
544 @param[in] aio The AIO instance to check
545 @return true if the AIO instance is for reading. */
546 static bool is_read(const AIO* aio)
547 MY_ATTRIBUTE((warn_unused_result))
548 {
549 return(s_reads == aio);
550 }
551
552 /** Wait on an event until no pending writes */
553 static void wait_until_no_pending_writes()
554 {
555 os_event_wait(AIO::s_writes->m_is_empty);
556 }
557
558 /** Print to file
559 @param[in] file File to write to */
560 static void print_to_file(FILE* file);
561
562 /** Check for pending IO. Gets the count and also validates the
563 data structures.
564 @return count of pending IO requests */
565 static ulint total_pending_io_count();
566
567private:
568 /** Initialise the slots
569 @return DB_SUCCESS or error code */
570 dberr_t init_slots()
571 MY_ATTRIBUTE((warn_unused_result));
572
573 /** Wakes up a simulated AIO I/O-handler thread if it has something
574 to do for a local segment in the AIO array.
575 @param[in] global_segment the number of the segment in the
576 AIO arrays
577 @param[in] segment the local segment in the AIO array */
578 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
579
580 /** Prints pending IO requests per segment of an aio array.
581 We probably don't need per segment statistics but they can help us
582 during development phase to see if the IO requests are being
583 distributed as expected.
584 @param[in,out] file file where to print
585 @param[in] segments pending IO array */
586 void print_segment_info(
587 FILE* file,
588 const ulint* segments);
589
590#ifdef LINUX_NATIVE_AIO
591 /** Initialise the Linux native AIO data structures
592 @return DB_SUCCESS or error code */
593 dberr_t init_linux_native_aio()
594 MY_ATTRIBUTE((warn_unused_result));
595#endif /* LINUX_NATIVE_AIO */
596
597private:
598 typedef std::vector<Slot> Slots;
599
600 /** the mutex protecting the aio array */
601 mutable SysMutex m_mutex;
602
603 /** Pointer to the slots in the array.
604 Number of elements must be divisible by n_threads. */
605 Slots m_slots;
606
607 /** Number of segments in the aio array of pending aio requests.
608 A thread can wait separately for any one of the segments. */
609 ulint m_n_segments;
610
611 /** The event which is set to the signaled state when
612 there is space in the aio outside the ibuf segment;
613 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
614 os_event_t m_not_full;
615
616 /** The event which is set to the signaled state when
617 there are no pending i/os in this array;
618 os_event_set() and os_event_reset() are protected by AIO::m_mutex */
619 os_event_t m_is_empty;
620
621 /** Number of reserved slots in the AIO array outside
622 the ibuf segment */
623 ulint m_n_reserved;
624
625
626#if defined(LINUX_NATIVE_AIO)
627 typedef std::vector<io_event> IOEvents;
628
629 /** completion queue for IO. There is one such queue per
630 segment. Each thread will work on one ctx exclusively. */
631 io_context_t* m_aio_ctx;
632
633 /** The array to collect completed IOs. There is one such
634 event for each possible pending IO. The size of the array
635 is equal to m_slots.size(). */
636 IOEvents m_events;
637#endif /* LINUX_NATIV_AIO */
638
639 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
640 sync AIO. These are NULL when the module has not yet been
641 initialized. */
642
643 /** Insert buffer */
644 static AIO* s_ibuf;
645
646 /** Redo log */
647 static AIO* s_log;
648
649 /** Reads */
650 static AIO* s_reads;
651
652 /** Writes */
653 static AIO* s_writes;
654
655 /** Synchronous I/O */
656 static AIO* s_sync;
657};
658
659/** Static declarations */
660AIO* AIO::s_reads;
661AIO* AIO::s_writes;
662AIO* AIO::s_ibuf;
663AIO* AIO::s_log;
664AIO* AIO::s_sync;
665
666#if defined(LINUX_NATIVE_AIO)
667/** timeout for each io_getevents() call = 500ms. */
668static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
669
670/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
671static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
672
673/** number of attempts before giving up on io_setup(). */
674static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
675#endif /* LINUX_NATIVE_AIO */
676
677/** Array of events used in simulated AIO */
678static os_event_t* os_aio_segment_wait_events;
679
680/** Number of asynchronous I/O segments. Set by os_aio_init(). */
681static ulint os_aio_n_segments = ULINT_UNDEFINED;
682
683/** If the following is true, read i/o handler threads try to
684wait until a batch of new read requests have been posted */
685static bool os_aio_recommend_sleep_for_read_threads;
686
687ulint os_n_file_reads;
688static ulint os_bytes_read_since_printout;
689ulint os_n_file_writes;
690ulint os_n_fsyncs;
691static ulint os_n_file_reads_old;
692static ulint os_n_file_writes_old;
693static ulint os_n_fsyncs_old;
694
695static time_t os_last_printout;
696bool os_has_said_disk_full;
697
698/** Default Zip compression level */
699extern uint page_zip_level;
700
701/** Validates the consistency of the aio system.
702@return true if ok */
703static
704bool
705os_aio_validate();
706
707/** Handle errors for file operations.
708@param[in] name name of a file or NULL
709@param[in] operation operation
710@param[in] should_abort whether to abort on an unknown error
711@param[in] on_error_silent whether to suppress reports of non-fatal errors
712@return true if we should retry the operation */
713static MY_ATTRIBUTE((warn_unused_result))
714bool
715os_file_handle_error_cond_exit(
716 const char* name,
717 const char* operation,
718 bool should_abort,
719 bool on_error_silent);
720
721/** Does error handling when a file operation fails.
722@param[in] name name of a file or NULL
723@param[in] operation operation name that failed
724@return true if we should retry the operation */
725static
726bool
727os_file_handle_error(
728 const char* name,
729 const char* operation)
730{
731 /* Exit in case of unknown error */
732 return(os_file_handle_error_cond_exit(name, operation, true, false));
733}
734
735/** Does error handling when a file operation fails.
736@param[in] name name of a file or NULL
737@param[in] operation operation name that failed
738@param[in] on_error_silent if true then don't print any message to the log.
739@return true if we should retry the operation */
740static
741bool
742os_file_handle_error_no_exit(
743 const char* name,
744 const char* operation,
745 bool on_error_silent)
746{
747 /* Don't exit in case of unknown error */
748 return(os_file_handle_error_cond_exit(
749 name, operation, false, on_error_silent));
750}
751
752/** Does simulated AIO. This function should be called by an i/o-handler
753thread.
754
755@param[in] segment The number of the segment in the aio arrays to wait
756 for; segment 0 is the ibuf i/o thread, segment 1 the
757 log i/o thread, then follow the non-ibuf read threads,
758 and as the last are the non-ibuf write threads
759@param[out] m1 the messages passed with the AIO request; note that
760 also in the case where the AIO operation failed, these
761 output parameters are valid and can be used to restart
762 the operation, for example
763@param[out] m2 Callback argument
764@param[in] type IO context
765@return DB_SUCCESS or error code */
766static
767dberr_t
768os_aio_simulated_handler(
769 ulint global_segment,
770 fil_node_t** m1,
771 void** m2,
772 IORequest* type);
773
774#ifdef _WIN32
775static HANDLE win_get_syncio_event();
776#endif
777
778#ifdef _WIN32
779/**
780 Wrapper around Windows DeviceIoControl() function.
781
782 Works synchronously, also in case for handle opened
783 for async access (i.e with FILE_FLAG_OVERLAPPED).
784
785 Accepts the same parameters as DeviceIoControl(),except
786 last parameter (OVERLAPPED).
787*/
788static
789BOOL
790os_win32_device_io_control(
791 HANDLE handle,
792 DWORD code,
793 LPVOID inbuf,
794 DWORD inbuf_size,
795 LPVOID outbuf,
796 DWORD outbuf_size,
797 LPDWORD bytes_returned
798)
799{
800 OVERLAPPED overlapped = { 0 };
801 overlapped.hEvent = win_get_syncio_event();
802 BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
803 outbuf_size, NULL, &overlapped);
804
805 if (result || (GetLastError() == ERROR_IO_PENDING)) {
806 /* Wait for async io to complete */
807 result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
808 }
809
810 return result;
811}
812
813#endif
814
815/***********************************************************************//**
816Try to get number of bytes per sector from file system.
817@return file block size */
818UNIV_INTERN
819ulint
820os_file_get_block_size(
821/*===================*/
822 os_file_t file, /*!< in: handle to a file */
823 const char* name) /*!< in: file name */
824{
825 ulint fblock_size = 512;
826
827#if defined(UNIV_LINUX)
828 struct stat local_stat;
829 int err;
830
831 err = fstat((int)file, &local_stat);
832
833 if (err != 0) {
834 os_file_handle_error_no_exit(name, "fstat()", FALSE);
835 } else {
836 fblock_size = local_stat.st_blksize;
837 }
838#endif /* UNIV_LINUX */
839#ifdef _WIN32
840
841 fblock_size = 0;
842 BOOL result = false;
843 size_t len = 0;
844 // Open volume for this file, find out it "physical bytes per sector"
845
846 HANDLE volume_handle = INVALID_HANDLE_VALUE;
847 char volume[MAX_PATH + 4]="\\\\.\\"; // Special prefix required for volume names.
848 if (!GetVolumePathName(name , volume + 4, MAX_PATH)) {
849 os_file_handle_error_no_exit(name,
850 "GetVolumePathName()", FALSE);
851 goto end;
852 }
853
854 len = strlen(volume);
855 if (volume[len - 1] == '\\') {
856 // Trim trailing backslash from volume name.
857 volume[len - 1] = 0;
858 }
859
860 volume_handle = CreateFile(volume, FILE_READ_ATTRIBUTES,
861 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
862 0, OPEN_EXISTING, 0, 0);
863
864 if (volume_handle == INVALID_HANDLE_VALUE) {
865 if (GetLastError() != ERROR_ACCESS_DENIED) {
866 os_file_handle_error_no_exit(volume,
867 "CreateFile()", FALSE);
868 }
869 goto end;
870 }
871
872 DWORD tmp;
873 STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR disk_alignment;
874
875 STORAGE_PROPERTY_QUERY storage_query;
876 memset(&storage_query, 0, sizeof(storage_query));
877 storage_query.PropertyId = StorageAccessAlignmentProperty;
878 storage_query.QueryType = PropertyStandardQuery;
879
880 result = os_win32_device_io_control(volume_handle,
881 IOCTL_STORAGE_QUERY_PROPERTY,
882 &storage_query,
883 sizeof(storage_query),
884 &disk_alignment,
885 sizeof(disk_alignment),
886 &tmp);
887
888 if (!result) {
889 DWORD err = GetLastError();
890 if (err != ERROR_INVALID_FUNCTION && err != ERROR_NOT_SUPPORTED) {
891 os_file_handle_error_no_exit(volume,
892 "DeviceIoControl(IOCTL_STORAGE_QUERY_PROPERTY)", FALSE);
893 }
894 goto end;
895 }
896
897 fblock_size = disk_alignment.BytesPerPhysicalSector;
898
899end:
900 if (volume_handle != INVALID_HANDLE_VALUE) {
901 CloseHandle(volume_handle);
902 }
903#endif /* _WIN32 */
904
905 /* Currently we support file block size up to 4Kb */
906 if (fblock_size > 4096 || fblock_size < 512) {
907 if (fblock_size < 512) {
908 fblock_size = 512;
909 } else {
910 fblock_size = 4096;
911 }
912 }
913
914 return fblock_size;
915}
916
917#ifdef WIN_ASYNC_IO
918/** This function is only used in Windows asynchronous i/o.
919Waits for an aio operation to complete. This function is used to wait the
920for completed requests. The aio array of pending requests is divided
921into segments. The thread specifies which segment or slot it wants to wait
922for. NOTE: this function will also take care of freeing the aio slot,
923therefore no other thread is allowed to do the freeing!
924@param[in] segment The number of the segment in the aio arrays to
925wait for; segment 0 is the ibuf I/O thread,
926segment 1 the log I/O thread, then follow the
927non-ibuf read threads, and as the last are the
928non-ibuf write threads; if this is
929ULINT_UNDEFINED, then it means that sync AIO
930is used, and this parameter is ignored
931@param[in] pos this parameter is used only in sync AIO:
932wait for the aio slot at this position
933@param[out] m1 the messages passed with the AIO request; note
934that also in the case where the AIO operation
935failed, these output parameters are valid and
936can be used to restart the operation,
937for example
938@param[out] m2 callback message
939@param[out] type OS_FILE_WRITE or ..._READ
940@return DB_SUCCESS or error code */
941static
942dberr_t
943os_aio_windows_handler(
944 ulint segment,
945 ulint pos,
946 fil_node_t** m1,
947 void** m2,
948 IORequest* type);
949#endif /* WIN_ASYNC_IO */
950
951/** Generic AIO Handler methods. Currently handles IO post processing. */
952class AIOHandler {
953public:
954 /** Do any post processing after a read/write
955 @return DB_SUCCESS or error code. */
956 static dberr_t post_io_processing(Slot* slot);
957};
958
959/** Helper class for doing synchronous file IO. Currently, the objective
960is to hide the OS specific code, so that the higher level functions aren't
961peppered with #ifdef. Makes the code flow difficult to follow. */
962class SyncFileIO {
963public:
964 /** Constructor
965 @param[in] fh File handle
966 @param[in,out] buf Buffer to read/write
967 @param[in] n Number of bytes to read/write
968 @param[in] offset Offset where to read or write */
969 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
970 :
971 m_fh(fh),
972 m_buf(buf),
973 m_n(static_cast<ssize_t>(n)),
974 m_offset(offset)
975 {
976 ut_ad(m_n > 0);
977 }
978
979 /** Destructor */
980 ~SyncFileIO()
981 {
982 /* No op */
983 }
984
985 /** Do the read/write
986 @param[in] request The IO context and type
987 @return the number of bytes read/written or negative value on error */
988 ssize_t execute(const IORequest& request);
989
990 /** Do the read/write
991 @param[in,out] slot The IO slot, it has the IO context
992 @return the number of bytes read/written or negative value on error */
993 static ssize_t execute(Slot* slot);
994
995 /** Move the read/write offset up to where the partial IO succeeded.
996 @param[in] n_bytes The number of bytes to advance */
997 void advance(ssize_t n_bytes)
998 {
999 m_offset += n_bytes;
1000
1001 ut_ad(m_n >= n_bytes);
1002
1003 m_n -= n_bytes;
1004
1005 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1006 }
1007
1008private:
1009 /** Open file handle */
1010 os_file_t m_fh;
1011
1012 /** Buffer to read/write */
1013 void* m_buf;
1014
1015 /** Number of bytes to read/write */
1016 ssize_t m_n;
1017
1018 /** Offset from where to read/write */
1019 os_offset_t m_offset;
1020};
1021
1022/** Do any post processing after a read/write
1023@return DB_SUCCESS or error code. */
1024dberr_t
1025AIOHandler::post_io_processing(Slot* slot)
1026{
1027 ut_ad(slot->is_reserved);
1028
1029 /* Total bytes read so far */
1030 ulint n_bytes = ulint(slot->ptr - slot->buf) + slot->n_bytes;
1031
1032 return(n_bytes == slot->original_len ? DB_SUCCESS : DB_FAIL);
1033}
1034
1035/** Count the number of free slots
1036@return number of reserved slots */
1037ulint
1038AIO::pending_io_count() const
1039{
1040 acquire();
1041
1042#ifdef UNIV_DEBUG
1043 ut_a(m_n_segments > 0);
1044 ut_a(!m_slots.empty());
1045
1046 ulint count = 0;
1047
1048 for (ulint i = 0; i < m_slots.size(); ++i) {
1049
1050 const Slot& slot = m_slots[i];
1051
1052 if (slot.is_reserved) {
1053 ++count;
1054 ut_a(slot.len > 0);
1055 }
1056 }
1057
1058 ut_a(m_n_reserved == count);
1059#endif /* UNIV_DEBUG */
1060
1061 ulint reserved = m_n_reserved;
1062
1063 release();
1064
1065 return(reserved);
1066}
1067
1068#ifdef UNIV_DEBUG
1069/** Validates the consistency the aio system some of the time.
1070@return true if ok or the check was skipped */
1071static
1072bool
1073os_aio_validate_skip()
1074{
1075/** Try os_aio_validate() every this many times */
1076# define OS_AIO_VALIDATE_SKIP 13
1077
1078 /** The os_aio_validate() call skip counter.
1079 Use a signed type because of the race condition below. */
1080 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1081
1082 /* There is a race condition below, but it does not matter,
1083 because this call is only for heuristic purposes. We want to
1084 reduce the call frequency of the costly os_aio_validate()
1085 check in debug builds. */
1086 --os_aio_validate_count;
1087
1088 if (os_aio_validate_count > 0) {
1089 return(true);
1090 }
1091
1092 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1093 return(os_aio_validate());
1094}
1095#endif /* UNIV_DEBUG */
1096
1097#undef USE_FILE_LOCK
1098#ifndef _WIN32
1099/* On Windows, mandatory locking is used */
1100# define USE_FILE_LOCK
1101#endif
1102#ifdef USE_FILE_LOCK
1103/** Obtain an exclusive lock on a file.
1104@param[in] fd file descriptor
1105@param[in] name file name
1106@return 0 on success */
1107static
1108int
1109os_file_lock(
1110 int fd,
1111 const char* name)
1112{
1113 struct flock lk;
1114
1115 lk.l_type = F_WRLCK;
1116 lk.l_whence = SEEK_SET;
1117 lk.l_start = lk.l_len = 0;
1118
1119 if (fcntl(fd, F_SETLK, &lk) == -1) {
1120
1121 ib::error()
1122 << "Unable to lock " << name
1123 << " error: " << errno;
1124
1125 if (errno == EAGAIN || errno == EACCES) {
1126
1127 ib::info()
1128 << "Check that you do not already have"
1129 " another mysqld process using the"
1130 " same InnoDB data or log files.";
1131 }
1132
1133 return(-1);
1134 }
1135
1136 return(0);
1137}
1138#endif /* USE_FILE_LOCK */
1139
1140/** Calculates local segment number and aio array from global segment number.
1141@param[out] array aio wait array
1142@param[in] segment global segment number
1143@return local segment number within the aio array */
1144ulint
1145AIO::get_array_and_local_segment(
1146 AIO** array,
1147 ulint segment)
1148{
1149 ulint local_segment;
1150 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1151
1152 ut_a(segment < os_aio_n_segments);
1153
1154 if (!srv_read_only_mode && segment < n_extra_segs) {
1155
1156 /* We don't support ibuf/log IO during read only mode. */
1157
1158 if (segment == IO_IBUF_SEGMENT) {
1159
1160 *array = s_ibuf;
1161
1162 } else if (segment == IO_LOG_SEGMENT) {
1163
1164 *array = s_log;
1165
1166 } else {
1167 *array = NULL;
1168 }
1169
1170 local_segment = 0;
1171
1172 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1173
1174 *array = s_reads;
1175 local_segment = segment - n_extra_segs;
1176
1177 } else {
1178 *array = s_writes;
1179
1180 local_segment = segment
1181 - (s_reads->m_n_segments + n_extra_segs);
1182 }
1183
1184 return(local_segment);
1185}
1186
1187/** Frees a slot in the aio array. Assumes caller owns the mutex.
1188@param[in,out] slot Slot to release */
1189void
1190AIO::release(Slot* slot)
1191{
1192 ut_ad(is_mutex_owned());
1193
1194 ut_ad(slot->is_reserved);
1195
1196 slot->is_reserved = false;
1197
1198 --m_n_reserved;
1199
1200 if (m_n_reserved == m_slots.size() - 1) {
1201 os_event_set(m_not_full);
1202 }
1203
1204 if (m_n_reserved == 0) {
1205 os_event_set(m_is_empty);
1206 }
1207
1208#if defined(LINUX_NATIVE_AIO)
1209
1210 if (srv_use_native_aio) {
1211 memset(&slot->control, 0x0, sizeof(slot->control));
1212 slot->ret = 0;
1213 slot->n_bytes = 0;
1214 } else {
1215 /* These fields should not be used if we are not
1216 using native AIO. */
1217 ut_ad(slot->n_bytes == 0);
1218 ut_ad(slot->ret == 0);
1219 }
1220
1221#endif /* WIN_ASYNC_IO */
1222}
1223
1224/** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1225@param[in,out] slot Slot to release */
1226void
1227AIO::release_with_mutex(Slot* slot)
1228{
1229 acquire();
1230
1231 release(slot);
1232
1233 release();
1234}
1235
1236/** Create a temporary file. This function is like tmpfile(3), but
1237the temporary file is created in the in the mysql server configuration
1238parameter (--tmpdir).
1239@return temporary file handle, or NULL on error */
1240FILE*
1241os_file_create_tmpfile()
1242{
1243 FILE* file = NULL;
1244 WAIT_ALLOW_WRITES();
1245 os_file_t fd = innobase_mysql_tmpfile(NULL);
1246
1247 if (fd != OS_FILE_CLOSED) {
1248#ifdef _WIN32
1249 int crt_fd = _open_osfhandle((intptr_t)HANDLE(fd), 0);
1250 if (crt_fd != -1) {
1251 file = fdopen(crt_fd, "w+b");
1252 if (!file) {
1253 close(crt_fd);
1254 }
1255 }
1256#else
1257 file = fdopen(fd, "w+b");
1258 if (!file) {
1259 close(fd);
1260 }
1261#endif
1262 }
1263
1264 if (file == NULL) {
1265
1266 ib::error()
1267 << "Unable to create temporary file; errno: "
1268 << errno;
1269 }
1270
1271 return(file);
1272}
1273
1274/** Rewind file to its start, read at most size - 1 bytes from it to str, and
1275NUL-terminate str. All errors are silently ignored. This function is
1276mostly meant to be used with temporary files.
1277@param[in,out] file File to read from
1278@param[in,out] str Buffer where to read
1279@param[in] size Size of buffer */
1280void
1281os_file_read_string(
1282 FILE* file,
1283 char* str,
1284 ulint size)
1285{
1286 if (size != 0) {
1287 rewind(file);
1288
1289 size_t flen = fread(str, 1, size - 1, file);
1290
1291 str[flen] = '\0';
1292 }
1293}
1294
1295/** This function returns a new path name after replacing the basename
1296in an old path with a new basename. The old_path is a full path
1297name including the extension. The tablename is in the normal
1298form "databasename/tablename". The new base name is found after
1299the forward slash. Both input strings are null terminated.
1300
1301This function allocates memory to be returned. It is the callers
1302responsibility to free the return value after it is no longer needed.
1303
1304@param[in] old_path Pathname
1305@param[in] tablename Contains new base name
1306@return own: new full pathname */
1307char*
1308os_file_make_new_pathname(
1309 const char* old_path,
1310 const char* tablename)
1311{
1312 ulint dir_len;
1313 char* last_slash;
1314 char* base_name;
1315 char* new_path;
1316 ulint new_path_len;
1317
1318 /* Split the tablename into its database and table name components.
1319 They are separated by a '/'. */
1320 last_slash = strrchr((char*) tablename, '/');
1321 base_name = last_slash ? last_slash + 1 : (char*) tablename;
1322
1323 /* Find the offset of the last slash. We will strip off the
1324 old basename.ibd which starts after that slash. */
1325 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
1326 dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
1327
1328 /* allocate a new path and move the old directory path to it. */
1329 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
1330 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
1331 memcpy(new_path, old_path, dir_len);
1332
1333 snprintf(new_path + dir_len, new_path_len - dir_len,
1334 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
1335
1336 return(new_path);
1337}
1338
1339/** This function reduces a null-terminated full remote path name into
1340the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
1341the 'databasename/tablename.ibd' found at the end of the path with just
1342'tablename'.
1343
1344Since the result is always smaller than the path sent in, no new memory
1345is allocated. The caller should allocate memory for the path sent in.
1346This function manipulates that path in place.
1347
1348If the path format is not as expected, just return. The result is used
1349to inform a SHOW CREATE TABLE command.
1350@param[in,out] data_dir_path Full path/data_dir_path */
1351void
1352os_file_make_data_dir_path(
1353 char* data_dir_path)
1354{
1355 /* Replace the period before the extension with a null byte. */
1356 char* ptr = strrchr((char*) data_dir_path, '.');
1357
1358 if (ptr == NULL) {
1359 return;
1360 }
1361
1362 ptr[0] = '\0';
1363
1364 /* The tablename starts after the last slash. */
1365 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1366
1367 if (ptr == NULL) {
1368 return;
1369 }
1370
1371 ptr[0] = '\0';
1372
1373 char* tablename = ptr + 1;
1374
1375 /* The databasename starts after the next to last slash. */
1376 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
1377
1378 if (ptr == NULL) {
1379 return;
1380 }
1381
1382 ulint tablename_len = ut_strlen(tablename);
1383
1384 ut_memmove(++ptr, tablename, tablename_len);
1385
1386 ptr[tablename_len] = '\0';
1387}
1388
1389/** Check if the path refers to the root of a drive using a pointer
1390to the last directory separator that the caller has fixed.
1391@param[in] path path name
1392@param[in] path last directory separator in the path
1393@return true if this path is a drive root, false if not */
1394UNIV_INLINE
1395bool
1396os_file_is_root(
1397 const char* path,
1398 const char* last_slash)
1399{
1400 return(
1401#ifdef _WIN32
1402 (last_slash == path + 2 && path[1] == ':') ||
1403#endif /* _WIN32 */
1404 last_slash == path);
1405}
1406
1407/** Return the parent directory component of a null-terminated path.
1408Return a new buffer containing the string up to, but not including,
1409the final component of the path.
1410The path returned will not contain a trailing separator.
1411Do not return a root path, return NULL instead.
1412The final component trimmed off may be a filename or a directory name.
1413If the final component is the only component of the path, return NULL.
1414It is the caller's responsibility to free the returned string after it
1415is no longer needed.
1416@param[in] path Path name
1417@return own: parent directory of the path */
1418static
1419char*
1420os_file_get_parent_dir(
1421 const char* path)
1422{
1423 bool has_trailing_slash = false;
1424
1425 /* Find the offset of the last slash */
1426 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
1427
1428 if (!last_slash) {
1429 /* No slash in the path, return NULL */
1430 return(NULL);
1431 }
1432
1433 /* Ok, there is a slash. Is there anything after it? */
1434 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
1435 has_trailing_slash = true;
1436 }
1437
1438 /* Reduce repetative slashes. */
1439 while (last_slash > path
1440 && last_slash[-1] == OS_PATH_SEPARATOR) {
1441 last_slash--;
1442 }
1443
1444 /* Check for the root of a drive. */
1445 if (os_file_is_root(path, last_slash)) {
1446 return(NULL);
1447 }
1448
1449 /* If a trailing slash prevented the first strrchr() from trimming
1450 the last component of the path, trim that component now. */
1451 if (has_trailing_slash) {
1452 /* Back up to the previous slash. */
1453 last_slash--;
1454 while (last_slash > path
1455 && last_slash[0] != OS_PATH_SEPARATOR) {
1456 last_slash--;
1457 }
1458
1459 /* Reduce repetative slashes. */
1460 while (last_slash > path
1461 && last_slash[-1] == OS_PATH_SEPARATOR) {
1462 last_slash--;
1463 }
1464 }
1465
1466 /* Check for the root of a drive. */
1467 if (os_file_is_root(path, last_slash)) {
1468 return(NULL);
1469 }
1470
1471 /* Non-trivial directory component */
1472
1473 return(mem_strdupl(path, ulint(last_slash - path)));
1474}
1475#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
1476
1477/* Test the function os_file_get_parent_dir. */
1478void
1479test_os_file_get_parent_dir(
1480 const char* child_dir,
1481 const char* expected_dir)
1482{
1483 char* child = mem_strdup(child_dir);
1484 char* expected = expected_dir == NULL ? NULL
1485 : mem_strdup(expected_dir);
1486
1487 /* os_file_get_parent_dir() assumes that separators are
1488 converted to OS_PATH_SEPARATOR. */
1489 os_normalize_path(child);
1490 os_normalize_path(expected);
1491
1492 char* parent = os_file_get_parent_dir(child);
1493
1494 bool unexpected = (expected == NULL
1495 ? (parent != NULL)
1496 : (0 != strcmp(parent, expected)));
1497 if (unexpected) {
1498 ib::fatal() << "os_file_get_parent_dir('" << child
1499 << "') returned '" << parent
1500 << "', instead of '" << expected << "'.";
1501 }
1502 ut_free(parent);
1503 ut_free(child);
1504 ut_free(expected);
1505}
1506
1507/* Test the function os_file_get_parent_dir. */
1508void
1509unit_test_os_file_get_parent_dir()
1510{
1511 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
1512 test_os_file_get_parent_dir("/usr/", NULL);
1513 test_os_file_get_parent_dir("//usr//", NULL);
1514 test_os_file_get_parent_dir("usr", NULL);
1515 test_os_file_get_parent_dir("usr//", NULL);
1516 test_os_file_get_parent_dir("/", NULL);
1517 test_os_file_get_parent_dir("//", NULL);
1518 test_os_file_get_parent_dir(".", NULL);
1519 test_os_file_get_parent_dir("..", NULL);
1520# ifdef _WIN32
1521 test_os_file_get_parent_dir("D:", NULL);
1522 test_os_file_get_parent_dir("D:/", NULL);
1523 test_os_file_get_parent_dir("D:\\", NULL);
1524 test_os_file_get_parent_dir("D:/data", NULL);
1525 test_os_file_get_parent_dir("D:/data/", NULL);
1526 test_os_file_get_parent_dir("D:\\data\\", NULL);
1527 test_os_file_get_parent_dir("D:///data/////", NULL);
1528 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
1529 test_os_file_get_parent_dir("D:/data//a", "D:/data");
1530 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
1531 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
1532 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
1533#endif /* _WIN32 */
1534}
1535#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
1536
1537
1538/** Creates all missing subdirectories along the given path.
1539@param[in] path Path name
1540@return DB_SUCCESS if OK, otherwise error code. */
1541dberr_t
1542os_file_create_subdirs_if_needed(
1543 const char* path)
1544{
1545 if (srv_read_only_mode) {
1546
1547 ib::error()
1548 << "read only mode set. Can't create "
1549 << "subdirectories '" << path << "'";
1550
1551 return(DB_READ_ONLY);
1552
1553 }
1554
1555 char* subdir = os_file_get_parent_dir(path);
1556
1557 if (subdir == NULL) {
1558 /* subdir is root or cwd, nothing to do */
1559 return(DB_SUCCESS);
1560 }
1561
1562 /* Test if subdir exists */
1563 os_file_type_t type;
1564 bool subdir_exists;
1565 bool success = os_file_status(subdir, &subdir_exists, &type);
1566
1567 if (success && !subdir_exists) {
1568
1569 /* Subdir does not exist, create it */
1570 dberr_t err = os_file_create_subdirs_if_needed(subdir);
1571
1572 if (err != DB_SUCCESS) {
1573
1574 ut_free(subdir);
1575
1576 return(err);
1577 }
1578
1579 success = os_file_create_directory(subdir, false);
1580 }
1581
1582 ut_free(subdir);
1583
1584 return(success ? DB_SUCCESS : DB_ERROR);
1585}
1586
1587#ifndef _WIN32
1588
1589/** Do the read/write
1590@param[in] request The IO context and type
1591@return the number of bytes read/written or negative value on error */
1592ssize_t
1593SyncFileIO::execute(const IORequest& request)
1594{
1595 ssize_t n_bytes;
1596
1597 if (request.is_read()) {
1598 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
1599 } else {
1600 ut_ad(request.is_write());
1601 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
1602 }
1603
1604 return(n_bytes);
1605}
1606/** Free storage space associated with a section of the file.
1607@param[in] fh Open file handle
1608@param[in] off Starting offset (SEEK_SET)
1609@param[in] len Size of the hole
1610@return DB_SUCCESS or error code */
1611static
1612dberr_t
1613os_file_punch_hole_posix(
1614 os_file_t fh,
1615 os_offset_t off,
1616 os_offset_t len)
1617{
1618
1619#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1620 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
1621
1622 int ret = fallocate(fh, mode, off, len);
1623
1624 if (ret == 0) {
1625 return(DB_SUCCESS);
1626 }
1627
1628 if (errno == ENOTSUP) {
1629 return(DB_IO_NO_PUNCH_HOLE);
1630 }
1631
1632 ib::warn()
1633 << "fallocate("
1634 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
1635 << off << ", " << len << ") returned errno: "
1636 << errno;
1637
1638 return(DB_IO_ERROR);
1639
1640#elif defined(UNIV_SOLARIS)
1641
1642 // Use F_FREESP
1643
1644#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
1645
1646 return(DB_IO_NO_PUNCH_HOLE);
1647}
1648
1649#if defined(LINUX_NATIVE_AIO)
1650
1651/** Linux native AIO handler */
1652class LinuxAIOHandler {
1653public:
1654 /**
1655 @param[in] global_segment The global segment*/
1656 LinuxAIOHandler(ulint global_segment)
1657 :
1658 m_global_segment(global_segment)
1659 {
1660 /* Should never be doing Sync IO here. */
1661 ut_a(m_global_segment != ULINT_UNDEFINED);
1662
1663 /* Find the array and the local segment. */
1664
1665 m_segment = AIO::get_array_and_local_segment(
1666 &m_array, m_global_segment);
1667
1668 m_n_slots = m_array->slots_per_segment();
1669 }
1670
1671 /** Destructor */
1672 ~LinuxAIOHandler()
1673 {
1674 // No op
1675 }
1676
1677 /**
1678 Process a Linux AIO request
1679 @param[out] m1 the messages passed with the
1680 @param[out] m2 AIO request; note that in case the
1681 AIO operation failed, these output
1682 parameters are valid and can be used to
1683 restart the operation.
1684 @param[out] request IO context
1685 @return DB_SUCCESS or error code */
1686 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
1687
1688private:
1689 /** Resubmit an IO request that was only partially successful
1690 @param[in,out] slot Request to resubmit
1691 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1692 dberr_t resubmit(Slot* slot);
1693
1694 /** Check if the AIO succeeded
1695 @param[in,out] slot The slot to check
1696 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
1697 DB_IO_ERROR on all other errors */
1698 dberr_t check_state(Slot* slot);
1699
1700 /** @return true if a shutdown was detected */
1701 bool is_shutdown() const
1702 {
1703 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1704 && !buf_page_cleaner_is_active);
1705 }
1706
1707 /** If no slot was found then the m_array->m_mutex will be released.
1708 @param[out] n_pending The number of pending IOs
1709 @return NULL or a slot that has completed IO */
1710 Slot* find_completed_slot(ulint* n_pending);
1711
1712 /** This is called from within the IO-thread. If there are no completed
1713 IO requests in the slot array, the thread calls this function to
1714 collect more requests from the Linux kernel.
1715 The IO-thread waits on io_getevents(), which is a blocking call, with
1716 a timeout value. Unless the system is very heavy loaded, keeping the
1717 IO-thread very busy, the io-thread will spend most of its time waiting
1718 in this function.
1719 The IO-thread also exits in this function. It checks server status at
1720 each wakeup and that is why we use timed wait in io_getevents(). */
1721 void collect();
1722
1723private:
1724 /** Slot array */
1725 AIO* m_array;
1726
1727 /** Number of slots inthe local segment */
1728 ulint m_n_slots;
1729
1730 /** The local segment to check */
1731 ulint m_segment;
1732
1733 /** The global segment */
1734 ulint m_global_segment;
1735};
1736
1737/** Resubmit an IO request that was only partially successful
1738@param[in,out] slot Request to resubmit
1739@return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
1740dberr_t
1741LinuxAIOHandler::resubmit(Slot* slot)
1742{
1743#ifdef UNIV_DEBUG
1744 /* Bytes already read/written out */
1745 ulint n_bytes = slot->ptr - slot->buf;
1746
1747 ut_ad(m_array->is_mutex_owned());
1748
1749 ut_ad(n_bytes < slot->original_len);
1750 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
1751 /* Partial read or write scenario */
1752 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
1753#endif /* UNIV_DEBUG */
1754
1755 slot->len -= slot->n_bytes;
1756 slot->ptr += slot->n_bytes;
1757 slot->offset += slot->n_bytes;
1758
1759 /* Resetting the bytes read/written */
1760 slot->n_bytes = 0;
1761 slot->io_already_done = false;
1762
1763 struct iocb* iocb = &slot->control;
1764
1765 if (slot->type.is_read()) {
1766
1767 io_prep_pread(
1768 iocb,
1769 slot->file,
1770 slot->ptr,
1771 slot->len,
1772 static_cast<off_t>(slot->offset));
1773 } else {
1774
1775 ut_a(slot->type.is_write());
1776
1777 io_prep_pwrite(
1778 iocb,
1779 slot->file,
1780 slot->ptr,
1781 slot->len,
1782 static_cast<off_t>(slot->offset));
1783 }
1784
1785 iocb->data = slot;
1786
1787 /* Resubmit an I/O request */
1788 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
1789
1790 if (ret < -1) {
1791 errno = -ret;
1792 }
1793
1794 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
1795}
1796
1797/** Check if the AIO succeeded
1798@param[in,out] slot The slot to check
1799@return DB_SUCCESS, DB_FAIL if the operation should be retried or
1800 DB_IO_ERROR on all other errors */
1801dberr_t
1802LinuxAIOHandler::check_state(Slot* slot)
1803{
1804 ut_ad(m_array->is_mutex_owned());
1805
1806 /* Note that it may be that there is more then one completed
1807 IO requests. We process them one at a time. We may have a case
1808 here to improve the performance slightly by dealing with all
1809 requests in one sweep. */
1810
1811 srv_set_io_thread_op_info(
1812 m_global_segment, "processing completed aio requests");
1813
1814 ut_ad(slot->io_already_done);
1815
1816 dberr_t err = DB_SUCCESS;
1817
1818 if (slot->ret == 0) {
1819
1820 err = AIOHandler::post_io_processing(slot);
1821
1822 } else {
1823 errno = -slot->ret;
1824
1825 /* os_file_handle_error does tell us if we should retry
1826 this IO. As it stands now, we don't do this retry when
1827 reaping requests from a different context than
1828 the dispatcher. This non-retry logic is the same for
1829 Windows and Linux native AIO.
1830 We should probably look into this to transparently
1831 re-submit the IO. */
1832 os_file_handle_error(slot->name, "Linux aio");
1833
1834 err = DB_IO_ERROR;
1835 }
1836
1837 return(err);
1838}
1839
1840/** If no slot was found then the m_array->m_mutex will be released.
1841@param[out] n_pending The number of pending IOs
1842@return NULL or a slot that has completed IO */
1843Slot*
1844LinuxAIOHandler::find_completed_slot(ulint* n_pending)
1845{
1846 ulint offset = m_n_slots * m_segment;
1847
1848 *n_pending = 0;
1849
1850 m_array->acquire();
1851
1852 Slot* slot = m_array->at(offset);
1853
1854 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
1855
1856 if (slot->is_reserved) {
1857
1858 ++*n_pending;
1859
1860 if (slot->io_already_done) {
1861
1862 /* Something for us to work on.
1863 Note: We don't release the mutex. */
1864 return(slot);
1865 }
1866 }
1867 }
1868
1869 m_array->release();
1870
1871 return(NULL);
1872}
1873
1874/** This function is only used in Linux native asynchronous i/o. This is
1875called from within the io-thread. If there are no completed IO requests
1876in the slot array, the thread calls this function to collect more
1877requests from the kernel.
1878The io-thread waits on io_getevents(), which is a blocking call, with
1879a timeout value. Unless the system is very heavy loaded, keeping the
1880io-thread very busy, the io-thread will spend most of its time waiting
1881in this function.
1882The io-thread also exits in this function. It checks server status at
1883each wakeup and that is why we use timed wait in io_getevents(). */
1884void
1885LinuxAIOHandler::collect()
1886{
1887 ut_ad(m_n_slots > 0);
1888 ut_ad(m_array != NULL);
1889 ut_ad(m_segment < m_array->get_n_segments());
1890
1891 /* Which io_context we are going to use. */
1892 io_context* io_ctx = m_array->io_ctx(m_segment);
1893
1894 /* Starting point of the m_segment we will be working on. */
1895 ulint start_pos = m_segment * m_n_slots;
1896
1897 /* End point. */
1898 ulint end_pos = start_pos + m_n_slots;
1899
1900 for (;;) {
1901 struct io_event* events;
1902
1903 /* Which part of event array we are going to work on. */
1904 events = m_array->io_events(m_segment * m_n_slots);
1905
1906 /* Initialize the events. */
1907 memset(events, 0, sizeof(*events) * m_n_slots);
1908
1909 /* The timeout value is arbitrary. We probably need
1910 to experiment with it a little. */
1911 struct timespec timeout;
1912
1913 timeout.tv_sec = 0;
1914 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
1915
1916 int ret;
1917
1918 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
1919
1920 for (int i = 0; i < ret; ++i) {
1921
1922 struct iocb* iocb;
1923
1924 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
1925 ut_a(iocb != NULL);
1926
1927 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
1928
1929 /* Some sanity checks. */
1930 ut_a(slot != NULL);
1931 ut_a(slot->is_reserved);
1932
1933 /* We are not scribbling previous segment. */
1934 ut_a(slot->pos >= start_pos);
1935
1936 /* We have not overstepped to next segment. */
1937 ut_a(slot->pos < end_pos);
1938
1939 /* Deallocate unused blocks from file system.
1940 This is newer done to page 0 or to log files.*/
1941 if (slot->offset > 0
1942 && !slot->type.is_log()
1943 && slot->type.is_write()
1944 && slot->type.punch_hole()) {
1945
1946 slot->err = slot->type.punch_hole(
1947 slot->file,
1948 slot->offset, slot->len);
1949 } else {
1950 slot->err = DB_SUCCESS;
1951 }
1952
1953 /* Mark this request as completed. The error handling
1954 will be done in the calling function. */
1955 m_array->acquire();
1956
1957 slot->ret = events[i].res2;
1958 slot->io_already_done = true;
1959 slot->n_bytes = events[i].res;
1960
1961 m_array->release();
1962 }
1963
1964 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
1965 || !buf_page_cleaner_is_active
1966 || ret > 0) {
1967
1968 break;
1969 }
1970
1971 /* This error handling is for any error in collecting the
1972 IO requests. The errors, if any, for any particular IO
1973 request are simply passed on to the calling routine. */
1974
1975 switch (ret) {
1976 case -EAGAIN:
1977 /* Not enough resources! Try again. */
1978
1979 case -EINTR:
1980 /* Interrupted! The behaviour in case of an interrupt.
1981 If we have some completed IOs available then the
1982 return code will be the number of IOs. We get EINTR
1983 only if there are no completed IOs and we have been
1984 interrupted. */
1985
1986 case 0:
1987 /* No pending request! Go back and check again. */
1988
1989 continue;
1990 }
1991
1992 /* All other errors should cause a trap for now. */
1993 ib::fatal()
1994 << "Unexpected ret_code[" << ret
1995 << "] from io_getevents()!";
1996
1997 break;
1998 }
1999}
2000
2001/** Process a Linux AIO request
2002@param[out] m1 the messages passed with the
2003@param[out] m2 AIO request; note that in case the
2004 AIO operation failed, these output
2005 parameters are valid and can be used to
2006 restart the operation.
2007@param[out] request IO context
2008@return DB_SUCCESS or error code */
2009dberr_t
2010LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2011{
2012 dberr_t err = DB_SUCCESS;
2013 Slot* slot;
2014
2015 /* Loop until we have found a completed request. */
2016 for (;;) {
2017
2018 ulint n_pending;
2019
2020 slot = find_completed_slot(&n_pending);
2021
2022 if (slot != NULL) {
2023
2024 ut_ad(m_array->is_mutex_owned());
2025
2026 err = check_state(slot);
2027
2028 /* DB_FAIL is not a hard error, we should retry */
2029 if (err != DB_FAIL) {
2030 break;
2031 }
2032
2033 /* Partial IO, resubmit request for
2034 remaining bytes to read/write */
2035 err = resubmit(slot);
2036
2037 if (err != DB_SUCCESS) {
2038 break;
2039 }
2040
2041 m_array->release();
2042
2043 } else if (is_shutdown() && n_pending == 0) {
2044
2045 /* There is no completed request. If there is
2046 no pending request at all, and the system is
2047 being shut down, exit. */
2048
2049 *m1 = NULL;
2050 *m2 = NULL;
2051
2052 return(DB_SUCCESS);
2053
2054 } else {
2055
2056 /* Wait for some request. Note that we return
2057 from wait if we have found a request. */
2058
2059 srv_set_io_thread_op_info(
2060 m_global_segment,
2061 "waiting for completed aio requests");
2062
2063 collect();
2064 }
2065 }
2066
2067 if (err == DB_IO_PARTIAL_FAILED) {
2068 /* Aborting in case of submit failure */
2069 ib::fatal()
2070 << "Native Linux AIO interface. "
2071 "io_submit() call failed when "
2072 "resubmitting a partial I/O "
2073 "request on the file " << slot->name
2074 << ".";
2075 }
2076
2077 *m1 = slot->m1;
2078 *m2 = slot->m2;
2079
2080 *request = slot->type;
2081
2082 m_array->release(slot);
2083
2084 m_array->release();
2085
2086 return(err);
2087}
2088
2089/** This function is only used in Linux native asynchronous i/o.
2090Waits for an aio operation to complete. This function is used to wait for
2091the completed requests. The aio array of pending requests is divided
2092into segments. The thread specifies which segment or slot it wants to wait
2093for. NOTE: this function will also take care of freeing the aio slot,
2094therefore no other thread is allowed to do the freeing!
2095
2096@param[in] global_seg segment number in the aio array
2097 to wait for; segment 0 is the ibuf
2098 i/o thread, segment 1 is log i/o thread,
2099 then follow the non-ibuf read threads,
2100 and the last are the non-ibuf write
2101 threads.
2102@param[out] m1 the messages passed with the
2103@param[out] m2 AIO request; note that in case the
2104 AIO operation failed, these output
2105 parameters are valid and can be used to
2106 restart the operation.
2107@param[out]xi request IO context
2108@return DB_SUCCESS if the IO was successful */
2109static
2110dberr_t
2111os_aio_linux_handler(
2112 ulint global_segment,
2113 fil_node_t** m1,
2114 void** m2,
2115 IORequest* request)
2116{
2117 return LinuxAIOHandler(global_segment).poll(m1, m2, request);
2118}
2119
2120/** Dispatch an AIO request to the kernel.
2121@param[in,out] slot an already reserved slot
2122@return true on success. */
2123bool
2124AIO::linux_dispatch(Slot* slot)
2125{
2126 ut_a(slot->is_reserved);
2127 ut_ad(slot->type.validate());
2128
2129 /* Find out what we are going to work with.
2130 The iocb struct is directly in the slot.
2131 The io_context is one per segment. */
2132
2133 ulint io_ctx_index;
2134 struct iocb* iocb = &slot->control;
2135
2136 io_ctx_index = (slot->pos * m_n_segments) / m_slots.size();
2137
2138 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
2139
2140 /* io_submit() returns number of successfully queued requests
2141 or -errno. */
2142
2143 if (ret != 1) {
2144 errno = -ret;
2145 }
2146
2147 return(ret == 1);
2148}
2149
2150/** Creates an io_context for native linux AIO.
2151@param[in] max_events number of events
2152@param[out] io_ctx io_ctx to initialize.
2153@return true on success. */
2154bool
2155AIO::linux_create_io_ctx(
2156 unsigned max_events,
2157 io_context_t* io_ctx)
2158{
2159 ssize_t n_retries = 0;
2160
2161 for (;;) {
2162
2163 memset(io_ctx, 0x0, sizeof(*io_ctx));
2164
2165 /* Initialize the io_ctx. Tell it how many pending
2166 IO requests this context will handle. */
2167
2168 int ret = io_setup(max_events, io_ctx);
2169
2170 if (ret == 0) {
2171 /* Success. Return now. */
2172 return(true);
2173 }
2174
2175 /* If we hit EAGAIN we'll make a few attempts before failing. */
2176
2177 switch (ret) {
2178 case -EAGAIN:
2179 if (n_retries == 0) {
2180 /* First time around. */
2181 ib::warn()
2182 << "io_setup() failed with EAGAIN."
2183 " Will make "
2184 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2185 << " attempts before giving up.";
2186 }
2187
2188 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
2189
2190 ++n_retries;
2191
2192 ib::warn()
2193 << "io_setup() attempt "
2194 << n_retries << ".";
2195
2196 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
2197
2198 continue;
2199 }
2200
2201 /* Have tried enough. Better call it a day. */
2202 ib::error()
2203 << "io_setup() failed with EAGAIN after "
2204 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
2205 << " attempts.";
2206 break;
2207
2208 case -ENOSYS:
2209 ib::error()
2210 << "Linux Native AIO interface"
2211 " is not supported on this platform. Please"
2212 " check your OS documentation and install"
2213 " appropriate binary of InnoDB.";
2214
2215 break;
2216
2217 default:
2218 ib::error()
2219 << "Linux Native AIO setup"
2220 << " returned following error["
2221 << ret << "]";
2222 break;
2223 }
2224
2225 ib::info()
2226 << "You can disable Linux Native AIO by"
2227 " setting innodb_use_native_aio = 0 in my.cnf";
2228
2229 break;
2230 }
2231
2232 return(false);
2233}
2234
2235/** Checks if the system supports native linux aio. On some kernel
2236versions where native aio is supported it won't work on tmpfs. In such
2237cases we can't use native aio as it is not possible to mix simulated
2238and native aio.
2239@return: true if supported, false otherwise. */
2240bool
2241AIO::is_linux_native_aio_supported()
2242{
2243 int fd;
2244 io_context_t io_ctx;
2245 char name[1000];
2246
2247 if (!linux_create_io_ctx(1, &io_ctx)) {
2248
2249 /* The platform does not support native aio. */
2250
2251 return(false);
2252
2253 } else if (!srv_read_only_mode) {
2254
2255 /* Now check if tmpdir supports native aio ops. */
2256 fd = innobase_mysql_tmpfile(NULL);
2257
2258 if (fd < 0) {
2259 ib::warn()
2260 << "Unable to create temp file to check"
2261 " native AIO support.";
2262
2263 return(false);
2264 }
2265 } else {
2266
2267 os_normalize_path(srv_log_group_home_dir);
2268
2269 ulint dirnamelen = strlen(srv_log_group_home_dir);
2270
2271 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
2272
2273 memcpy(name, srv_log_group_home_dir, dirnamelen);
2274
2275 /* Add a path separator if needed. */
2276 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
2277
2278 name[dirnamelen++] = OS_PATH_SEPARATOR;
2279 }
2280
2281 strcpy(name + dirnamelen, "ib_logfile0");
2282
2283 fd = open(name, O_RDONLY | O_CLOEXEC);
2284
2285 if (fd == -1) {
2286
2287 ib::warn()
2288 << "Unable to open"
2289 << " \"" << name << "\" to check native"
2290 << " AIO read support.";
2291
2292 return(false);
2293 }
2294 }
2295
2296 struct io_event io_event;
2297
2298 memset(&io_event, 0x0, sizeof(io_event));
2299
2300 byte* buf = static_cast<byte*>(ut_malloc_nokey(srv_page_size * 2));
2301 byte* ptr = static_cast<byte*>(ut_align(buf, srv_page_size));
2302
2303 struct iocb iocb;
2304
2305 /* Suppress valgrind warning. */
2306 memset(buf, 0x00, srv_page_size * 2);
2307 memset(&iocb, 0x0, sizeof(iocb));
2308
2309 struct iocb* p_iocb = &iocb;
2310
2311 if (!srv_read_only_mode) {
2312
2313 io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
2314
2315 } else {
2316 ut_a(srv_page_size >= 512);
2317 io_prep_pread(p_iocb, fd, ptr, 512, 0);
2318 }
2319
2320 int err = io_submit(io_ctx, 1, &p_iocb);
2321
2322 if (err >= 1) {
2323 /* Now collect the submitted IO request. */
2324 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
2325 }
2326
2327 ut_free(buf);
2328 close(fd);
2329
2330 switch (err) {
2331 case 1:
2332 return(true);
2333
2334 case -EINVAL:
2335 case -ENOSYS:
2336 ib::error()
2337 << "Linux Native AIO not supported. You can either"
2338 " move "
2339 << (srv_read_only_mode ? name : "tmpdir")
2340 << " to a file system that supports native"
2341 " AIO or you can set innodb_use_native_aio to"
2342 " FALSE to avoid this message.";
2343
2344 /* fall through. */
2345 default:
2346 ib::error()
2347 << "Linux Native AIO check on "
2348 << (srv_read_only_mode ? name : "tmpdir")
2349 << "returned error[" << -err << "]";
2350 }
2351
2352 return(false);
2353}
2354
2355#endif /* LINUX_NATIVE_AIO */
2356
2357/** Retrieves the last error number if an error occurs in a file io function.
2358The number should be retrieved before any other OS calls (because they may
2359overwrite the error number). If the number is not known to this program,
2360the OS error number + 100 is returned.
2361@param[in] report_all_errors true if we want an error message
2362 printed of all errors
2363@param[in] on_error_silent true then don't print any diagnostic
2364 to the log
2365@return error number, or OS error number + 100 */
2366static
2367ulint
2368os_file_get_last_error_low(
2369 bool report_all_errors,
2370 bool on_error_silent)
2371{
2372 int err = errno;
2373
2374 if (err == 0) {
2375 return(0);
2376 }
2377
2378 if (report_all_errors
2379 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
2380
2381 ib::error()
2382 << "Operating system error number "
2383 << err
2384 << " in a file operation.";
2385
2386 if (err == ENOENT) {
2387
2388 ib::error()
2389 << "The error means the system"
2390 " cannot find the path specified.";
2391
2392 if (srv_is_being_started) {
2393
2394 ib::error()
2395 << "If you are installing InnoDB,"
2396 " remember that you must create"
2397 " directories yourself, InnoDB"
2398 " does not create them.";
2399 }
2400 } else if (err == EACCES) {
2401
2402 ib::error()
2403 << "The error means mysqld does not have"
2404 " the access rights to the directory.";
2405
2406 } else {
2407 if (strerror(err) != NULL) {
2408
2409 ib::error()
2410 << "Error number " << err << " means '"
2411 << strerror(err) << "'";
2412 }
2413
2414 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
2415 }
2416 }
2417
2418 switch (err) {
2419 case ENOSPC:
2420 return(OS_FILE_DISK_FULL);
2421 case ENOENT:
2422 return(OS_FILE_NOT_FOUND);
2423 case EEXIST:
2424 return(OS_FILE_ALREADY_EXISTS);
2425 case EXDEV:
2426 case ENOTDIR:
2427 case EISDIR:
2428 return(OS_FILE_PATH_ERROR);
2429 case EAGAIN:
2430 if (srv_use_native_aio) {
2431 return(OS_FILE_AIO_RESOURCES_RESERVED);
2432 }
2433 break;
2434 case EINTR:
2435 if (srv_use_native_aio) {
2436 return(OS_FILE_AIO_INTERRUPTED);
2437 }
2438 break;
2439 case EACCES:
2440 return(OS_FILE_ACCESS_VIOLATION);
2441 }
2442 return(OS_FILE_ERROR_MAX + err);
2443}
2444
2445/** Wrapper to fsync(2) that retries the call on some errors.
2446Returns the value 0 if successful; otherwise the value -1 is returned and
2447the global variable errno is set to indicate the error.
2448@param[in] file open file handle
2449@return 0 if success, -1 otherwise */
2450static
2451int
2452os_file_fsync_posix(
2453 os_file_t file)
2454{
2455 ulint failures = 0;
2456
2457 for (;;) {
2458
2459 ++os_n_fsyncs;
2460
2461 int ret = fsync(file);
2462
2463 if (ret == 0) {
2464 return(ret);
2465 }
2466
2467 switch(errno) {
2468 case ENOLCK:
2469
2470 ++failures;
2471 ut_a(failures < 1000);
2472
2473 if (!(failures % 100)) {
2474
2475 ib::warn()
2476 << "fsync(): "
2477 << "No locks available; retrying";
2478 }
2479
2480 /* 0.2 sec */
2481 os_thread_sleep(200000);
2482 break;
2483
2484 case EIO:
2485
2486 ++failures;
2487 ut_a(failures < 1000);
2488
2489 if (!(failures % 100)) {
2490
2491 ib::warn()
2492 << "fsync(): "
2493 << "An error occurred during "
2494 << "synchronization,"
2495 << " retrying";
2496 }
2497
2498 /* 0.2 sec */
2499 os_thread_sleep(200000);
2500 break;
2501
2502 case EINTR:
2503
2504 ++failures;
2505 ut_a(failures < 2000);
2506 break;
2507
2508 default:
2509 ut_error;
2510 break;
2511 }
2512 }
2513
2514 ut_error;
2515
2516 return(-1);
2517}
2518
2519/** Check the existence and type of the given file.
2520@param[in] path path name of file
2521@param[out] exists true if the file exists
2522@param[out] type Type of the file, if it exists
2523@return true if call succeeded */
2524static
2525bool
2526os_file_status_posix(
2527 const char* path,
2528 bool* exists,
2529 os_file_type_t* type)
2530{
2531 struct stat statinfo;
2532
2533 int ret = stat(path, &statinfo);
2534
2535 *exists = !ret;
2536
2537 if (!ret) {
2538 /* file exists, everything OK */
2539
2540 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
2541 /* file does not exist */
2542 return(true);
2543
2544 } else {
2545 /* file exists, but stat call failed */
2546 os_file_handle_error_no_exit(path, "stat", false);
2547 return(false);
2548 }
2549
2550 if (S_ISDIR(statinfo.st_mode)) {
2551 *type = OS_FILE_TYPE_DIR;
2552
2553 } else if (S_ISLNK(statinfo.st_mode)) {
2554 *type = OS_FILE_TYPE_LINK;
2555
2556 } else if (S_ISREG(statinfo.st_mode)) {
2557 *type = OS_FILE_TYPE_FILE;
2558 } else {
2559 *type = OS_FILE_TYPE_UNKNOWN;
2560 }
2561
2562 return(true);
2563}
2564
2565/** NOTE! Use the corresponding macro os_file_flush(), not directly this
2566function!
2567Flushes the write buffers of a given file to the disk.
2568@param[in] file handle to a file
2569@return true if success */
2570bool
2571os_file_flush_func(
2572 os_file_t file)
2573{
2574 int ret;
2575
2576 WAIT_ALLOW_WRITES();
2577 ret = os_file_fsync_posix(file);
2578
2579 if (ret == 0) {
2580 return(true);
2581 }
2582
2583 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
2584 we choose to ignore that error if we are using raw disks */
2585
2586 if (srv_start_raw_disk_in_use && errno == EINVAL) {
2587
2588 return(true);
2589 }
2590
2591 ib::error() << "The OS said file flush did not succeed";
2592
2593 os_file_handle_error(NULL, "flush");
2594
2595 /* It is a fatal error if a file flush does not succeed, because then
2596 the database can get corrupt on disk */
2597 ut_error;
2598
2599 return(false);
2600}
2601
2602/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
2603this function!
2604A simple function to open or create a file.
2605@param[in] name name of the file or path as a null-terminated
2606 string
2607@param[in] create_mode create mode
2608@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
2609@param[in] read_only if true, read only checks are enforced
2610@param[out] success true if succeed, false if error
2611@return handle to the file, not defined if error, error number
2612 can be retrieved with os_file_get_last_error */
2613pfs_os_file_t
2614os_file_create_simple_func(
2615 const char* name,
2616 ulint create_mode,
2617 ulint access_type,
2618 bool read_only,
2619 bool* success)
2620{
2621 pfs_os_file_t file;
2622
2623 *success = false;
2624
2625 int create_flag;
2626 const char* mode_str = NULL;
2627
2628 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
2629 WAIT_ALLOW_WRITES();
2630 }
2631
2632 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
2633 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
2634
2635 if (create_mode == OS_FILE_OPEN) {
2636 mode_str = "OPEN";
2637
2638 if (access_type == OS_FILE_READ_ONLY) {
2639
2640 create_flag = O_RDONLY;
2641
2642 } else if (read_only) {
2643
2644 create_flag = O_RDONLY;
2645
2646 } else {
2647 create_flag = O_RDWR;
2648 }
2649
2650 } else if (read_only) {
2651
2652 mode_str = "OPEN";
2653 create_flag = O_RDONLY;
2654
2655 } else if (create_mode == OS_FILE_CREATE) {
2656
2657 mode_str = "CREATE";
2658 create_flag = O_RDWR | O_CREAT | O_EXCL;
2659
2660 } else if (create_mode == OS_FILE_CREATE_PATH) {
2661
2662 mode_str = "CREATE PATH";
2663 /* Create subdirs along the path if needed. */
2664
2665 *success = os_file_create_subdirs_if_needed(name);
2666
2667 if (!*success) {
2668
2669 ib::error()
2670 << "Unable to create subdirectories '"
2671 << name << "'";
2672
2673 return(OS_FILE_CLOSED);
2674 }
2675
2676 create_flag = O_RDWR | O_CREAT | O_EXCL;
2677 create_mode = OS_FILE_CREATE;
2678 } else {
2679
2680 ib::error()
2681 << "Unknown file create mode ("
2682 << create_mode
2683 << " for file '" << name << "'";
2684
2685 return(OS_FILE_CLOSED);
2686 }
2687
2688 bool retry;
2689
2690 do {
2691 file = open(name, create_flag, os_innodb_umask);
2692
2693 if (file == -1) {
2694 *success = false;
2695 retry = os_file_handle_error(
2696 name,
2697 create_mode == OS_FILE_OPEN
2698 ? "open" : "create");
2699 } else {
2700 *success = true;
2701 retry = false;
2702 }
2703
2704 } while (retry);
2705
2706 /* This function is always called for data files, we should disable
2707 OS caching (O_DIRECT) here as we do in os_file_create_func(), so
2708 we open the same file in the same mode, see man page of open(2). */
2709 if (!srv_read_only_mode
2710 && *success
2711 && (srv_file_flush_method == SRV_O_DIRECT
2712 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
2713
2714 os_file_set_nocache(file, name, mode_str);
2715 }
2716
2717#ifdef USE_FILE_LOCK
2718 if (!read_only
2719 && *success
2720 && (access_type == OS_FILE_READ_WRITE)
2721 && os_file_lock(file, name)) {
2722
2723 *success = false;
2724 close(file);
2725 file = -1;
2726 }
2727#endif /* USE_FILE_LOCK */
2728
2729 return(file);
2730}
2731
2732/** This function attempts to create a directory named pathname. The new
2733directory gets default permissions. On Unix the permissions are
2734(0770 & ~umask). If the directory exists already, nothing is done and
2735the call succeeds, unless the fail_if_exists arguments is true.
2736If another error occurs, such as a permission error, this does not crash,
2737but reports the error and returns false.
2738@param[in] pathname directory name as null-terminated string
2739@param[in] fail_if_exists if true, pre-existing directory is treated as
2740 an error.
2741@return true if call succeeds, false on error */
2742bool
2743os_file_create_directory(
2744 const char* pathname,
2745 bool fail_if_exists)
2746{
2747 int rcode;
2748
2749 WAIT_ALLOW_WRITES();
2750 rcode = mkdir(pathname, 0770);
2751
2752 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
2753 /* failure */
2754 os_file_handle_error_no_exit(pathname, "mkdir", false);
2755
2756 return(false);
2757 }
2758
2759 return(true);
2760}
2761
2762/**
2763The os_file_opendir() function opens a directory stream corresponding to the
2764directory named by the dirname argument. The directory stream is positioned
2765at the first entry. In both Unix and Windows we automatically skip the '.'
2766and '..' items at the start of the directory listing.
2767@param[in] dirname directory name; it must not contain a trailing
2768 '\' or '/'
2769@param[in] is_fatal true if we should treat an error as a fatal
2770 error; if we try to open symlinks then we do
2771 not wish a fatal error if it happens not to be
2772 a directory
2773@return directory stream, NULL if error */
2774os_file_dir_t
2775os_file_opendir(
2776 const char* dirname,
2777 bool error_is_fatal)
2778{
2779 os_file_dir_t dir;
2780 dir = opendir(dirname);
2781
2782 if (dir == NULL && error_is_fatal) {
2783 os_file_handle_error(dirname, "opendir");
2784 }
2785
2786 return(dir);
2787}
2788
2789/** Closes a directory stream.
2790@param[in] dir directory stream
2791@return 0 if success, -1 if failure */
2792int
2793os_file_closedir(
2794 os_file_dir_t dir)
2795{
2796 int ret = closedir(dir);
2797
2798 if (ret != 0) {
2799 os_file_handle_error_no_exit(NULL, "closedir", false);
2800 }
2801
2802 return(ret);
2803}
2804
2805/** This function returns information of the next file in the directory. We jump
2806over the '.' and '..' entries in the directory.
2807@param[in] dirname directory name or path
2808@param[in] dir directory stream
2809@param[out] info buffer where the info is returned
2810@return 0 if ok, -1 if error, 1 if at the end of the directory */
2811int
2812os_file_readdir_next_file(
2813 const char* dirname,
2814 os_file_dir_t dir,
2815 os_file_stat_t* info)
2816{
2817 struct dirent* ent;
2818 char* full_path;
2819 int ret;
2820 struct stat statinfo;
2821
2822next_file:
2823
2824 ent = readdir(dir);
2825
2826 if (ent == NULL) {
2827
2828 return(1);
2829 }
2830
2831 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
2832
2833 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
2834
2835 goto next_file;
2836 }
2837
2838 strcpy(info->name, ent->d_name);
2839
2840 full_path = static_cast<char*>(
2841 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
2842
2843 sprintf(full_path, "%s/%s", dirname, ent->d_name);
2844
2845 ret = stat(full_path, &statinfo);
2846
2847 if (ret) {
2848
2849 if (errno == ENOENT) {
2850 /* readdir() returned a file that does not exist,
2851 it must have been deleted in the meantime. Do what
2852 would have happened if the file was deleted before
2853 readdir() - ignore and go to the next entry.
2854 If this is the last entry then info->name will still
2855 contain the name of the deleted file when this
2856 function returns, but this is not an issue since the
2857 caller shouldn't be looking at info when end of
2858 directory is returned. */
2859
2860 ut_free(full_path);
2861
2862 goto next_file;
2863 }
2864
2865 os_file_handle_error_no_exit(full_path, "stat", false);
2866
2867 ut_free(full_path);
2868
2869 return(-1);
2870 }
2871
2872 info->size = statinfo.st_size;
2873
2874 if (S_ISDIR(statinfo.st_mode)) {
2875 info->type = OS_FILE_TYPE_DIR;
2876 } else if (S_ISLNK(statinfo.st_mode)) {
2877 info->type = OS_FILE_TYPE_LINK;
2878 } else if (S_ISREG(statinfo.st_mode)) {
2879 info->type = OS_FILE_TYPE_FILE;
2880 } else {
2881 info->type = OS_FILE_TYPE_UNKNOWN;
2882 }
2883
2884 ut_free(full_path);
2885
2886 return(0);
2887}
2888
2889/** NOTE! Use the corresponding macro os_file_create(), not directly
2890this function!
2891Opens an existing file or creates a new.
2892@param[in] name name of the file or path as a null-terminated
2893 string
2894@param[in] create_mode create mode
2895@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
2896 is desired, OS_FILE_NORMAL, if any normal file;
2897 NOTE that it also depends on type, os_aio_..
2898 and srv_.. variables whether we really use async
2899 I/O or unbuffered I/O: look in the function
2900 source code for the exact rules
2901@param[in] type OS_DATA_FILE or OS_LOG_FILE
2902@param[in] read_only true, if read only checks should be enforcedm
2903@param[in] success true if succeeded
2904@return handle to the file, not defined if error, error number
2905 can be retrieved with os_file_get_last_error */
2906pfs_os_file_t
2907os_file_create_func(
2908 const char* name,
2909 ulint create_mode,
2910 ulint purpose,
2911 ulint type,
2912 bool read_only,
2913 bool* success)
2914{
2915 bool on_error_no_exit;
2916 bool on_error_silent;
2917
2918 *success = false;
2919
2920 DBUG_EXECUTE_IF(
2921 "ib_create_table_fail_disk_full",
2922 *success = false;
2923 errno = ENOSPC;
2924 return(OS_FILE_CLOSED);
2925 );
2926
2927 int create_flag;
2928 const char* mode_str = NULL;
2929
2930 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
2931 ? true : false;
2932 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
2933 ? true : false;
2934
2935 create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
2936 | OS_FILE_ON_ERROR_SILENT));
2937
2938 if (create_mode == OS_FILE_OPEN
2939 || create_mode == OS_FILE_OPEN_RAW
2940 || create_mode == OS_FILE_OPEN_RETRY) {
2941
2942 mode_str = "OPEN";
2943
2944 create_flag = read_only ? O_RDONLY : O_RDWR;
2945
2946 } else if (read_only) {
2947
2948 mode_str = "OPEN";
2949
2950 create_flag = O_RDONLY;
2951
2952 } else if (create_mode == OS_FILE_CREATE) {
2953
2954 mode_str = "CREATE";
2955 create_flag = O_RDWR | O_CREAT | O_EXCL;
2956
2957 } else if (create_mode == OS_FILE_OVERWRITE) {
2958
2959 mode_str = "OVERWRITE";
2960 create_flag = O_RDWR | O_CREAT | O_TRUNC;
2961
2962 } else {
2963 ib::error()
2964 << "Unknown file create mode (" << create_mode << ")"
2965 << " for file '" << name << "'";
2966
2967 return(OS_FILE_CLOSED);
2968 }
2969
2970 ut_a(type == OS_LOG_FILE
2971 || type == OS_DATA_FILE
2972 || type == OS_DATA_TEMP_FILE);
2973
2974 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
2975
2976#ifdef O_SYNC
2977 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
2978 O_SYNC because the datasync options seemed to corrupt files in 2001
2979 in both Linux and Solaris */
2980
2981 if (!read_only
2982 && type == OS_LOG_FILE
2983 && srv_file_flush_method == SRV_O_DSYNC) {
2984
2985 create_flag |= O_SYNC;
2986 }
2987#endif /* O_SYNC */
2988
2989 os_file_t file;
2990 bool retry;
2991
2992 do {
2993 file = open(name, create_flag, os_innodb_umask);
2994
2995 if (file == -1) {
2996 const char* operation;
2997
2998 operation = (create_mode == OS_FILE_CREATE
2999 && !read_only) ? "create" : "open";
3000
3001 *success = false;
3002
3003 if (on_error_no_exit) {
3004 retry = os_file_handle_error_no_exit(
3005 name, operation, on_error_silent);
3006 } else {
3007 retry = os_file_handle_error(name, operation);
3008 }
3009 } else {
3010 *success = true;
3011 retry = false;
3012 }
3013
3014 } while (retry);
3015
3016 /* We disable OS caching (O_DIRECT) only on data files */
3017 if (!read_only
3018 && *success
3019 && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
3020 && (srv_file_flush_method == SRV_O_DIRECT
3021 || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
3022
3023 os_file_set_nocache(file, name, mode_str);
3024 }
3025
3026#ifdef USE_FILE_LOCK
3027 if (!read_only
3028 && *success
3029 && create_mode != OS_FILE_OPEN_RAW
3030 && os_file_lock(file, name)) {
3031
3032 if (create_mode == OS_FILE_OPEN_RETRY) {
3033
3034 ib::info()
3035 << "Retrying to lock the first data file";
3036
3037 for (int i = 0; i < 100; i++) {
3038 os_thread_sleep(1000000);
3039
3040 if (!os_file_lock(file, name)) {
3041 *success = true;
3042 return(file);
3043 }
3044 }
3045
3046 ib::info()
3047 << "Unable to open the first data file";
3048 }
3049
3050 *success = false;
3051 close(file);
3052 file = -1;
3053 }
3054#endif /* USE_FILE_LOCK */
3055
3056 return(file);
3057}
3058
3059/** NOTE! Use the corresponding macro
3060os_file_create_simple_no_error_handling(), not directly this function!
3061A simple function to open or create a file.
3062@param[in] name name of the file or path as a null-terminated
3063 string
3064@param[in] create_mode create mode
3065@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
3066 OS_FILE_READ_ALLOW_DELETE; the last option
3067 is used by a backup program reading the file
3068@param[in] read_only if true read only mode checks are enforced
3069@param[out] success true if succeeded
3070@return own: handle to the file, not defined if error, error number
3071 can be retrieved with os_file_get_last_error */
3072pfs_os_file_t
3073os_file_create_simple_no_error_handling_func(
3074 const char* name,
3075 ulint create_mode,
3076 ulint access_type,
3077 bool read_only,
3078 bool* success)
3079{
3080 os_file_t file;
3081 int create_flag;
3082
3083 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
3084 WAIT_ALLOW_WRITES();
3085 }
3086
3087 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3088 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3089
3090 *success = false;
3091
3092 if (create_mode == OS_FILE_OPEN) {
3093
3094 if (access_type == OS_FILE_READ_ONLY) {
3095
3096 create_flag = O_RDONLY;
3097
3098 } else if (read_only) {
3099
3100 create_flag = O_RDONLY;
3101
3102 } else {
3103
3104 ut_a(access_type == OS_FILE_READ_WRITE
3105 || access_type == OS_FILE_READ_ALLOW_DELETE);
3106
3107 create_flag = O_RDWR;
3108 }
3109
3110 } else if (read_only) {
3111
3112 create_flag = O_RDONLY;
3113
3114 } else if (create_mode == OS_FILE_CREATE) {
3115
3116 create_flag = O_RDWR | O_CREAT | O_EXCL;
3117
3118 } else {
3119
3120 ib::error()
3121 << "Unknown file create mode "
3122 << create_mode << " for file '" << name << "'";
3123
3124 return(OS_FILE_CLOSED);
3125 }
3126
3127 file = open(name, create_flag, os_innodb_umask);
3128
3129 *success = (file != -1);
3130
3131#ifdef USE_FILE_LOCK
3132 if (!read_only
3133 && *success
3134 && access_type == OS_FILE_READ_WRITE
3135 && os_file_lock(file, name)) {
3136
3137 *success = false;
3138 close(file);
3139 file = -1;
3140
3141 }
3142#endif /* USE_FILE_LOCK */
3143
3144 return(file);
3145}
3146
3147/** Deletes a file if it exists. The file has to be closed before calling this.
3148@param[in] name file path as a null-terminated string
3149@param[out] exist indicate if file pre-exist
3150@return true if success */
3151bool
3152os_file_delete_if_exists_func(
3153 const char* name,
3154 bool* exist)
3155{
3156 if (exist != NULL) {
3157 *exist = true;
3158 }
3159
3160 int ret;
3161 WAIT_ALLOW_WRITES();
3162
3163 ret = unlink(name);
3164
3165 if (ret != 0 && errno == ENOENT) {
3166 if (exist != NULL) {
3167 *exist = false;
3168 }
3169 } else if (ret != 0 && errno != ENOENT) {
3170 os_file_handle_error_no_exit(name, "delete", false);
3171
3172 return(false);
3173 }
3174
3175 return(true);
3176}
3177
3178/** Deletes a file. The file has to be closed before calling this.
3179@param[in] name file path as a null-terminated string
3180@return true if success */
3181bool
3182os_file_delete_func(
3183 const char* name)
3184{
3185 int ret;
3186 WAIT_ALLOW_WRITES();
3187
3188 ret = unlink(name);
3189
3190 if (ret != 0) {
3191 os_file_handle_error_no_exit(name, "delete", FALSE);
3192
3193 return(false);
3194 }
3195
3196 return(true);
3197}
3198
3199/** NOTE! Use the corresponding macro os_file_rename(), not directly this
3200function!
3201Renames a file (can also move it to another directory). It is safest that the
3202file is closed before calling this function.
3203@param[in] oldpath old file path as a null-terminated string
3204@param[in] newpath new file path
3205@return true if success */
3206bool
3207os_file_rename_func(
3208 const char* oldpath,
3209 const char* newpath)
3210{
3211#ifdef UNIV_DEBUG
3212 os_file_type_t type;
3213 bool exists;
3214
3215 /* New path must not exist. */
3216 ut_ad(os_file_status(newpath, &exists, &type));
3217 ut_ad(!exists);
3218
3219 /* Old path must exist. */
3220 ut_ad(os_file_status(oldpath, &exists, &type));
3221 ut_ad(exists);
3222#endif /* UNIV_DEBUG */
3223
3224 int ret;
3225 WAIT_ALLOW_WRITES();
3226
3227 ret = rename(oldpath, newpath);
3228
3229 if (ret != 0) {
3230 os_file_handle_error_no_exit(oldpath, "rename", FALSE);
3231
3232 return(false);
3233 }
3234
3235 return(true);
3236}
3237
3238/** NOTE! Use the corresponding macro os_file_close(), not directly this
3239function!
3240Closes a file handle. In case of error, error number can be retrieved with
3241os_file_get_last_error.
3242@param[in] file Handle to close
3243@return true if success */
3244bool
3245os_file_close_func(
3246 os_file_t file)
3247{
3248 int ret = close(file);
3249
3250 if (ret == -1) {
3251 os_file_handle_error(NULL, "close");
3252
3253 return(false);
3254 }
3255
3256 return(true);
3257}
3258
3259/** Gets a file size.
3260@param[in] file handle to an open file
3261@return file size, or (os_offset_t) -1 on failure */
3262os_offset_t
3263os_file_get_size(os_file_t file)
3264{
3265 struct stat statbuf;
3266 return fstat(file, &statbuf) ? os_offset_t(-1) : statbuf.st_size;
3267}
3268
3269/** Gets a file size.
3270@param[in] filename Full path to the filename to check
3271@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
3272 errno */
3273os_file_size_t
3274os_file_get_size(
3275 const char* filename)
3276{
3277 struct stat s;
3278 os_file_size_t file_size;
3279
3280 int ret = stat(filename, &s);
3281
3282 if (ret == 0) {
3283 file_size.m_total_size = s.st_size;
3284 /* st_blocks is in 512 byte sized blocks */
3285 file_size.m_alloc_size = s.st_blocks * 512;
3286 } else {
3287 file_size.m_total_size = ~0U;
3288 file_size.m_alloc_size = (os_offset_t) errno;
3289 }
3290
3291 return(file_size);
3292}
3293
3294/** This function returns information about the specified file
3295@param[in] path pathname of the file
3296@param[out] stat_info information of a file in a directory
3297@param[in,out] statinfo information of a file in a directory
3298@param[in] check_rw_perm for testing whether the file can be opened
3299 in RW mode
3300@param[in] read_only if true read only mode checks are enforced
3301@return DB_SUCCESS if all OK */
3302static
3303dberr_t
3304os_file_get_status_posix(
3305 const char* path,
3306 os_file_stat_t* stat_info,
3307 struct stat* statinfo,
3308 bool check_rw_perm,
3309 bool read_only)
3310{
3311 int ret = stat(path, statinfo);
3312
3313 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
3314 /* file does not exist */
3315
3316 return(DB_NOT_FOUND);
3317
3318 } else if (ret) {
3319 /* file exists, but stat call failed */
3320
3321 os_file_handle_error_no_exit(path, "stat", false);
3322
3323 return(DB_FAIL);
3324 }
3325
3326 switch (statinfo->st_mode & S_IFMT) {
3327 case S_IFDIR:
3328 stat_info->type = OS_FILE_TYPE_DIR;
3329 break;
3330 case S_IFLNK:
3331 stat_info->type = OS_FILE_TYPE_LINK;
3332 break;
3333 case S_IFBLK:
3334 /* Handle block device as regular file. */
3335 case S_IFCHR:
3336 /* Handle character device as regular file. */
3337 case S_IFREG:
3338 stat_info->type = OS_FILE_TYPE_FILE;
3339 break;
3340 default:
3341 stat_info->type = OS_FILE_TYPE_UNKNOWN;
3342 }
3343
3344 stat_info->size = statinfo->st_size;
3345 stat_info->block_size = statinfo->st_blksize;
3346 stat_info->alloc_size = statinfo->st_blocks * 512;
3347
3348 if (check_rw_perm
3349 && (stat_info->type == OS_FILE_TYPE_FILE
3350 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
3351
3352 stat_info->rw_perm = !access(path, read_only
3353 ? R_OK : R_OK | W_OK);
3354 }
3355
3356 return(DB_SUCCESS);
3357}
3358
3359/** Truncates a file to a specified size in bytes.
3360Do nothing if the size to preserve is greater or equal to the current
3361size of the file.
3362@param[in] pathname file path
3363@param[in] file file to be truncated
3364@param[in] size size to preserve in bytes
3365@return true if success */
3366static
3367bool
3368os_file_truncate_posix(
3369 const char* pathname,
3370 os_file_t file,
3371 os_offset_t size)
3372{
3373 int res = ftruncate(file, size);
3374
3375 if (res == -1) {
3376
3377 bool retry;
3378
3379 retry = os_file_handle_error_no_exit(
3380 pathname, "truncate", false);
3381
3382 if (retry) {
3383 ib::warn()
3384 << "Truncate failed for '"
3385 << pathname << "'";
3386 }
3387 }
3388
3389 return(res == 0);
3390}
3391
3392/** Truncates a file at its current position.
3393@return true if success */
3394bool
3395os_file_set_eof(
3396 FILE* file) /*!< in: file to be truncated */
3397{
3398 WAIT_ALLOW_WRITES();
3399 return(!ftruncate(fileno(file), ftell(file)));
3400}
3401
3402#else /* !_WIN32 */
3403
3404#include <WinIoCtl.h>
3405
3406/*
3407Windows : Handling synchronous IO on files opened asynchronously.
3408
3409If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
3410a completion port, then every IO on this file would normally be enqueued to the
3411completion port. Sometimes however we would like to do a synchronous IO. This is
3412possible if we initialitze have overlapped.hEvent with a valid event and set its
3413lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
3414
3415We'll create this special event once for each thread and store in thread local
3416storage.
3417*/
3418
3419
3420static void __stdcall win_free_syncio_event(void *data) {
3421 if (data) {
3422 CloseHandle((HANDLE)data);
3423 }
3424}
3425
3426
3427/*
3428Retrieve per-thread event for doing synchronous io on asyncronously opened files
3429*/
3430static HANDLE win_get_syncio_event()
3431{
3432 HANDLE h;
3433
3434 h = (HANDLE)FlsGetValue(fls_sync_io);
3435 if (h) {
3436 return h;
3437 }
3438 h = CreateEventA(NULL, FALSE, FALSE, NULL);
3439 ut_a(h);
3440 /* Set low-order bit to keeps I/O completion from being queued */
3441 h = (HANDLE)((uintptr_t)h | 1);
3442 FlsSetValue(fls_sync_io, h);
3443 return h;
3444}
3445
3446
3447/** Do the read/write
3448@param[in] request The IO context and type
3449@return the number of bytes read/written or negative value on error */
3450ssize_t
3451SyncFileIO::execute(const IORequest& request)
3452{
3453 OVERLAPPED seek;
3454
3455 memset(&seek, 0x0, sizeof(seek));
3456
3457 seek.hEvent = win_get_syncio_event();
3458 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
3459 seek.OffsetHigh = (DWORD) (m_offset >> 32);
3460
3461 BOOL ret;
3462 DWORD n_bytes;
3463
3464 if (request.is_read()) {
3465 ret = ReadFile(m_fh, m_buf,
3466 static_cast<DWORD>(m_n), NULL, &seek);
3467
3468 } else {
3469 ut_ad(request.is_write());
3470 ret = WriteFile(m_fh, m_buf,
3471 static_cast<DWORD>(m_n), NULL, &seek);
3472 }
3473 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3474 /* Wait for async io to complete */
3475 ret = GetOverlappedResult(m_fh, &seek, &n_bytes, TRUE);
3476 }
3477
3478 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
3479}
3480
3481/** Do the read/write
3482@param[in,out] slot The IO slot, it has the IO context
3483@return the number of bytes read/written or negative value on error */
3484ssize_t
3485SyncFileIO::execute(Slot* slot)
3486{
3487 BOOL ret;
3488 slot->control.hEvent = win_get_syncio_event();
3489 if (slot->type.is_read()) {
3490
3491 ret = ReadFile(
3492 slot->file, slot->ptr, slot->len,
3493 NULL, &slot->control);
3494
3495 } else {
3496 ut_ad(slot->type.is_write());
3497
3498 ret = WriteFile(
3499 slot->file, slot->ptr, slot->len,
3500 NULL, &slot->control);
3501
3502 }
3503 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
3504 /* Wait for async io to complete */
3505 ret = GetOverlappedResult(slot->file, &slot->control, &slot->n_bytes, TRUE);
3506 }
3507
3508 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
3509}
3510
3511/* Startup/shutdown */
3512
3513struct WinIoInit
3514{
3515 WinIoInit() {
3516 fls_sync_io = FlsAlloc(win_free_syncio_event);
3517 ut_a(fls_sync_io != FLS_OUT_OF_INDEXES);
3518 }
3519
3520 ~WinIoInit() {
3521 FlsFree(fls_sync_io);
3522 }
3523};
3524
3525/* Ensures proper initialization and shutdown */
3526static WinIoInit win_io_init;
3527
3528
3529/** Free storage space associated with a section of the file.
3530@param[in] fh Open file handle
3531@param[in] page_size Tablespace page size
3532@param[in] block_size File system block size
3533@param[in] off Starting offset (SEEK_SET)
3534@param[in] len Size of the hole
3535@return 0 on success or errno */
3536static
3537dberr_t
3538os_file_punch_hole_win32(
3539 os_file_t fh,
3540 os_offset_t off,
3541 os_offset_t len)
3542{
3543 FILE_ZERO_DATA_INFORMATION punch;
3544
3545 punch.FileOffset.QuadPart = off;
3546 punch.BeyondFinalZero.QuadPart = off + len;
3547
3548 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
3549 therefore we pass a dummy parameter. */
3550 DWORD temp;
3551 BOOL success = os_win32_device_io_control(
3552 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
3553 NULL, 0, &temp);
3554
3555 return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
3556}
3557
3558/** Check the existence and type of the given file.
3559@param[in] path path name of file
3560@param[out] exists true if the file exists
3561@param[out] type Type of the file, if it exists
3562@return true if call succeeded */
3563static
3564bool
3565os_file_status_win32(
3566 const char* path,
3567 bool* exists,
3568 os_file_type_t* type)
3569{
3570 int ret;
3571 struct _stat64 statinfo;
3572
3573 ret = _stat64(path, &statinfo);
3574
3575 *exists = !ret;
3576
3577 if (!ret) {
3578 /* file exists, everything OK */
3579
3580 } else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
3581 /* file does not exist */
3582 return(true);
3583
3584 } else {
3585 /* file exists, but stat call failed */
3586 os_file_handle_error_no_exit(path, "stat", false);
3587 return(false);
3588 }
3589
3590 if (_S_IFDIR & statinfo.st_mode) {
3591 *type = OS_FILE_TYPE_DIR;
3592
3593 } else if (_S_IFREG & statinfo.st_mode) {
3594 *type = OS_FILE_TYPE_FILE;
3595
3596 } else {
3597 *type = OS_FILE_TYPE_UNKNOWN;
3598 }
3599
3600 return(true);
3601}
3602
3603/** NOTE! Use the corresponding macro os_file_flush(), not directly this
3604function!
3605Flushes the write buffers of a given file to the disk.
3606@param[in] file handle to a file
3607@return true if success */
3608bool
3609os_file_flush_func(
3610 os_file_t file)
3611{
3612 ++os_n_fsyncs;
3613
3614 BOOL ret = FlushFileBuffers(file);
3615
3616 if (ret) {
3617 return(true);
3618 }
3619
3620 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
3621 actually a raw device, we choose to ignore that error if we are using
3622 raw disks */
3623
3624 if (srv_start_raw_disk_in_use && GetLastError()
3625 == ERROR_INVALID_FUNCTION) {
3626 return(true);
3627 }
3628
3629 os_file_handle_error(NULL, "flush");
3630
3631 /* It is a fatal error if a file flush does not succeed, because then
3632 the database can get corrupt on disk */
3633 ut_error;
3634
3635 return(false);
3636}
3637
3638/** Retrieves the last error number if an error occurs in a file io function.
3639The number should be retrieved before any other OS calls (because they may
3640overwrite the error number). If the number is not known to this program,
3641the OS error number + 100 is returned.
3642@param[in] report_all_errors true if we want an error message printed
3643 of all errors
3644@param[in] on_error_silent true then don't print any diagnostic
3645 to the log
3646@return error number, or OS error number + 100 */
3647static
3648ulint
3649os_file_get_last_error_low(
3650 bool report_all_errors,
3651 bool on_error_silent)
3652{
3653 ulint err = (ulint) GetLastError();
3654
3655 if (err == ERROR_SUCCESS) {
3656 return(0);
3657 }
3658
3659 if (report_all_errors
3660 || (!on_error_silent
3661 && err != ERROR_DISK_FULL
3662 && err != ERROR_FILE_EXISTS)) {
3663
3664 ib::error()
3665 << "Operating system error number " << err
3666 << " in a file operation.";
3667
3668 if (err == ERROR_PATH_NOT_FOUND) {
3669 ib::error()
3670 << "The error means the system"
3671 " cannot find the path specified.";
3672
3673 if (srv_is_being_started) {
3674 ib::error()
3675 << "If you are installing InnoDB,"
3676 " remember that you must create"
3677 " directories yourself, InnoDB"
3678 " does not create them.";
3679 }
3680
3681 } else if (err == ERROR_ACCESS_DENIED) {
3682
3683 ib::error()
3684 << "The error means mysqld does not have"
3685 " the access rights to"
3686 " the directory. It may also be"
3687 " you have created a subdirectory"
3688 " of the same name as a data file.";
3689
3690 } else if (err == ERROR_SHARING_VIOLATION
3691 || err == ERROR_LOCK_VIOLATION) {
3692
3693 ib::error()
3694 << "The error means that another program"
3695 " is using InnoDB's files."
3696 " This might be a backup or antivirus"
3697 " software or another instance"
3698 " of MySQL."
3699 " Please close it to get rid of this error.";
3700
3701 } else if (err == ERROR_WORKING_SET_QUOTA
3702 || err == ERROR_NO_SYSTEM_RESOURCES) {
3703
3704 ib::error()
3705 << "The error means that there are no"
3706 " sufficient system resources or quota to"
3707 " complete the operation.";
3708
3709 } else if (err == ERROR_OPERATION_ABORTED) {
3710
3711 ib::error()
3712 << "The error means that the I/O"
3713 " operation has been aborted"
3714 " because of either a thread exit"
3715 " or an application request."
3716 " Retry attempt is made.";
3717 } else {
3718
3719 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3720 }
3721 }
3722
3723 if (err == ERROR_FILE_NOT_FOUND) {
3724 return(OS_FILE_NOT_FOUND);
3725 } else if (err == ERROR_DISK_FULL) {
3726 return(OS_FILE_DISK_FULL);
3727 } else if (err == ERROR_FILE_EXISTS) {
3728 return(OS_FILE_ALREADY_EXISTS);
3729 } else if (err == ERROR_SHARING_VIOLATION
3730 || err == ERROR_LOCK_VIOLATION) {
3731 return(OS_FILE_SHARING_VIOLATION);
3732 } else if (err == ERROR_WORKING_SET_QUOTA
3733 || err == ERROR_NO_SYSTEM_RESOURCES) {
3734 return(OS_FILE_INSUFFICIENT_RESOURCE);
3735 } else if (err == ERROR_OPERATION_ABORTED) {
3736 return(OS_FILE_OPERATION_ABORTED);
3737 } else if (err == ERROR_ACCESS_DENIED) {
3738 return(OS_FILE_ACCESS_VIOLATION);
3739 }
3740
3741 return(OS_FILE_ERROR_MAX + err);
3742}
3743
3744
3745/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3746this function!
3747A simple function to open or create a file.
3748@param[in] name name of the file or path as a null-terminated
3749 string
3750@param[in] create_mode create mode
3751@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3752@param[in] read_only if true read only mode checks are enforced
3753@param[out] success true if succeed, false if error
3754@return handle to the file, not defined if error, error number
3755 can be retrieved with os_file_get_last_error */
3756pfs_os_file_t
3757os_file_create_simple_func(
3758 const char* name,
3759 ulint create_mode,
3760 ulint access_type,
3761 bool read_only,
3762 bool* success)
3763{
3764 os_file_t file;
3765
3766 *success = false;
3767
3768 DWORD access;
3769 DWORD create_flag;
3770 DWORD attributes = 0;
3771
3772 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3773 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3774 ut_ad(srv_operation == SRV_OPERATION_NORMAL);
3775
3776 if (create_mode == OS_FILE_OPEN) {
3777
3778 create_flag = OPEN_EXISTING;
3779
3780 } else if (read_only) {
3781
3782 create_flag = OPEN_EXISTING;
3783
3784 } else if (create_mode == OS_FILE_CREATE) {
3785
3786 create_flag = CREATE_NEW;
3787
3788 } else if (create_mode == OS_FILE_CREATE_PATH) {
3789
3790 /* Create subdirs along the path if needed. */
3791 *success = os_file_create_subdirs_if_needed(name);
3792
3793 if (!*success) {
3794
3795 ib::error()
3796 << "Unable to create subdirectories '"
3797 << name << "'";
3798
3799 return(OS_FILE_CLOSED);
3800 }
3801
3802 create_flag = CREATE_NEW;
3803 create_mode = OS_FILE_CREATE;
3804
3805 } else {
3806
3807 ib::error()
3808 << "Unknown file create mode ("
3809 << create_mode << ") for file '"
3810 << name << "'";
3811
3812 return(OS_FILE_CLOSED);
3813 }
3814
3815 if (access_type == OS_FILE_READ_ONLY) {
3816
3817 access = GENERIC_READ;
3818
3819 } else if (read_only) {
3820
3821 ib::info()
3822 << "Read only mode set. Unable to"
3823 " open file '" << name << "' in RW mode, "
3824 << "trying RO mode";
3825
3826 access = GENERIC_READ;
3827
3828 } else if (access_type == OS_FILE_READ_WRITE) {
3829
3830 access = GENERIC_READ | GENERIC_WRITE;
3831
3832 } else {
3833
3834 ib::error()
3835 << "Unknown file access type (" << access_type << ") "
3836 "for file '" << name << "'";
3837
3838 return(OS_FILE_CLOSED);
3839 }
3840
3841 bool retry;
3842
3843 do {
3844 /* Use default security attributes and no template file. */
3845
3846 file = CreateFile(
3847 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
3848 create_flag, attributes, NULL);
3849
3850 if (file == INVALID_HANDLE_VALUE) {
3851
3852 *success = false;
3853
3854 retry = os_file_handle_error(
3855 name, create_mode == OS_FILE_OPEN ?
3856 "open" : "create");
3857
3858 } else {
3859
3860 retry = false;
3861
3862 *success = true;
3863 }
3864
3865 } while (retry);
3866
3867 return(file);
3868}
3869
3870/** This function attempts to create a directory named pathname. The new
3871directory gets default permissions. On Unix the permissions are
3872(0770 & ~umask). If the directory exists already, nothing is done and
3873the call succeeds, unless the fail_if_exists arguments is true.
3874If another error occurs, such as a permission error, this does not crash,
3875but reports the error and returns false.
3876@param[in] pathname directory name as null-terminated string
3877@param[in] fail_if_exists if true, pre-existing directory is treated
3878 as an error.
3879@return true if call succeeds, false on error */
3880bool
3881os_file_create_directory(
3882 const char* pathname,
3883 bool fail_if_exists)
3884{
3885 BOOL rcode;
3886
3887 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
3888 if (!(rcode != 0
3889 || (GetLastError() == ERROR_ALREADY_EXISTS
3890 && !fail_if_exists))) {
3891
3892 os_file_handle_error_no_exit(
3893 pathname, "CreateDirectory", false);
3894
3895 return(false);
3896 }
3897
3898 return(true);
3899}
3900
3901/** The os_file_opendir() function opens a directory stream corresponding to the
3902directory named by the dirname argument. The directory stream is positioned
3903at the first entry. In both Unix and Windows we automatically skip the '.'
3904and '..' items at the start of the directory listing.
3905@param[in] dirname directory name; it must not contain a trailing
3906 '\' or '/'
3907@param[in] is_fatal true if we should treat an error as a fatal
3908 error; if we try to open symlinks then we do
3909 not wish a fatal error if it happens not to
3910 be a directory
3911@return directory stream, NULL if error */
3912os_file_dir_t
3913os_file_opendir(
3914 const char* dirname,
3915 bool error_is_fatal)
3916{
3917 os_file_dir_t dir;
3918 LPWIN32_FIND_DATA lpFindFileData;
3919 char path[OS_FILE_MAX_PATH + 3];
3920
3921 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
3922
3923 strcpy(path, dirname);
3924 strcpy(path + strlen(path), "\\*");
3925
3926 /* Note that in Windows opening the 'directory stream' also retrieves
3927 the first entry in the directory. Since it is '.', that is no problem,
3928 as we will skip over the '.' and '..' entries anyway. */
3929
3930 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
3931 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
3932
3933 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
3934
3935 ut_free(lpFindFileData);
3936
3937 if (dir == INVALID_HANDLE_VALUE) {
3938
3939 if (error_is_fatal) {
3940 os_file_handle_error(dirname, "opendir");
3941 }
3942
3943 return(NULL);
3944 }
3945
3946 return(dir);
3947}
3948
3949/** Closes a directory stream.
3950@param[in] dir directory stream
3951@return 0 if success, -1 if failure */
3952int
3953os_file_closedir(
3954 os_file_dir_t dir)
3955{
3956 BOOL ret;
3957
3958 ret = FindClose(dir);
3959
3960 if (!ret) {
3961 os_file_handle_error_no_exit(NULL, "closedir", false);
3962
3963 return(-1);
3964 }
3965
3966 return(0);
3967}
3968
3969/** This function returns information of the next file in the directory. We
3970jump over the '.' and '..' entries in the directory.
3971@param[in] dirname directory name or path
3972@param[in] dir directory stream
3973@param[out] info buffer where the info is returned
3974@return 0 if ok, -1 if error, 1 if at the end of the directory */
3975int
3976os_file_readdir_next_file(
3977 const char* dirname,
3978 os_file_dir_t dir,
3979 os_file_stat_t* info)
3980{
3981 BOOL ret;
3982 int status;
3983 WIN32_FIND_DATA find_data;
3984
3985next_file:
3986
3987 ret = FindNextFile(dir, &find_data);
3988
3989 if (ret > 0) {
3990
3991 const char* name;
3992
3993 name = static_cast<const char*>(find_data.cFileName);
3994
3995 ut_a(strlen(name) < OS_FILE_MAX_PATH);
3996
3997 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
3998
3999 goto next_file;
4000 }
4001
4002 strcpy(info->name, name);
4003
4004 info->size = find_data.nFileSizeHigh;
4005 info->size <<= 32;
4006 info->size |= find_data.nFileSizeLow;
4007
4008 if (find_data.dwFileAttributes
4009 & FILE_ATTRIBUTE_REPARSE_POINT) {
4010
4011 /* TODO: test Windows symlinks */
4012 /* TODO: MySQL has apparently its own symlink
4013 implementation in Windows, dbname.sym can
4014 redirect a database directory:
4015 REFMAN "windows-symbolic-links.html" */
4016
4017 info->type = OS_FILE_TYPE_LINK;
4018
4019 } else if (find_data.dwFileAttributes
4020 & FILE_ATTRIBUTE_DIRECTORY) {
4021
4022 info->type = OS_FILE_TYPE_DIR;
4023
4024 } else {
4025
4026 /* It is probably safest to assume that all other
4027 file types are normal. Better to check them rather
4028 than blindly skip them. */
4029
4030 info->type = OS_FILE_TYPE_FILE;
4031 }
4032
4033 status = 0;
4034
4035 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
4036
4037 status = 1;
4038
4039 } else {
4040
4041 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
4042
4043 status = -1;
4044 }
4045
4046 return(status);
4047}
4048
4049/** NOTE! Use the corresponding macro os_file_create(), not directly
4050this function!
4051Opens an existing file or creates a new.
4052@param[in] name name of the file or path as a null-terminated
4053 string
4054@param[in] create_mode create mode
4055@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4056 is desired, OS_FILE_NORMAL, if any normal file;
4057 NOTE that it also depends on type, os_aio_..
4058 and srv_.. variables whether we really use async
4059 I/O or unbuffered I/O: look in the function
4060 source code for the exact rules
4061@param[in] type OS_DATA_FILE or OS_LOG_FILE
4062@param[in] success true if succeeded
4063@return handle to the file, not defined if error, error number
4064 can be retrieved with os_file_get_last_error */
4065pfs_os_file_t
4066os_file_create_func(
4067 const char* name,
4068 ulint create_mode,
4069 ulint purpose,
4070 ulint type,
4071 bool read_only,
4072 bool* success)
4073{
4074 os_file_t file;
4075 bool retry;
4076 bool on_error_no_exit;
4077 bool on_error_silent;
4078
4079 *success = false;
4080
4081 DBUG_EXECUTE_IF(
4082 "ib_create_table_fail_disk_full",
4083 *success = false;
4084 SetLastError(ERROR_DISK_FULL);
4085 return(OS_FILE_CLOSED);
4086 );
4087
4088 DWORD create_flag;
4089 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
4090 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4091 : FILE_SHARE_READ;
4092
4093 if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) {
4094 WAIT_ALLOW_WRITES();
4095 }
4096
4097 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4098 ? true : false;
4099
4100 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4101 ? true : false;
4102
4103 create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
4104
4105 if (create_mode == OS_FILE_OPEN_RAW) {
4106
4107 ut_a(!read_only);
4108
4109 create_flag = OPEN_EXISTING;
4110
4111 /* On Windows Physical devices require admin privileges and
4112 have to have the write-share mode set. See the remarks
4113 section for the CreateFile() function documentation in MSDN. */
4114
4115 share_mode |= FILE_SHARE_WRITE;
4116
4117 } else if (create_mode == OS_FILE_OPEN
4118 || create_mode == OS_FILE_OPEN_RETRY) {
4119
4120 create_flag = OPEN_EXISTING;
4121
4122 } else if (read_only) {
4123
4124 create_flag = OPEN_EXISTING;
4125
4126 } else if (create_mode == OS_FILE_CREATE) {
4127
4128 create_flag = CREATE_NEW;
4129
4130 } else if (create_mode == OS_FILE_OVERWRITE) {
4131
4132 create_flag = CREATE_ALWAYS;
4133
4134 } else {
4135 ib::error()
4136 << "Unknown file create mode (" << create_mode << ") "
4137 << " for file '" << name << "'";
4138
4139 return(OS_FILE_CLOSED);
4140 }
4141
4142 DWORD attributes = 0;
4143
4144 if (purpose == OS_FILE_AIO) {
4145
4146#ifdef WIN_ASYNC_IO
4147 /* If specified, use asynchronous (overlapped) io and no
4148 buffering of writes in the OS */
4149
4150 if (srv_use_native_aio) {
4151 attributes |= FILE_FLAG_OVERLAPPED;
4152 }
4153#endif /* WIN_ASYNC_IO */
4154
4155 } else if (purpose == OS_FILE_NORMAL) {
4156
4157 /* Use default setting. */
4158
4159 } else {
4160
4161 ib::error()
4162 << "Unknown purpose flag (" << purpose << ") "
4163 << "while opening file '" << name << "'";
4164
4165 return(OS_FILE_CLOSED);
4166 }
4167
4168 if (type == OS_LOG_FILE) {
4169 /* There is not reason to use buffered write to logs.*/
4170 attributes |= FILE_FLAG_NO_BUFFERING;
4171 }
4172
4173 switch (srv_file_flush_method)
4174 {
4175 case SRV_O_DSYNC:
4176 if (type == OS_LOG_FILE) {
4177 /* Map O_SYNC to FILE_WRITE_THROUGH */
4178 attributes |= FILE_FLAG_WRITE_THROUGH;
4179 }
4180 break;
4181
4182 case SRV_O_DIRECT_NO_FSYNC:
4183 case SRV_O_DIRECT:
4184 if (type == OS_DATA_FILE) {
4185 attributes |= FILE_FLAG_NO_BUFFERING;
4186 }
4187 break;
4188
4189 case SRV_ALL_O_DIRECT_FSYNC:
4190 /*Traditional Windows behavior, no buffering for any files.*/
4191 attributes |= FILE_FLAG_NO_BUFFERING;
4192 break;
4193
4194 case SRV_FSYNC:
4195 case SRV_LITTLESYNC:
4196 break;
4197
4198 case SRV_NOSYNC:
4199 /* Let Windows cache manager handle all writes.*/
4200 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4201 break;
4202
4203 default:
4204 ut_a(false); /* unknown flush mode.*/
4205 }
4206
4207
4208 // TODO: Create a bug, this looks wrong. The flush log
4209 // parameter is dynamic.
4210 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
4211 /* Do not use unbuffered i/o for the log files because
4212 value 2 denotes that we do not flush the log at every
4213 commit, but only once per second */
4214 attributes &= ~(FILE_FLAG_WRITE_THROUGH | FILE_FLAG_NO_BUFFERING);
4215 }
4216
4217
4218 DWORD access = GENERIC_READ;
4219
4220 if (!read_only) {
4221 access |= GENERIC_WRITE;
4222 }
4223
4224 do {
4225 /* Use default security attributes and no template file. */
4226 file = CreateFile(
4227 (LPCTSTR) name, access, share_mode, NULL,
4228 create_flag, attributes, NULL);
4229
4230 if (file == INVALID_HANDLE_VALUE) {
4231 const char* operation;
4232
4233 operation = (create_mode == OS_FILE_CREATE
4234 && !read_only)
4235 ? "create" : "open";
4236
4237 *success = false;
4238
4239 if (on_error_no_exit) {
4240 retry = os_file_handle_error_no_exit(
4241 name, operation, on_error_silent);
4242 } else {
4243 retry = os_file_handle_error(name, operation);
4244 }
4245 } else {
4246
4247 retry = false;
4248
4249 *success = true;
4250
4251 if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) {
4252 /* Bind the file handle to completion port. Completion port
4253 might not be created yet, in some stages of backup, but
4254 must always be there for the server.*/
4255 HANDLE port =(type == OS_LOG_FILE)?
4256 log_completion_port : data_completion_port;
4257 ut_a(port || srv_operation != SRV_OPERATION_NORMAL);
4258 if (port) {
4259 ut_a(CreateIoCompletionPort(file, port, 0, 0));
4260 }
4261 }
4262 }
4263 } while (retry);
4264
4265 return(file);
4266}
4267
4268/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
4269not directly this function!
4270A simple function to open or create a file.
4271@param[in] name name of the file or path as a null-terminated
4272 string
4273@param[in] create_mode create mode
4274@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4275 OS_FILE_READ_ALLOW_DELETE; the last option is
4276 used by a backup program reading the file
4277@param[out] success true if succeeded
4278@return own: handle to the file, not defined if error, error number
4279 can be retrieved with os_file_get_last_error */
4280pfs_os_file_t
4281os_file_create_simple_no_error_handling_func(
4282 const char* name,
4283 ulint create_mode,
4284 ulint access_type,
4285 bool read_only,
4286 bool* success)
4287{
4288 os_file_t file;
4289
4290 *success = false;
4291
4292 DWORD access;
4293 DWORD create_flag;
4294 DWORD attributes = 0;
4295 DWORD share_mode = srv_operation != SRV_OPERATION_NORMAL
4296 ? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
4297 : FILE_SHARE_READ;
4298
4299 ut_a(name);
4300
4301 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4302 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4303
4304 if (create_mode == OS_FILE_OPEN) {
4305
4306 create_flag = OPEN_EXISTING;
4307
4308 } else if (read_only) {
4309
4310 create_flag = OPEN_EXISTING;
4311
4312 } else if (create_mode == OS_FILE_CREATE) {
4313
4314 create_flag = CREATE_NEW;
4315
4316 } else {
4317
4318 ib::error()
4319 << "Unknown file create mode (" << create_mode << ") "
4320 << " for file '" << name << "'";
4321
4322 return(OS_FILE_CLOSED);
4323 }
4324
4325 if (access_type == OS_FILE_READ_ONLY) {
4326
4327 access = GENERIC_READ;
4328
4329 } else if (read_only) {
4330
4331 access = GENERIC_READ;
4332
4333 } else if (access_type == OS_FILE_READ_WRITE) {
4334
4335 access = GENERIC_READ | GENERIC_WRITE;
4336
4337 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
4338
4339 ut_a(!read_only);
4340
4341 access = GENERIC_READ;
4342
4343 /*!< A backup program has to give mysqld the maximum
4344 freedom to do what it likes with the file */
4345
4346 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
4347 | FILE_SHARE_READ;
4348 } else {
4349
4350 ib::error()
4351 << "Unknown file access type (" << access_type << ") "
4352 << "for file '" << name << "'";
4353
4354 return(OS_FILE_CLOSED);
4355 }
4356
4357 file = CreateFile((LPCTSTR) name,
4358 access,
4359 share_mode,
4360 NULL, // Security attributes
4361 create_flag,
4362 attributes,
4363 NULL); // No template file
4364
4365 *success = (file != INVALID_HANDLE_VALUE);
4366
4367 return(file);
4368}
4369
4370/** Deletes a file if it exists. The file has to be closed before calling this.
4371@param[in] name file path as a null-terminated string
4372@param[out] exist indicate if file pre-exist
4373@return true if success */
4374bool
4375os_file_delete_if_exists_func(
4376 const char* name,
4377 bool* exist)
4378{
4379 ulint count = 0;
4380
4381 if (exist != NULL) {
4382 *exist = true;
4383 }
4384
4385 for (;;) {
4386 /* In Windows, deleting an .ibd file may fail if
4387 the file is being accessed by an external program,
4388 such as a backup tool. */
4389
4390 bool ret = DeleteFile((LPCTSTR) name);
4391
4392 if (ret) {
4393 return(true);
4394 }
4395
4396 DWORD lasterr = GetLastError();
4397
4398 if (lasterr == ERROR_FILE_NOT_FOUND
4399 || lasterr == ERROR_PATH_NOT_FOUND) {
4400
4401 /* the file does not exist, this not an error */
4402 if (exist != NULL) {
4403 *exist = false;
4404 }
4405
4406 return(true);
4407 }
4408
4409 ++count;
4410
4411 if (count > 100 && 0 == (count % 10)) {
4412
4413 /* Print error information */
4414 os_file_get_last_error(true);
4415
4416 ib::warn() << "Delete of file '" << name << "' failed.";
4417 }
4418
4419 /* Sleep for a second */
4420 os_thread_sleep(1000000);
4421
4422 if (count > 2000) {
4423
4424 return(false);
4425 }
4426 }
4427}
4428
4429/** Deletes a file. The file has to be closed before calling this.
4430@param[in] name File path as NUL terminated string
4431@return true if success */
4432bool
4433os_file_delete_func(
4434 const char* name)
4435{
4436 ulint count = 0;
4437
4438 for (;;) {
4439 /* In Windows, deleting an .ibd file may fail if
4440 the file is being accessed by an external program,
4441 such as a backup tool. */
4442
4443 BOOL ret = DeleteFile((LPCTSTR) name);
4444
4445 if (ret) {
4446 return(true);
4447 }
4448
4449 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
4450 /* If the file does not exist, we classify this as
4451 a 'mild' error and return */
4452
4453 return(false);
4454 }
4455
4456 ++count;
4457
4458 if (count > 100 && 0 == (count % 10)) {
4459
4460 /* print error information */
4461 os_file_get_last_error(true);
4462
4463 ib::warn()
4464 << "Cannot delete file '" << name << "'. Is "
4465 << "another program accessing it?";
4466 }
4467
4468 /* sleep for a second */
4469 os_thread_sleep(1000000);
4470
4471 if (count > 2000) {
4472
4473 return(false);
4474 }
4475 }
4476
4477 ut_error;
4478 return(false);
4479}
4480
4481/** NOTE! Use the corresponding macro os_file_rename(), not directly this
4482function!
4483Renames a file (can also move it to another directory). It is safest that the
4484file is closed before calling this function.
4485@param[in] oldpath old file path as a null-terminated string
4486@param[in] newpath new file path
4487@return true if success */
4488bool
4489os_file_rename_func(
4490 const char* oldpath,
4491 const char* newpath)
4492{
4493#ifdef UNIV_DEBUG
4494 os_file_type_t type;
4495 bool exists;
4496
4497 /* New path must not exist. */
4498 ut_ad(os_file_status(newpath, &exists, &type));
4499 ut_ad(!exists);
4500
4501 /* Old path must exist. */
4502 ut_ad(os_file_status(oldpath, &exists, &type));
4503 ut_ad(exists);
4504#endif /* UNIV_DEBUG */
4505
4506 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
4507 return(true);
4508 }
4509
4510 os_file_handle_error_no_exit(oldpath, "rename", false);
4511
4512 return(false);
4513}
4514
4515/** NOTE! Use the corresponding macro os_file_close(), not directly
4516this function!
4517Closes a file handle. In case of error, error number can be retrieved with
4518os_file_get_last_error.
4519@param[in,own] file Handle to a file
4520@return true if success */
4521bool
4522os_file_close_func(
4523 os_file_t file)
4524{
4525 ut_a(file);
4526
4527 if (CloseHandle(file)) {
4528 return(true);
4529 }
4530
4531 os_file_handle_error(NULL, "close");
4532
4533 return(false);
4534}
4535
4536/** Gets a file size.
4537@param[in] file Handle to a file
4538@return file size, or (os_offset_t) -1 on failure */
4539os_offset_t
4540os_file_get_size(
4541 os_file_t file)
4542{
4543 DWORD high;
4544 DWORD low = GetFileSize(file, &high);
4545
4546 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
4547 return((os_offset_t) -1);
4548 }
4549
4550 return(os_offset_t(low | (os_offset_t(high) << 32)));
4551}
4552
4553/** Gets a file size.
4554@param[in] filename Full path to the filename to check
4555@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4556 errno */
4557os_file_size_t
4558os_file_get_size(
4559 const char* filename)
4560{
4561 struct __stat64 s;
4562 os_file_size_t file_size;
4563
4564 int ret = _stat64(filename, &s);
4565
4566 if (ret == 0) {
4567
4568 file_size.m_total_size = s.st_size;
4569
4570 DWORD low_size;
4571 DWORD high_size;
4572
4573 low_size = GetCompressedFileSize(filename, &high_size);
4574
4575 if (low_size != INVALID_FILE_SIZE) {
4576
4577 file_size.m_alloc_size = high_size;
4578 file_size.m_alloc_size <<= 32;
4579 file_size.m_alloc_size |= low_size;
4580
4581 } else {
4582 ib::error()
4583 << "GetCompressedFileSize("
4584 << filename << ", ..) failed.";
4585
4586 file_size.m_alloc_size = (os_offset_t) -1;
4587 }
4588 } else {
4589 file_size.m_total_size = ~0;
4590 file_size.m_alloc_size = (os_offset_t) ret;
4591 }
4592
4593 return(file_size);
4594}
4595
4596/** This function returns information about the specified file
4597@param[in] path pathname of the file
4598@param[out] stat_info information of a file in a directory
4599@param[in,out] statinfo information of a file in a directory
4600@param[in] check_rw_perm for testing whether the file can be opened
4601 in RW mode
4602@param[in] read_only true if the file is opened in read-only mode
4603@return DB_SUCCESS if all OK */
4604static
4605dberr_t
4606os_file_get_status_win32(
4607 const char* path,
4608 os_file_stat_t* stat_info,
4609 struct _stat64* statinfo,
4610 bool check_rw_perm,
4611 bool read_only)
4612{
4613 int ret = _stat64(path, statinfo);
4614
4615 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
4616 /* file does not exist */
4617
4618 return(DB_NOT_FOUND);
4619
4620 } else if (ret) {
4621 /* file exists, but stat call failed */
4622
4623 os_file_handle_error_no_exit(path, "STAT", false);
4624
4625 return(DB_FAIL);
4626
4627 } else if (_S_IFDIR & statinfo->st_mode) {
4628
4629 stat_info->type = OS_FILE_TYPE_DIR;
4630
4631 } else if (_S_IFREG & statinfo->st_mode) {
4632
4633 DWORD access = GENERIC_READ;
4634
4635 if (!read_only) {
4636 access |= GENERIC_WRITE;
4637 }
4638
4639 stat_info->type = OS_FILE_TYPE_FILE;
4640
4641 /* Check if we can open it in read-only mode. */
4642
4643 if (check_rw_perm) {
4644 HANDLE fh;
4645
4646 fh = CreateFile(
4647 (LPCTSTR) path, // File to open
4648 access,
4649 FILE_SHARE_READ | FILE_SHARE_WRITE
4650 | FILE_SHARE_DELETE, // Full sharing
4651 NULL, // Default security
4652 OPEN_EXISTING, // Existing file only
4653 FILE_ATTRIBUTE_NORMAL, // Normal file
4654 NULL); // No attr. template
4655
4656 if (fh == INVALID_HANDLE_VALUE) {
4657 stat_info->rw_perm = false;
4658 } else {
4659 stat_info->rw_perm = true;
4660 CloseHandle(fh);
4661 }
4662 }
4663
4664 char volname[MAX_PATH];
4665 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
4666
4667 if (!result) {
4668
4669 ib::error()
4670 << "os_file_get_status_win32: "
4671 << "Failed to get the volume path name for: "
4672 << path
4673 << "- OS error number " << GetLastError();
4674
4675 return(DB_FAIL);
4676 }
4677
4678 DWORD sectorsPerCluster;
4679 DWORD bytesPerSector;
4680 DWORD numberOfFreeClusters;
4681 DWORD totalNumberOfClusters;
4682
4683 result = GetDiskFreeSpace(
4684 (LPCSTR) volname,
4685 &sectorsPerCluster,
4686 &bytesPerSector,
4687 &numberOfFreeClusters,
4688 &totalNumberOfClusters);
4689
4690 if (!result) {
4691
4692 ib::error()
4693 << "GetDiskFreeSpace(" << volname << ",...) "
4694 << "failed "
4695 << "- OS error number " << GetLastError();
4696
4697 return(DB_FAIL);
4698 }
4699
4700 stat_info->block_size = bytesPerSector * sectorsPerCluster;
4701 } else {
4702 stat_info->type = OS_FILE_TYPE_UNKNOWN;
4703 }
4704
4705 return(DB_SUCCESS);
4706}
4707
4708/**
4709Sets a sparse flag on Windows file.
4710@param[in] file file handle
4711@return true on success, false on error
4712*/
4713#include <versionhelpers.h>
4714bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
4715{
4716 if (!is_sparse && !IsWindows8OrGreater()) {
4717 /* Cannot unset sparse flag on older Windows.
4718 Until Windows8 it is documented to produce unpredictable results,
4719 if there are unallocated ranges in file.*/
4720 return false;
4721 }
4722 DWORD temp;
4723 FILE_SET_SPARSE_BUFFER sparse_buffer;
4724 sparse_buffer.SetSparse = is_sparse;
4725 return os_win32_device_io_control(file,
4726 FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
4727}
4728
4729
4730/**
4731Change file size on Windows.
4732
4733If file is extended, the bytes between old and new EOF
4734are zeros.
4735
4736If file is sparse, "virtual" block is added at the end of
4737allocated area.
4738
4739If file is normal, file system allocates storage.
4740
4741@param[in] pathname file path
4742@param[in] file file handle
4743@param[in] size size to preserve in bytes
4744@return true if success */
4745bool
4746os_file_change_size_win32(
4747 const char* pathname,
4748 os_file_t file,
4749 os_offset_t size)
4750{
4751 LARGE_INTEGER length;
4752
4753 length.QuadPart = size;
4754
4755 BOOL success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
4756
4757 if (!success) {
4758 os_file_handle_error_no_exit(
4759 pathname, "SetFilePointerEx", false);
4760 } else {
4761 success = SetEndOfFile(file);
4762 if (!success) {
4763 os_file_handle_error_no_exit(
4764 pathname, "SetEndOfFile", false);
4765 }
4766 }
4767 return(success);
4768}
4769
4770/** Truncates a file at its current position.
4771@param[in] file Handle to be truncated
4772@return true if success */
4773bool
4774os_file_set_eof(
4775 FILE* file)
4776{
4777 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
4778
4779 return(SetEndOfFile(h));
4780}
4781
4782/** This function can be called if one wants to post a batch of reads and
4783prefers an i/o-handler thread to handle them all at once later. You must
4784call os_aio_simulated_wake_handler_threads later to ensure the threads
4785are not left sleeping! */
4786void
4787os_aio_simulated_put_read_threads_to_sleep()
4788{
4789 AIO::simulated_put_read_threads_to_sleep();
4790}
4791
4792/** This function can be called if one wants to post a batch of reads and
4793prefers an i/o-handler thread to handle them all at once later. You must
4794call os_aio_simulated_wake_handler_threads later to ensure the threads
4795are not left sleeping! */
4796void
4797AIO::simulated_put_read_threads_to_sleep()
4798{
4799 /* The idea of putting background IO threads to sleep is only for
4800 Windows when using simulated AIO. Windows XP seems to schedule
4801 background threads too eagerly to allow for coalescing during
4802 readahead requests. */
4803
4804 if (srv_use_native_aio) {
4805 /* We do not use simulated AIO: do nothing */
4806
4807 return;
4808 }
4809
4810 os_aio_recommend_sleep_for_read_threads = true;
4811
4812 for (ulint i = 0; i < os_aio_n_segments; i++) {
4813 AIO* array;
4814
4815 get_array_and_local_segment(&array, i);
4816
4817 if (array == s_reads) {
4818
4819 os_event_reset(os_aio_segment_wait_events[i]);
4820 }
4821 }
4822}
4823
4824#endif /* !_WIN32*/
4825
4826/** Does a syncronous read or write depending upon the type specified
4827In case of partial reads/writes the function tries
4828NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
4829@param[in] type, IO flags
4830@param[in] file handle to an open file
4831@param[out] buf buffer where to read
4832@param[in] offset file offset from the start where to read
4833@param[in] n number of bytes to read, starting from offset
4834@param[out] err DB_SUCCESS or error code
4835@return number of bytes read/written, -1 if error */
4836static MY_ATTRIBUTE((warn_unused_result))
4837ssize_t
4838os_file_io(
4839 const IORequest&in_type,
4840 os_file_t file,
4841 void* buf,
4842 ulint n,
4843 os_offset_t offset,
4844 dberr_t* err)
4845{
4846 ssize_t original_n = ssize_t(n);
4847 IORequest type = in_type;
4848 ssize_t bytes_returned = 0;
4849
4850 SyncFileIO sync_file_io(file, buf, n, offset);
4851
4852 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
4853
4854 ssize_t n_bytes = sync_file_io.execute(type);
4855
4856 /* Check for a hard error. Not much we can do now. */
4857 if (n_bytes < 0) {
4858
4859 break;
4860
4861 } else if (n_bytes + bytes_returned == ssize_t(n)) {
4862
4863 bytes_returned += n_bytes;
4864
4865 if (offset > 0
4866 && !type.is_log()
4867 && type.is_write()
4868 && type.punch_hole()) {
4869 *err = type.punch_hole(file, offset, n);
4870
4871 } else {
4872 *err = DB_SUCCESS;
4873 }
4874
4875 return(original_n);
4876 }
4877
4878 /* Handle partial read/write. */
4879
4880 ut_ad(ulint(n_bytes + bytes_returned) < n);
4881
4882 bytes_returned += n_bytes;
4883
4884 if (!type.is_partial_io_warning_disabled()) {
4885
4886 const char* op = type.is_read()
4887 ? "read" : "written";
4888
4889 ib::warn()
4890 << n
4891 << " bytes should have been " << op << ". Only "
4892 << bytes_returned
4893 << " bytes " << op << ". Retrying"
4894 << " for the remaining bytes.";
4895 }
4896
4897 /* Advance the offset and buffer by n_bytes */
4898 sync_file_io.advance(n_bytes);
4899 }
4900
4901 *err = DB_IO_ERROR;
4902
4903 if (!type.is_partial_io_warning_disabled()) {
4904 ib::warn()
4905 << "Retry attempts for "
4906 << (type.is_read() ? "reading" : "writing")
4907 << " partial data failed.";
4908 }
4909
4910 return(bytes_returned);
4911}
4912
4913/** Does a synchronous write operation in Posix.
4914@param[in] type IO context
4915@param[in] file handle to an open file
4916@param[out] buf buffer from which to write
4917@param[in] n number of bytes to read, starting from offset
4918@param[in] offset file offset from the start where to read
4919@param[out] err DB_SUCCESS or error code
4920@return number of bytes written, -1 if error */
4921static MY_ATTRIBUTE((warn_unused_result))
4922ssize_t
4923os_file_pwrite(
4924 const IORequest& type,
4925 os_file_t file,
4926 const byte* buf,
4927 ulint n,
4928 os_offset_t offset,
4929 dberr_t* err)
4930{
4931 ut_ad(type.validate());
4932 ut_ad(type.is_write());
4933
4934 ++os_n_file_writes;
4935
4936 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
4937 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4938 ssize_t n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
4939 n, offset, err);
4940 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
4941
4942 return(n_bytes);
4943}
4944
4945/** NOTE! Use the corresponding macro os_file_write(), not directly
4946Requests a synchronous write operation.
4947@param[in] type IO flags
4948@param[in] file handle to an open file
4949@param[out] buf buffer from which to write
4950@param[in] offset file offset from the start where to read
4951@param[in] n number of bytes to read, starting from offset
4952@return DB_SUCCESS if request was successful, false if fail */
4953dberr_t
4954os_file_write_func(
4955 const IORequest& type,
4956 const char* name,
4957 os_file_t file,
4958 const void* buf,
4959 os_offset_t offset,
4960 ulint n)
4961{
4962 dberr_t err;
4963
4964 ut_ad(type.validate());
4965 ut_ad(n > 0);
4966
4967 WAIT_ALLOW_WRITES();
4968
4969 ssize_t n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
4970
4971 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
4972
4973 ib::error()
4974 << "Write to file " << name << " failed at offset "
4975 << offset << ", " << n
4976 << " bytes should have been written,"
4977 " only " << n_bytes << " were written."
4978 " Operating system error number " << errno << "."
4979 " Check that your OS and file system"
4980 " support files of this size."
4981 " Check also that the disk is not full"
4982 " or a disk quota exceeded.";
4983
4984 if (strerror(errno) != NULL) {
4985
4986 ib::error()
4987 << "Error number " << errno
4988 << " means '" << strerror(errno) << "'";
4989 }
4990
4991 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4992
4993 os_has_said_disk_full = true;
4994 }
4995
4996 return(err);
4997}
4998
4999/** Does a synchronous read operation in Posix.
5000@param[in] type IO flags
5001@param[in] file handle to an open file
5002@param[out] buf buffer where to read
5003@param[in] offset file offset from the start where to read
5004@param[in] n number of bytes to read, starting from offset
5005@param[out] err DB_SUCCESS or error code
5006@return number of bytes read, -1 if error */
5007static MY_ATTRIBUTE((warn_unused_result))
5008ssize_t
5009os_file_pread(
5010 const IORequest& type,
5011 os_file_t file,
5012 void* buf,
5013 ulint n,
5014 os_offset_t offset,
5015 dberr_t* err)
5016{
5017 ut_ad(type.is_read());
5018
5019 ++os_n_file_reads;
5020
5021 const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
5022 MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
5023 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
5024 MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
5025
5026 return(n_bytes);
5027}
5028
5029/** Requests a synchronous positioned read operation.
5030@return DB_SUCCESS if request was successful, false if fail
5031@param[in] type IO flags
5032@param[in] file handle to an open file
5033@param[out] buf buffer where to read
5034@param[in] offset file offset from the start where to read
5035@param[in] n number of bytes to read, starting from offset
5036@param[out] o number of bytes actually read
5037@param[in] exit_on_err if true then exit on error
5038@return DB_SUCCESS or error code */
5039static MY_ATTRIBUTE((warn_unused_result))
5040dberr_t
5041os_file_read_page(
5042 const IORequest& type,
5043 os_file_t file,
5044 void* buf,
5045 os_offset_t offset,
5046 ulint n,
5047 ulint* o,
5048 bool exit_on_err)
5049{
5050 dberr_t err;
5051
5052 os_bytes_read_since_printout += n;
5053
5054 ut_ad(type.validate());
5055 ut_ad(n > 0);
5056
5057 ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err);
5058
5059 if (o) {
5060 *o = n_bytes;
5061 }
5062
5063 if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
5064 return err;
5065 }
5066
5067 ib::error() << "Tried to read " << n << " bytes at offset "
5068 << offset << ", but was only able to read " << n_bytes;
5069
5070 if (!os_file_handle_error_cond_exit(
5071 NULL, "read", exit_on_err, false)) {
5072 ib::fatal()
5073 << "Cannot read from file. OS error number "
5074 << errno << ".";
5075 }
5076
5077 if (err == DB_SUCCESS) {
5078 err = DB_IO_ERROR;
5079 }
5080
5081 return err;
5082}
5083
5084/** Retrieves the last error number if an error occurs in a file io function.
5085The number should be retrieved before any other OS calls (because they may
5086overwrite the error number). If the number is not known to this program,
5087the OS error number + 100 is returned.
5088@param[in] report_all_errors true if we want an error printed
5089 for all errors
5090@return error number, or OS error number + 100 */
5091ulint
5092os_file_get_last_error(
5093 bool report_all_errors)
5094{
5095 return(os_file_get_last_error_low(report_all_errors, false));
5096}
5097
5098/** Handle errors for file operations.
5099@param[in] name name of a file or NULL
5100@param[in] operation operation
5101@param[in] should_abort whether to abort on an unknown error
5102@param[in] on_error_silent whether to suppress reports of non-fatal errors
5103@return true if we should retry the operation */
5104static MY_ATTRIBUTE((warn_unused_result))
5105bool
5106os_file_handle_error_cond_exit(
5107 const char* name,
5108 const char* operation,
5109 bool should_abort,
5110 bool on_error_silent)
5111{
5112 ulint err;
5113
5114 err = os_file_get_last_error_low(false, on_error_silent);
5115
5116 switch (err) {
5117 case OS_FILE_DISK_FULL:
5118 /* We only print a warning about disk full once */
5119
5120 if (os_has_said_disk_full) {
5121
5122 return(false);
5123 }
5124
5125 /* Disk full error is reported irrespective of the
5126 on_error_silent setting. */
5127
5128 if (name) {
5129
5130 ib::error()
5131 << "Encountered a problem with file '"
5132 << name << "'";
5133 }
5134
5135 ib::error()
5136 << "Disk is full. Try to clean the disk to free space.";
5137
5138 os_has_said_disk_full = true;
5139
5140 return(false);
5141
5142 case OS_FILE_AIO_RESOURCES_RESERVED:
5143 case OS_FILE_AIO_INTERRUPTED:
5144
5145 return(true);
5146
5147 case OS_FILE_PATH_ERROR:
5148 case OS_FILE_ALREADY_EXISTS:
5149 case OS_FILE_ACCESS_VIOLATION:
5150
5151 return(false);
5152
5153 case OS_FILE_SHARING_VIOLATION:
5154
5155 os_thread_sleep(10000000); /* 10 sec */
5156 return(true);
5157
5158 case OS_FILE_OPERATION_ABORTED:
5159 case OS_FILE_INSUFFICIENT_RESOURCE:
5160
5161 os_thread_sleep(100000); /* 100 ms */
5162 return(true);
5163
5164 default:
5165
5166 /* If it is an operation that can crash on error then it
5167 is better to ignore on_error_silent and print an error message
5168 to the log. */
5169
5170 if (should_abort || !on_error_silent) {
5171 ib::error() << "File "
5172 << (name != NULL ? name : "(unknown)")
5173 << ": '" << operation << "'"
5174 " returned OS error " << err << "."
5175 << (should_abort
5176 ? " Cannot continue operation" : "");
5177 }
5178
5179 if (should_abort) {
5180 abort();
5181 }
5182 }
5183
5184 return(false);
5185}
5186
5187#ifndef _WIN32
5188/** Tries to disable OS caching on an opened file descriptor.
5189@param[in] fd file descriptor to alter
5190@param[in] file_name file name, used in the diagnostic message
5191@param[in] name "open" or "create"; used in the diagnostic
5192 message */
5193void
5194os_file_set_nocache(
5195 int fd MY_ATTRIBUTE((unused)),
5196 const char* file_name MY_ATTRIBUTE((unused)),
5197 const char* operation_name MY_ATTRIBUTE((unused)))
5198{
5199 /* some versions of Solaris may not have DIRECTIO_ON */
5200#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
5201 if (directio(fd, DIRECTIO_ON) == -1) {
5202 int errno_save = errno;
5203
5204 ib::error()
5205 << "Failed to set DIRECTIO_ON on file "
5206 << file_name << "; " << operation_name << ": "
5207 << strerror(errno_save) << ","
5208 " continuing anyway.";
5209 }
5210#elif defined(O_DIRECT)
5211 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
5212 int errno_save = errno;
5213 static bool warning_message_printed = false;
5214 if (errno_save == EINVAL) {
5215 if (!warning_message_printed) {
5216 warning_message_printed = true;
5217# ifdef UNIV_LINUX
5218 ib::warn()
5219 << "Failed to set O_DIRECT on file"
5220 << file_name << "; " << operation_name
5221 << ": " << strerror(errno_save) << ", "
5222 "continuing anyway. O_DIRECT is "
5223 "known to result in 'Invalid argument' "
5224 "on Linux on tmpfs, "
5225 "see MySQL Bug#26662.";
5226# else /* UNIV_LINUX */
5227 goto short_warning;
5228# endif /* UNIV_LINUX */
5229 }
5230 } else {
5231# ifndef UNIV_LINUX
5232short_warning:
5233# endif
5234 ib::warn()
5235 << "Failed to set O_DIRECT on file "
5236 << file_name << "; " << operation_name
5237 << " : " << strerror(errno_save)
5238 << ", continuing anyway.";
5239 }
5240 }
5241#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
5242}
5243
5244#endif /* _WIN32 */
5245
5246/** Extend a file.
5247
5248On Windows, extending a file allocates blocks for the file,
5249unless the file is sparse.
5250
5251On Unix, we will extend the file with ftruncate(), if
5252file needs to be sparse. Otherwise posix_fallocate() is used
5253when available, and if not, binary zeroes are added to the end
5254of file.
5255
5256@param[in] name file name
5257@param[in] file file handle
5258@param[in] size desired file size
5259@param[in] sparse whether to create a sparse file (no preallocating)
5260@return whether the operation succeeded */
5261bool
5262os_file_set_size(
5263 const char* name,
5264 os_file_t file,
5265 os_offset_t size,
5266 bool is_sparse)
5267{
5268#ifdef _WIN32
5269 /* On Windows, changing file size works well and as expected for both
5270 sparse and normal files.
5271
5272 However, 10.2 up until 10.2.9 made every file sparse in innodb,
5273 causing NTFS fragmentation issues(MDEV-13941). We try to undo
5274 the damage, and unsparse the file.*/
5275
5276 if (!is_sparse && os_is_sparse_file_supported(file)) {
5277 if (!os_file_set_sparse_win32(file, false))
5278 /* Unsparsing file failed. Fallback to writing binary
5279 zeros, to avoid even higher fragmentation.*/
5280 goto fallback;
5281 }
5282
5283 return os_file_change_size_win32(name, file, size);
5284
5285fallback:
5286#else
5287 if (is_sparse) {
5288 bool success = !ftruncate(file, size);
5289 if (!success) {
5290 ib::error() << "ftruncate of file " << name << " to "
5291 << size << " bytes failed with error "
5292 << errno;
5293 }
5294 return(success);
5295 }
5296
5297# ifdef HAVE_POSIX_FALLOCATE
5298 int err;
5299 do {
5300 os_offset_t current_size = os_file_get_size(file);
5301 err = current_size >= size
5302 ? 0 : posix_fallocate(file, current_size,
5303 size - current_size);
5304 } while (err == EINTR
5305 && srv_shutdown_state == SRV_SHUTDOWN_NONE);
5306
5307 switch (err) {
5308 case 0:
5309 return true;
5310 default:
5311 ib::error() << "preallocating "
5312 << size << " bytes for file " << name
5313 << " failed with error " << err;
5314 /* fall through */
5315 case EINTR:
5316 errno = err;
5317 return false;
5318 case EINVAL:
5319 /* fall back to the code below */
5320 break;
5321 }
5322# endif /* HAVE_POSIX_ALLOCATE */
5323#endif /* _WIN32*/
5324
5325 /* Write up to 1 megabyte at a time. */
5326 ulint buf_size = ut_min(ulint(64),
5327 ulint(size >> srv_page_size_shift))
5328 << srv_page_size_shift;
5329
5330 /* Align the buffer for possible raw i/o */
5331 byte* buf2;
5332
5333 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + srv_page_size));
5334
5335 byte* buf = static_cast<byte*>(ut_align(buf2, srv_page_size));
5336
5337 /* Write buffer full of zeros */
5338 memset(buf, 0, buf_size);
5339
5340 os_offset_t current_size = os_file_get_size(file);
5341
5342 while (current_size < size
5343 && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
5344 ulint n_bytes;
5345
5346 if (size - current_size < (os_offset_t) buf_size) {
5347 n_bytes = (ulint) (size - current_size);
5348 } else {
5349 n_bytes = buf_size;
5350 }
5351
5352 dberr_t err;
5353 IORequest request(IORequest::WRITE);
5354
5355 err = os_file_write(
5356 request, name, file, buf, current_size, n_bytes);
5357
5358 if (err != DB_SUCCESS) {
5359 break;
5360 }
5361
5362 current_size += n_bytes;
5363 }
5364
5365 ut_free(buf2);
5366
5367 return(current_size >= size && os_file_flush(file));
5368}
5369
5370/** Truncates a file to a specified size in bytes.
5371Do nothing if the size to preserve is greater or equal to the current
5372size of the file.
5373@param[in] pathname file path
5374@param[in] file file to be truncated
5375@param[in] size size to preserve in bytes
5376@return true if success */
5377bool
5378os_file_truncate(
5379 const char* pathname,
5380 os_file_t file,
5381 os_offset_t size)
5382{
5383 /* Do nothing if the size preserved is larger than or equal to the
5384 current size of file */
5385 os_offset_t size_bytes = os_file_get_size(file);
5386
5387 if (size >= size_bytes) {
5388 return(true);
5389 }
5390
5391#ifdef _WIN32
5392 return(os_file_change_size_win32(pathname, file, size));
5393#else /* _WIN32 */
5394 return(os_file_truncate_posix(pathname, file, size));
5395#endif /* _WIN32 */
5396}
5397
5398/** NOTE! Use the corresponding macro os_file_read(), not directly this
5399function!
5400Requests a synchronous positioned read operation.
5401@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5402@param[in] type IO flags
5403@param[in] file handle to an open file
5404@param[out] buf buffer where to read
5405@param[in] offset file offset from the start where to read
5406@param[in] n number of bytes to read, starting from offset
5407@return DB_SUCCESS or error code */
5408dberr_t
5409os_file_read_func(
5410 const IORequest& type,
5411 os_file_t file,
5412 void* buf,
5413 os_offset_t offset,
5414 ulint n)
5415{
5416 return(os_file_read_page(type, file, buf, offset, n, NULL, true));
5417}
5418
5419/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
5420not directly this function!
5421Requests a synchronous positioned read operation.
5422@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
5423@param[in] type IO flags
5424@param[in] file handle to an open file
5425@param[out] buf buffer where to read
5426@param[in] offset file offset from the start where to read
5427@param[in] n number of bytes to read, starting from offset
5428@param[out] o number of bytes actually read
5429@return DB_SUCCESS or error code */
5430dberr_t
5431os_file_read_no_error_handling_func(
5432 const IORequest& type,
5433 os_file_t file,
5434 void* buf,
5435 os_offset_t offset,
5436 ulint n,
5437 ulint* o)
5438{
5439 return(os_file_read_page(type, file, buf, offset, n, o, false));
5440}
5441
5442/** Check the existence and type of the given file.
5443@param[in] path path name of file
5444@param[out] exists true if the file exists
5445@param[out] type Type of the file, if it exists
5446@return true if call succeeded */
5447bool
5448os_file_status(
5449 const char* path,
5450 bool* exists,
5451 os_file_type_t* type)
5452{
5453#ifdef _WIN32
5454 return(os_file_status_win32(path, exists, type));
5455#else
5456 return(os_file_status_posix(path, exists, type));
5457#endif /* _WIN32 */
5458}
5459
5460/** Free storage space associated with a section of the file.
5461@param[in] fh Open file handle
5462@param[in] off Starting offset (SEEK_SET)
5463@param[in] len Size of the hole
5464@return DB_SUCCESS or error code */
5465dberr_t
5466os_file_punch_hole(
5467 os_file_t fh,
5468 os_offset_t off,
5469 os_offset_t len)
5470{
5471 dberr_t err;
5472
5473#ifdef _WIN32
5474 err = os_file_punch_hole_win32(fh, off, len);
5475#else
5476 err = os_file_punch_hole_posix(fh, off, len);
5477#endif /* _WIN32 */
5478
5479 return (err);
5480}
5481
5482/** Free storage space associated with a section of the file.
5483@param[in] fh Open file handle
5484@param[in] off Starting offset (SEEK_SET)
5485@param[in] len Size of the hole
5486@return DB_SUCCESS or error code */
5487dberr_t
5488IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len)
5489{
5490 /* In this debugging mode, we act as if punch hole is supported,
5491 and then skip any calls to actually punch a hole here.
5492 In this way, Transparent Page Compression is still being tested. */
5493 DBUG_EXECUTE_IF("ignore_punch_hole",
5494 return(DB_SUCCESS);
5495 );
5496
5497 ulint trim_len = get_trim_length(len);
5498
5499 if (trim_len == 0) {
5500 return(DB_SUCCESS);
5501 }
5502
5503 off += len;
5504
5505 /* Check does file system support punching holes for this
5506 tablespace. */
5507 if (!should_punch_hole()) {
5508 return DB_IO_NO_PUNCH_HOLE;
5509 }
5510
5511 dberr_t err = os_file_punch_hole(fh, off, trim_len);
5512
5513 if (err == DB_SUCCESS) {
5514 srv_stats.page_compressed_trim_op.inc();
5515 } else {
5516 /* If punch hole is not supported,
5517 set space so that it is not used. */
5518 if (err == DB_IO_NO_PUNCH_HOLE) {
5519 space_no_punch_hole();
5520 err = DB_SUCCESS;
5521 }
5522 }
5523
5524 return (err);
5525}
5526
5527/** Check if the file system supports sparse files.
5528
5529Warning: On POSIX systems we try and punch a hole from offset 0 to
5530the system configured page size. This should only be called on an empty
5531file.
5532@param[in] fh File handle for the file - if opened
5533@return true if the file system supports sparse files */
5534bool
5535os_is_sparse_file_supported(os_file_t fh)
5536{
5537 /* In this debugging mode, we act as if punch hole is supported,
5538 then we skip any calls to actually punch a hole. In this way,
5539 Transparent Page Compression is still being tested. */
5540 DBUG_EXECUTE_IF("ignore_punch_hole",
5541 return(true);
5542 );
5543
5544#ifdef _WIN32
5545 FILE_ATTRIBUTE_TAG_INFO info;
5546 if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
5547 &info, (DWORD)sizeof(info))) {
5548 if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
5549 return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
5550 }
5551 }
5552 return false;
5553#else
5554 dberr_t err;
5555
5556 /* We don't know the FS block size, use the sector size. The FS
5557 will do the magic. */
5558 err = os_file_punch_hole_posix(fh, 0, srv_page_size);
5559
5560 return(err == DB_SUCCESS);
5561#endif /* _WIN32 */
5562}
5563
5564/** This function returns information about the specified file
5565@param[in] path pathname of the file
5566@param[out] stat_info information of a file in a directory
5567@param[in] check_rw_perm for testing whether the file can be opened
5568 in RW mode
5569@param[in] read_only true if file is opened in read-only mode
5570@return DB_SUCCESS if all OK */
5571dberr_t
5572os_file_get_status(
5573 const char* path,
5574 os_file_stat_t* stat_info,
5575 bool check_rw_perm,
5576 bool read_only)
5577{
5578 dberr_t ret;
5579
5580#ifdef _WIN32
5581 struct _stat64 info;
5582
5583 ret = os_file_get_status_win32(
5584 path, stat_info, &info, check_rw_perm, read_only);
5585
5586#else
5587 struct stat info;
5588
5589 ret = os_file_get_status_posix(
5590 path, stat_info, &info, check_rw_perm, read_only);
5591
5592#endif /* _WIN32 */
5593
5594 if (ret == DB_SUCCESS) {
5595 stat_info->ctime = info.st_ctime;
5596 stat_info->atime = info.st_atime;
5597 stat_info->mtime = info.st_mtime;
5598 stat_info->size = info.st_size;
5599 }
5600
5601 return(ret);
5602}
5603
5604/**
5605Waits for an AIO operation to complete. This function is used to wait the
5606for completed requests. The aio array of pending requests is divided
5607into segments. The thread specifies which segment or slot it wants to wait
5608for. NOTE: this function will also take care of freeing the aio slot,
5609therefore no other thread is allowed to do the freeing!
5610@param[in] segment The number of the segment in the aio arrays to
5611 wait for; segment 0 is the ibuf I/O thread,
5612 segment 1 the log I/O thread, then follow the
5613 non-ibuf read threads, and as the last are the
5614 non-ibuf write threads; if this is
5615 ULINT_UNDEFINED, then it means that sync AIO
5616 is used, and this parameter is ignored
5617@param[out] m1 the messages passed with the AIO request; note
5618 that also in the case where the AIO operation
5619 failed, these output parameters are valid and
5620 can be used to restart the operation,
5621 for example
5622@param[out] m2 callback message
5623@param[out] type OS_FILE_WRITE or ..._READ
5624@return DB_SUCCESS or error code */
5625dberr_t
5626os_aio_handler(
5627 ulint segment,
5628 fil_node_t** m1,
5629 void** m2,
5630 IORequest* request)
5631{
5632 dberr_t err;
5633
5634 if (srv_use_native_aio) {
5635 srv_set_io_thread_op_info(segment, "native aio handle");
5636
5637#ifdef WIN_ASYNC_IO
5638
5639 err = os_aio_windows_handler(segment, 0, m1, m2, request);
5640
5641#elif defined(LINUX_NATIVE_AIO)
5642
5643 err = os_aio_linux_handler(segment, m1, m2, request);
5644
5645#else
5646 ut_error;
5647
5648 err = DB_ERROR; /* Eliminate compiler warning */
5649
5650#endif /* WIN_ASYNC_IO */
5651
5652 } else {
5653 srv_set_io_thread_op_info(segment, "simulated aio handle");
5654
5655 err = os_aio_simulated_handler(segment, m1, m2, request);
5656 }
5657
5658 return(err);
5659}
5660
5661#ifdef WIN_ASYNC_IO
5662static HANDLE new_completion_port()
5663{
5664 HANDLE h = CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0);
5665 ut_a(h);
5666 return h;
5667}
5668#endif
5669
5670/** Constructor
5671@param[in] id The latch ID
5672@param[in] n Number of AIO slots
5673@param[in] segments Number of segments */
5674AIO::AIO(
5675 latch_id_t id,
5676 ulint n,
5677 ulint segments)
5678 :
5679 m_slots(n),
5680 m_n_segments(segments),
5681 m_n_reserved()
5682# ifdef LINUX_NATIVE_AIO
5683 ,m_aio_ctx(),
5684 m_events(m_slots.size())
5685# endif /* LINUX_NATIVE_AIO */
5686#ifdef WIN_ASYNC_IO
5687 ,m_completion_port(new_completion_port())
5688#endif
5689{
5690 ut_a(n > 0);
5691 ut_a(m_n_segments > 0);
5692
5693 mutex_create(id, &m_mutex);
5694
5695 m_not_full = os_event_create("aio_not_full");
5696 m_is_empty = os_event_create("aio_is_empty");
5697
5698 memset(&m_slots[0], 0x0, sizeof(m_slots[0]) * m_slots.size());
5699#ifdef LINUX_NATIVE_AIO
5700 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
5701#endif /* LINUX_NATIVE_AIO */
5702
5703 os_event_set(m_is_empty);
5704}
5705
5706/** Initialise the slots */
5707dberr_t
5708AIO::init_slots()
5709{
5710 for (ulint i = 0; i < m_slots.size(); ++i) {
5711 Slot& slot = m_slots[i];
5712
5713 slot.pos = static_cast<uint16_t>(i);
5714
5715 slot.is_reserved = false;
5716
5717#ifdef WIN_ASYNC_IO
5718
5719 slot.array = this;
5720
5721#elif defined(LINUX_NATIVE_AIO)
5722
5723 slot.ret = 0;
5724
5725 slot.n_bytes = 0;
5726
5727 memset(&slot.control, 0x0, sizeof(slot.control));
5728
5729#endif /* WIN_ASYNC_IO */
5730 }
5731
5732 return(DB_SUCCESS);
5733}
5734
5735#ifdef LINUX_NATIVE_AIO
5736/** Initialise the Linux Native AIO interface */
5737dberr_t
5738AIO::init_linux_native_aio()
5739{
5740 /* Initialize the io_context array. One io_context
5741 per segment in the array. */
5742
5743 ut_a(m_aio_ctx == NULL);
5744
5745 m_aio_ctx = static_cast<io_context**>(
5746 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
5747
5748 if (m_aio_ctx == NULL) {
5749 return(DB_OUT_OF_MEMORY);
5750 }
5751
5752 io_context** ctx = m_aio_ctx;
5753 ulint max_events = slots_per_segment();
5754
5755 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
5756
5757 if (!linux_create_io_ctx(max_events, ctx)) {
5758 /* If something bad happened during aio setup
5759 we disable linux native aio.
5760 The disadvantage will be a small memory leak
5761 at shutdown but that's ok compared to a crash
5762 or a not working server.
5763 This frequently happens when running the test suite
5764 with many threads on a system with low fs.aio-max-nr!
5765 */
5766
5767 ib::warn()
5768 << "Warning: Linux Native AIO disabled "
5769 << "because _linux_create_io_ctx() "
5770 << "failed. To get rid of this warning you can "
5771 << "try increasing system "
5772 << "fs.aio-max-nr to 1048576 or larger or "
5773 << "setting innodb_use_native_aio = 0 in my.cnf";
5774 ut_free(m_aio_ctx);
5775 m_aio_ctx = 0;
5776 srv_use_native_aio = FALSE;
5777 return(DB_SUCCESS);
5778 }
5779 }
5780
5781 return(DB_SUCCESS);
5782}
5783#endif /* LINUX_NATIVE_AIO */
5784
5785/** Initialise the array */
5786dberr_t
5787AIO::init()
5788{
5789 ut_a(!m_slots.empty());
5790
5791
5792 if (srv_use_native_aio) {
5793#ifdef LINUX_NATIVE_AIO
5794 dberr_t err = init_linux_native_aio();
5795
5796 if (err != DB_SUCCESS) {
5797 return(err);
5798 }
5799
5800#endif /* LINUX_NATIVE_AIO */
5801 }
5802
5803 return(init_slots());
5804}
5805
5806/** Creates an aio wait array. Note that we return NULL in case of failure.
5807We don't care about freeing memory here because we assume that a
5808failure will result in server refusing to start up.
5809@param[in] id Latch ID
5810@param[in] n maximum number of pending AIO operations
5811 allowed; n must be divisible by m_n_segments
5812@param[in] n_segments number of segments in the AIO array
5813@return own: AIO array, NULL on failure */
5814AIO*
5815AIO::create(
5816 latch_id_t id,
5817 ulint n,
5818 ulint n_segments)
5819{
5820 if ((n % n_segments)) {
5821
5822 ib::error()
5823 << "Maximum number of AIO operations must be "
5824 << "divisible by number of segments";
5825
5826 return(NULL);
5827 }
5828
5829 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
5830
5831 if (array != NULL && array->init() != DB_SUCCESS) {
5832
5833 UT_DELETE(array);
5834
5835 array = NULL;
5836 }
5837
5838 return(array);
5839}
5840
5841/** AIO destructor */
5842AIO::~AIO()
5843{
5844 mutex_destroy(&m_mutex);
5845
5846 os_event_destroy(m_not_full);
5847 os_event_destroy(m_is_empty);
5848
5849#if defined(LINUX_NATIVE_AIO)
5850 if (srv_use_native_aio) {
5851 m_events.clear();
5852 ut_free(m_aio_ctx);
5853 }
5854#endif /* LINUX_NATIVE_AIO */
5855#if defined(WIN_ASYNC_IO)
5856 CloseHandle(m_completion_port);
5857#endif
5858
5859 m_slots.clear();
5860}
5861
5862/** Initializes the asynchronous io system. Creates one array each for ibuf
5863and log i/o. Also creates one array each for read and write where each
5864array is divided logically into n_readers and n_writers
5865respectively. The caller must create an i/o handler thread for each
5866segment in these arrays. This function also creates the sync array.
5867No i/o handler thread needs to be created for that
5868@param[in] n_per_seg maximum number of pending aio
5869 operations allowed per segment
5870@param[in] n_readers number of reader threads
5871@param[in] n_writers number of writer threads
5872@param[in] n_slots_sync number of slots in the sync aio array
5873@return true if the AIO sub-system was started successfully */
5874bool
5875AIO::start(
5876 ulint n_per_seg,
5877 ulint n_readers,
5878 ulint n_writers,
5879 ulint n_slots_sync)
5880{
5881#if defined(LINUX_NATIVE_AIO)
5882 /* Check if native aio is supported on this system and tmpfs */
5883 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
5884
5885 ib::warn() << "Linux Native AIO disabled.";
5886
5887 srv_use_native_aio = FALSE;
5888 }
5889#endif /* LINUX_NATIVE_AIO */
5890
5891 srv_reset_io_thread_op_info();
5892
5893 s_reads = create(
5894 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
5895
5896 if (s_reads == NULL) {
5897 return(false);
5898 }
5899
5900 ulint start = srv_read_only_mode ? 0 : 2;
5901 ulint n_segs = n_readers + start;
5902
5903 /* 0 is the ibuf segment and 1 is the redo log segment. */
5904 for (ulint i = start; i < n_segs; ++i) {
5905 ut_a(i < SRV_MAX_N_IO_THREADS);
5906 srv_io_thread_function[i] = "read thread";
5907 }
5908
5909 ulint n_segments = n_readers;
5910
5911 if (!srv_read_only_mode) {
5912
5913 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
5914
5915 if (s_ibuf == NULL) {
5916 return(false);
5917 }
5918
5919 ++n_segments;
5920
5921 srv_io_thread_function[0] = "insert buffer thread";
5922
5923 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
5924
5925 if (s_log == NULL) {
5926 return(false);
5927 }
5928
5929 ++n_segments;
5930
5931 srv_io_thread_function[1] = "log thread";
5932
5933 } else {
5934 s_ibuf = s_log = NULL;
5935 }
5936
5937 s_writes = create(
5938 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
5939
5940 if (s_writes == NULL) {
5941 return(false);
5942 }
5943
5944#ifdef WIN_ASYNC_IO
5945 data_completion_port = s_writes->m_completion_port;
5946 log_completion_port =
5947 s_log ? s_log->m_completion_port : data_completion_port;
5948#endif
5949
5950 n_segments += n_writers;
5951
5952 for (ulint i = start + n_readers; i < n_segments; ++i) {
5953 ut_a(i < SRV_MAX_N_IO_THREADS);
5954 srv_io_thread_function[i] = "write thread";
5955 }
5956
5957 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
5958
5959 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
5960
5961 if (s_sync == NULL) {
5962
5963 return(false);
5964 }
5965
5966 os_aio_n_segments = n_segments;
5967
5968 os_aio_validate();
5969
5970 os_last_printout = ut_time();
5971
5972 if (srv_use_native_aio) {
5973 return(true);
5974 }
5975
5976 os_aio_segment_wait_events = static_cast<os_event_t*>(
5977 ut_zalloc_nokey(
5978 n_segments * sizeof *os_aio_segment_wait_events));
5979
5980 if (os_aio_segment_wait_events == NULL) {
5981
5982 return(false);
5983 }
5984
5985 for (ulint i = 0; i < n_segments; ++i) {
5986 os_aio_segment_wait_events[i] = os_event_create(0);
5987 }
5988
5989 return(true);
5990}
5991
5992/** Free the AIO arrays */
5993void
5994AIO::shutdown()
5995{
5996 UT_DELETE(s_ibuf);
5997 s_ibuf = NULL;
5998
5999 UT_DELETE(s_log);
6000 s_log = NULL;
6001
6002 UT_DELETE(s_writes);
6003 s_writes = NULL;
6004
6005 UT_DELETE(s_sync);
6006 s_sync = NULL;
6007
6008 UT_DELETE(s_reads);
6009 s_reads = NULL;
6010}
6011
6012/** Initializes the asynchronous io system. Creates one array each for ibuf
6013and log i/o. Also creates one array each for read and write where each
6014array is divided logically into n_readers and n_writers
6015respectively. The caller must create an i/o handler thread for each
6016segment in these arrays. This function also creates the sync array.
6017No i/o handler thread needs to be created for that
6018@param[in] n_readers number of reader threads
6019@param[in] n_writers number of writer threads
6020@param[in] n_slots_sync number of slots in the sync aio array */
6021bool
6022os_aio_init(
6023 ulint n_readers,
6024 ulint n_writers,
6025 ulint n_slots_sync)
6026{
6027 /* Maximum number of pending aio operations allowed per segment */
6028 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
6029
6030 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
6031}
6032
6033/** Frees the asynchronous io system. */
6034void
6035os_aio_free()
6036{
6037 AIO::shutdown();
6038
6039 ut_ad(!os_aio_segment_wait_events || !srv_use_native_aio);
6040 ut_ad(srv_use_native_aio || os_aio_segment_wait_events
6041 || !srv_was_started);
6042
6043 if (!srv_use_native_aio && os_aio_segment_wait_events) {
6044 for (ulint i = 0; i < os_aio_n_segments; i++) {
6045 os_event_destroy(os_aio_segment_wait_events[i]);
6046 }
6047
6048 ut_free(os_aio_segment_wait_events);
6049 os_aio_segment_wait_events = 0;
6050 }
6051 os_aio_n_segments = 0;
6052}
6053
6054/** Wakes up all async i/o threads so that they know to exit themselves in
6055shutdown. */
6056void
6057os_aio_wake_all_threads_at_shutdown()
6058{
6059#ifdef WIN_ASYNC_IO
6060 AIO::wake_at_shutdown();
6061#elif defined(LINUX_NATIVE_AIO)
6062 /* When using native AIO interface the io helper threads
6063 wait on io_getevents with a timeout value of 500ms. At
6064 each wake up these threads check the server status.
6065 No need to do anything to wake them up. */
6066#endif /* !WIN_ASYNC_AIO */
6067
6068 if (srv_use_native_aio) {
6069 return;
6070 }
6071
6072 /* This loop wakes up all simulated ai/o threads */
6073
6074 for (ulint i = 0; i < os_aio_n_segments; ++i) {
6075
6076 os_event_set(os_aio_segment_wait_events[i]);
6077 }
6078}
6079
6080/** Waits until there are no pending writes in AIO::s_writes. There can
6081be other, synchronous, pending writes. */
6082void
6083os_aio_wait_until_no_pending_writes()
6084{
6085 AIO::wait_until_no_pending_writes();
6086}
6087
6088/** Calculates segment number for a slot.
6089@param[in] array AIO wait array
6090@param[in] slot slot in this array
6091@return segment number (which is the number used by, for example,
6092 I/O-handler threads) */
6093ulint
6094AIO::get_segment_no_from_slot(
6095 const AIO* array,
6096 const Slot* slot)
6097{
6098 ulint segment;
6099 ulint seg_len;
6100
6101 if (array == s_ibuf) {
6102 ut_ad(!srv_read_only_mode);
6103
6104 segment = IO_IBUF_SEGMENT;
6105
6106 } else if (array == s_log) {
6107 ut_ad(!srv_read_only_mode);
6108
6109 segment = IO_LOG_SEGMENT;
6110
6111 } else if (array == s_reads) {
6112 seg_len = s_reads->slots_per_segment();
6113
6114 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6115 } else {
6116 ut_a(array == s_writes);
6117
6118 seg_len = s_writes->slots_per_segment();
6119
6120 segment = s_reads->m_n_segments
6121 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
6122 }
6123
6124 return(segment);
6125}
6126
6127/** Requests for a slot in the aio array. If no slot is available, waits until
6128not_full-event becomes signaled.
6129
6130@param[in] type IO context
6131@param[in,out] m1 message to be passed along with the AIO
6132 operation
6133@param[in,out] m2 message to be passed along with the AIO
6134 operation
6135@param[in] file file handle
6136@param[in] name name of the file or path as a NUL-terminated
6137 string
6138@param[in,out] buf buffer where to read or from which to write
6139@param[in] offset file offset, where to read from or start writing
6140@param[in] len length of the block to read or write
6141@return pointer to slot */
6142Slot*
6143AIO::reserve_slot(
6144 const IORequest& type,
6145 fil_node_t* m1,
6146 void* m2,
6147 pfs_os_file_t file,
6148 const char* name,
6149 void* buf,
6150 os_offset_t offset,
6151 ulint len)
6152{
6153#ifdef WIN_ASYNC_IO
6154 ut_a((len & 0xFFFFFFFFUL) == len);
6155#endif /* WIN_ASYNC_IO */
6156
6157 /* No need of a mutex. Only reading constant fields */
6158 ulint slots_per_seg;
6159
6160 ut_ad(type.validate());
6161
6162 slots_per_seg = slots_per_segment();
6163
6164 /* We attempt to keep adjacent blocks in the same local
6165 segment. This can help in merging IO requests when we are
6166 doing simulated AIO */
6167 ulint local_seg;
6168
6169 local_seg = (offset >> (srv_page_size_shift + 6)) % m_n_segments;
6170
6171 for (;;) {
6172
6173 acquire();
6174
6175 if (m_n_reserved != m_slots.size()) {
6176 break;
6177 }
6178
6179 release();
6180
6181 if (!srv_use_native_aio) {
6182 /* If the handler threads are suspended,
6183 wake them so that we get more slots */
6184
6185 os_aio_simulated_wake_handler_threads();
6186 }
6187
6188 os_event_wait(m_not_full);
6189 }
6190
6191 ulint counter = 0;
6192 Slot* slot = NULL;
6193
6194 /* We start our search for an available slot from our preferred
6195 local segment and do a full scan of the array. We are
6196 guaranteed to find a slot in full scan. */
6197 for (ulint i = local_seg * slots_per_seg;
6198 counter < m_slots.size();
6199 ++i, ++counter) {
6200
6201 i %= m_slots.size();
6202
6203 slot = at(i);
6204
6205 if (slot->is_reserved == false) {
6206 break;
6207 }
6208 }
6209
6210 /* We MUST always be able to get hold of a reserved slot. */
6211 ut_a(counter < m_slots.size());
6212
6213 ut_a(slot->is_reserved == false);
6214
6215 ++m_n_reserved;
6216
6217 if (m_n_reserved == 1) {
6218 os_event_reset(m_is_empty);
6219 }
6220
6221 if (m_n_reserved == m_slots.size()) {
6222 os_event_reset(m_not_full);
6223 }
6224
6225 slot->is_reserved = true;
6226 slot->reservation_time = ut_time();
6227 slot->m1 = m1;
6228 slot->m2 = m2;
6229 slot->file = file;
6230 slot->name = name;
6231#ifdef _WIN32
6232 slot->len = static_cast<DWORD>(len);
6233#else
6234 slot->len = static_cast<ulint>(len);
6235#endif /* _WIN32 */
6236 slot->type = type;
6237 slot->buf = static_cast<byte*>(buf);
6238 slot->ptr = slot->buf;
6239 slot->offset = offset;
6240 slot->err = DB_SUCCESS;
6241 slot->original_len = static_cast<uint32>(len);
6242 slot->io_already_done = false;
6243 slot->buf = static_cast<byte*>(buf);
6244
6245#ifdef WIN_ASYNC_IO
6246 {
6247 OVERLAPPED* control;
6248
6249 control = &slot->control;
6250 control->Offset = (DWORD) offset & 0xFFFFFFFF;
6251 control->OffsetHigh = (DWORD) (offset >> 32);
6252 }
6253#elif defined(LINUX_NATIVE_AIO)
6254
6255 /* If we are not using native AIO skip this part. */
6256 if (srv_use_native_aio) {
6257
6258 off_t aio_offset;
6259
6260 /* Check if we are dealing with 64 bit arch.
6261 If not then make sure that offset fits in 32 bits. */
6262 aio_offset = (off_t) offset;
6263
6264 ut_a(sizeof(aio_offset) >= sizeof(offset)
6265 || ((os_offset_t) aio_offset) == offset);
6266
6267 struct iocb* iocb = &slot->control;
6268
6269 if (type.is_read()) {
6270
6271 io_prep_pread(
6272 iocb, file, slot->ptr, slot->len, aio_offset);
6273 } else {
6274 ut_ad(type.is_write());
6275
6276 io_prep_pwrite(
6277 iocb, file, slot->ptr, slot->len, aio_offset);
6278 }
6279
6280 iocb->data = slot;
6281
6282 slot->n_bytes = 0;
6283 slot->ret = 0;
6284 }
6285#endif /* LINUX_NATIVE_AIO */
6286
6287 release();
6288
6289 return(slot);
6290}
6291
6292/** Wakes up a simulated aio i/o-handler thread if it has something to do.
6293@param[in] global_segment The number of the segment in the AIO arrays */
6294void
6295AIO::wake_simulated_handler_thread(ulint global_segment)
6296{
6297 ut_ad(!srv_use_native_aio);
6298
6299 AIO* array;
6300 ulint segment = get_array_and_local_segment(&array, global_segment);
6301
6302 array->wake_simulated_handler_thread(global_segment, segment);
6303}
6304
6305/** Wakes up a simulated AIO I/O-handler thread if it has something to do
6306for a local segment in the AIO array.
6307@param[in] global_segment The number of the segment in the AIO arrays
6308@param[in] segment The local segment in the AIO array */
6309void
6310AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
6311{
6312 ut_ad(!srv_use_native_aio);
6313
6314 ulint n = slots_per_segment();
6315 ulint offset = segment * n;
6316
6317 /* Look through n slots after the segment * n'th slot */
6318
6319 acquire();
6320
6321 const Slot* slot = at(offset);
6322
6323 for (ulint i = 0; i < n; ++i, ++slot) {
6324
6325 if (slot->is_reserved) {
6326
6327 /* Found an i/o request */
6328
6329 release();
6330
6331 os_event_t event;
6332
6333 event = os_aio_segment_wait_events[global_segment];
6334
6335 os_event_set(event);
6336
6337 return;
6338 }
6339 }
6340
6341 release();
6342}
6343
6344/** Wakes up simulated aio i/o-handler threads if they have something to do. */
6345void
6346os_aio_simulated_wake_handler_threads()
6347{
6348 if (srv_use_native_aio) {
6349 /* We do not use simulated aio: do nothing */
6350
6351 return;
6352 }
6353
6354 os_aio_recommend_sleep_for_read_threads = false;
6355
6356 for (ulint i = 0; i < os_aio_n_segments; i++) {
6357 AIO::wake_simulated_handler_thread(i);
6358 }
6359}
6360
6361/** Select the IO slot array
6362@param[in,out] type Type of IO, READ or WRITE
6363@param[in] read_only true if running in read-only mode
6364@param[in] mode IO mode
6365@return slot array or NULL if invalid mode specified */
6366AIO*
6367AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
6368{
6369 AIO* array;
6370
6371 ut_ad(type.validate());
6372
6373 switch (mode) {
6374 case OS_AIO_NORMAL:
6375
6376 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
6377 break;
6378
6379 case OS_AIO_IBUF:
6380 ut_ad(type.is_read());
6381
6382 /* Reduce probability of deadlock bugs in connection with ibuf:
6383 do not let the ibuf i/o handler sleep */
6384
6385 type.clear_do_not_wake();
6386
6387 array = read_only ? AIO::s_reads : AIO::s_ibuf;
6388 break;
6389
6390 case OS_AIO_LOG:
6391
6392 array = read_only ? AIO::s_reads : AIO::s_log;
6393 break;
6394
6395 case OS_AIO_SYNC:
6396
6397 array = AIO::s_sync;
6398#if defined(LINUX_NATIVE_AIO)
6399 /* In Linux native AIO we don't use sync IO array. */
6400 ut_a(!srv_use_native_aio);
6401#endif /* LINUX_NATIVE_AIO */
6402 break;
6403
6404 default:
6405 ut_error;
6406 array = NULL; /* Eliminate compiler warning */
6407 }
6408
6409 return(array);
6410}
6411
6412#ifdef WIN_ASYNC_IO
6413/** This function is only used in Windows asynchronous i/o.
6414Waits for an aio operation to complete. This function is used to wait the
6415for completed requests. The aio array of pending requests is divided
6416into segments. The thread specifies which segment or slot it wants to wait
6417for. NOTE: this function will also take care of freeing the aio slot,
6418therefore no other thread is allowed to do the freeing!
6419@param[in] segment The number of the segment in the aio arrays to
6420 wait for; segment 0 is the ibuf I/O thread,
6421 segment 1 the log I/O thread, then follow the
6422 non-ibuf read threads, and as the last are the
6423 non-ibuf write threads; if this is
6424 ULINT_UNDEFINED, then it means that sync AIO
6425 is used, and this parameter is ignored
6426@param[in] pos this parameter is used only in sync AIO:
6427 wait for the aio slot at this position
6428@param[out] m1 the messages passed with the AIO request; note
6429 that also in the case where the AIO operation
6430 failed, these output parameters are valid and
6431 can be used to restart the operation,
6432 for example
6433@param[out] m2 callback message
6434@param[out] type OS_FILE_WRITE or ..._READ
6435@return DB_SUCCESS or error code */
6436
6437
6438
6439static
6440dberr_t
6441os_aio_windows_handler(
6442 ulint segment,
6443 ulint pos,
6444 fil_node_t** m1,
6445 void** m2,
6446 IORequest* type)
6447{
6448 Slot* slot= 0;
6449 dberr_t err;
6450
6451 BOOL ret;
6452 ULONG_PTR key;
6453
6454 ut_a(segment != ULINT_UNDEFINED);
6455
6456 /* NOTE! We only access constant fields in os_aio_array. Therefore
6457 we do not have to acquire the protecting mutex yet */
6458
6459 ut_ad(os_aio_validate_skip());
6460 AIO *my_array;
6461 AIO::get_array_and_local_segment(&my_array, segment);
6462
6463 HANDLE port = my_array->m_completion_port;
6464 ut_ad(port);
6465 for (;;) {
6466 DWORD len;
6467 ret = GetQueuedCompletionStatus(port, &len, &key,
6468 (OVERLAPPED **)&slot, INFINITE);
6469
6470 /* If shutdown key was received, repost the shutdown message and exit */
6471 if (ret && key == IOCP_SHUTDOWN_KEY) {
6472 PostQueuedCompletionStatus(port, 0, key, NULL);
6473 *m1 = NULL;
6474 *m2 = NULL;
6475 return (DB_SUCCESS);
6476 }
6477
6478 ut_a(slot);
6479
6480 if (!ret) {
6481 /* IO failed */
6482 break;
6483 }
6484
6485 slot->n_bytes= len;
6486 ut_a(slot->array);
6487 HANDLE slot_port = slot->array->m_completion_port;
6488 if (slot_port != port) {
6489 /* there are no redirections between data and log */
6490 ut_ad(port == data_completion_port);
6491 ut_ad(slot_port != log_completion_port);
6492
6493 /*
6494 Redirect completions to the dedicated completion port
6495 and threads.
6496
6497 "Write array" threads receive write,read and ibuf
6498 notifications, read and ibuf completions are redirected.
6499
6500 Forwarding IO completion this way costs a context switch,
6501 and this seems tolerable since asynchronous reads are by
6502 far less frequent.
6503 */
6504 ut_a(PostQueuedCompletionStatus(slot_port,
6505 len, key, &slot->control));
6506 }
6507 else {
6508 break;
6509 }
6510 }
6511
6512 ut_a(slot->is_reserved);
6513
6514 *m1 = slot->m1;
6515 *m2 = slot->m2;
6516
6517 *type = slot->type;
6518
6519 bool retry = false;
6520
6521 if (ret && slot->n_bytes == slot->len) {
6522
6523 err = DB_SUCCESS;
6524
6525 } else if (os_file_handle_error(slot->name, "Windows aio")) {
6526
6527 retry = true;
6528
6529 } else {
6530
6531 err = DB_IO_ERROR;
6532 }
6533
6534
6535 if (retry) {
6536 /* Retry failed read/write operation synchronously. */
6537
6538#ifdef UNIV_PFS_IO
6539 /* This read/write does not go through os_file_read
6540 and os_file_write APIs, need to register with
6541 performance schema explicitly here. */
6542 PSI_file_locker_state state;
6543 struct PSI_file_locker* locker = NULL;
6544
6545 register_pfs_file_io_begin(
6546 &state, locker, slot->file, slot->len,
6547 slot->type.is_write()
6548 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
6549#endif /* UNIV_PFS_IO */
6550
6551 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
6552
6553 ssize_t n_bytes = SyncFileIO::execute(slot);
6554
6555#ifdef UNIV_PFS_IO
6556 register_pfs_file_io_end(locker, slot->len);
6557#endif /* UNIV_PFS_IO */
6558
6559 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
6560 }
6561
6562 if (err == DB_SUCCESS) {
6563 err = AIOHandler::post_io_processing(slot);
6564 }
6565
6566 slot->array->release_with_mutex(slot);
6567
6568 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
6569 && !buf_page_cleaner_is_active
6570 && os_aio_all_slots_free()) {
6571 /* Last IO, wakeup other io threads */
6572 AIO::wake_at_shutdown();
6573 }
6574 return(err);
6575}
6576#endif /* WIN_ASYNC_IO */
6577
6578/**
6579NOTE! Use the corresponding macro os_aio(), not directly this function!
6580Requests an asynchronous i/o operation.
6581@param[in,out] type IO request context
6582@param[in] mode IO mode
6583@param[in] name Name of the file or path as NUL terminated
6584 string
6585@param[in] file Open file handle
6586@param[out] buf buffer where to read
6587@param[in] offset file offset where to read
6588@param[in] n number of bytes to read
6589@param[in] read_only if true read only mode checks are enforced
6590@param[in,out] m1 Message for the AIO handler, (can be used to
6591 identify a completed AIO operation); ignored
6592 if mode is OS_AIO_SYNC
6593@param[in,out] m2 message for the AIO handler (can be used to
6594 identify a completed AIO operation); ignored
6595 if mode is OS_AIO_SYNC
6596
6597@return DB_SUCCESS or error code */
6598dberr_t
6599os_aio_func(
6600 IORequest& type,
6601 ulint mode,
6602 const char* name,
6603 pfs_os_file_t file,
6604 void* buf,
6605 os_offset_t offset,
6606 ulint n,
6607 bool read_only,
6608 fil_node_t* m1,
6609 void* m2)
6610{
6611#ifdef WIN_ASYNC_IO
6612 BOOL ret = TRUE;
6613#endif /* WIN_ASYNC_IO */
6614
6615 ut_ad(n > 0);
6616 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
6617 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
6618 ut_ad(os_aio_validate_skip());
6619
6620#ifdef WIN_ASYNC_IO
6621 ut_ad((n & 0xFFFFFFFFUL) == n);
6622#endif /* WIN_ASYNC_IO */
6623
6624 DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
6625 mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
6626
6627 if (mode == OS_AIO_SYNC) {
6628 if (type.is_read()) {
6629 return(os_file_read_func(type, file, buf, offset, n));
6630 }
6631
6632 ut_ad(type.is_write());
6633
6634 return(os_file_write_func(type, name, file, buf, offset, n));
6635 }
6636
6637try_again:
6638
6639 AIO* array;
6640
6641 array = AIO::select_slot_array(type, read_only, mode);
6642
6643 Slot* slot;
6644
6645 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n);
6646
6647 if (type.is_read()) {
6648
6649
6650 if (srv_use_native_aio) {
6651
6652 ++os_n_file_reads;
6653
6654 os_bytes_read_since_printout += n;
6655#ifdef WIN_ASYNC_IO
6656 ret = ReadFile(
6657 file, slot->ptr, slot->len,
6658 NULL, &slot->control);
6659#elif defined(LINUX_NATIVE_AIO)
6660 if (!array->linux_dispatch(slot)) {
6661 goto err_exit;
6662 }
6663#endif /* WIN_ASYNC_IO */
6664 } else if (type.is_wake()) {
6665 AIO::wake_simulated_handler_thread(
6666 AIO::get_segment_no_from_slot(array, slot));
6667 }
6668 } else if (type.is_write()) {
6669
6670 if (srv_use_native_aio) {
6671 ++os_n_file_writes;
6672
6673#ifdef WIN_ASYNC_IO
6674 ret = WriteFile(
6675 file, slot->ptr, slot->len,
6676 NULL, &slot->control);
6677#elif defined(LINUX_NATIVE_AIO)
6678 if (!array->linux_dispatch(slot)) {
6679 goto err_exit;
6680 }
6681#endif /* WIN_ASYNC_IO */
6682
6683 } else if (type.is_wake()) {
6684 AIO::wake_simulated_handler_thread(
6685 AIO::get_segment_no_from_slot(array, slot));
6686 }
6687 } else {
6688 ut_error;
6689 }
6690
6691#ifdef WIN_ASYNC_IO
6692 if (ret || (GetLastError() == ERROR_IO_PENDING)) {
6693 /* aio completed or was queued successfully! */
6694 return(DB_SUCCESS);
6695 }
6696
6697 goto err_exit;
6698
6699#endif /* WIN_ASYNC_IO */
6700
6701 /* AIO request was queued successfully! */
6702 return(DB_SUCCESS);
6703
6704#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
6705err_exit:
6706#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
6707
6708 array->release_with_mutex(slot);
6709
6710 if (os_file_handle_error(
6711 name, type.is_read() ? "aio read" : "aio write")) {
6712
6713 goto try_again;
6714 }
6715
6716 return(DB_IO_ERROR);
6717}
6718
6719/** Simulated AIO handler for reaping IO requests */
6720class SimulatedAIOHandler {
6721
6722public:
6723
6724 /** Constructor
6725 @param[in,out] array The AIO array
6726 @param[in] segment Local segment in the array */
6727 SimulatedAIOHandler(AIO* array, ulint segment)
6728 :
6729 m_oldest(),
6730 m_n_elems(),
6731 m_lowest_offset(IB_UINT64_MAX),
6732 m_array(array),
6733 m_n_slots(),
6734 m_segment(segment),
6735 m_ptr(),
6736 m_buf()
6737 {
6738 ut_ad(m_segment < 100);
6739
6740 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
6741 }
6742
6743 /** Destructor */
6744 ~SimulatedAIOHandler()
6745 {
6746 if (m_ptr != NULL) {
6747 ut_free(m_ptr);
6748 }
6749 }
6750
6751 /** Reset the state of the handler
6752 @param[in] n_slots Number of pending AIO operations supported */
6753 void init(ulint n_slots)
6754 {
6755 m_oldest = 0;
6756 m_n_elems = 0;
6757 m_n_slots = n_slots;
6758 m_lowest_offset = IB_UINT64_MAX;
6759
6760 if (m_ptr != NULL) {
6761 ut_free(m_ptr);
6762 m_ptr = m_buf = NULL;
6763 }
6764
6765 m_slots[0] = NULL;
6766 }
6767
6768 /** Check if there is a slot for which the i/o has already been done
6769 @param[out] n_reserved Number of reserved slots
6770 @return the first completed slot that is found. */
6771 Slot* check_completed(ulint* n_reserved)
6772 {
6773 ulint offset = m_segment * m_n_slots;
6774
6775 *n_reserved = 0;
6776
6777 Slot* slot;
6778
6779 slot = m_array->at(offset);
6780
6781 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6782
6783 if (slot->is_reserved) {
6784
6785 if (slot->io_already_done) {
6786
6787 ut_a(slot->is_reserved);
6788
6789 return(slot);
6790 }
6791
6792 ++*n_reserved;
6793 }
6794 }
6795
6796 return(NULL);
6797 }
6798
6799 /** If there are at least 2 seconds old requests, then pick the
6800 oldest one to prevent starvation. If several requests have the
6801 same age, then pick the one at the lowest offset.
6802 @return true if request was selected */
6803 bool select()
6804 {
6805 if (!select_oldest()) {
6806
6807 return(select_lowest_offset());
6808 }
6809
6810 return(true);
6811 }
6812
6813 /** Check if there are several consecutive blocks
6814 to read or write. Merge them if found. */
6815 void merge()
6816 {
6817 /* if m_n_elems != 0, then we have assigned
6818 something valid to consecutive_ios[0] */
6819 ut_ad(m_n_elems != 0);
6820 ut_ad(first_slot() != NULL);
6821
6822 Slot* slot = first_slot();
6823
6824 while (!merge_adjacent(slot)) {
6825 /* No op */
6826 }
6827 }
6828
6829 /** We have now collected n_consecutive I/O requests
6830 in the array; allocate a single buffer which can hold
6831 all data, and perform the I/O
6832 @return the length of the buffer */
6833 ulint allocate_buffer()
6834 MY_ATTRIBUTE((warn_unused_result))
6835 {
6836 ulint len;
6837 Slot* slot = first_slot();
6838
6839 ut_ad(m_ptr == NULL);
6840
6841 if (slot->type.is_read() && m_n_elems > 1) {
6842
6843 len = 0;
6844
6845 for (ulint i = 0; i < m_n_elems; ++i) {
6846 len += m_slots[i]->len;
6847 }
6848
6849 m_ptr = static_cast<byte*>(
6850 ut_malloc_nokey(len + srv_page_size));
6851
6852 m_buf = static_cast<byte*>(
6853 ut_align(m_ptr, srv_page_size));
6854
6855 } else {
6856 len = first_slot()->len;
6857 m_buf = first_slot()->buf;
6858 }
6859
6860 return(len);
6861 }
6862
6863 /** We have to compress the individual pages and punch
6864 holes in them on a page by page basis when writing to
6865 tables that can be compresed at the IO level.
6866 @param[in] len Value returned by allocate_buffer */
6867 void copy_to_buffer(ulint len)
6868 {
6869 Slot* slot = first_slot();
6870
6871 if (len > slot->len && slot->type.is_write()) {
6872
6873 byte* ptr = m_buf;
6874
6875 ut_ad(ptr != slot->buf);
6876
6877 /* Copy the buffers to the combined buffer */
6878 for (ulint i = 0; i < m_n_elems; ++i) {
6879
6880 slot = m_slots[i];
6881
6882 memmove(ptr, slot->buf, slot->len);
6883
6884 ptr += slot->len;
6885 }
6886 }
6887 }
6888
6889 /** Do the I/O with ordinary, synchronous i/o functions:
6890 @param[in] len Length of buffer for IO */
6891 void io()
6892 {
6893 if (first_slot()->type.is_write()) {
6894
6895 for (ulint i = 0; i < m_n_elems; ++i) {
6896 write(m_slots[i]);
6897 }
6898
6899 } else {
6900
6901 for (ulint i = 0; i < m_n_elems; ++i) {
6902 read(m_slots[i]);
6903 }
6904 }
6905 }
6906
6907 /** Mark the i/os done in slots */
6908 void done()
6909 {
6910 for (ulint i = 0; i < m_n_elems; ++i) {
6911 m_slots[i]->io_already_done = true;
6912 }
6913 }
6914
6915 /** @return the first slot in the consecutive array */
6916 Slot* first_slot()
6917 MY_ATTRIBUTE((warn_unused_result))
6918 {
6919 ut_a(m_n_elems > 0);
6920
6921 return(m_slots[0]);
6922 }
6923
6924 /** Wait for I/O requests
6925 @param[in] global_segment The global segment
6926 @param[in,out] event Wait on event if no active requests
6927 @return the number of slots */
6928 ulint check_pending(
6929 ulint global_segment,
6930 os_event_t event)
6931 MY_ATTRIBUTE((warn_unused_result));
6932private:
6933
6934 /** Do the file read
6935 @param[in,out] slot Slot that has the IO context */
6936 void read(Slot* slot)
6937 {
6938 dberr_t err = os_file_read(
6939 slot->type,
6940 slot->file,
6941 slot->ptr,
6942 slot->offset,
6943 slot->len);
6944
6945 ut_a(err == DB_SUCCESS);
6946 }
6947
6948 /** Do the file read
6949 @param[in,out] slot Slot that has the IO context */
6950 void write(Slot* slot)
6951 {
6952 dberr_t err = os_file_write(
6953 slot->type,
6954 slot->name,
6955 slot->file,
6956 slot->ptr,
6957 slot->offset,
6958 slot->len);
6959
6960 ut_a(err == DB_SUCCESS);
6961 }
6962
6963 /** @return true if the slots are adjacent and can be merged */
6964 bool adjacent(const Slot* s1, const Slot* s2) const
6965 {
6966 return(s1 != s2
6967 && s1->file == s2->file
6968 && s2->offset == s1->offset + s1->len
6969 && s1->type == s2->type);
6970 }
6971
6972 /** @return true if merge limit reached or no adjacent slots found. */
6973 bool merge_adjacent(Slot*& current)
6974 {
6975 Slot* slot;
6976 ulint offset = m_segment * m_n_slots;
6977
6978 slot = m_array->at(offset);
6979
6980 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
6981
6982 if (slot->is_reserved && adjacent(current, slot)) {
6983
6984 current = slot;
6985
6986 /* Found a consecutive i/o request */
6987
6988 m_slots[m_n_elems] = slot;
6989
6990 ++m_n_elems;
6991
6992 return(m_n_elems >= m_slots.capacity());
6993 }
6994 }
6995
6996 return(true);
6997 }
6998
6999 /** There were no old requests. Look for an I/O request at the lowest
7000 offset in the array (we ignore the high 32 bits of the offset in these
7001 heuristics) */
7002 bool select_lowest_offset()
7003 {
7004 ut_ad(m_n_elems == 0);
7005
7006 ulint offset = m_segment * m_n_slots;
7007
7008 m_lowest_offset = IB_UINT64_MAX;
7009
7010 for (ulint i = 0; i < m_n_slots; ++i) {
7011 Slot* slot;
7012
7013 slot = m_array->at(i + offset);
7014
7015 if (slot->is_reserved
7016 && slot->offset < m_lowest_offset) {
7017
7018 /* Found an i/o request */
7019 m_slots[0] = slot;
7020
7021 m_n_elems = 1;
7022
7023 m_lowest_offset = slot->offset;
7024 }
7025 }
7026
7027 return(m_n_elems > 0);
7028 }
7029
7030 /** Select the slot if it is older than the current oldest slot.
7031 @param[in] slot The slot to check */
7032 void select_if_older(Slot* slot)
7033 {
7034 ulint age;
7035
7036 age = (ulint) difftime(ut_time(), slot->reservation_time);
7037
7038 if ((age >= 2 && age > m_oldest)
7039 || (age >= 2
7040 && age == m_oldest
7041 && slot->offset < m_lowest_offset)) {
7042
7043 /* Found an i/o request */
7044 m_slots[0] = slot;
7045
7046 m_n_elems = 1;
7047
7048 m_oldest = age;
7049
7050 m_lowest_offset = slot->offset;
7051 }
7052 }
7053
7054 /** Select th oldest slot in the array
7055 @return true if oldest slot found */
7056 bool select_oldest()
7057 {
7058 ut_ad(m_n_elems == 0);
7059
7060 Slot* slot;
7061 ulint offset = m_n_slots * m_segment;
7062
7063 slot = m_array->at(offset);
7064
7065 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
7066
7067 if (slot->is_reserved) {
7068 select_if_older(slot);
7069 }
7070 }
7071
7072 return(m_n_elems > 0);
7073 }
7074
7075 typedef std::vector<Slot*> slots_t;
7076
7077private:
7078 ulint m_oldest;
7079 ulint m_n_elems;
7080 os_offset_t m_lowest_offset;
7081
7082 AIO* m_array;
7083 ulint m_n_slots;
7084 ulint m_segment;
7085
7086 slots_t m_slots;
7087
7088 byte* m_ptr;
7089 byte* m_buf;
7090};
7091
7092/** Wait for I/O requests
7093@return the number of slots */
7094ulint
7095SimulatedAIOHandler::check_pending(
7096 ulint global_segment,
7097 os_event_t event)
7098{
7099 /* NOTE! We only access constant fields in os_aio_array.
7100 Therefore we do not have to acquire the protecting mutex yet */
7101
7102 ut_ad(os_aio_validate_skip());
7103
7104 ut_ad(m_segment < m_array->get_n_segments());
7105
7106 /* Look through n slots after the segment * n'th slot */
7107
7108 if (AIO::is_read(m_array)
7109 && os_aio_recommend_sleep_for_read_threads) {
7110
7111 /* Give other threads chance to add several
7112 I/Os to the array at once. */
7113
7114 srv_set_io_thread_op_info(
7115 global_segment, "waiting for i/o request");
7116
7117 os_event_wait(event);
7118
7119 return(0);
7120 }
7121
7122 return(m_array->slots_per_segment());
7123}
7124
7125/** Does simulated AIO. This function should be called by an i/o-handler
7126thread.
7127
7128@param[in] segment The number of the segment in the aio arrays to wait
7129 for; segment 0 is the ibuf i/o thread, segment 1 the
7130 log i/o thread, then follow the non-ibuf read threads,
7131 and as the last are the non-ibuf write threads
7132@param[out] m1 the messages passed with the AIO request; note that
7133 also in the case where the AIO operation failed, these
7134 output parameters are valid and can be used to restart
7135 the operation, for example
7136@param[out] m2 Callback argument
7137@param[in] type IO context
7138@return DB_SUCCESS or error code */
7139static
7140dberr_t
7141os_aio_simulated_handler(
7142 ulint global_segment,
7143 fil_node_t** m1,
7144 void** m2,
7145 IORequest* type)
7146{
7147 Slot* slot;
7148 AIO* array;
7149 ulint segment;
7150 os_event_t event = os_aio_segment_wait_events[global_segment];
7151
7152 segment = AIO::get_array_and_local_segment(&array, global_segment);
7153
7154 SimulatedAIOHandler handler(array, segment);
7155
7156 for (;;) {
7157
7158 srv_set_io_thread_op_info(
7159 global_segment, "looking for i/o requests (a)");
7160
7161 ulint n_slots = handler.check_pending(global_segment, event);
7162
7163 if (n_slots == 0) {
7164 continue;
7165 }
7166
7167 handler.init(n_slots);
7168
7169 srv_set_io_thread_op_info(
7170 global_segment, "looking for i/o requests (b)");
7171
7172 array->acquire();
7173
7174 ulint n_reserved;
7175
7176 slot = handler.check_completed(&n_reserved);
7177
7178 if (slot != NULL) {
7179
7180 break;
7181
7182 } else if (n_reserved == 0
7183 && !buf_page_cleaner_is_active
7184 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
7185
7186 /* There is no completed request. If there
7187 are no pending request at all, and the system
7188 is being shut down, exit. */
7189
7190 array->release();
7191
7192 *m1 = NULL;
7193
7194 *m2 = NULL;
7195
7196 return(DB_SUCCESS);
7197
7198 } else if (handler.select()) {
7199
7200 break;
7201 }
7202
7203 /* No I/O requested at the moment */
7204
7205 srv_set_io_thread_op_info(
7206 global_segment, "resetting wait event");
7207
7208 /* We wait here until tbere are more IO requests
7209 for this segment. */
7210
7211 os_event_reset(event);
7212
7213 array->release();
7214
7215 srv_set_io_thread_op_info(
7216 global_segment, "waiting for i/o request");
7217
7218 os_event_wait(event);
7219 }
7220
7221 /** Found a slot that has already completed its IO */
7222
7223 if (slot == NULL) {
7224 /* Merge adjacent requests */
7225 handler.merge();
7226
7227 /* Check if there are several consecutive blocks
7228 to read or write */
7229
7230 srv_set_io_thread_op_info(
7231 global_segment, "consecutive i/o requests");
7232
7233 // Note: We don't support write combining for simulated AIO.
7234 //ulint total_len = handler.allocate_buffer();
7235
7236 /* We release the array mutex for the time of the I/O: NOTE that
7237 this assumes that there is just one i/o-handler thread serving
7238 a single segment of slots! */
7239
7240 array->release();
7241
7242 // Note: We don't support write combining for simulated AIO.
7243 //handler.copy_to_buffer(total_len);
7244
7245 srv_set_io_thread_op_info(global_segment, "doing file i/o");
7246
7247 handler.io();
7248
7249 srv_set_io_thread_op_info(global_segment, "file i/o done");
7250
7251 array->acquire();
7252
7253 handler.done();
7254
7255 /* We return the messages for the first slot now, and if there
7256 were several slots, the messages will be returned with
7257 subsequent calls of this function */
7258
7259 slot = handler.first_slot();
7260 }
7261
7262 ut_ad(slot->is_reserved);
7263
7264 *m1 = slot->m1;
7265 *m2 = slot->m2;
7266
7267 *type = slot->type;
7268
7269 array->release(slot);
7270
7271 array->release();
7272
7273 return(DB_SUCCESS);
7274}
7275
7276/** Get the total number of pending IOs
7277@return the total number of pending IOs */
7278ulint
7279AIO::total_pending_io_count()
7280{
7281 ulint count = s_reads->pending_io_count();
7282
7283 if (s_writes != NULL) {
7284 count += s_writes->pending_io_count();
7285 }
7286
7287 if (s_ibuf != NULL) {
7288 count += s_ibuf->pending_io_count();
7289 }
7290
7291 if (s_log != NULL) {
7292 count += s_log->pending_io_count();
7293 }
7294
7295 if (s_sync != NULL) {
7296 count += s_sync->pending_io_count();
7297 }
7298
7299 return(count);
7300}
7301
7302/** Validates the consistency the aio system.
7303@return true if ok */
7304static
7305bool
7306os_aio_validate()
7307{
7308 /* The methods countds and validates, we ignore the count. */
7309 AIO::total_pending_io_count();
7310
7311 return(true);
7312}
7313
7314/** Prints pending IO requests per segment of an aio array.
7315We probably don't need per segment statistics but they can help us
7316during development phase to see if the IO requests are being
7317distributed as expected.
7318@param[in,out] file File where to print
7319@param[in] segments Pending IO array */
7320void
7321AIO::print_segment_info(
7322 FILE* file,
7323 const ulint* segments)
7324{
7325 ut_ad(m_n_segments > 0);
7326
7327 if (m_n_segments > 1) {
7328
7329 fprintf(file, " [");
7330
7331 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
7332
7333 if (i != 0) {
7334 fprintf(file, ", ");
7335 }
7336
7337 fprintf(file, ULINTPF, *segments);
7338 }
7339
7340 fprintf(file, "] ");
7341 }
7342}
7343
7344/** Prints info about the aio array.
7345@param[in,out] file Where to print */
7346void
7347AIO::print(FILE* file)
7348{
7349 ulint count = 0;
7350 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
7351
7352 mutex_enter(&m_mutex);
7353
7354 ut_a(!m_slots.empty());
7355 ut_a(m_n_segments > 0);
7356
7357 memset(n_res_seg, 0x0, sizeof(n_res_seg));
7358
7359 for (ulint i = 0; i < m_slots.size(); ++i) {
7360 Slot& slot = m_slots[i];
7361 ulint segment = (i * m_n_segments) / m_slots.size();
7362
7363 if (slot.is_reserved) {
7364
7365 ++count;
7366
7367 ++n_res_seg[segment];
7368
7369 ut_a(slot.len > 0);
7370 }
7371 }
7372
7373 ut_a(m_n_reserved == count);
7374
7375 print_segment_info(file, n_res_seg);
7376
7377 mutex_exit(&m_mutex);
7378}
7379
7380/** Print all the AIO segments
7381@param[in,out] file Where to print */
7382void
7383AIO::print_all(FILE* file)
7384{
7385 s_reads->print(file);
7386
7387 if (s_writes != NULL) {
7388 fputs(", aio writes:", file);
7389 s_writes->print(file);
7390 }
7391
7392 if (s_ibuf != NULL) {
7393 fputs(",\n ibuf aio reads:", file);
7394 s_ibuf->print(file);
7395 }
7396
7397 if (s_log != NULL) {
7398 fputs(", log i/o's:", file);
7399 s_log->print(file);
7400 }
7401
7402 if (s_sync != NULL) {
7403 fputs(", sync i/o's:", file);
7404 s_sync->print(file);
7405 }
7406}
7407
7408/** Prints info of the aio arrays.
7409@param[in,out] file file where to print */
7410void
7411os_aio_print(FILE* file)
7412{
7413 time_t current_time;
7414 double time_elapsed;
7415 double avg_bytes_read;
7416
7417 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
7418 fprintf(file, "I/O thread " ULINTPF " state: %s (%s)",
7419 i,
7420 srv_io_thread_op_info[i],
7421 srv_io_thread_function[i]);
7422
7423#ifndef _WIN32
7424 if (!srv_use_native_aio
7425 && os_event_is_set(os_aio_segment_wait_events[i])) {
7426 fprintf(file, " ev set");
7427 }
7428#endif /* _WIN32 */
7429
7430 fprintf(file, "\n");
7431 }
7432
7433 fputs("Pending normal aio reads:", file);
7434
7435 AIO::print_all(file);
7436
7437 putc('\n', file);
7438 current_time = ut_time();
7439 time_elapsed = 0.001 + difftime(current_time, os_last_printout);
7440
7441 fprintf(file,
7442 "Pending flushes (fsync) log: " ULINTPF
7443 "; buffer pool: " ULINTPF "\n"
7444 ULINTPF " OS file reads, "
7445 ULINTPF " OS file writes, "
7446 ULINTPF " OS fsyncs\n",
7447 fil_n_pending_log_flushes,
7448 fil_n_pending_tablespace_flushes,
7449 os_n_file_reads,
7450 os_n_file_writes,
7451 os_n_fsyncs);
7452
7453 const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
7454 const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
7455
7456 if (n_reads != 0 || n_writes != 0) {
7457 fprintf(file,
7458 ULINTPF " pending reads, " ULINTPF " pending writes\n",
7459 n_reads, n_writes);
7460 }
7461
7462 if (os_n_file_reads == os_n_file_reads_old) {
7463 avg_bytes_read = 0.0;
7464 } else {
7465 avg_bytes_read = (double) os_bytes_read_since_printout
7466 / (os_n_file_reads - os_n_file_reads_old);
7467 }
7468
7469 fprintf(file,
7470 "%.2f reads/s, " ULINTPF " avg bytes/read,"
7471 " %.2f writes/s, %.2f fsyncs/s\n",
7472 (os_n_file_reads - os_n_file_reads_old)
7473 / time_elapsed,
7474 (ulint) avg_bytes_read,
7475 (os_n_file_writes - os_n_file_writes_old)
7476 / time_elapsed,
7477 (os_n_fsyncs - os_n_fsyncs_old)
7478 / time_elapsed);
7479
7480 os_n_file_reads_old = os_n_file_reads;
7481 os_n_file_writes_old = os_n_file_writes;
7482 os_n_fsyncs_old = os_n_fsyncs;
7483 os_bytes_read_since_printout = 0;
7484
7485 os_last_printout = current_time;
7486}
7487
7488/** Refreshes the statistics used to print per-second averages. */
7489void
7490os_aio_refresh_stats()
7491{
7492 os_n_fsyncs_old = os_n_fsyncs;
7493
7494 os_bytes_read_since_printout = 0;
7495
7496 os_n_file_reads_old = os_n_file_reads;
7497
7498 os_n_file_writes_old = os_n_file_writes;
7499
7500 os_n_fsyncs_old = os_n_fsyncs;
7501
7502 os_bytes_read_since_printout = 0;
7503
7504 os_last_printout = ut_time();
7505}
7506
7507/** Checks that all slots in the system have been freed, that is, there are
7508no pending io operations.
7509@return true if all free */
7510bool
7511os_aio_all_slots_free()
7512{
7513 return(AIO::total_pending_io_count() == 0);
7514}
7515
7516#ifdef UNIV_DEBUG
7517/** Prints all pending IO for the array
7518@param[in] file file where to print
7519@param[in] array array to process */
7520void
7521AIO::to_file(FILE* file) const
7522{
7523 acquire();
7524
7525 fprintf(file, " " ULINTPF "\n", m_n_reserved);
7526
7527 for (ulint i = 0; i < m_slots.size(); ++i) {
7528
7529 const Slot& slot = m_slots[i];
7530
7531 if (slot.is_reserved) {
7532
7533 fprintf(file,
7534 "%s IO for %s (offset=" UINT64PF
7535 ", size=%lu)\n",
7536 slot.type.is_read() ? "read" : "write",
7537 slot.name, slot.offset, (unsigned long)(slot.len));
7538 }
7539 }
7540
7541 release();
7542}
7543
7544/** Print pending IOs for all arrays */
7545void
7546AIO::print_to_file(FILE* file)
7547{
7548 fprintf(file, "Pending normal aio reads:");
7549
7550 s_reads->to_file(file);
7551
7552 if (s_writes != NULL) {
7553 fprintf(file, "Pending normal aio writes:");
7554 s_writes->to_file(file);
7555 }
7556
7557 if (s_ibuf != NULL) {
7558 fprintf(file, "Pending ibuf aio reads:");
7559 s_ibuf->to_file(file);
7560 }
7561
7562 if (s_log != NULL) {
7563 fprintf(file, "Pending log i/o's:");
7564 s_log->to_file(file);
7565 }
7566
7567 if (s_sync != NULL) {
7568 fprintf(file, "Pending sync i/o's:");
7569 s_sync->to_file(file);
7570 }
7571}
7572
7573/** Prints all pending IO
7574@param[in] file File where to print */
7575void
7576os_aio_print_pending_io(
7577 FILE* file)
7578{
7579 AIO::print_to_file(file);
7580}
7581
7582#endif /* UNIV_DEBUG */
7583
7584/**
7585Set the file create umask
7586@param[in] umask The umask to use for file creation. */
7587void
7588os_file_set_umask(ulint umask)
7589{
7590 os_innodb_umask = umask;
7591}
7592
7593#else
7594#include "univ.i"
7595#endif /* !UNIV_INNOCHECKSUM */
7596
7597/** Normalizes a directory path for the current OS:
7598On Windows, we convert '/' to '\', else we convert '\' to '/'.
7599@param[in,out] str A null-terminated directory and file path */
7600void
7601os_normalize_path(
7602 char* str)
7603{
7604 if (str != NULL) {
7605 for (; *str; str++) {
7606 if (*str == OS_PATH_SEPARATOR_ALT) {
7607 *str = OS_PATH_SEPARATOR;
7608 }
7609 }
7610 }
7611}
7612