1 | /* |
2 | Copyright (c) 2017, Facebook, Inc. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
16 | |
17 | /* This C++ file's header */ |
18 | #include "./rdb_io_watchdog.h" |
19 | |
20 | /* C++ standard header files */ |
21 | #include <string> |
22 | #include <vector> |
23 | |
24 | /* Rdb_io_watchdog doesn't work on Windows [yet] */ |
25 | #ifdef HAVE_TIMER_DELETE |
26 | |
27 | namespace myrocks { |
28 | |
29 | void Rdb_io_watchdog::expire_io_callback(union sigval timer_data) { |
30 | DBUG_ASSERT(timer_data.sival_ptr != nullptr); |
31 | |
32 | // The treatment of any pending signal generated by the deleted timer is |
33 | // unspecified. Therefore we still need to handle the rare case where we |
34 | // finished the I/O operation right before the timer was deleted and callback |
35 | // was in flight. |
36 | if (!m_io_in_progress.load()) { |
37 | return; |
38 | } |
39 | |
40 | // At this point we know that I/O has been stuck in `write()` for more than |
41 | // `m_write_timeout` seconds. We'll log a message and shut down the service. |
42 | // NO_LINT_DEBUG |
43 | sql_print_error("MyRocks has detected a combination of I/O requests which " |
44 | "have cumulatively been blocking for more than %u seconds. " |
45 | "Shutting the service down." , |
46 | m_write_timeout); |
47 | |
48 | abort(); |
49 | } |
50 | |
51 | void Rdb_io_watchdog::io_check_callback(union sigval timer_data) { |
52 | RDB_MUTEX_LOCK_CHECK(m_reset_mutex); |
53 | |
54 | DBUG_ASSERT(timer_data.sival_ptr != nullptr); |
55 | |
56 | struct sigevent e; |
57 | |
58 | e.sigev_notify = SIGEV_THREAD; |
59 | e.sigev_notify_function = &Rdb_io_watchdog::expire_io_callback_wrapper; |
60 | e.sigev_value.sival_ptr = this; |
61 | e.sigev_notify_attributes = nullptr; |
62 | |
63 | int ret = timer_create(CLOCK_MONOTONIC, &e, &m_io_check_watchdog_timer); |
64 | |
65 | if (unlikely(ret)) { |
66 | // NO_LINT_DEBUG |
67 | sql_print_warning("Creating a watchdog I/O timer failed with %d." , errno); |
68 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
69 | return; |
70 | } |
71 | |
72 | struct itimerspec timer_spec; |
73 | memset(&timer_spec, 0, sizeof(timer_spec)); |
74 | |
75 | // One time execution only for the watchdog. No interval. |
76 | timer_spec.it_value.tv_sec = m_write_timeout; |
77 | |
78 | ret = timer_settime(m_io_check_watchdog_timer, 0, &timer_spec, nullptr); |
79 | |
80 | if (unlikely(ret)) { |
81 | // NO_LINT_DEBUG |
82 | sql_print_warning("Setting time for a watchdog I/O timer failed with %d." , |
83 | errno); |
84 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
85 | return; |
86 | } |
87 | |
88 | m_io_in_progress.store(true); |
89 | |
90 | // Verify the write access to all directories we care about. |
91 | for (const std::string &directory : m_dirs_to_check) { |
92 | ret = check_write_access(directory); |
93 | |
94 | // We'll log a warning and attept to continue to see if the problem happens |
95 | // in other cases as well. |
96 | if (unlikely(ret != HA_EXIT_SUCCESS)) { |
97 | // NO_LINT_DEBUG |
98 | sql_print_warning("Unable to verify write access to %s (error code %d)." , |
99 | directory.c_str(), ret); |
100 | } |
101 | } |
102 | |
103 | m_io_in_progress.store(false); |
104 | |
105 | // Clean up the watchdog timer. |
106 | ret = timer_delete(m_io_check_watchdog_timer); |
107 | |
108 | if (unlikely(ret)) { |
109 | // NO_LINT_DEBUG |
110 | sql_print_warning("Deleting the watchdog I/O timer failed with %d." , errno); |
111 | } |
112 | |
113 | m_io_check_watchdog_timer = nullptr; |
114 | |
115 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
116 | } |
117 | |
118 | int Rdb_io_watchdog::check_write_access(const std::string &dirname) const { |
119 | DBUG_ASSERT(!dirname.empty()); |
120 | DBUG_ASSERT(m_buf != nullptr); |
121 | |
122 | const std::string fname = dirname + FN_DIRSEP + RDB_IO_DUMMY_FILE_NAME; |
123 | |
124 | // O_DIRECT is a key flag here to make sure that we'll bypass the kernel's |
125 | // buffer cache. |
126 | int fd = open(fname.c_str(), O_WRONLY | O_DIRECT | O_CREAT | O_SYNC, |
127 | S_IRWXU | S_IWUSR); |
128 | |
129 | if (unlikely(fd == -1)) { |
130 | return fd; |
131 | } |
132 | |
133 | int ret = write(fd, m_buf, RDB_IO_WRITE_BUFFER_SIZE); |
134 | |
135 | if (unlikely(ret != RDB_IO_WRITE_BUFFER_SIZE)) { |
136 | return ret; |
137 | } |
138 | |
139 | ret = close(fd); |
140 | |
141 | if (unlikely(ret)) { |
142 | return ret; |
143 | } |
144 | |
145 | ret = unlink(fname.c_str()); |
146 | |
147 | if (unlikely(ret)) { |
148 | return ret; |
149 | } |
150 | |
151 | return HA_EXIT_SUCCESS; |
152 | } |
153 | |
154 | int Rdb_io_watchdog::reset_timeout(const uint32_t &write_timeout) { |
155 | // This function will be called either from a thread initializing MyRocks |
156 | // engine or handling system variable changes. We need to account for the |
157 | // possibility of I/O callback executing at the same time. If that happens |
158 | // then we'll wait for it to finish. |
159 | RDB_MUTEX_LOCK_CHECK(m_reset_mutex); |
160 | |
161 | struct sigevent e; |
162 | |
163 | // In all the cases all the active timers needs to be stopped. |
164 | int ret = stop_timers(); |
165 | |
166 | if (unlikely(ret)) { |
167 | // NO_LINT_DEBUG |
168 | sql_print_warning("Stopping I/O timers failed with %d." , errno); |
169 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
170 | return ret; |
171 | } |
172 | |
173 | m_write_timeout = write_timeout; |
174 | m_io_in_progress.store(false); |
175 | |
176 | // Zero means that the I/O timer will be disabled. Therefore there's nothing |
177 | // for us to do here. |
178 | if (!write_timeout) { |
179 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
180 | return HA_EXIT_SUCCESS; |
181 | } |
182 | |
183 | free(m_buf); |
184 | |
185 | ret = posix_memalign(reinterpret_cast<void **>(&m_buf), |
186 | RDB_IO_WRITE_BUFFER_SIZE, RDB_IO_WRITE_BUFFER_SIZE); |
187 | |
188 | if (unlikely(ret)) { |
189 | m_buf = nullptr; |
190 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
191 | // NB! The value of errno is not set. |
192 | return ret; |
193 | } |
194 | |
195 | DBUG_ASSERT(m_buf != nullptr); |
196 | memset(m_buf, 0, RDB_IO_WRITE_BUFFER_SIZE); |
197 | |
198 | // Common case gets handled here - we'll create a timer with a specific |
199 | // interval to check a set of directories for write access. |
200 | DBUG_ASSERT(m_dirs_to_check.size() > 0); |
201 | |
202 | e.sigev_notify = SIGEV_THREAD; |
203 | e.sigev_notify_function = &Rdb_io_watchdog::io_check_callback_wrapper; |
204 | e.sigev_value.sival_ptr = this; |
205 | e.sigev_notify_attributes = nullptr; |
206 | |
207 | ret = timer_create(CLOCK_MONOTONIC, &e, &m_io_check_timer); |
208 | |
209 | if (unlikely(ret)) { |
210 | // NO_LINT_DEBUG |
211 | sql_print_warning("Creating a I/O timer failed with %d." , errno); |
212 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
213 | return ret; |
214 | } |
215 | |
216 | struct itimerspec timer_spec; |
217 | memset(&timer_spec, 0, sizeof(timer_spec)); |
218 | |
219 | // I/O timer will need to execute on a certain interval. |
220 | timer_spec.it_value.tv_sec = m_write_timeout; |
221 | timer_spec.it_interval.tv_sec = m_write_timeout; |
222 | |
223 | ret = timer_settime(m_io_check_timer, 0, &timer_spec, nullptr); |
224 | |
225 | if (unlikely(ret)) { |
226 | // NO_LINT_DEBUG |
227 | sql_print_warning("Setting time for a watchdog I/O timer failed with %d." , |
228 | errno); |
229 | } |
230 | |
231 | RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex); |
232 | |
233 | return HA_EXIT_SUCCESS; |
234 | } |
235 | |
236 | } // namespace myrocks |
237 | |
238 | #endif |
239 | |
240 | |