1 | /* |
2 | Copyright (c) 2015, Facebook, Inc. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
16 | |
17 | #include <my_global.h> |
18 | |
19 | /* This C++ file's header file */ |
20 | #include "./rdb_mutex_wrapper.h" |
21 | |
22 | /* The following are for THD_ENTER_COND: */ |
23 | #define MYSQL_SERVER 1 |
24 | #include "sql_priv.h" |
25 | #include "my_decimal.h" |
26 | #include "sql_class.h" |
27 | //psergey-merge-todo: does MariaDB have/need: #include "../sql/replication.h" |
28 | |
29 | |
30 | /* MyRocks header files */ |
31 | #include "./ha_rocksdb.h" |
32 | #include "./rdb_utils.h" |
33 | |
34 | |
35 | using namespace rocksdb; |
36 | |
37 | namespace myrocks { |
38 | |
39 | static PSI_stage_info stage_waiting_on_row_lock2 = {0, "Waiting for row lock" , |
40 | 0}; |
41 | |
42 | static const int64_t ONE_SECOND_IN_MICROSECS = 1000 * 1000; |
43 | // A timeout as long as one full non-leap year worth of microseconds is as |
44 | // good as infinite timeout. |
45 | static const int64_t ONE_YEAR_IN_MICROSECS = |
46 | ONE_SECOND_IN_MICROSECS * 60 * 60 * 24 * 365; |
47 | |
48 | Rdb_cond_var::Rdb_cond_var() { mysql_cond_init(0, &m_cond, nullptr); } |
49 | |
50 | Rdb_cond_var::~Rdb_cond_var() { mysql_cond_destroy(&m_cond); } |
51 | |
52 | Status Rdb_cond_var::Wait(const std::shared_ptr<TransactionDBMutex> mutex_arg) { |
53 | return WaitFor(mutex_arg, ONE_YEAR_IN_MICROSECS); |
54 | } |
55 | |
56 | /* |
57 | @brief |
58 | Wait on condition variable. The caller must make sure that we own |
59 | *mutex_ptr. The mutex is released and re-acquired by the wait function. |
60 | |
61 | @param |
62 | timeout_micros Timeout in microseconds. Negative value means no timeout. |
63 | |
64 | @return |
65 | Status::OK() - Wait successfull |
66 | Status::TimedOut() - Timed out or wait killed (the caller can check |
67 | thd_killed() to determine which occurred) |
68 | */ |
69 | |
70 | Status |
71 | Rdb_cond_var::WaitFor(const std::shared_ptr<TransactionDBMutex> mutex_arg, |
72 | int64_t timeout_micros) { |
73 | auto *mutex_obj = reinterpret_cast<Rdb_mutex *>(mutex_arg.get()); |
74 | DBUG_ASSERT(mutex_obj != nullptr); |
75 | |
76 | mysql_mutex_t *const mutex_ptr = &mutex_obj->m_mutex; |
77 | |
78 | int res = 0; |
79 | struct timespec wait_timeout; |
80 | |
81 | if (timeout_micros < 0) |
82 | timeout_micros = ONE_YEAR_IN_MICROSECS; |
83 | set_timespec_nsec(wait_timeout, timeout_micros * 1000); |
84 | |
85 | #ifndef STANDALONE_UNITTEST |
86 | PSI_stage_info old_stage; |
87 | mysql_mutex_assert_owner(mutex_ptr); |
88 | |
89 | if (current_thd && mutex_obj->m_old_stage_info.count(current_thd) == 0) { |
90 | THD_ENTER_COND(current_thd, &m_cond, mutex_ptr, &stage_waiting_on_row_lock2, |
91 | &old_stage); |
92 | /* |
93 | After the mysql_cond_timedwait we need make this call |
94 | |
95 | THD_EXIT_COND(thd, &old_stage); |
96 | |
97 | to inform the SQL layer that KILLable wait has ended. However, |
98 | that will cause mutex to be released. Defer the release until the mutex |
99 | that is unlocked by RocksDB's Pessimistic Transactions system. |
100 | */ |
101 | mutex_obj->set_unlock_action(&old_stage); |
102 | } |
103 | |
104 | #endif |
105 | bool killed = false; |
106 | |
107 | do { |
108 | res = mysql_cond_timedwait(&m_cond, mutex_ptr, &wait_timeout); |
109 | |
110 | #ifndef STANDALONE_UNITTEST |
111 | if (current_thd) |
112 | killed= thd_killed(current_thd); |
113 | #endif |
114 | } while (!killed && res == EINTR); |
115 | |
116 | if (res || killed) |
117 | return Status::TimedOut(); |
118 | else |
119 | return Status::OK(); |
120 | } |
121 | |
122 | /* |
123 | |
124 | @note |
125 | This function may be called while not holding the mutex that is used to wait |
126 | on the condition variable. |
127 | |
128 | The manual page says ( http://linux.die.net/man/3/pthread_cond_signal): |
129 | |
130 | The pthread_cond_broadcast() or pthread_cond_signal() functions may be called |
131 | by a thread whether or not it currently owns the mutex that threads calling |
132 | pthread_cond_wait() or pthread_cond_timedwait() have associated with the |
133 | condition variable during their waits; however, IF PREDICTABLE SCHEDULING |
134 | BEHAVIOR IS REQUIRED, THEN THAT MUTEX SHALL BE LOCKED by the thread calling |
135 | pthread_cond_broadcast() or pthread_cond_signal(). |
136 | |
137 | What's "predicate scheduling" and do we need it? The explanation is here: |
138 | |
139 | https://groups.google.com/forum/?hl=ky#!msg/comp.programming.threads/wEUgPq541v8/ZByyyS8acqMJ |
140 | "The problem (from the realtime side) with condition variables is that |
141 | if you can signal/broadcast without holding the mutex, and any thread |
142 | currently running can acquire an unlocked mutex and check a predicate |
143 | without reference to the condition variable, then you can have an |
144 | indirect priority inversion." |
145 | |
146 | Another possible consequence is that one can create spurious wake-ups when |
147 | there are multiple threads signaling the condition. |
148 | |
149 | None of this looks like a problem for our use case. |
150 | */ |
151 | |
152 | void Rdb_cond_var::Notify() { mysql_cond_signal(&m_cond); } |
153 | |
154 | /* |
155 | @note |
156 | This is called without holding the mutex that's used for waiting on the |
157 | condition. See ::Notify(). |
158 | */ |
159 | void Rdb_cond_var::NotifyAll() { mysql_cond_broadcast(&m_cond); } |
160 | |
161 | Rdb_mutex::Rdb_mutex() { |
162 | mysql_mutex_init(0 /* Don't register in P_S. */, &m_mutex, |
163 | MY_MUTEX_INIT_FAST); |
164 | } |
165 | |
166 | Rdb_mutex::~Rdb_mutex() { mysql_mutex_destroy(&m_mutex); } |
167 | |
168 | Status Rdb_mutex::Lock() { |
169 | RDB_MUTEX_LOCK_CHECK(m_mutex); |
170 | DBUG_ASSERT(m_old_stage_info.count(current_thd) == 0); |
171 | return Status::OK(); |
172 | } |
173 | |
174 | // Attempt to acquire lock. If timeout is non-negative, operation may be |
175 | // failed after this many milliseconds. |
176 | // If implementing a custom version of this class, the implementation may |
177 | // choose to ignore the timeout. |
178 | // Return OK on success, or other Status on failure. |
179 | Status Rdb_mutex::TryLockFor(int64_t timeout_time MY_ATTRIBUTE((__unused__))) { |
180 | /* |
181 | Note: PThreads API has pthread_mutex_timedlock(), but mysql's |
182 | mysql_mutex_* wrappers do not wrap that function. |
183 | */ |
184 | RDB_MUTEX_LOCK_CHECK(m_mutex); |
185 | return Status::OK(); |
186 | } |
187 | |
188 | #ifndef STANDALONE_UNITTEST |
189 | void Rdb_mutex::set_unlock_action(const PSI_stage_info *const old_stage_arg) { |
190 | DBUG_ASSERT(old_stage_arg != nullptr); |
191 | |
192 | mysql_mutex_assert_owner(&m_mutex); |
193 | DBUG_ASSERT(m_old_stage_info.count(current_thd) == 0); |
194 | |
195 | m_old_stage_info[current_thd] = |
196 | std::make_shared<PSI_stage_info>(*old_stage_arg); |
197 | } |
198 | #endif |
199 | |
200 | // Unlock Mutex that was successfully locked by Lock() or TryLockUntil() |
201 | void Rdb_mutex::UnLock() { |
202 | #ifndef STANDALONE_UNITTEST |
203 | if (m_old_stage_info.count(current_thd) > 0) { |
204 | const std::shared_ptr<PSI_stage_info> old_stage = |
205 | m_old_stage_info[current_thd]; |
206 | m_old_stage_info.erase(current_thd); |
207 | /* The following will call mysql_mutex_unlock */ |
208 | THD_EXIT_COND(current_thd, old_stage.get()); |
209 | return; |
210 | } |
211 | #endif |
212 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
213 | } |
214 | |
215 | } // namespace myrocks |
216 | |