1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
3 | #ident "$Id$" |
4 | /*====== |
5 | This file is part of PerconaFT. |
6 | |
7 | |
8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
9 | |
10 | PerconaFT is free software: you can redistribute it and/or modify |
11 | it under the terms of the GNU General Public License, version 2, |
12 | as published by the Free Software Foundation. |
13 | |
14 | PerconaFT is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
21 | |
22 | ---------------------------------------- |
23 | |
24 | PerconaFT is free software: you can redistribute it and/or modify |
25 | it under the terms of the GNU Affero General Public License, version 3, |
26 | as published by the Free Software Foundation. |
27 | |
28 | PerconaFT is distributed in the hope that it will be useful, |
29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
31 | GNU Affero General Public License for more details. |
32 | |
33 | You should have received a copy of the GNU Affero General Public License |
34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
35 | ======= */ |
36 | |
37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
38 | |
39 | #pragma once |
40 | |
41 | #include <toku_assert.h> |
42 | #include <toku_portability.h> |
43 | #include <toku_instrumentation.h> |
44 | |
45 | /* Readers/writers locks implementation |
46 | * |
47 | ***************************************** |
48 | * Overview |
49 | ***************************************** |
50 | * |
51 | * PerconaFT employs readers/writers locks for the ephemeral locks (e.g., |
52 | * on FT nodes) Why not just use the toku_pthread_rwlock API? |
53 | * |
54 | * 1) we need multiprocess rwlocks (not just multithreaded) |
55 | * |
56 | * 2) pthread rwlocks are very slow since they entail a system call |
57 | * (about 2000ns on a 2GHz T2500.) |
58 | * |
59 | * Related: We expect the common case to be that the lock is |
60 | * granted |
61 | * |
62 | * 3) We are willing to employ machine-specific instructions (such |
63 | * as atomic exchange, and mfence, each of which runs in about |
64 | * 10ns.) |
65 | * |
66 | * 4) We want to guarantee nonstarvation (many rwlock |
67 | * implementations can starve the writers because another reader |
68 | * comes * along before all the other readers have unlocked.) |
69 | * |
70 | ***************************************** |
71 | * How it works |
72 | ***************************************** |
73 | * |
74 | * We arrange that the rwlock object is in the address space of both |
75 | * threads or processes. For processes we use mmap(). |
76 | * |
77 | * The rwlock struct comprises the following fields |
78 | * |
79 | * a long mutex field (which is accessed using xchgl() or other |
80 | * machine-specific instructions. This is a spin lock. |
81 | * |
82 | * a read counter (how many readers currently have the lock?) |
83 | * |
84 | * a write boolean (does a writer have the lock?) |
85 | * |
86 | * a singly linked list of semaphores for waiting requesters. This |
87 | * list is sorted oldest requester first. Each list element |
88 | * contains a semaphore (which is provided by the requestor) and a |
89 | * boolean indicating whether it is a reader or a writer. |
90 | * |
91 | * To lock a read rwlock: |
92 | * |
93 | * 1) Acquire the mutex. |
94 | * |
95 | * 2) If the linked list is not empty or the writer boolean is true |
96 | * then |
97 | * |
98 | * a) initialize your semaphore (to 0), |
99 | * b) add your list element to the end of the list (with rw="read") |
100 | * c) release the mutex |
101 | * d) wait on the semaphore |
102 | * e) when the semaphore release, return success. |
103 | * |
104 | * 3) Otherwise increment the reader count, release the mutex, and |
105 | * return success. |
106 | * |
107 | * To lock the write rwlock is almost the same. |
108 | * 1) Acquire the mutex |
109 | * 2) If the list is not empty or the reader count is nonzero |
110 | * a) initialize semaphore |
111 | * b) add to end of list (with rw="write") |
112 | * c) release mutex |
113 | * d) wait on the semaphore |
114 | * e) return success when the semaphore releases |
115 | * 3) Otherwise set writer=true, release mutex and return success. |
116 | * |
117 | * To unlock a read rwlock: |
118 | * 1) Acquire mutex |
119 | * 2) Decrement reader count |
120 | * 3) If the count is still positive or the list is empty then |
121 | * return success |
122 | * 4) Otherwise (count==zero and the list is nonempty): |
123 | * a) If the first element of the list is a reader: |
124 | * i) while the first element is a reader: |
125 | * x) pop the list |
126 | * y) increment the reader count |
127 | * z) increment the semaphore (releasing it for some waiter) |
128 | * ii) return success |
129 | * b) Else if the first element is a writer |
130 | * i) pop the list |
131 | * ii) set writer to true |
132 | * iii) increment the semaphore |
133 | * iv) return success |
134 | */ |
135 | |
136 | //Use case: |
137 | // A read lock is acquired by threads that get and pin an entry in the |
138 | // cachetable. A write lock is acquired by the writer thread when an entry |
139 | // is evicted from the cachetable and is being written storage. |
140 | |
141 | //Use case: |
142 | // General purpose reader writer lock with properties: |
143 | // 1. multiple readers, no writers |
144 | // 2. one writer at a time |
145 | // 3. pending writers have priority over pending readers |
146 | |
147 | // An external mutex must be locked when using these functions. An alternate |
148 | // design would bury a mutex into the rwlock itself. While this may |
149 | // increase parallelism at the expense of single thread performance, we |
150 | // are experimenting with a single higher level lock. |
151 | |
152 | extern toku_instr_key *rwlock_cond_key; |
153 | extern toku_instr_key *rwlock_wait_read_key; |
154 | extern toku_instr_key *rwlock_wait_write_key; |
155 | |
156 | typedef struct st_rwlock *RWLOCK; |
157 | struct st_rwlock { |
158 | int reader; // the number of readers |
159 | int want_read; // the number of blocked readers |
160 | toku_cond_t wait_read; |
161 | int writer; // the number of writers |
162 | int want_write; // the number of blocked writers |
163 | toku_cond_t wait_write; |
164 | toku_cond_t *wait_users_go_to_zero; |
165 | #if defined(TOKU_MYSQL_WITH_PFS) |
166 | toku_pthread_rwlock_t prwlock; |
167 | #endif |
168 | }; |
169 | |
170 | // returns: the sum of the number of readers, pending readers, writers, and |
171 | // pending writers |
172 | |
173 | static inline int rwlock_users(RWLOCK rwlock) { |
174 | return rwlock->reader + rwlock->want_read + rwlock->writer + |
175 | rwlock->want_write; |
176 | } |
177 | |
178 | #if defined(TOKU_MYSQL_WITH_PFS) |
179 | #define rwlock_init(K, R) inline_rwlock_init(K, R) |
180 | #else |
181 | #define rwlock_init(K, R) inline_rwlock_init(R) |
182 | #endif |
183 | |
184 | // initialize a read write lock |
185 | static inline __attribute__((__unused__)) void inline_rwlock_init( |
186 | #if defined(TOKU_MYSQL_WITH_PFS) |
187 | const toku_instr_key &rwlock_instr_key, |
188 | #endif |
189 | RWLOCK rwlock) { |
190 | #if defined(TOKU_MYSQL_WITH_PFS) |
191 | toku_pthread_rwlock_init(rwlock_instr_key, &rwlock->prwlock, nullptr); |
192 | #endif |
193 | rwlock->reader = rwlock->want_read = 0; |
194 | rwlock->writer = rwlock->want_write = 0; |
195 | toku_cond_init(toku_uninstrumented, &rwlock->wait_read, nullptr); |
196 | toku_cond_init(toku_uninstrumented, &rwlock->wait_write, nullptr); |
197 | rwlock->wait_users_go_to_zero = NULL; |
198 | } |
199 | |
200 | // destroy a read write lock |
201 | |
202 | static inline __attribute__((__unused__)) void rwlock_destroy(RWLOCK rwlock) { |
203 | paranoid_invariant(rwlock->reader == 0); |
204 | paranoid_invariant(rwlock->want_read == 0); |
205 | paranoid_invariant(rwlock->writer == 0); |
206 | paranoid_invariant(rwlock->want_write == 0); |
207 | toku_cond_destroy(&rwlock->wait_read); |
208 | toku_cond_destroy(&rwlock->wait_write); |
209 | #if defined(TOKU_MYSQL_WITH_PFS) |
210 | toku_pthread_rwlock_destroy(&rwlock->prwlock); |
211 | #endif |
212 | } |
213 | |
214 | // obtain a read lock |
215 | // expects: mutex is locked |
216 | |
217 | static inline void rwlock_read_lock(RWLOCK rwlock, toku_mutex_t *mutex) { |
218 | #ifdef TOKU_MYSQL_WITH_PFS |
219 | /* Instrumentation start */ |
220 | toku_rwlock_instrumentation rwlock_instr; |
221 | // TODO: pull location information up to caller |
222 | toku_instr_rwlock_rdlock_wait_start( |
223 | rwlock_instr, rwlock->prwlock, __FILE__, __LINE__); |
224 | |
225 | #endif |
226 | |
227 | paranoid_invariant(!rwlock->wait_users_go_to_zero); |
228 | if (rwlock->writer || rwlock->want_write) { |
229 | rwlock->want_read++; |
230 | while (rwlock->writer || rwlock->want_write) { |
231 | toku_cond_wait(&rwlock->wait_read, mutex); |
232 | } |
233 | rwlock->want_read--; |
234 | } |
235 | rwlock->reader++; |
236 | #ifdef TOKU_MYSQL_WITH_PFS |
237 | /* Instrumentation end */ |
238 | toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0); |
239 | #endif |
240 | } |
241 | |
242 | // release a read lock |
243 | // expects: mutex is locked |
244 | |
245 | static inline void rwlock_read_unlock(RWLOCK rwlock) { |
246 | #ifdef TOKU_MYSQL_WITH_PFS |
247 | toku_instr_rwlock_unlock(rwlock->prwlock); |
248 | #endif |
249 | paranoid_invariant(rwlock->reader > 0); |
250 | paranoid_invariant(rwlock->writer == 0); |
251 | rwlock->reader--; |
252 | if (rwlock->reader == 0 && rwlock->want_write) { |
253 | toku_cond_signal(&rwlock->wait_write); |
254 | } |
255 | if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) { |
256 | toku_cond_signal(rwlock->wait_users_go_to_zero); |
257 | } |
258 | } |
259 | |
260 | // obtain a write lock |
261 | // expects: mutex is locked |
262 | |
263 | static inline void rwlock_write_lock(RWLOCK rwlock, toku_mutex_t *mutex) { |
264 | #ifdef TOKU_MYSQL_WITH_PFS |
265 | /* Instrumentation start */ |
266 | toku_rwlock_instrumentation rwlock_instr; |
267 | toku_instr_rwlock_wrlock_wait_start( |
268 | rwlock_instr, rwlock->prwlock, __FILE__, __LINE__); |
269 | #endif |
270 | paranoid_invariant(!rwlock->wait_users_go_to_zero); |
271 | if (rwlock->reader || rwlock->writer) { |
272 | rwlock->want_write++; |
273 | while (rwlock->reader || rwlock->writer) { |
274 | toku_cond_wait(&rwlock->wait_write, mutex); |
275 | } |
276 | rwlock->want_write--; |
277 | } |
278 | rwlock->writer++; |
279 | #if defined(TOKU_MYSQL_WITH_PFS) |
280 | /* Instrumentation end */ |
281 | toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0); |
282 | #endif |
283 | } |
284 | |
285 | // release a write lock |
286 | // expects: mutex is locked |
287 | |
288 | static inline void rwlock_write_unlock(RWLOCK rwlock) { |
289 | #if defined(TOKU_MYSQL_WITH_PFS) |
290 | toku_instr_rwlock_unlock(rwlock->prwlock); |
291 | #endif |
292 | paranoid_invariant(rwlock->reader == 0); |
293 | paranoid_invariant(rwlock->writer == 1); |
294 | rwlock->writer--; |
295 | if (rwlock->want_write) { |
296 | toku_cond_signal(&rwlock->wait_write); |
297 | } else if (rwlock->want_read) { |
298 | toku_cond_broadcast(&rwlock->wait_read); |
299 | } |
300 | if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) { |
301 | toku_cond_signal(rwlock->wait_users_go_to_zero); |
302 | } |
303 | } |
304 | |
305 | // returns: the number of readers |
306 | |
307 | static inline int rwlock_readers(RWLOCK rwlock) { |
308 | return rwlock->reader; |
309 | } |
310 | |
311 | // returns: the number of readers who are waiting for the lock |
312 | |
313 | static inline int rwlock_blocked_readers(RWLOCK rwlock) { |
314 | return rwlock->want_read; |
315 | } |
316 | |
317 | // returns: the number of writers who are waiting for the lock |
318 | |
319 | static inline int rwlock_blocked_writers(RWLOCK rwlock) { |
320 | return rwlock->want_write; |
321 | } |
322 | |
323 | // returns: the number of writers |
324 | |
325 | static inline int rwlock_writers(RWLOCK rwlock) { |
326 | return rwlock->writer; |
327 | } |
328 | |
329 | static inline bool rwlock_write_will_block(RWLOCK rwlock) { |
330 | return (rwlock->writer > 0 || rwlock->reader > 0); |
331 | } |
332 | |
333 | static inline int rwlock_read_will_block(RWLOCK rwlock) { |
334 | return (rwlock->writer > 0 || rwlock->want_write > 0); |
335 | } |
336 | |
337 | static inline void rwlock_wait_for_users(RWLOCK rwlock, toku_mutex_t *mutex) { |
338 | paranoid_invariant(!rwlock->wait_users_go_to_zero); |
339 | toku_cond_t cond; |
340 | toku_cond_init(toku_uninstrumented, &cond, nullptr); |
341 | while (rwlock_users(rwlock) > 0) { |
342 | rwlock->wait_users_go_to_zero = &cond; |
343 | toku_cond_wait(&cond, mutex); |
344 | } |
345 | rwlock->wait_users_go_to_zero = NULL; |
346 | toku_cond_destroy(&cond); |
347 | } |
348 | |
349 | |