1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of PerconaFT.
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 PerconaFT is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 PerconaFT is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
21
22----------------------------------------
23
24 PerconaFT is free software: you can redistribute it and/or modify
25 it under the terms of the GNU Affero General Public License, version 3,
26 as published by the Free Software Foundation.
27
28 PerconaFT is distributed in the hope that it will be useful,
29 but WITHOUT ANY WARRANTY; without even the implied warranty of
30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 GNU Affero General Public License for more details.
32
33 You should have received a copy of the GNU Affero General Public License
34 along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
35======= */
36
37#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38
39#pragma once
40
41#include <toku_assert.h>
42#include <toku_portability.h>
43#include <toku_instrumentation.h>
44
45/* Readers/writers locks implementation
46 *
47 *****************************************
48 * Overview
49 *****************************************
50 *
51 * PerconaFT employs readers/writers locks for the ephemeral locks (e.g.,
52 * on FT nodes) Why not just use the toku_pthread_rwlock API?
53 *
54 * 1) we need multiprocess rwlocks (not just multithreaded)
55 *
56 * 2) pthread rwlocks are very slow since they entail a system call
57 * (about 2000ns on a 2GHz T2500.)
58 *
59 * Related: We expect the common case to be that the lock is
60 * granted
61 *
62 * 3) We are willing to employ machine-specific instructions (such
63 * as atomic exchange, and mfence, each of which runs in about
64 * 10ns.)
65 *
66 * 4) We want to guarantee nonstarvation (many rwlock
67 * implementations can starve the writers because another reader
68 * comes * along before all the other readers have unlocked.)
69 *
70 *****************************************
71 * How it works
72 *****************************************
73 *
74 * We arrange that the rwlock object is in the address space of both
75 * threads or processes. For processes we use mmap().
76 *
77 * The rwlock struct comprises the following fields
78 *
79 * a long mutex field (which is accessed using xchgl() or other
80 * machine-specific instructions. This is a spin lock.
81 *
82 * a read counter (how many readers currently have the lock?)
83 *
84 * a write boolean (does a writer have the lock?)
85 *
86 * a singly linked list of semaphores for waiting requesters. This
87 * list is sorted oldest requester first. Each list element
88 * contains a semaphore (which is provided by the requestor) and a
89 * boolean indicating whether it is a reader or a writer.
90 *
91 * To lock a read rwlock:
92 *
93 * 1) Acquire the mutex.
94 *
95 * 2) If the linked list is not empty or the writer boolean is true
96 * then
97 *
98 * a) initialize your semaphore (to 0),
99 * b) add your list element to the end of the list (with rw="read")
100 * c) release the mutex
101 * d) wait on the semaphore
102 * e) when the semaphore release, return success.
103 *
104 * 3) Otherwise increment the reader count, release the mutex, and
105 * return success.
106 *
107 * To lock the write rwlock is almost the same.
108 * 1) Acquire the mutex
109 * 2) If the list is not empty or the reader count is nonzero
110 * a) initialize semaphore
111 * b) add to end of list (with rw="write")
112 * c) release mutex
113 * d) wait on the semaphore
114 * e) return success when the semaphore releases
115 * 3) Otherwise set writer=true, release mutex and return success.
116 *
117 * To unlock a read rwlock:
118 * 1) Acquire mutex
119 * 2) Decrement reader count
120 * 3) If the count is still positive or the list is empty then
121 * return success
122 * 4) Otherwise (count==zero and the list is nonempty):
123 * a) If the first element of the list is a reader:
124 * i) while the first element is a reader:
125 * x) pop the list
126 * y) increment the reader count
127 * z) increment the semaphore (releasing it for some waiter)
128 * ii) return success
129 * b) Else if the first element is a writer
130 * i) pop the list
131 * ii) set writer to true
132 * iii) increment the semaphore
133 * iv) return success
134 */
135
136//Use case:
137// A read lock is acquired by threads that get and pin an entry in the
138// cachetable. A write lock is acquired by the writer thread when an entry
139// is evicted from the cachetable and is being written storage.
140
141//Use case:
142// General purpose reader writer lock with properties:
143// 1. multiple readers, no writers
144// 2. one writer at a time
145// 3. pending writers have priority over pending readers
146
147// An external mutex must be locked when using these functions. An alternate
148// design would bury a mutex into the rwlock itself. While this may
149// increase parallelism at the expense of single thread performance, we
150// are experimenting with a single higher level lock.
151
152extern toku_instr_key *rwlock_cond_key;
153extern toku_instr_key *rwlock_wait_read_key;
154extern toku_instr_key *rwlock_wait_write_key;
155
156typedef struct st_rwlock *RWLOCK;
157struct st_rwlock {
158 int reader; // the number of readers
159 int want_read; // the number of blocked readers
160 toku_cond_t wait_read;
161 int writer; // the number of writers
162 int want_write; // the number of blocked writers
163 toku_cond_t wait_write;
164 toku_cond_t *wait_users_go_to_zero;
165#if defined(TOKU_MYSQL_WITH_PFS)
166 toku_pthread_rwlock_t prwlock;
167#endif
168};
169
170// returns: the sum of the number of readers, pending readers, writers, and
171// pending writers
172
173static inline int rwlock_users(RWLOCK rwlock) {
174 return rwlock->reader + rwlock->want_read + rwlock->writer +
175 rwlock->want_write;
176}
177
178#if defined(TOKU_MYSQL_WITH_PFS)
179#define rwlock_init(K, R) inline_rwlock_init(K, R)
180#else
181#define rwlock_init(K, R) inline_rwlock_init(R)
182#endif
183
184// initialize a read write lock
185static inline __attribute__((__unused__)) void inline_rwlock_init(
186#if defined(TOKU_MYSQL_WITH_PFS)
187 const toku_instr_key &rwlock_instr_key,
188#endif
189 RWLOCK rwlock) {
190#if defined(TOKU_MYSQL_WITH_PFS)
191 toku_pthread_rwlock_init(rwlock_instr_key, &rwlock->prwlock, nullptr);
192#endif
193 rwlock->reader = rwlock->want_read = 0;
194 rwlock->writer = rwlock->want_write = 0;
195 toku_cond_init(toku_uninstrumented, &rwlock->wait_read, nullptr);
196 toku_cond_init(toku_uninstrumented, &rwlock->wait_write, nullptr);
197 rwlock->wait_users_go_to_zero = NULL;
198}
199
200// destroy a read write lock
201
202static inline __attribute__((__unused__)) void rwlock_destroy(RWLOCK rwlock) {
203 paranoid_invariant(rwlock->reader == 0);
204 paranoid_invariant(rwlock->want_read == 0);
205 paranoid_invariant(rwlock->writer == 0);
206 paranoid_invariant(rwlock->want_write == 0);
207 toku_cond_destroy(&rwlock->wait_read);
208 toku_cond_destroy(&rwlock->wait_write);
209#if defined(TOKU_MYSQL_WITH_PFS)
210 toku_pthread_rwlock_destroy(&rwlock->prwlock);
211#endif
212}
213
214// obtain a read lock
215// expects: mutex is locked
216
217static inline void rwlock_read_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
218#ifdef TOKU_MYSQL_WITH_PFS
219 /* Instrumentation start */
220 toku_rwlock_instrumentation rwlock_instr;
221 // TODO: pull location information up to caller
222 toku_instr_rwlock_rdlock_wait_start(
223 rwlock_instr, rwlock->prwlock, __FILE__, __LINE__);
224
225#endif
226
227 paranoid_invariant(!rwlock->wait_users_go_to_zero);
228 if (rwlock->writer || rwlock->want_write) {
229 rwlock->want_read++;
230 while (rwlock->writer || rwlock->want_write) {
231 toku_cond_wait(&rwlock->wait_read, mutex);
232 }
233 rwlock->want_read--;
234 }
235 rwlock->reader++;
236#ifdef TOKU_MYSQL_WITH_PFS
237 /* Instrumentation end */
238 toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
239#endif
240}
241
242// release a read lock
243// expects: mutex is locked
244
245static inline void rwlock_read_unlock(RWLOCK rwlock) {
246#ifdef TOKU_MYSQL_WITH_PFS
247 toku_instr_rwlock_unlock(rwlock->prwlock);
248#endif
249 paranoid_invariant(rwlock->reader > 0);
250 paranoid_invariant(rwlock->writer == 0);
251 rwlock->reader--;
252 if (rwlock->reader == 0 && rwlock->want_write) {
253 toku_cond_signal(&rwlock->wait_write);
254 }
255 if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) {
256 toku_cond_signal(rwlock->wait_users_go_to_zero);
257 }
258}
259
260// obtain a write lock
261// expects: mutex is locked
262
263static inline void rwlock_write_lock(RWLOCK rwlock, toku_mutex_t *mutex) {
264#ifdef TOKU_MYSQL_WITH_PFS
265 /* Instrumentation start */
266 toku_rwlock_instrumentation rwlock_instr;
267 toku_instr_rwlock_wrlock_wait_start(
268 rwlock_instr, rwlock->prwlock, __FILE__, __LINE__);
269#endif
270 paranoid_invariant(!rwlock->wait_users_go_to_zero);
271 if (rwlock->reader || rwlock->writer) {
272 rwlock->want_write++;
273 while (rwlock->reader || rwlock->writer) {
274 toku_cond_wait(&rwlock->wait_write, mutex);
275 }
276 rwlock->want_write--;
277 }
278 rwlock->writer++;
279#if defined(TOKU_MYSQL_WITH_PFS)
280 /* Instrumentation end */
281 toku_instr_rwlock_wrlock_wait_end(rwlock_instr, 0);
282#endif
283}
284
285// release a write lock
286// expects: mutex is locked
287
288static inline void rwlock_write_unlock(RWLOCK rwlock) {
289#if defined(TOKU_MYSQL_WITH_PFS)
290 toku_instr_rwlock_unlock(rwlock->prwlock);
291#endif
292 paranoid_invariant(rwlock->reader == 0);
293 paranoid_invariant(rwlock->writer == 1);
294 rwlock->writer--;
295 if (rwlock->want_write) {
296 toku_cond_signal(&rwlock->wait_write);
297 } else if (rwlock->want_read) {
298 toku_cond_broadcast(&rwlock->wait_read);
299 }
300 if (rwlock->wait_users_go_to_zero && rwlock_users(rwlock) == 0) {
301 toku_cond_signal(rwlock->wait_users_go_to_zero);
302 }
303}
304
305// returns: the number of readers
306
307static inline int rwlock_readers(RWLOCK rwlock) {
308 return rwlock->reader;
309}
310
311// returns: the number of readers who are waiting for the lock
312
313static inline int rwlock_blocked_readers(RWLOCK rwlock) {
314 return rwlock->want_read;
315}
316
317// returns: the number of writers who are waiting for the lock
318
319static inline int rwlock_blocked_writers(RWLOCK rwlock) {
320 return rwlock->want_write;
321}
322
323// returns: the number of writers
324
325static inline int rwlock_writers(RWLOCK rwlock) {
326 return rwlock->writer;
327}
328
329static inline bool rwlock_write_will_block(RWLOCK rwlock) {
330 return (rwlock->writer > 0 || rwlock->reader > 0);
331}
332
333static inline int rwlock_read_will_block(RWLOCK rwlock) {
334 return (rwlock->writer > 0 || rwlock->want_write > 0);
335}
336
337static inline void rwlock_wait_for_users(RWLOCK rwlock, toku_mutex_t *mutex) {
338 paranoid_invariant(!rwlock->wait_users_go_to_zero);
339 toku_cond_t cond;
340 toku_cond_init(toku_uninstrumented, &cond, nullptr);
341 while (rwlock_users(rwlock) > 0) {
342 rwlock->wait_users_go_to_zero = &cond;
343 toku_cond_wait(&cond, mutex);
344 }
345 rwlock->wait_users_go_to_zero = NULL;
346 toku_cond_destroy(&cond);
347}
348
349