| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #include <db.h> |
| 40 | |
| 41 | #include <locktree/lock_request.h> |
| 42 | |
| 43 | #include "ydb-internal.h" |
| 44 | #include "ydb_txn.h" |
| 45 | #include "ydb_row_lock.h" |
| 46 | |
| 47 | /* |
| 48 | Used for partial implementation of nested transactions. |
| 49 | Work is done by children as normal, but all locking is done by the |
| 50 | root of the nested txn tree. |
| 51 | This may hold extra locks, and will not work as expected when |
| 52 | a node has two non-completed txns at any time. |
| 53 | */ |
| 54 | static DB_TXN *txn_oldest_ancester(DB_TXN* txn) { |
| 55 | while (txn && txn->parent) { |
| 56 | txn = txn->parent; |
| 57 | } |
| 58 | return txn; |
| 59 | } |
| 60 | |
| 61 | int find_key_ranges_by_lt(const txn_lt_key_ranges &ranges, |
| 62 | const toku::locktree *const &find_lt); |
| 63 | int find_key_ranges_by_lt(const txn_lt_key_ranges &ranges, |
| 64 | const toku::locktree *const &find_lt) { |
| 65 | return ranges.lt->compare(find_lt); |
| 66 | } |
| 67 | |
| 68 | static void db_txn_note_row_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key) { |
| 69 | const toku::locktree *lt = db->i->lt; |
| 70 | |
| 71 | toku_mutex_lock(&db_txn_struct_i(txn)->txn_mutex); |
| 72 | |
| 73 | uint32_t idx; |
| 74 | txn_lt_key_ranges ranges; |
| 75 | toku::omt<txn_lt_key_ranges> *map = &db_txn_struct_i(txn)->lt_map; |
| 76 | |
| 77 | // if this txn has not yet already referenced this |
| 78 | // locktree, then add it to this txn's locktree map |
| 79 | int r = map->find_zero<const toku::locktree *, find_key_ranges_by_lt>(lt, &ranges, &idx); |
| 80 | if (r == DB_NOTFOUND) { |
| 81 | ranges.lt = db->i->lt; |
| 82 | XMALLOC(ranges.buffer); |
| 83 | ranges.buffer->create(); |
| 84 | map->insert_at(ranges, idx); |
| 85 | |
| 86 | // let the manager know we're referencing this lt |
| 87 | toku::locktree_manager *ltm = &txn->mgrp->i->ltm; |
| 88 | ltm->reference_lt(ranges.lt); |
| 89 | } else { |
| 90 | invariant_zero(r); |
| 91 | } |
| 92 | |
| 93 | // add a new lock range to this txn's row lock buffer |
| 94 | size_t old_mem_size = ranges.buffer->total_memory_size(); |
| 95 | ranges.buffer->append(left_key, right_key); |
| 96 | size_t new_mem_size = ranges.buffer->total_memory_size(); |
| 97 | invariant(new_mem_size > old_mem_size); |
| 98 | lt->get_manager()->note_mem_used(new_mem_size - old_mem_size); |
| 99 | |
| 100 | toku_mutex_unlock(&db_txn_struct_i(txn)->txn_mutex); |
| 101 | } |
| 102 | |
| 103 | void toku_db_txn_escalate_callback(TXNID txnid, const toku::locktree *lt, const toku::range_buffer &buffer, void *) { |
| 104 | DB_ENV *CAST_FROM_VOIDP(env, extra); |
| 105 | |
| 106 | // Get the TOKUTXN and DB_TXN for this txnid from the environment's txn manager. |
| 107 | // Only the parent id is used in the search. |
| 108 | TOKUTXN ttxn; |
| 109 | TXNID_PAIR txnid_pair = { .parent_id64 = txnid, .child_id64 = 0 }; |
| 110 | TXN_MANAGER txn_manager = toku_logger_get_txn_manager(env->i->logger); |
| 111 | |
| 112 | toku_txn_manager_suspend(txn_manager); |
| 113 | toku_txn_manager_id2txn_unlocked(txn_manager, txnid_pair, &ttxn); |
| 114 | |
| 115 | // We are still holding the txn manager lock. If we couldn't find the txn, |
| 116 | // then we lost a race with a committing transaction that got removed |
| 117 | // from the txn manager before it released its locktree locks. In this |
| 118 | // case we do nothing - that transaction has or is just about to release |
| 119 | // its locks and be gone, so there's not point in updating its lt_map |
| 120 | // with the new escalated ranges. It will go about releasing the old |
| 121 | // locks it thinks it had, and will succeed as if nothing happened. |
| 122 | // |
| 123 | // If we did find the transaction, then it has not yet been removed |
| 124 | // from the manager and therefore has not yet released its locks. |
| 125 | // We must try to replace the range buffer associated with this locktree, |
| 126 | // if it exists. This is impotant, otherwise it can grow out of |
| 127 | // control (ticket 5961). |
| 128 | |
| 129 | if (ttxn != nullptr) { |
| 130 | DB_TXN *txn = toku_txn_get_container_db_txn(ttxn); |
| 131 | |
| 132 | // One subtle point is that if the transaction is still live, it is impossible |
| 133 | // to deadlock on the txn mutex, even though we are holding the locktree's root |
| 134 | // mutex and release locks takes them in the opposite order. |
| 135 | // |
| 136 | // Proof: releasing locks takes the txn mutex and then acquires the locktree's |
| 137 | // root mutex, escalation takes the root mutex and possibly takes the txn mutex. |
| 138 | // releasing locks implies the txn is not live, and a non-live txn implies we |
| 139 | // will not need to take the txn mutex, so the deadlock is avoided. |
| 140 | toku_mutex_lock(&db_txn_struct_i(txn)->txn_mutex); |
| 141 | |
| 142 | uint32_t idx; |
| 143 | txn_lt_key_ranges ranges; |
| 144 | toku::omt<txn_lt_key_ranges> *map = &db_txn_struct_i(txn)->lt_map; |
| 145 | int r = map->find_zero<const toku::locktree *, find_key_ranges_by_lt>(lt, &ranges, &idx); |
| 146 | if (r == 0) { |
| 147 | // Destroy the old range buffer, create a new one, and insert the new ranges. |
| 148 | // |
| 149 | // We could theoretically steal the memory from the caller instead of copying |
| 150 | // it, but it's simpler to have a callback API that doesn't transfer memory ownership. |
| 151 | lt->get_manager()->note_mem_released(ranges.buffer->total_memory_size()); |
| 152 | ranges.buffer->destroy(); |
| 153 | ranges.buffer->create(); |
| 154 | toku::range_buffer::iterator iter(&buffer); |
| 155 | toku::range_buffer::iterator::record rec; |
| 156 | while (iter.current(&rec)) { |
| 157 | ranges.buffer->append(rec.get_left_key(), rec.get_right_key()); |
| 158 | iter.next(); |
| 159 | } |
| 160 | lt->get_manager()->note_mem_used(ranges.buffer->total_memory_size()); |
| 161 | } else { |
| 162 | // In rare cases, we may not find the associated locktree, because we are |
| 163 | // racing with the transaction trying to add this locktree to the lt map |
| 164 | // after acquiring its first lock. The escalated lock set must be the single |
| 165 | // lock that this txnid just acquired. Do nothing here and let the txn |
| 166 | // take care of adding this locktree and range to its lt map as usual. |
| 167 | invariant(buffer.get_num_ranges() == 1); |
| 168 | } |
| 169 | |
| 170 | toku_mutex_unlock(&db_txn_struct_i(txn)->txn_mutex); |
| 171 | } |
| 172 | |
| 173 | toku_txn_manager_resume(txn_manager); |
| 174 | } |
| 175 | |
| 176 | // Get a range lock. |
| 177 | // Return when the range lock is acquired or the default lock tree timeout has expired. |
| 178 | int toku_db_get_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, |
| 179 | toku::lock_request::type lock_type) { |
| 180 | toku::lock_request request; |
| 181 | request.create(); |
| 182 | int r = toku_db_start_range_lock(db, txn, left_key, right_key, lock_type, &request); |
| 183 | if (r == DB_LOCK_NOTGRANTED) { |
| 184 | toku_debug_sync(db_txn_struct_i(txn)->tokutxn, |
| 185 | "toku_range_lock_before_wait" ); |
| 186 | r = toku_db_wait_range_lock(db, txn, &request); |
| 187 | if (r == DB_LOCK_NOTGRANTED) |
| 188 | toku_debug_sync(db_txn_struct_i(txn)->tokutxn, |
| 189 | "toku_range_lock_not_granted_after_wait" ); |
| 190 | } |
| 191 | else if (r == 0) { |
| 192 | toku_debug_sync(db_txn_struct_i(txn)->tokutxn, |
| 193 | "toku_range_lock_granted_immediately" ); |
| 194 | } |
| 195 | |
| 196 | request.destroy(); |
| 197 | return r; |
| 198 | } |
| 199 | |
| 200 | // Setup and start an asynchronous lock request. |
| 201 | int toku_db_start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, |
| 202 | toku::lock_request::type lock_type, toku::lock_request *request) { |
| 203 | uint64_t client_id; |
| 204 | void *; |
| 205 | DB_TXN *txn_anc = txn_oldest_ancester(txn); |
| 206 | TXNID txn_anc_id = txn_anc->id64(txn_anc); |
| 207 | txn->get_client_id(txn, &client_id, &client_extra); |
| 208 | request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, |
| 209 | toku_is_big_txn(txn_anc), client_extra); |
| 210 | |
| 211 | const int r = request->start(); |
| 212 | if (r == 0) { |
| 213 | db_txn_note_row_lock(db, txn_anc, left_key, right_key); |
| 214 | } else if (r == DB_LOCK_DEADLOCK) { |
| 215 | lock_timeout_callback callback = txn->mgrp->i->lock_wait_timeout_callback; |
| 216 | if (callback != nullptr) { |
| 217 | callback(db, txn_anc_id, left_key, right_key, |
| 218 | request->get_conflicting_txnid()); |
| 219 | } |
| 220 | } |
| 221 | return r; |
| 222 | } |
| 223 | |
| 224 | // Complete a lock request by waiting until the request is ready |
| 225 | // and then storing the acquired lock if successful. |
| 226 | int toku_db_wait_range_lock(DB *db, DB_TXN *txn, toku::lock_request *request) { |
| 227 | DB_TXN *txn_anc = txn_oldest_ancester(txn); |
| 228 | const DBT *left_key = request->get_left_key(); |
| 229 | const DBT *right_key = request->get_right_key(); |
| 230 | DB_ENV *env = db->dbenv; |
| 231 | uint64_t wait_time_msec = env->i->default_lock_timeout_msec; |
| 232 | if (env->i->get_lock_timeout_callback) |
| 233 | wait_time_msec = env->i->get_lock_timeout_callback(wait_time_msec); |
| 234 | uint64_t killed_time_msec = env->i->default_killed_time_msec; |
| 235 | if (env->i->get_killed_time_callback) |
| 236 | killed_time_msec = env->i->get_killed_time_callback(killed_time_msec); |
| 237 | const int r = request->wait(wait_time_msec, killed_time_msec, env->i->killed_callback, |
| 238 | env->i->lock_wait_needed_callback); |
| 239 | if (r == 0) { |
| 240 | db_txn_note_row_lock(db, txn_anc, left_key, right_key); |
| 241 | } else if (r == DB_LOCK_NOTGRANTED) { |
| 242 | lock_timeout_callback callback = txn->mgrp->i->lock_wait_timeout_callback; |
| 243 | if (callback != nullptr) { |
| 244 | callback(db, txn_anc->id64(txn_anc), left_key, right_key, |
| 245 | request->get_conflicting_txnid()); |
| 246 | } |
| 247 | } |
| 248 | return r; |
| 249 | } |
| 250 | |
| 251 | int toku_db_get_point_write_lock(DB *db, DB_TXN *txn, const DBT *key) { |
| 252 | return toku_db_get_range_lock(db, txn, key, key, toku::lock_request::type::WRITE); |
| 253 | } |
| 254 | |
| 255 | // acquire a point write lock on the key for a given txn. |
| 256 | // this does not block the calling thread. |
| 257 | void toku_db_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) { |
| 258 | uint64_t client_id; |
| 259 | void *; |
| 260 | DB_TXN *txn = toku_txn_get_container_db_txn(tokutxn); |
| 261 | DB_TXN *txn_anc = txn_oldest_ancester(txn); |
| 262 | TXNID txn_anc_id = txn_anc->id64(txn_anc); |
| 263 | |
| 264 | // This lock request must succeed, so we do not want to wait |
| 265 | toku::lock_request request; |
| 266 | request.create(); |
| 267 | txn->get_client_id(txn, &client_id, &client_extra); |
| 268 | request.set(db->i->lt, txn_anc_id, key, key, |
| 269 | toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc), |
| 270 | client_extra); |
| 271 | int r = request.start(); |
| 272 | invariant_zero(r); |
| 273 | db_txn_note_row_lock(db, txn_anc, key, key); |
| 274 | request.destroy(); |
| 275 | } |
| 276 | |
| 277 | void toku_db_release_lt_key_ranges(DB_TXN *txn, txn_lt_key_ranges *ranges) { |
| 278 | toku::locktree *lt = ranges->lt; |
| 279 | TXNID txnid = txn->id64(txn); |
| 280 | |
| 281 | // release all of the locks this txn has ever successfully |
| 282 | // acquired and stored in the range buffer for this locktree |
| 283 | lt->release_locks(txnid, ranges->buffer); |
| 284 | lt->get_manager()->note_mem_released(ranges->buffer->total_memory_size()); |
| 285 | ranges->buffer->destroy(); |
| 286 | toku_free(ranges->buffer); |
| 287 | |
| 288 | // all of our locks have been released, so first try to wake up |
| 289 | // pending lock requests, then release our reference on the lt |
| 290 | toku::lock_request::retry_all_lock_requests(lt, txn->mgrp->i->lock_wait_needed_callback); |
| 291 | |
| 292 | // Release our reference on this locktree |
| 293 | toku::locktree_manager *ltm = &txn->mgrp->i->ltm; |
| 294 | ltm->release_lt(lt); |
| 295 | } |
| 296 | |