| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * @a M. L. Kersten, P. Boncz, N. J. Nes |
| 11 | * |
| 12 | * @* Transaction management |
| 13 | * The Transaction Manager maintains the buffer of (permanent) BATS |
| 14 | * held resident. Entries from the BAT buffer are always accessed by |
| 15 | * BAT id. A BAT becomes permanent by assigning a name with |
| 16 | * @%BBPrename@. Access to the transaction table is regulated by a |
| 17 | * semaphore. |
| 18 | */ |
| 19 | #include "monetdb_config.h" |
| 20 | #include "gdk.h" |
| 21 | #include "gdk_private.h" |
| 22 | #include "gdk_tm.h" |
| 23 | |
| 24 | /* |
| 25 | * The physical (disk) commit protocol is handled mostly by |
| 26 | * BBPsync. Once a commit succeeded, there is the task of removing |
| 27 | * ex-persistent bats (those that still were persistent in the |
| 28 | * previous commit, but were made transient in this transaction). |
| 29 | * Notice that such ex- (i.e. non-) persistent bats are not backed up |
| 30 | * by the BBPsync protocol, so we cannot start deleting after we know |
| 31 | * the commit will succeed. |
| 32 | * |
| 33 | * Another hairy issue are the delta statuses in BATs. These provide a |
| 34 | * fast way to perform a transaction abort (HOT-abort, instead of |
| 35 | * COLD-abort, which is achieved by the BBP recovery in a database |
| 36 | * restart). Hot-abort functionality has not been important in MonetDB |
| 37 | * for now, so it is not well-tested. The problem here is that if a |
| 38 | * commit fails in the physical part (BBPsync), we have not sufficient |
| 39 | * information to roll back the delta statuses. |
| 40 | * |
| 41 | * So a 'feature' of the abort is that after a failed commit, |
| 42 | * in-memory we *will* commit the transaction. Subsequent commits can |
| 43 | * retry to achieve a physical commit. The only way to abort in such a |
| 44 | * situation is COLD-abort: quit the server and restart, so you get |
| 45 | * the recovered disk images. |
| 46 | */ |
| 47 | /* in the commit prelude, the delta status in the memory image of all |
| 48 | * bats is commited */ |
| 49 | static gdk_return |
| 50 | prelude(int cnt, bat *subcommit) |
| 51 | { |
| 52 | int i = 0; |
| 53 | |
| 54 | while (++i < cnt) { |
| 55 | bat bid = subcommit ? subcommit[i] : i; |
| 56 | |
| 57 | if (BBP_status(bid) & BBPPERSISTENT) { |
| 58 | BAT *b = BBP_cache(bid); |
| 59 | |
| 60 | if (b == NULL && (BBP_status(bid) & BBPSWAPPED)) { |
| 61 | b = BBPquickdesc(bid, true); |
| 62 | if (b == NULL) |
| 63 | return GDK_FAIL; |
| 64 | } |
| 65 | if (b) { |
| 66 | assert(!isVIEW(b)); |
| 67 | assert(b->batRole == PERSISTENT); |
| 68 | BATcommit(b); |
| 69 | } |
| 70 | } |
| 71 | } |
| 72 | return GDK_SUCCEED; |
| 73 | } |
| 74 | |
| 75 | /* in the commit epilogue, the BBP-status of the bats is changed to |
| 76 | * reflect their presence in the succeeded checkpoint. Also bats from |
| 77 | * the previous checkpoint that were deleted now are physically |
| 78 | * destroyed. |
| 79 | */ |
| 80 | static void |
| 81 | epilogue(int cnt, bat *subcommit) |
| 82 | { |
| 83 | int i = 0; |
| 84 | |
| 85 | while (++i < cnt) { |
| 86 | bat bid = subcommit ? subcommit[i] : i; |
| 87 | |
| 88 | if (BBP_status(bid) & BBPPERSISTENT) { |
| 89 | BBP_status_on(bid, BBPEXISTING, subcommit ? "TMsubcommit" : "TMcommit" ); |
| 90 | } else if (BBP_status(bid) & BBPDELETED) { |
| 91 | /* check mmap modes of bats that are now |
| 92 | * transient. this has to be done after the |
| 93 | * commit succeeded, because the mmap modes |
| 94 | * allowed on transient bats would be |
| 95 | * dangerous on persistent bats. If the commit |
| 96 | * failed, the already processed bats that |
| 97 | * would become transient after the commit, |
| 98 | * but didn't due to the failure, would be a |
| 99 | * consistency risk. |
| 100 | */ |
| 101 | BAT *b = BBP_cache(bid); |
| 102 | if (b) { |
| 103 | /* check mmap modes */ |
| 104 | if (BATcheckmodes(b, true) != GDK_SUCCEED) |
| 105 | fprintf(stderr, "#epilogue: BATcheckmodes failed\n" ); |
| 106 | } |
| 107 | } |
| 108 | if ((BBP_status(bid) & BBPDELETED) && BBP_refs(bid) <= 0 && BBP_lrefs(bid) <= 0) { |
| 109 | BAT *b = BBPquickdesc(bid, true); |
| 110 | |
| 111 | /* the unloaded ones are deleted without |
| 112 | * loading deleted disk images */ |
| 113 | if (b) { |
| 114 | BATdelete(b); |
| 115 | if (BBP_cache(bid)) { |
| 116 | /* those that quickdesc |
| 117 | * decides to load => free |
| 118 | * memory */ |
| 119 | BATfree(b); |
| 120 | } |
| 121 | } |
| 122 | BBPclear(bid); /* clear with locking */ |
| 123 | } |
| 124 | BBP_status_off(bid, BBPDELETED | BBPSWAPPED | BBPNEW, subcommit ? "TMsubcommit" : "TMcommit" ); |
| 125 | } |
| 126 | GDKclrerr(); |
| 127 | } |
| 128 | |
| 129 | /* |
| 130 | * @- TMcommit |
| 131 | * global commit without any multi-threaded access assumptions, thus |
| 132 | * taking all BBP locks. It creates a new database checkpoint. |
| 133 | */ |
| 134 | gdk_return |
| 135 | TMcommit(void) |
| 136 | { |
| 137 | gdk_return ret = GDK_FAIL; |
| 138 | |
| 139 | /* commit with the BBP globally locked */ |
| 140 | BBPlock(); |
| 141 | if (prelude(getBBPsize(), NULL) == GDK_SUCCEED && |
| 142 | BBPsync(getBBPsize(), NULL) == GDK_SUCCEED) { |
| 143 | epilogue(getBBPsize(), NULL); |
| 144 | ret = GDK_SUCCEED; |
| 145 | } |
| 146 | BBPunlock(); |
| 147 | return ret; |
| 148 | } |
| 149 | |
| 150 | /* |
| 151 | * @- TMsubcommit |
| 152 | * |
| 153 | * Create a new checkpoint that is equal to the previous, with the |
| 154 | * exception that for the passed list of batnames, the current state |
| 155 | * will be reflected in the new checkpoint. |
| 156 | * |
| 157 | * On the bats in this list we assume exclusive access during the |
| 158 | * operation. |
| 159 | * |
| 160 | * This operation is useful for e.g. adding a new XQuery document or |
| 161 | * SQL table to the committed state (after bulk-load). Or for dropping |
| 162 | * a table or doc, without forcing the total database to be clean, |
| 163 | * which may require a lot of I/O. |
| 164 | * |
| 165 | * We expect the globally locked phase (BBPsync) to take little time |
| 166 | * (<100ms) as only the BBP.dir is written out; and for the existing |
| 167 | * bats that were modified, only some heap moves are done (moved from |
| 168 | * BAKDIR to SUBDIR). The atomic commit for sub-commit is the rename |
| 169 | * of SUBDIR to DELDIR. |
| 170 | * |
| 171 | * As it does not take the BBP-locks (thanks to the assumption that |
| 172 | * access is exclusive), the concurrency impact of subcommit is also |
| 173 | * much lighter to ongoing concurrent query and update facilities than |
| 174 | * a real global TMcommit. |
| 175 | */ |
| 176 | gdk_return |
| 177 | TMsubcommit_list(bat *subcommit, int cnt) |
| 178 | { |
| 179 | int xx; |
| 180 | gdk_return ret = GDK_FAIL; |
| 181 | |
| 182 | assert(cnt > 0); |
| 183 | assert(subcommit[0] == 0); /* BBP artifact: slot 0 in the array will be ignored */ |
| 184 | |
| 185 | if (GDKinmemory()) |
| 186 | return GDK_SUCCEED; |
| 187 | |
| 188 | /* sort the list on BAT id */ |
| 189 | GDKqsort(subcommit + 1, NULL, NULL, cnt - 1, sizeof(bat), 0, TYPE_bat, false, false); |
| 190 | |
| 191 | assert(cnt == 1 || subcommit[1] > 0); /* all values > 0 */ |
| 192 | /* de-duplication of BAT ids in subcommit list |
| 193 | * this is needed because of legacy reasons (database |
| 194 | * upgrade) */ |
| 195 | for (xx = 2; xx < cnt; xx++) { |
| 196 | if (subcommit[xx-1] == subcommit[xx]) { |
| 197 | int i; |
| 198 | cnt--; |
| 199 | for (i = xx; i < cnt; i++) |
| 200 | subcommit[i] = subcommit[i+1]; |
| 201 | } |
| 202 | } |
| 203 | if (prelude(cnt, subcommit) == GDK_SUCCEED) { /* save the new bats outside the lock */ |
| 204 | /* lock just prevents BBPtrims, and other global |
| 205 | * (sub-)commits */ |
| 206 | for (xx = 0; xx <= BBP_THREADMASK; xx++) |
| 207 | MT_lock_set(&GDKtrimLock(xx)); |
| 208 | if (BBPsync(cnt, subcommit) == GDK_SUCCEED) { /* write BBP.dir (++) */ |
| 209 | epilogue(cnt, subcommit); |
| 210 | ret = GDK_SUCCEED; |
| 211 | } |
| 212 | for (xx = BBP_THREADMASK; xx >= 0; xx--) |
| 213 | MT_lock_unset(&GDKtrimLock(xx)); |
| 214 | } |
| 215 | return ret; |
| 216 | } |
| 217 | |
| 218 | gdk_return |
| 219 | TMsubcommit(BAT *b) |
| 220 | { |
| 221 | int cnt = 1; |
| 222 | gdk_return ret = GDK_FAIL; |
| 223 | bat *subcommit; |
| 224 | BUN p, q; |
| 225 | BATiter bi = bat_iterator(b); |
| 226 | |
| 227 | subcommit = GDKmalloc((BATcount(b) + 1) * sizeof(bat)); |
| 228 | if (subcommit == NULL) |
| 229 | return GDK_FAIL; |
| 230 | |
| 231 | subcommit[0] = 0; /* BBP artifact: slot 0 in the array will be ignored */ |
| 232 | /* collect the list and save the new bats outside any |
| 233 | * locking */ |
| 234 | BATloop(b, p, q) { |
| 235 | bat bid = BBPindex((str) BUNtvar(bi, p)); |
| 236 | |
| 237 | if (bid) |
| 238 | subcommit[cnt++] = bid; |
| 239 | } |
| 240 | |
| 241 | ret = TMsubcommit_list(subcommit, cnt); |
| 242 | GDKfree(subcommit); |
| 243 | return ret; |
| 244 | } |
| 245 | |
| 246 | /* |
| 247 | * @- TMabort |
| 248 | * Transaction abort is cheap. We use the delta statuses to go back to |
| 249 | * the previous version of each BAT. Also for BATs that are currently |
| 250 | * swapped out. Persistent BATs that were made transient in this |
| 251 | * transaction become persistent again. |
| 252 | */ |
| 253 | void |
| 254 | TMabort(void) |
| 255 | { |
| 256 | int i; |
| 257 | |
| 258 | BBPlock(); |
| 259 | for (i = 1; i < getBBPsize(); i++) { |
| 260 | if (BBP_status(i) & BBPNEW) { |
| 261 | BAT *b = BBPquickdesc(i, false); |
| 262 | |
| 263 | if (b) { |
| 264 | if (!b->batTransient) |
| 265 | BBPrelease(i); |
| 266 | b->batTransient = true; |
| 267 | b->batDirtydesc = true; |
| 268 | } |
| 269 | } |
| 270 | } |
| 271 | for (i = 1; i < getBBPsize(); i++) { |
| 272 | if (BBP_status(i) & (BBPPERSISTENT | BBPDELETED | BBPSWAPPED)) { |
| 273 | BAT *b = BBPquickdesc(i, true); |
| 274 | |
| 275 | if (b == NULL) |
| 276 | continue; |
| 277 | |
| 278 | BBPfix(i); |
| 279 | if (BATdirty(b) || DELTAdirty(b)) { |
| 280 | /* BUN move-backes need a real BAT! */ |
| 281 | /* Stefan: |
| 282 | * Actually, in case DELTAdirty(b), |
| 283 | * i.e., a BAT with differences that |
| 284 | * is saved/swapped-out but not yet |
| 285 | * committed, we (AFAIK) don't have to |
| 286 | * load the BAT and apply the undo, |
| 287 | * but rather could simply discard the |
| 288 | * delta and revive the backup; |
| 289 | * however, I don't know how to do |
| 290 | * this (yet), hence we stick with |
| 291 | * this solution for the time being |
| 292 | * --- it should be correct though it |
| 293 | * might not be the most efficient |
| 294 | * way... |
| 295 | */ |
| 296 | b = BBPdescriptor(i); |
| 297 | BATundo(b); |
| 298 | } |
| 299 | if (BBP_status(i) & BBPDELETED) { |
| 300 | BBP_status_on(i, BBPEXISTING, "TMabort" ); |
| 301 | if (b->batTransient) |
| 302 | BBPretain(i); |
| 303 | b->batTransient = false; |
| 304 | b->batDirtydesc = true; |
| 305 | } |
| 306 | BBPunfix(i); |
| 307 | } |
| 308 | BBP_status_off(i, BBPDELETED | BBPSWAPPED | BBPNEW, "TMabort" ); |
| 309 | } |
| 310 | BBPunlock(); |
| 311 | GDKclrerr(); |
| 312 | } |
| 313 | |