1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @a M. L. Kersten, P. Boncz, N. J. Nes
11 *
12 * @* Transaction management
13 * The Transaction Manager maintains the buffer of (permanent) BATS
14 * held resident. Entries from the BAT buffer are always accessed by
15 * BAT id. A BAT becomes permanent by assigning a name with
16 * @%BBPrename@. Access to the transaction table is regulated by a
17 * semaphore.
18 */
19#include "monetdb_config.h"
20#include "gdk.h"
21#include "gdk_private.h"
22#include "gdk_tm.h"
23
24/*
25 * The physical (disk) commit protocol is handled mostly by
26 * BBPsync. Once a commit succeeded, there is the task of removing
27 * ex-persistent bats (those that still were persistent in the
28 * previous commit, but were made transient in this transaction).
29 * Notice that such ex- (i.e. non-) persistent bats are not backed up
30 * by the BBPsync protocol, so we cannot start deleting after we know
31 * the commit will succeed.
32 *
33 * Another hairy issue are the delta statuses in BATs. These provide a
34 * fast way to perform a transaction abort (HOT-abort, instead of
35 * COLD-abort, which is achieved by the BBP recovery in a database
36 * restart). Hot-abort functionality has not been important in MonetDB
37 * for now, so it is not well-tested. The problem here is that if a
38 * commit fails in the physical part (BBPsync), we have not sufficient
39 * information to roll back the delta statuses.
40 *
41 * So a 'feature' of the abort is that after a failed commit,
42 * in-memory we *will* commit the transaction. Subsequent commits can
43 * retry to achieve a physical commit. The only way to abort in such a
44 * situation is COLD-abort: quit the server and restart, so you get
45 * the recovered disk images.
46 */
47/* in the commit prelude, the delta status in the memory image of all
48 * bats is commited */
49static gdk_return
50prelude(int cnt, bat *subcommit)
51{
52 int i = 0;
53
54 while (++i < cnt) {
55 bat bid = subcommit ? subcommit[i] : i;
56
57 if (BBP_status(bid) & BBPPERSISTENT) {
58 BAT *b = BBP_cache(bid);
59
60 if (b == NULL && (BBP_status(bid) & BBPSWAPPED)) {
61 b = BBPquickdesc(bid, true);
62 if (b == NULL)
63 return GDK_FAIL;
64 }
65 if (b) {
66 assert(!isVIEW(b));
67 assert(b->batRole == PERSISTENT);
68 BATcommit(b);
69 }
70 }
71 }
72 return GDK_SUCCEED;
73}
74
75/* in the commit epilogue, the BBP-status of the bats is changed to
76 * reflect their presence in the succeeded checkpoint. Also bats from
77 * the previous checkpoint that were deleted now are physically
78 * destroyed.
79 */
80static void
81epilogue(int cnt, bat *subcommit)
82{
83 int i = 0;
84
85 while (++i < cnt) {
86 bat bid = subcommit ? subcommit[i] : i;
87
88 if (BBP_status(bid) & BBPPERSISTENT) {
89 BBP_status_on(bid, BBPEXISTING, subcommit ? "TMsubcommit" : "TMcommit");
90 } else if (BBP_status(bid) & BBPDELETED) {
91 /* check mmap modes of bats that are now
92 * transient. this has to be done after the
93 * commit succeeded, because the mmap modes
94 * allowed on transient bats would be
95 * dangerous on persistent bats. If the commit
96 * failed, the already processed bats that
97 * would become transient after the commit,
98 * but didn't due to the failure, would be a
99 * consistency risk.
100 */
101 BAT *b = BBP_cache(bid);
102 if (b) {
103 /* check mmap modes */
104 if (BATcheckmodes(b, true) != GDK_SUCCEED)
105 fprintf(stderr, "#epilogue: BATcheckmodes failed\n");
106 }
107 }
108 if ((BBP_status(bid) & BBPDELETED) && BBP_refs(bid) <= 0 && BBP_lrefs(bid) <= 0) {
109 BAT *b = BBPquickdesc(bid, true);
110
111 /* the unloaded ones are deleted without
112 * loading deleted disk images */
113 if (b) {
114 BATdelete(b);
115 if (BBP_cache(bid)) {
116 /* those that quickdesc
117 * decides to load => free
118 * memory */
119 BATfree(b);
120 }
121 }
122 BBPclear(bid); /* clear with locking */
123 }
124 BBP_status_off(bid, BBPDELETED | BBPSWAPPED | BBPNEW, subcommit ? "TMsubcommit" : "TMcommit");
125 }
126 GDKclrerr();
127}
128
129/*
130 * @- TMcommit
131 * global commit without any multi-threaded access assumptions, thus
132 * taking all BBP locks. It creates a new database checkpoint.
133 */
134gdk_return
135TMcommit(void)
136{
137 gdk_return ret = GDK_FAIL;
138
139 /* commit with the BBP globally locked */
140 BBPlock();
141 if (prelude(getBBPsize(), NULL) == GDK_SUCCEED &&
142 BBPsync(getBBPsize(), NULL) == GDK_SUCCEED) {
143 epilogue(getBBPsize(), NULL);
144 ret = GDK_SUCCEED;
145 }
146 BBPunlock();
147 return ret;
148}
149
150/*
151 * @- TMsubcommit
152 *
153 * Create a new checkpoint that is equal to the previous, with the
154 * exception that for the passed list of batnames, the current state
155 * will be reflected in the new checkpoint.
156 *
157 * On the bats in this list we assume exclusive access during the
158 * operation.
159 *
160 * This operation is useful for e.g. adding a new XQuery document or
161 * SQL table to the committed state (after bulk-load). Or for dropping
162 * a table or doc, without forcing the total database to be clean,
163 * which may require a lot of I/O.
164 *
165 * We expect the globally locked phase (BBPsync) to take little time
166 * (<100ms) as only the BBP.dir is written out; and for the existing
167 * bats that were modified, only some heap moves are done (moved from
168 * BAKDIR to SUBDIR). The atomic commit for sub-commit is the rename
169 * of SUBDIR to DELDIR.
170 *
171 * As it does not take the BBP-locks (thanks to the assumption that
172 * access is exclusive), the concurrency impact of subcommit is also
173 * much lighter to ongoing concurrent query and update facilities than
174 * a real global TMcommit.
175 */
176gdk_return
177TMsubcommit_list(bat *subcommit, int cnt)
178{
179 int xx;
180 gdk_return ret = GDK_FAIL;
181
182 assert(cnt > 0);
183 assert(subcommit[0] == 0); /* BBP artifact: slot 0 in the array will be ignored */
184
185 if (GDKinmemory())
186 return GDK_SUCCEED;
187
188 /* sort the list on BAT id */
189 GDKqsort(subcommit + 1, NULL, NULL, cnt - 1, sizeof(bat), 0, TYPE_bat, false, false);
190
191 assert(cnt == 1 || subcommit[1] > 0); /* all values > 0 */
192 /* de-duplication of BAT ids in subcommit list
193 * this is needed because of legacy reasons (database
194 * upgrade) */
195 for (xx = 2; xx < cnt; xx++) {
196 if (subcommit[xx-1] == subcommit[xx]) {
197 int i;
198 cnt--;
199 for (i = xx; i < cnt; i++)
200 subcommit[i] = subcommit[i+1];
201 }
202 }
203 if (prelude(cnt, subcommit) == GDK_SUCCEED) { /* save the new bats outside the lock */
204 /* lock just prevents BBPtrims, and other global
205 * (sub-)commits */
206 for (xx = 0; xx <= BBP_THREADMASK; xx++)
207 MT_lock_set(&GDKtrimLock(xx));
208 if (BBPsync(cnt, subcommit) == GDK_SUCCEED) { /* write BBP.dir (++) */
209 epilogue(cnt, subcommit);
210 ret = GDK_SUCCEED;
211 }
212 for (xx = BBP_THREADMASK; xx >= 0; xx--)
213 MT_lock_unset(&GDKtrimLock(xx));
214 }
215 return ret;
216}
217
218gdk_return
219TMsubcommit(BAT *b)
220{
221 int cnt = 1;
222 gdk_return ret = GDK_FAIL;
223 bat *subcommit;
224 BUN p, q;
225 BATiter bi = bat_iterator(b);
226
227 subcommit = GDKmalloc((BATcount(b) + 1) * sizeof(bat));
228 if (subcommit == NULL)
229 return GDK_FAIL;
230
231 subcommit[0] = 0; /* BBP artifact: slot 0 in the array will be ignored */
232 /* collect the list and save the new bats outside any
233 * locking */
234 BATloop(b, p, q) {
235 bat bid = BBPindex((str) BUNtvar(bi, p));
236
237 if (bid)
238 subcommit[cnt++] = bid;
239 }
240
241 ret = TMsubcommit_list(subcommit, cnt);
242 GDKfree(subcommit);
243 return ret;
244}
245
246/*
247 * @- TMabort
248 * Transaction abort is cheap. We use the delta statuses to go back to
249 * the previous version of each BAT. Also for BATs that are currently
250 * swapped out. Persistent BATs that were made transient in this
251 * transaction become persistent again.
252 */
253void
254TMabort(void)
255{
256 int i;
257
258 BBPlock();
259 for (i = 1; i < getBBPsize(); i++) {
260 if (BBP_status(i) & BBPNEW) {
261 BAT *b = BBPquickdesc(i, false);
262
263 if (b) {
264 if (!b->batTransient)
265 BBPrelease(i);
266 b->batTransient = true;
267 b->batDirtydesc = true;
268 }
269 }
270 }
271 for (i = 1; i < getBBPsize(); i++) {
272 if (BBP_status(i) & (BBPPERSISTENT | BBPDELETED | BBPSWAPPED)) {
273 BAT *b = BBPquickdesc(i, true);
274
275 if (b == NULL)
276 continue;
277
278 BBPfix(i);
279 if (BATdirty(b) || DELTAdirty(b)) {
280 /* BUN move-backes need a real BAT! */
281 /* Stefan:
282 * Actually, in case DELTAdirty(b),
283 * i.e., a BAT with differences that
284 * is saved/swapped-out but not yet
285 * committed, we (AFAIK) don't have to
286 * load the BAT and apply the undo,
287 * but rather could simply discard the
288 * delta and revive the backup;
289 * however, I don't know how to do
290 * this (yet), hence we stick with
291 * this solution for the time being
292 * --- it should be correct though it
293 * might not be the most efficient
294 * way...
295 */
296 b = BBPdescriptor(i);
297 BATundo(b);
298 }
299 if (BBP_status(i) & BBPDELETED) {
300 BBP_status_on(i, BBPEXISTING, "TMabort");
301 if (b->batTransient)
302 BBPretain(i);
303 b->batTransient = false;
304 b->batDirtydesc = true;
305 }
306 BBPunfix(i);
307 }
308 BBP_status_off(i, BBPDELETED | BBPSWAPPED | BBPNEW, "TMabort");
309 }
310 BBPunlock();
311 GDKclrerr();
312}
313