1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * @a M. L. Kersten, P. Boncz, N. J. Nes |
11 | * |
12 | * @* Transaction management |
13 | * The Transaction Manager maintains the buffer of (permanent) BATS |
14 | * held resident. Entries from the BAT buffer are always accessed by |
15 | * BAT id. A BAT becomes permanent by assigning a name with |
16 | * @%BBPrename@. Access to the transaction table is regulated by a |
17 | * semaphore. |
18 | */ |
19 | #include "monetdb_config.h" |
20 | #include "gdk.h" |
21 | #include "gdk_private.h" |
22 | #include "gdk_tm.h" |
23 | |
24 | /* |
25 | * The physical (disk) commit protocol is handled mostly by |
26 | * BBPsync. Once a commit succeeded, there is the task of removing |
27 | * ex-persistent bats (those that still were persistent in the |
28 | * previous commit, but were made transient in this transaction). |
29 | * Notice that such ex- (i.e. non-) persistent bats are not backed up |
30 | * by the BBPsync protocol, so we cannot start deleting after we know |
31 | * the commit will succeed. |
32 | * |
33 | * Another hairy issue are the delta statuses in BATs. These provide a |
34 | * fast way to perform a transaction abort (HOT-abort, instead of |
35 | * COLD-abort, which is achieved by the BBP recovery in a database |
36 | * restart). Hot-abort functionality has not been important in MonetDB |
37 | * for now, so it is not well-tested. The problem here is that if a |
38 | * commit fails in the physical part (BBPsync), we have not sufficient |
39 | * information to roll back the delta statuses. |
40 | * |
41 | * So a 'feature' of the abort is that after a failed commit, |
42 | * in-memory we *will* commit the transaction. Subsequent commits can |
43 | * retry to achieve a physical commit. The only way to abort in such a |
44 | * situation is COLD-abort: quit the server and restart, so you get |
45 | * the recovered disk images. |
46 | */ |
47 | /* in the commit prelude, the delta status in the memory image of all |
48 | * bats is commited */ |
49 | static gdk_return |
50 | prelude(int cnt, bat *subcommit) |
51 | { |
52 | int i = 0; |
53 | |
54 | while (++i < cnt) { |
55 | bat bid = subcommit ? subcommit[i] : i; |
56 | |
57 | if (BBP_status(bid) & BBPPERSISTENT) { |
58 | BAT *b = BBP_cache(bid); |
59 | |
60 | if (b == NULL && (BBP_status(bid) & BBPSWAPPED)) { |
61 | b = BBPquickdesc(bid, true); |
62 | if (b == NULL) |
63 | return GDK_FAIL; |
64 | } |
65 | if (b) { |
66 | assert(!isVIEW(b)); |
67 | assert(b->batRole == PERSISTENT); |
68 | BATcommit(b); |
69 | } |
70 | } |
71 | } |
72 | return GDK_SUCCEED; |
73 | } |
74 | |
75 | /* in the commit epilogue, the BBP-status of the bats is changed to |
76 | * reflect their presence in the succeeded checkpoint. Also bats from |
77 | * the previous checkpoint that were deleted now are physically |
78 | * destroyed. |
79 | */ |
80 | static void |
81 | epilogue(int cnt, bat *subcommit) |
82 | { |
83 | int i = 0; |
84 | |
85 | while (++i < cnt) { |
86 | bat bid = subcommit ? subcommit[i] : i; |
87 | |
88 | if (BBP_status(bid) & BBPPERSISTENT) { |
89 | BBP_status_on(bid, BBPEXISTING, subcommit ? "TMsubcommit" : "TMcommit" ); |
90 | } else if (BBP_status(bid) & BBPDELETED) { |
91 | /* check mmap modes of bats that are now |
92 | * transient. this has to be done after the |
93 | * commit succeeded, because the mmap modes |
94 | * allowed on transient bats would be |
95 | * dangerous on persistent bats. If the commit |
96 | * failed, the already processed bats that |
97 | * would become transient after the commit, |
98 | * but didn't due to the failure, would be a |
99 | * consistency risk. |
100 | */ |
101 | BAT *b = BBP_cache(bid); |
102 | if (b) { |
103 | /* check mmap modes */ |
104 | if (BATcheckmodes(b, true) != GDK_SUCCEED) |
105 | fprintf(stderr, "#epilogue: BATcheckmodes failed\n" ); |
106 | } |
107 | } |
108 | if ((BBP_status(bid) & BBPDELETED) && BBP_refs(bid) <= 0 && BBP_lrefs(bid) <= 0) { |
109 | BAT *b = BBPquickdesc(bid, true); |
110 | |
111 | /* the unloaded ones are deleted without |
112 | * loading deleted disk images */ |
113 | if (b) { |
114 | BATdelete(b); |
115 | if (BBP_cache(bid)) { |
116 | /* those that quickdesc |
117 | * decides to load => free |
118 | * memory */ |
119 | BATfree(b); |
120 | } |
121 | } |
122 | BBPclear(bid); /* clear with locking */ |
123 | } |
124 | BBP_status_off(bid, BBPDELETED | BBPSWAPPED | BBPNEW, subcommit ? "TMsubcommit" : "TMcommit" ); |
125 | } |
126 | GDKclrerr(); |
127 | } |
128 | |
129 | /* |
130 | * @- TMcommit |
131 | * global commit without any multi-threaded access assumptions, thus |
132 | * taking all BBP locks. It creates a new database checkpoint. |
133 | */ |
134 | gdk_return |
135 | TMcommit(void) |
136 | { |
137 | gdk_return ret = GDK_FAIL; |
138 | |
139 | /* commit with the BBP globally locked */ |
140 | BBPlock(); |
141 | if (prelude(getBBPsize(), NULL) == GDK_SUCCEED && |
142 | BBPsync(getBBPsize(), NULL) == GDK_SUCCEED) { |
143 | epilogue(getBBPsize(), NULL); |
144 | ret = GDK_SUCCEED; |
145 | } |
146 | BBPunlock(); |
147 | return ret; |
148 | } |
149 | |
150 | /* |
151 | * @- TMsubcommit |
152 | * |
153 | * Create a new checkpoint that is equal to the previous, with the |
154 | * exception that for the passed list of batnames, the current state |
155 | * will be reflected in the new checkpoint. |
156 | * |
157 | * On the bats in this list we assume exclusive access during the |
158 | * operation. |
159 | * |
160 | * This operation is useful for e.g. adding a new XQuery document or |
161 | * SQL table to the committed state (after bulk-load). Or for dropping |
162 | * a table or doc, without forcing the total database to be clean, |
163 | * which may require a lot of I/O. |
164 | * |
165 | * We expect the globally locked phase (BBPsync) to take little time |
166 | * (<100ms) as only the BBP.dir is written out; and for the existing |
167 | * bats that were modified, only some heap moves are done (moved from |
168 | * BAKDIR to SUBDIR). The atomic commit for sub-commit is the rename |
169 | * of SUBDIR to DELDIR. |
170 | * |
171 | * As it does not take the BBP-locks (thanks to the assumption that |
172 | * access is exclusive), the concurrency impact of subcommit is also |
173 | * much lighter to ongoing concurrent query and update facilities than |
174 | * a real global TMcommit. |
175 | */ |
176 | gdk_return |
177 | TMsubcommit_list(bat *subcommit, int cnt) |
178 | { |
179 | int xx; |
180 | gdk_return ret = GDK_FAIL; |
181 | |
182 | assert(cnt > 0); |
183 | assert(subcommit[0] == 0); /* BBP artifact: slot 0 in the array will be ignored */ |
184 | |
185 | if (GDKinmemory()) |
186 | return GDK_SUCCEED; |
187 | |
188 | /* sort the list on BAT id */ |
189 | GDKqsort(subcommit + 1, NULL, NULL, cnt - 1, sizeof(bat), 0, TYPE_bat, false, false); |
190 | |
191 | assert(cnt == 1 || subcommit[1] > 0); /* all values > 0 */ |
192 | /* de-duplication of BAT ids in subcommit list |
193 | * this is needed because of legacy reasons (database |
194 | * upgrade) */ |
195 | for (xx = 2; xx < cnt; xx++) { |
196 | if (subcommit[xx-1] == subcommit[xx]) { |
197 | int i; |
198 | cnt--; |
199 | for (i = xx; i < cnt; i++) |
200 | subcommit[i] = subcommit[i+1]; |
201 | } |
202 | } |
203 | if (prelude(cnt, subcommit) == GDK_SUCCEED) { /* save the new bats outside the lock */ |
204 | /* lock just prevents BBPtrims, and other global |
205 | * (sub-)commits */ |
206 | for (xx = 0; xx <= BBP_THREADMASK; xx++) |
207 | MT_lock_set(&GDKtrimLock(xx)); |
208 | if (BBPsync(cnt, subcommit) == GDK_SUCCEED) { /* write BBP.dir (++) */ |
209 | epilogue(cnt, subcommit); |
210 | ret = GDK_SUCCEED; |
211 | } |
212 | for (xx = BBP_THREADMASK; xx >= 0; xx--) |
213 | MT_lock_unset(&GDKtrimLock(xx)); |
214 | } |
215 | return ret; |
216 | } |
217 | |
218 | gdk_return |
219 | TMsubcommit(BAT *b) |
220 | { |
221 | int cnt = 1; |
222 | gdk_return ret = GDK_FAIL; |
223 | bat *subcommit; |
224 | BUN p, q; |
225 | BATiter bi = bat_iterator(b); |
226 | |
227 | subcommit = GDKmalloc((BATcount(b) + 1) * sizeof(bat)); |
228 | if (subcommit == NULL) |
229 | return GDK_FAIL; |
230 | |
231 | subcommit[0] = 0; /* BBP artifact: slot 0 in the array will be ignored */ |
232 | /* collect the list and save the new bats outside any |
233 | * locking */ |
234 | BATloop(b, p, q) { |
235 | bat bid = BBPindex((str) BUNtvar(bi, p)); |
236 | |
237 | if (bid) |
238 | subcommit[cnt++] = bid; |
239 | } |
240 | |
241 | ret = TMsubcommit_list(subcommit, cnt); |
242 | GDKfree(subcommit); |
243 | return ret; |
244 | } |
245 | |
246 | /* |
247 | * @- TMabort |
248 | * Transaction abort is cheap. We use the delta statuses to go back to |
249 | * the previous version of each BAT. Also for BATs that are currently |
250 | * swapped out. Persistent BATs that were made transient in this |
251 | * transaction become persistent again. |
252 | */ |
253 | void |
254 | TMabort(void) |
255 | { |
256 | int i; |
257 | |
258 | BBPlock(); |
259 | for (i = 1; i < getBBPsize(); i++) { |
260 | if (BBP_status(i) & BBPNEW) { |
261 | BAT *b = BBPquickdesc(i, false); |
262 | |
263 | if (b) { |
264 | if (!b->batTransient) |
265 | BBPrelease(i); |
266 | b->batTransient = true; |
267 | b->batDirtydesc = true; |
268 | } |
269 | } |
270 | } |
271 | for (i = 1; i < getBBPsize(); i++) { |
272 | if (BBP_status(i) & (BBPPERSISTENT | BBPDELETED | BBPSWAPPED)) { |
273 | BAT *b = BBPquickdesc(i, true); |
274 | |
275 | if (b == NULL) |
276 | continue; |
277 | |
278 | BBPfix(i); |
279 | if (BATdirty(b) || DELTAdirty(b)) { |
280 | /* BUN move-backes need a real BAT! */ |
281 | /* Stefan: |
282 | * Actually, in case DELTAdirty(b), |
283 | * i.e., a BAT with differences that |
284 | * is saved/swapped-out but not yet |
285 | * committed, we (AFAIK) don't have to |
286 | * load the BAT and apply the undo, |
287 | * but rather could simply discard the |
288 | * delta and revive the backup; |
289 | * however, I don't know how to do |
290 | * this (yet), hence we stick with |
291 | * this solution for the time being |
292 | * --- it should be correct though it |
293 | * might not be the most efficient |
294 | * way... |
295 | */ |
296 | b = BBPdescriptor(i); |
297 | BATundo(b); |
298 | } |
299 | if (BBP_status(i) & BBPDELETED) { |
300 | BBP_status_on(i, BBPEXISTING, "TMabort" ); |
301 | if (b->batTransient) |
302 | BBPretain(i); |
303 | b->batTransient = false; |
304 | b->batDirtydesc = true; |
305 | } |
306 | BBPunfix(i); |
307 | } |
308 | BBP_status_off(i, BBPDELETED | BBPSWAPPED | BBPNEW, "TMabort" ); |
309 | } |
310 | BBPunlock(); |
311 | GDKclrerr(); |
312 | } |
313 | |