1 | /* |
2 | * Live block commit |
3 | * |
4 | * Copyright Red Hat, Inc. 2012 |
5 | * |
6 | * Authors: |
7 | * Jeff Cody <jcody@redhat.com> |
8 | * Based on stream.c by Stefan Hajnoczi |
9 | * |
10 | * This work is licensed under the terms of the GNU LGPL, version 2 or later. |
11 | * See the COPYING.LIB file in the top-level directory. |
12 | * |
13 | */ |
14 | |
15 | #include "qemu/osdep.h" |
16 | #include "qemu/cutils.h" |
17 | #include "trace.h" |
18 | #include "block/block_int.h" |
19 | #include "block/blockjob_int.h" |
20 | #include "qapi/error.h" |
21 | #include "qapi/qmp/qerror.h" |
22 | #include "qemu/ratelimit.h" |
23 | #include "sysemu/block-backend.h" |
24 | |
25 | enum { |
26 | /* |
27 | * Size of data buffer for populating the image file. This should be large |
28 | * enough to process multiple clusters in a single call, so that populating |
29 | * contiguous regions of the image is efficient. |
30 | */ |
31 | COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ |
32 | }; |
33 | |
34 | typedef struct CommitBlockJob { |
35 | BlockJob common; |
36 | BlockDriverState *commit_top_bs; |
37 | BlockBackend *top; |
38 | BlockBackend *base; |
39 | BlockDriverState *base_bs; |
40 | BlockdevOnError on_error; |
41 | bool base_read_only; |
42 | bool chain_frozen; |
43 | char *backing_file_str; |
44 | } CommitBlockJob; |
45 | |
46 | static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base, |
47 | int64_t offset, uint64_t bytes, |
48 | void *buf) |
49 | { |
50 | int ret = 0; |
51 | |
52 | assert(bytes < SIZE_MAX); |
53 | |
54 | ret = blk_co_pread(bs, offset, bytes, buf, 0); |
55 | if (ret < 0) { |
56 | return ret; |
57 | } |
58 | |
59 | ret = blk_co_pwrite(base, offset, bytes, buf, 0); |
60 | if (ret < 0) { |
61 | return ret; |
62 | } |
63 | |
64 | return 0; |
65 | } |
66 | |
67 | static int commit_prepare(Job *job) |
68 | { |
69 | CommitBlockJob *s = container_of(job, CommitBlockJob, common.job); |
70 | |
71 | bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs); |
72 | s->chain_frozen = false; |
73 | |
74 | /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before |
75 | * the normal backing chain can be restored. */ |
76 | blk_unref(s->base); |
77 | s->base = NULL; |
78 | |
79 | /* FIXME: bdrv_drop_intermediate treats total failures and partial failures |
80 | * identically. Further work is needed to disambiguate these cases. */ |
81 | return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs, |
82 | s->backing_file_str); |
83 | } |
84 | |
85 | static void commit_abort(Job *job) |
86 | { |
87 | CommitBlockJob *s = container_of(job, CommitBlockJob, common.job); |
88 | BlockDriverState *top_bs = blk_bs(s->top); |
89 | |
90 | if (s->chain_frozen) { |
91 | bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs); |
92 | } |
93 | |
94 | /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */ |
95 | bdrv_ref(top_bs); |
96 | bdrv_ref(s->commit_top_bs); |
97 | |
98 | if (s->base) { |
99 | blk_unref(s->base); |
100 | } |
101 | |
102 | /* free the blockers on the intermediate nodes so that bdrv_replace_nodes |
103 | * can succeed */ |
104 | block_job_remove_all_bdrv(&s->common); |
105 | |
106 | /* If bdrv_drop_intermediate() failed (or was not invoked), remove the |
107 | * commit filter driver from the backing chain now. Do this as the final |
108 | * step so that the 'consistent read' permission can be granted. |
109 | * |
110 | * XXX Can (or should) we somehow keep 'consistent read' blocked even |
111 | * after the failed/cancelled commit job is gone? If we already wrote |
112 | * something to base, the intermediate images aren't valid any more. */ |
113 | bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs), |
114 | &error_abort); |
115 | |
116 | bdrv_unref(s->commit_top_bs); |
117 | bdrv_unref(top_bs); |
118 | } |
119 | |
120 | static void commit_clean(Job *job) |
121 | { |
122 | CommitBlockJob *s = container_of(job, CommitBlockJob, common.job); |
123 | |
124 | /* restore base open flags here if appropriate (e.g., change the base back |
125 | * to r/o). These reopens do not need to be atomic, since we won't abort |
126 | * even on failure here */ |
127 | if (s->base_read_only) { |
128 | bdrv_reopen_set_read_only(s->base_bs, true, NULL); |
129 | } |
130 | |
131 | g_free(s->backing_file_str); |
132 | blk_unref(s->top); |
133 | } |
134 | |
135 | static int coroutine_fn commit_run(Job *job, Error **errp) |
136 | { |
137 | CommitBlockJob *s = container_of(job, CommitBlockJob, common.job); |
138 | int64_t offset; |
139 | uint64_t delay_ns = 0; |
140 | int ret = 0; |
141 | int64_t n = 0; /* bytes */ |
142 | void *buf = NULL; |
143 | int bytes_written = 0; |
144 | int64_t len, base_len; |
145 | |
146 | ret = len = blk_getlength(s->top); |
147 | if (len < 0) { |
148 | goto out; |
149 | } |
150 | job_progress_set_remaining(&s->common.job, len); |
151 | |
152 | ret = base_len = blk_getlength(s->base); |
153 | if (base_len < 0) { |
154 | goto out; |
155 | } |
156 | |
157 | if (base_len < len) { |
158 | ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL); |
159 | if (ret) { |
160 | goto out; |
161 | } |
162 | } |
163 | |
164 | buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); |
165 | |
166 | for (offset = 0; offset < len; offset += n) { |
167 | bool copy; |
168 | |
169 | /* Note that even when no rate limit is applied we need to yield |
170 | * with no pending I/O here so that bdrv_drain_all() returns. |
171 | */ |
172 | job_sleep_ns(&s->common.job, delay_ns); |
173 | if (job_is_cancelled(&s->common.job)) { |
174 | break; |
175 | } |
176 | /* Copy if allocated above the base */ |
177 | ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, |
178 | offset, COMMIT_BUFFER_SIZE, &n); |
179 | copy = (ret == 1); |
180 | trace_commit_one_iteration(s, offset, n, ret); |
181 | if (copy) { |
182 | ret = commit_populate(s->top, s->base, offset, n, buf); |
183 | bytes_written += n; |
184 | } |
185 | if (ret < 0) { |
186 | BlockErrorAction action = |
187 | block_job_error_action(&s->common, false, s->on_error, -ret); |
188 | if (action == BLOCK_ERROR_ACTION_REPORT) { |
189 | goto out; |
190 | } else { |
191 | n = 0; |
192 | continue; |
193 | } |
194 | } |
195 | /* Publish progress */ |
196 | job_progress_update(&s->common.job, n); |
197 | |
198 | if (copy) { |
199 | delay_ns = block_job_ratelimit_get_delay(&s->common, n); |
200 | } else { |
201 | delay_ns = 0; |
202 | } |
203 | } |
204 | |
205 | ret = 0; |
206 | |
207 | out: |
208 | qemu_vfree(buf); |
209 | |
210 | return ret; |
211 | } |
212 | |
213 | static const BlockJobDriver commit_job_driver = { |
214 | .job_driver = { |
215 | .instance_size = sizeof(CommitBlockJob), |
216 | .job_type = JOB_TYPE_COMMIT, |
217 | .free = block_job_free, |
218 | .user_resume = block_job_user_resume, |
219 | .drain = block_job_drain, |
220 | .run = commit_run, |
221 | .prepare = commit_prepare, |
222 | .abort = commit_abort, |
223 | .clean = commit_clean |
224 | }, |
225 | }; |
226 | |
227 | static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs, |
228 | uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) |
229 | { |
230 | return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); |
231 | } |
232 | |
233 | static void bdrv_commit_top_refresh_filename(BlockDriverState *bs) |
234 | { |
235 | pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), |
236 | bs->backing->bs->filename); |
237 | } |
238 | |
239 | static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c, |
240 | const BdrvChildRole *role, |
241 | BlockReopenQueue *reopen_queue, |
242 | uint64_t perm, uint64_t shared, |
243 | uint64_t *nperm, uint64_t *nshared) |
244 | { |
245 | *nperm = 0; |
246 | *nshared = BLK_PERM_ALL; |
247 | } |
248 | |
249 | /* Dummy node that provides consistent read to its users without requiring it |
250 | * from its backing file and that allows writes on the backing file chain. */ |
251 | static BlockDriver bdrv_commit_top = { |
252 | .format_name = "commit_top" , |
253 | .bdrv_co_preadv = bdrv_commit_top_preadv, |
254 | .bdrv_co_block_status = bdrv_co_block_status_from_backing, |
255 | .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, |
256 | .bdrv_child_perm = bdrv_commit_top_child_perm, |
257 | }; |
258 | |
259 | void commit_start(const char *job_id, BlockDriverState *bs, |
260 | BlockDriverState *base, BlockDriverState *top, |
261 | int creation_flags, int64_t speed, |
262 | BlockdevOnError on_error, const char *backing_file_str, |
263 | const char *filter_node_name, Error **errp) |
264 | { |
265 | CommitBlockJob *s; |
266 | BlockDriverState *iter; |
267 | BlockDriverState *commit_top_bs = NULL; |
268 | Error *local_err = NULL; |
269 | int ret; |
270 | |
271 | assert(top != bs); |
272 | if (top == base) { |
273 | error_setg(errp, "Invalid files for merge: top and base are the same" ); |
274 | return; |
275 | } |
276 | |
277 | s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL, |
278 | speed, creation_flags, NULL, NULL, errp); |
279 | if (!s) { |
280 | return; |
281 | } |
282 | |
283 | /* convert base to r/w, if necessary */ |
284 | s->base_read_only = bdrv_is_read_only(base); |
285 | if (s->base_read_only) { |
286 | if (bdrv_reopen_set_read_only(base, false, errp) != 0) { |
287 | goto fail; |
288 | } |
289 | } |
290 | |
291 | /* Insert commit_top block node above top, so we can block consistent read |
292 | * on the backing chain below it */ |
293 | commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0, |
294 | errp); |
295 | if (commit_top_bs == NULL) { |
296 | goto fail; |
297 | } |
298 | if (!filter_node_name) { |
299 | commit_top_bs->implicit = true; |
300 | } |
301 | |
302 | /* So that we can always drop this node */ |
303 | commit_top_bs->never_freeze = true; |
304 | |
305 | commit_top_bs->total_sectors = top->total_sectors; |
306 | |
307 | bdrv_append(commit_top_bs, top, &local_err); |
308 | if (local_err) { |
309 | commit_top_bs = NULL; |
310 | error_propagate(errp, local_err); |
311 | goto fail; |
312 | } |
313 | |
314 | s->commit_top_bs = commit_top_bs; |
315 | |
316 | /* Block all nodes between top and base, because they will |
317 | * disappear from the chain after this operation. */ |
318 | assert(bdrv_chain_contains(top, base)); |
319 | for (iter = top; iter != base; iter = backing_bs(iter)) { |
320 | /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves |
321 | * at s->base (if writes are blocked for a node, they are also blocked |
322 | * for its backing file). The other options would be a second filter |
323 | * driver above s->base. */ |
324 | ret = block_job_add_bdrv(&s->common, "intermediate node" , iter, 0, |
325 | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, |
326 | errp); |
327 | if (ret < 0) { |
328 | goto fail; |
329 | } |
330 | } |
331 | |
332 | if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) { |
333 | goto fail; |
334 | } |
335 | s->chain_frozen = true; |
336 | |
337 | ret = block_job_add_bdrv(&s->common, "base" , base, 0, BLK_PERM_ALL, errp); |
338 | if (ret < 0) { |
339 | goto fail; |
340 | } |
341 | |
342 | s->base = blk_new(s->common.job.aio_context, |
343 | BLK_PERM_CONSISTENT_READ |
344 | | BLK_PERM_WRITE |
345 | | BLK_PERM_RESIZE, |
346 | BLK_PERM_CONSISTENT_READ |
347 | | BLK_PERM_GRAPH_MOD |
348 | | BLK_PERM_WRITE_UNCHANGED); |
349 | ret = blk_insert_bs(s->base, base, errp); |
350 | if (ret < 0) { |
351 | goto fail; |
352 | } |
353 | blk_set_disable_request_queuing(s->base, true); |
354 | s->base_bs = base; |
355 | |
356 | /* Required permissions are already taken with block_job_add_bdrv() */ |
357 | s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL); |
358 | ret = blk_insert_bs(s->top, top, errp); |
359 | if (ret < 0) { |
360 | goto fail; |
361 | } |
362 | blk_set_disable_request_queuing(s->top, true); |
363 | |
364 | s->backing_file_str = g_strdup(backing_file_str); |
365 | s->on_error = on_error; |
366 | |
367 | trace_commit_start(bs, base, top, s); |
368 | job_start(&s->common.job); |
369 | return; |
370 | |
371 | fail: |
372 | if (s->chain_frozen) { |
373 | bdrv_unfreeze_backing_chain(commit_top_bs, base); |
374 | } |
375 | if (s->base) { |
376 | blk_unref(s->base); |
377 | } |
378 | if (s->top) { |
379 | blk_unref(s->top); |
380 | } |
381 | if (s->base_read_only) { |
382 | bdrv_reopen_set_read_only(base, true, NULL); |
383 | } |
384 | job_early_fail(&s->common.job); |
385 | /* commit_top_bs has to be replaced after deleting the block job, |
386 | * otherwise this would fail because of lack of permissions. */ |
387 | if (commit_top_bs) { |
388 | bdrv_replace_node(commit_top_bs, top, &error_abort); |
389 | } |
390 | } |
391 | |
392 | |
393 | #define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE) |
394 | |
395 | /* commit COW file into the raw image */ |
396 | int bdrv_commit(BlockDriverState *bs) |
397 | { |
398 | BlockBackend *src, *backing; |
399 | BlockDriverState *backing_file_bs = NULL; |
400 | BlockDriverState *commit_top_bs = NULL; |
401 | BlockDriver *drv = bs->drv; |
402 | AioContext *ctx; |
403 | int64_t offset, length, backing_length; |
404 | int ro; |
405 | int64_t n; |
406 | int ret = 0; |
407 | uint8_t *buf = NULL; |
408 | Error *local_err = NULL; |
409 | |
410 | if (!drv) |
411 | return -ENOMEDIUM; |
412 | |
413 | if (!bs->backing) { |
414 | return -ENOTSUP; |
415 | } |
416 | |
417 | if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) || |
418 | bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) { |
419 | return -EBUSY; |
420 | } |
421 | |
422 | ro = bs->backing->bs->read_only; |
423 | |
424 | if (ro) { |
425 | if (bdrv_reopen_set_read_only(bs->backing->bs, false, NULL)) { |
426 | return -EACCES; |
427 | } |
428 | } |
429 | |
430 | ctx = bdrv_get_aio_context(bs); |
431 | src = blk_new(ctx, BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); |
432 | backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL); |
433 | |
434 | ret = blk_insert_bs(src, bs, &local_err); |
435 | if (ret < 0) { |
436 | error_report_err(local_err); |
437 | goto ro_cleanup; |
438 | } |
439 | |
440 | /* Insert commit_top block node above backing, so we can write to it */ |
441 | backing_file_bs = backing_bs(bs); |
442 | |
443 | commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR, |
444 | &local_err); |
445 | if (commit_top_bs == NULL) { |
446 | error_report_err(local_err); |
447 | goto ro_cleanup; |
448 | } |
449 | |
450 | bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort); |
451 | bdrv_set_backing_hd(bs, commit_top_bs, &error_abort); |
452 | |
453 | ret = blk_insert_bs(backing, backing_file_bs, &local_err); |
454 | if (ret < 0) { |
455 | error_report_err(local_err); |
456 | goto ro_cleanup; |
457 | } |
458 | |
459 | length = blk_getlength(src); |
460 | if (length < 0) { |
461 | ret = length; |
462 | goto ro_cleanup; |
463 | } |
464 | |
465 | backing_length = blk_getlength(backing); |
466 | if (backing_length < 0) { |
467 | ret = backing_length; |
468 | goto ro_cleanup; |
469 | } |
470 | |
471 | /* If our top snapshot is larger than the backing file image, |
472 | * grow the backing file image if possible. If not possible, |
473 | * we must return an error */ |
474 | if (length > backing_length) { |
475 | ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err); |
476 | if (ret < 0) { |
477 | error_report_err(local_err); |
478 | goto ro_cleanup; |
479 | } |
480 | } |
481 | |
482 | /* blk_try_blockalign() for src will choose an alignment that works for |
483 | * backing as well, so no need to compare the alignment manually. */ |
484 | buf = blk_try_blockalign(src, COMMIT_BUF_SIZE); |
485 | if (buf == NULL) { |
486 | ret = -ENOMEM; |
487 | goto ro_cleanup; |
488 | } |
489 | |
490 | for (offset = 0; offset < length; offset += n) { |
491 | ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n); |
492 | if (ret < 0) { |
493 | goto ro_cleanup; |
494 | } |
495 | if (ret) { |
496 | ret = blk_pread(src, offset, buf, n); |
497 | if (ret < 0) { |
498 | goto ro_cleanup; |
499 | } |
500 | |
501 | ret = blk_pwrite(backing, offset, buf, n, 0); |
502 | if (ret < 0) { |
503 | goto ro_cleanup; |
504 | } |
505 | } |
506 | } |
507 | |
508 | if (drv->bdrv_make_empty) { |
509 | ret = drv->bdrv_make_empty(bs); |
510 | if (ret < 0) { |
511 | goto ro_cleanup; |
512 | } |
513 | blk_flush(src); |
514 | } |
515 | |
516 | /* |
517 | * Make sure all data we wrote to the backing device is actually |
518 | * stable on disk. |
519 | */ |
520 | blk_flush(backing); |
521 | |
522 | ret = 0; |
523 | ro_cleanup: |
524 | qemu_vfree(buf); |
525 | |
526 | blk_unref(backing); |
527 | if (backing_file_bs) { |
528 | bdrv_set_backing_hd(bs, backing_file_bs, &error_abort); |
529 | } |
530 | bdrv_unref(commit_top_bs); |
531 | blk_unref(src); |
532 | |
533 | if (ro) { |
534 | /* ignoring error return here */ |
535 | bdrv_reopen_set_read_only(bs->backing->bs, true, NULL); |
536 | } |
537 | |
538 | return ret; |
539 | } |
540 | |