1 | /* |
2 | * QEMU live block migration |
3 | * |
4 | * Copyright IBM, Corp. 2009 |
5 | * |
6 | * Authors: |
7 | * Liran Schour <lirans@il.ibm.com> |
8 | * |
9 | * This work is licensed under the terms of the GNU GPL, version 2. See |
10 | * the COPYING file in the top-level directory. |
11 | * |
12 | * Contributions after 2012-01-13 are licensed under the terms of the |
13 | * GNU GPL, version 2 or (at your option) any later version. |
14 | */ |
15 | |
16 | #include "qemu/osdep.h" |
17 | #include "qapi/error.h" |
18 | #include "qemu/error-report.h" |
19 | #include "qemu/main-loop.h" |
20 | #include "qemu/cutils.h" |
21 | #include "qemu/queue.h" |
22 | #include "block.h" |
23 | #include "migration/misc.h" |
24 | #include "migration.h" |
25 | #include "migration/register.h" |
26 | #include "qemu-file.h" |
27 | #include "migration/vmstate.h" |
28 | #include "sysemu/block-backend.h" |
29 | |
30 | #define BLOCK_SIZE (1 << 20) |
31 | #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) |
32 | |
33 | #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 |
34 | #define BLK_MIG_FLAG_EOS 0x02 |
35 | #define BLK_MIG_FLAG_PROGRESS 0x04 |
36 | #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 |
37 | |
38 | #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) |
39 | |
40 | #define MAX_IO_BUFFERS 512 |
41 | #define MAX_PARALLEL_IO 16 |
42 | |
43 | //#define DEBUG_BLK_MIGRATION |
44 | |
45 | #ifdef DEBUG_BLK_MIGRATION |
46 | #define DPRINTF(fmt, ...) \ |
47 | do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) |
48 | #else |
49 | #define DPRINTF(fmt, ...) \ |
50 | do { } while (0) |
51 | #endif |
52 | |
53 | typedef struct BlkMigDevState { |
54 | /* Written during setup phase. Can be read without a lock. */ |
55 | BlockBackend *blk; |
56 | char *blk_name; |
57 | int shared_base; |
58 | int64_t total_sectors; |
59 | QSIMPLEQ_ENTRY(BlkMigDevState) entry; |
60 | Error *blocker; |
61 | |
62 | /* Only used by migration thread. Does not need a lock. */ |
63 | int bulk_completed; |
64 | int64_t cur_sector; |
65 | int64_t cur_dirty; |
66 | |
67 | /* Data in the aio_bitmap is protected by block migration lock. |
68 | * Allocation and free happen during setup and cleanup respectively. |
69 | */ |
70 | unsigned long *aio_bitmap; |
71 | |
72 | /* Protected by block migration lock. */ |
73 | int64_t completed_sectors; |
74 | |
75 | /* During migration this is protected by iothread lock / AioContext. |
76 | * Allocation and free happen during setup and cleanup respectively. |
77 | */ |
78 | BdrvDirtyBitmap *dirty_bitmap; |
79 | } BlkMigDevState; |
80 | |
81 | typedef struct BlkMigBlock { |
82 | /* Only used by migration thread. */ |
83 | uint8_t *buf; |
84 | BlkMigDevState *bmds; |
85 | int64_t sector; |
86 | int nr_sectors; |
87 | QEMUIOVector qiov; |
88 | BlockAIOCB *aiocb; |
89 | |
90 | /* Protected by block migration lock. */ |
91 | int ret; |
92 | QSIMPLEQ_ENTRY(BlkMigBlock) entry; |
93 | } BlkMigBlock; |
94 | |
95 | typedef struct BlkMigState { |
96 | QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; |
97 | int64_t total_sector_sum; |
98 | bool zero_blocks; |
99 | |
100 | /* Protected by lock. */ |
101 | QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; |
102 | int submitted; |
103 | int read_done; |
104 | |
105 | /* Only used by migration thread. Does not need a lock. */ |
106 | int transferred; |
107 | int prev_progress; |
108 | int bulk_completed; |
109 | |
110 | /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ |
111 | QemuMutex lock; |
112 | } BlkMigState; |
113 | |
114 | static BlkMigState block_mig_state; |
115 | |
116 | static void blk_mig_lock(void) |
117 | { |
118 | qemu_mutex_lock(&block_mig_state.lock); |
119 | } |
120 | |
121 | static void blk_mig_unlock(void) |
122 | { |
123 | qemu_mutex_unlock(&block_mig_state.lock); |
124 | } |
125 | |
126 | /* Must run outside of the iothread lock during the bulk phase, |
127 | * or the VM will stall. |
128 | */ |
129 | |
130 | static void blk_send(QEMUFile *f, BlkMigBlock * blk) |
131 | { |
132 | int len; |
133 | uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; |
134 | |
135 | if (block_mig_state.zero_blocks && |
136 | buffer_is_zero(blk->buf, BLOCK_SIZE)) { |
137 | flags |= BLK_MIG_FLAG_ZERO_BLOCK; |
138 | } |
139 | |
140 | /* sector number and flags */ |
141 | qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) |
142 | | flags); |
143 | |
144 | /* device name */ |
145 | len = strlen(blk->bmds->blk_name); |
146 | qemu_put_byte(f, len); |
147 | qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); |
148 | |
149 | /* if a block is zero we need to flush here since the network |
150 | * bandwidth is now a lot higher than the storage device bandwidth. |
151 | * thus if we queue zero blocks we slow down the migration */ |
152 | if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { |
153 | qemu_fflush(f); |
154 | return; |
155 | } |
156 | |
157 | qemu_put_buffer(f, blk->buf, BLOCK_SIZE); |
158 | } |
159 | |
160 | int blk_mig_active(void) |
161 | { |
162 | return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); |
163 | } |
164 | |
165 | int blk_mig_bulk_active(void) |
166 | { |
167 | return blk_mig_active() && !block_mig_state.bulk_completed; |
168 | } |
169 | |
170 | uint64_t blk_mig_bytes_transferred(void) |
171 | { |
172 | BlkMigDevState *bmds; |
173 | uint64_t sum = 0; |
174 | |
175 | blk_mig_lock(); |
176 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
177 | sum += bmds->completed_sectors; |
178 | } |
179 | blk_mig_unlock(); |
180 | return sum << BDRV_SECTOR_BITS; |
181 | } |
182 | |
183 | uint64_t blk_mig_bytes_remaining(void) |
184 | { |
185 | return blk_mig_bytes_total() - blk_mig_bytes_transferred(); |
186 | } |
187 | |
188 | uint64_t blk_mig_bytes_total(void) |
189 | { |
190 | BlkMigDevState *bmds; |
191 | uint64_t sum = 0; |
192 | |
193 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
194 | sum += bmds->total_sectors; |
195 | } |
196 | return sum << BDRV_SECTOR_BITS; |
197 | } |
198 | |
199 | |
200 | /* Called with migration lock held. */ |
201 | |
202 | static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) |
203 | { |
204 | int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; |
205 | |
206 | if (sector < blk_nb_sectors(bmds->blk)) { |
207 | return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & |
208 | (1UL << (chunk % (sizeof(unsigned long) * 8)))); |
209 | } else { |
210 | return 0; |
211 | } |
212 | } |
213 | |
214 | /* Called with migration lock held. */ |
215 | |
216 | static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, |
217 | int nb_sectors, int set) |
218 | { |
219 | int64_t start, end; |
220 | unsigned long val, idx, bit; |
221 | |
222 | start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; |
223 | end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; |
224 | |
225 | for (; start <= end; start++) { |
226 | idx = start / (sizeof(unsigned long) * 8); |
227 | bit = start % (sizeof(unsigned long) * 8); |
228 | val = bmds->aio_bitmap[idx]; |
229 | if (set) { |
230 | val |= 1UL << bit; |
231 | } else { |
232 | val &= ~(1UL << bit); |
233 | } |
234 | bmds->aio_bitmap[idx] = val; |
235 | } |
236 | } |
237 | |
238 | static void alloc_aio_bitmap(BlkMigDevState *bmds) |
239 | { |
240 | BlockBackend *bb = bmds->blk; |
241 | int64_t bitmap_size; |
242 | |
243 | bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; |
244 | bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; |
245 | |
246 | bmds->aio_bitmap = g_malloc0(bitmap_size); |
247 | } |
248 | |
249 | /* Never hold migration lock when yielding to the main loop! */ |
250 | |
251 | static void blk_mig_read_cb(void *opaque, int ret) |
252 | { |
253 | BlkMigBlock *blk = opaque; |
254 | |
255 | blk_mig_lock(); |
256 | blk->ret = ret; |
257 | |
258 | QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); |
259 | bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); |
260 | |
261 | block_mig_state.submitted--; |
262 | block_mig_state.read_done++; |
263 | assert(block_mig_state.submitted >= 0); |
264 | blk_mig_unlock(); |
265 | } |
266 | |
267 | /* Called with no lock taken. */ |
268 | |
269 | static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) |
270 | { |
271 | int64_t total_sectors = bmds->total_sectors; |
272 | int64_t cur_sector = bmds->cur_sector; |
273 | BlockBackend *bb = bmds->blk; |
274 | BlkMigBlock *blk; |
275 | int nr_sectors; |
276 | int64_t count; |
277 | |
278 | if (bmds->shared_base) { |
279 | qemu_mutex_lock_iothread(); |
280 | aio_context_acquire(blk_get_aio_context(bb)); |
281 | /* Skip unallocated sectors; intentionally treats failure or |
282 | * partial sector as an allocated sector */ |
283 | while (cur_sector < total_sectors && |
284 | !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, |
285 | MAX_IS_ALLOCATED_SEARCH, &count)) { |
286 | if (count < BDRV_SECTOR_SIZE) { |
287 | break; |
288 | } |
289 | cur_sector += count >> BDRV_SECTOR_BITS; |
290 | } |
291 | aio_context_release(blk_get_aio_context(bb)); |
292 | qemu_mutex_unlock_iothread(); |
293 | } |
294 | |
295 | if (cur_sector >= total_sectors) { |
296 | bmds->cur_sector = bmds->completed_sectors = total_sectors; |
297 | return 1; |
298 | } |
299 | |
300 | bmds->completed_sectors = cur_sector; |
301 | |
302 | cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); |
303 | |
304 | /* we are going to transfer a full block even if it is not allocated */ |
305 | nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; |
306 | |
307 | if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { |
308 | nr_sectors = total_sectors - cur_sector; |
309 | } |
310 | |
311 | blk = g_new(BlkMigBlock, 1); |
312 | blk->buf = g_malloc(BLOCK_SIZE); |
313 | blk->bmds = bmds; |
314 | blk->sector = cur_sector; |
315 | blk->nr_sectors = nr_sectors; |
316 | |
317 | qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); |
318 | |
319 | blk_mig_lock(); |
320 | block_mig_state.submitted++; |
321 | blk_mig_unlock(); |
322 | |
323 | /* We do not know if bs is under the main thread (and thus does |
324 | * not acquire the AioContext when doing AIO) or rather under |
325 | * dataplane. Thus acquire both the iothread mutex and the |
326 | * AioContext. |
327 | * |
328 | * This is ugly and will disappear when we make bdrv_* thread-safe, |
329 | * without the need to acquire the AioContext. |
330 | */ |
331 | qemu_mutex_lock_iothread(); |
332 | aio_context_acquire(blk_get_aio_context(bmds->blk)); |
333 | bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, |
334 | nr_sectors * BDRV_SECTOR_SIZE); |
335 | blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, |
336 | 0, blk_mig_read_cb, blk); |
337 | aio_context_release(blk_get_aio_context(bmds->blk)); |
338 | qemu_mutex_unlock_iothread(); |
339 | |
340 | bmds->cur_sector = cur_sector + nr_sectors; |
341 | return (bmds->cur_sector >= total_sectors); |
342 | } |
343 | |
344 | /* Called with iothread lock taken. */ |
345 | |
346 | static int set_dirty_tracking(void) |
347 | { |
348 | BlkMigDevState *bmds; |
349 | int ret; |
350 | |
351 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
352 | bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), |
353 | BLOCK_SIZE, NULL, NULL); |
354 | if (!bmds->dirty_bitmap) { |
355 | ret = -errno; |
356 | goto fail; |
357 | } |
358 | } |
359 | return 0; |
360 | |
361 | fail: |
362 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
363 | if (bmds->dirty_bitmap) { |
364 | bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); |
365 | } |
366 | } |
367 | return ret; |
368 | } |
369 | |
370 | /* Called with iothread lock taken. */ |
371 | |
372 | static void unset_dirty_tracking(void) |
373 | { |
374 | BlkMigDevState *bmds; |
375 | |
376 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
377 | bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); |
378 | } |
379 | } |
380 | |
381 | static int init_blk_migration(QEMUFile *f) |
382 | { |
383 | BlockDriverState *bs; |
384 | BlkMigDevState *bmds; |
385 | int64_t sectors; |
386 | BdrvNextIterator it; |
387 | int i, num_bs = 0; |
388 | struct { |
389 | BlkMigDevState *bmds; |
390 | BlockDriverState *bs; |
391 | } *bmds_bs; |
392 | Error *local_err = NULL; |
393 | int ret; |
394 | |
395 | block_mig_state.submitted = 0; |
396 | block_mig_state.read_done = 0; |
397 | block_mig_state.transferred = 0; |
398 | block_mig_state.total_sector_sum = 0; |
399 | block_mig_state.prev_progress = -1; |
400 | block_mig_state.bulk_completed = 0; |
401 | block_mig_state.zero_blocks = migrate_zero_blocks(); |
402 | |
403 | for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { |
404 | num_bs++; |
405 | } |
406 | bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); |
407 | |
408 | for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { |
409 | if (bdrv_is_read_only(bs)) { |
410 | continue; |
411 | } |
412 | |
413 | sectors = bdrv_nb_sectors(bs); |
414 | if (sectors <= 0) { |
415 | ret = sectors; |
416 | bdrv_next_cleanup(&it); |
417 | goto out; |
418 | } |
419 | |
420 | bmds = g_new0(BlkMigDevState, 1); |
421 | bmds->blk = blk_new(qemu_get_aio_context(), |
422 | BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); |
423 | bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); |
424 | bmds->bulk_completed = 0; |
425 | bmds->total_sectors = sectors; |
426 | bmds->completed_sectors = 0; |
427 | bmds->shared_base = migrate_use_block_incremental(); |
428 | |
429 | assert(i < num_bs); |
430 | bmds_bs[i].bmds = bmds; |
431 | bmds_bs[i].bs = bs; |
432 | |
433 | block_mig_state.total_sector_sum += sectors; |
434 | |
435 | if (bmds->shared_base) { |
436 | DPRINTF("Start migration for %s with shared base image\n" , |
437 | bdrv_get_device_name(bs)); |
438 | } else { |
439 | DPRINTF("Start full migration for %s\n" , bdrv_get_device_name(bs)); |
440 | } |
441 | |
442 | QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); |
443 | } |
444 | |
445 | /* Can only insert new BDSes now because doing so while iterating block |
446 | * devices may end up in a deadlock (iterating the new BDSes, too). */ |
447 | for (i = 0; i < num_bs; i++) { |
448 | BlkMigDevState *bmds = bmds_bs[i].bmds; |
449 | BlockDriverState *bs = bmds_bs[i].bs; |
450 | |
451 | if (bmds) { |
452 | ret = blk_insert_bs(bmds->blk, bs, &local_err); |
453 | if (ret < 0) { |
454 | error_report_err(local_err); |
455 | goto out; |
456 | } |
457 | |
458 | alloc_aio_bitmap(bmds); |
459 | error_setg(&bmds->blocker, "block device is in use by migration" ); |
460 | bdrv_op_block_all(bs, bmds->blocker); |
461 | } |
462 | } |
463 | |
464 | ret = 0; |
465 | out: |
466 | g_free(bmds_bs); |
467 | return ret; |
468 | } |
469 | |
470 | /* Called with no lock taken. */ |
471 | |
472 | static int blk_mig_save_bulked_block(QEMUFile *f) |
473 | { |
474 | int64_t completed_sector_sum = 0; |
475 | BlkMigDevState *bmds; |
476 | int progress; |
477 | int ret = 0; |
478 | |
479 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
480 | if (bmds->bulk_completed == 0) { |
481 | if (mig_save_device_bulk(f, bmds) == 1) { |
482 | /* completed bulk section for this device */ |
483 | bmds->bulk_completed = 1; |
484 | } |
485 | completed_sector_sum += bmds->completed_sectors; |
486 | ret = 1; |
487 | break; |
488 | } else { |
489 | completed_sector_sum += bmds->completed_sectors; |
490 | } |
491 | } |
492 | |
493 | if (block_mig_state.total_sector_sum != 0) { |
494 | progress = completed_sector_sum * 100 / |
495 | block_mig_state.total_sector_sum; |
496 | } else { |
497 | progress = 100; |
498 | } |
499 | if (progress != block_mig_state.prev_progress) { |
500 | block_mig_state.prev_progress = progress; |
501 | qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) |
502 | | BLK_MIG_FLAG_PROGRESS); |
503 | DPRINTF("Completed %d %%\r" , progress); |
504 | } |
505 | |
506 | return ret; |
507 | } |
508 | |
509 | static void blk_mig_reset_dirty_cursor(void) |
510 | { |
511 | BlkMigDevState *bmds; |
512 | |
513 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
514 | bmds->cur_dirty = 0; |
515 | } |
516 | } |
517 | |
518 | /* Called with iothread lock and AioContext taken. */ |
519 | |
520 | static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, |
521 | int is_async) |
522 | { |
523 | BlkMigBlock *blk; |
524 | int64_t total_sectors = bmds->total_sectors; |
525 | int64_t sector; |
526 | int nr_sectors; |
527 | int ret = -EIO; |
528 | |
529 | for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { |
530 | blk_mig_lock(); |
531 | if (bmds_aio_inflight(bmds, sector)) { |
532 | blk_mig_unlock(); |
533 | blk_drain(bmds->blk); |
534 | } else { |
535 | blk_mig_unlock(); |
536 | } |
537 | bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); |
538 | if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, |
539 | sector * BDRV_SECTOR_SIZE)) { |
540 | if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { |
541 | nr_sectors = total_sectors - sector; |
542 | } else { |
543 | nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; |
544 | } |
545 | bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, |
546 | sector * BDRV_SECTOR_SIZE, |
547 | nr_sectors * BDRV_SECTOR_SIZE); |
548 | bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); |
549 | |
550 | blk = g_new(BlkMigBlock, 1); |
551 | blk->buf = g_malloc(BLOCK_SIZE); |
552 | blk->bmds = bmds; |
553 | blk->sector = sector; |
554 | blk->nr_sectors = nr_sectors; |
555 | |
556 | if (is_async) { |
557 | qemu_iovec_init_buf(&blk->qiov, blk->buf, |
558 | nr_sectors * BDRV_SECTOR_SIZE); |
559 | |
560 | blk->aiocb = blk_aio_preadv(bmds->blk, |
561 | sector * BDRV_SECTOR_SIZE, |
562 | &blk->qiov, 0, blk_mig_read_cb, |
563 | blk); |
564 | |
565 | blk_mig_lock(); |
566 | block_mig_state.submitted++; |
567 | bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); |
568 | blk_mig_unlock(); |
569 | } else { |
570 | ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf, |
571 | nr_sectors * BDRV_SECTOR_SIZE); |
572 | if (ret < 0) { |
573 | goto error; |
574 | } |
575 | blk_send(f, blk); |
576 | |
577 | g_free(blk->buf); |
578 | g_free(blk); |
579 | } |
580 | |
581 | sector += nr_sectors; |
582 | bmds->cur_dirty = sector; |
583 | break; |
584 | } |
585 | |
586 | bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); |
587 | sector += BDRV_SECTORS_PER_DIRTY_CHUNK; |
588 | bmds->cur_dirty = sector; |
589 | } |
590 | |
591 | return (bmds->cur_dirty >= bmds->total_sectors); |
592 | |
593 | error: |
594 | DPRINTF("Error reading sector %" PRId64 "\n" , sector); |
595 | g_free(blk->buf); |
596 | g_free(blk); |
597 | return ret; |
598 | } |
599 | |
600 | /* Called with iothread lock taken. |
601 | * |
602 | * return value: |
603 | * 0: too much data for max_downtime |
604 | * 1: few enough data for max_downtime |
605 | */ |
606 | static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) |
607 | { |
608 | BlkMigDevState *bmds; |
609 | int ret = 1; |
610 | |
611 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
612 | aio_context_acquire(blk_get_aio_context(bmds->blk)); |
613 | ret = mig_save_device_dirty(f, bmds, is_async); |
614 | aio_context_release(blk_get_aio_context(bmds->blk)); |
615 | if (ret <= 0) { |
616 | break; |
617 | } |
618 | } |
619 | |
620 | return ret; |
621 | } |
622 | |
623 | /* Called with no locks taken. */ |
624 | |
625 | static int flush_blks(QEMUFile *f) |
626 | { |
627 | BlkMigBlock *blk; |
628 | int ret = 0; |
629 | |
630 | DPRINTF("%s Enter submitted %d read_done %d transferred %d\n" , |
631 | __func__, block_mig_state.submitted, block_mig_state.read_done, |
632 | block_mig_state.transferred); |
633 | |
634 | blk_mig_lock(); |
635 | while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { |
636 | if (qemu_file_rate_limit(f)) { |
637 | break; |
638 | } |
639 | if (blk->ret < 0) { |
640 | ret = blk->ret; |
641 | break; |
642 | } |
643 | |
644 | QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); |
645 | blk_mig_unlock(); |
646 | blk_send(f, blk); |
647 | blk_mig_lock(); |
648 | |
649 | g_free(blk->buf); |
650 | g_free(blk); |
651 | |
652 | block_mig_state.read_done--; |
653 | block_mig_state.transferred++; |
654 | assert(block_mig_state.read_done >= 0); |
655 | } |
656 | blk_mig_unlock(); |
657 | |
658 | DPRINTF("%s Exit submitted %d read_done %d transferred %d\n" , __func__, |
659 | block_mig_state.submitted, block_mig_state.read_done, |
660 | block_mig_state.transferred); |
661 | return ret; |
662 | } |
663 | |
664 | /* Called with iothread lock taken. */ |
665 | |
666 | static int64_t get_remaining_dirty(void) |
667 | { |
668 | BlkMigDevState *bmds; |
669 | int64_t dirty = 0; |
670 | |
671 | QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { |
672 | aio_context_acquire(blk_get_aio_context(bmds->blk)); |
673 | dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); |
674 | aio_context_release(blk_get_aio_context(bmds->blk)); |
675 | } |
676 | |
677 | return dirty; |
678 | } |
679 | |
680 | |
681 | |
682 | /* Called with iothread lock taken. */ |
683 | static void block_migration_cleanup_bmds(void) |
684 | { |
685 | BlkMigDevState *bmds; |
686 | AioContext *ctx; |
687 | |
688 | unset_dirty_tracking(); |
689 | |
690 | while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { |
691 | QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); |
692 | bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); |
693 | error_free(bmds->blocker); |
694 | |
695 | /* Save ctx, because bmds->blk can disappear during blk_unref. */ |
696 | ctx = blk_get_aio_context(bmds->blk); |
697 | aio_context_acquire(ctx); |
698 | blk_unref(bmds->blk); |
699 | aio_context_release(ctx); |
700 | |
701 | g_free(bmds->blk_name); |
702 | g_free(bmds->aio_bitmap); |
703 | g_free(bmds); |
704 | } |
705 | } |
706 | |
707 | /* Called with iothread lock taken. */ |
708 | static void block_migration_cleanup(void *opaque) |
709 | { |
710 | BlkMigBlock *blk; |
711 | |
712 | bdrv_drain_all(); |
713 | |
714 | block_migration_cleanup_bmds(); |
715 | |
716 | blk_mig_lock(); |
717 | while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { |
718 | QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); |
719 | g_free(blk->buf); |
720 | g_free(blk); |
721 | } |
722 | blk_mig_unlock(); |
723 | } |
724 | |
725 | static int block_save_setup(QEMUFile *f, void *opaque) |
726 | { |
727 | int ret; |
728 | |
729 | DPRINTF("Enter save live setup submitted %d transferred %d\n" , |
730 | block_mig_state.submitted, block_mig_state.transferred); |
731 | |
732 | qemu_mutex_lock_iothread(); |
733 | ret = init_blk_migration(f); |
734 | if (ret < 0) { |
735 | qemu_mutex_unlock_iothread(); |
736 | return ret; |
737 | } |
738 | |
739 | /* start track dirty blocks */ |
740 | ret = set_dirty_tracking(); |
741 | |
742 | qemu_mutex_unlock_iothread(); |
743 | |
744 | if (ret) { |
745 | return ret; |
746 | } |
747 | |
748 | ret = flush_blks(f); |
749 | blk_mig_reset_dirty_cursor(); |
750 | qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
751 | |
752 | return ret; |
753 | } |
754 | |
755 | static int block_save_iterate(QEMUFile *f, void *opaque) |
756 | { |
757 | int ret; |
758 | int64_t last_ftell = qemu_ftell(f); |
759 | int64_t delta_ftell; |
760 | |
761 | DPRINTF("Enter save live iterate submitted %d transferred %d\n" , |
762 | block_mig_state.submitted, block_mig_state.transferred); |
763 | |
764 | ret = flush_blks(f); |
765 | if (ret) { |
766 | return ret; |
767 | } |
768 | |
769 | blk_mig_reset_dirty_cursor(); |
770 | |
771 | /* control the rate of transfer */ |
772 | blk_mig_lock(); |
773 | while (block_mig_state.read_done * BLOCK_SIZE < |
774 | qemu_file_get_rate_limit(f) && |
775 | block_mig_state.submitted < MAX_PARALLEL_IO && |
776 | (block_mig_state.submitted + block_mig_state.read_done) < |
777 | MAX_IO_BUFFERS) { |
778 | blk_mig_unlock(); |
779 | if (block_mig_state.bulk_completed == 0) { |
780 | /* first finish the bulk phase */ |
781 | if (blk_mig_save_bulked_block(f) == 0) { |
782 | /* finished saving bulk on all devices */ |
783 | block_mig_state.bulk_completed = 1; |
784 | } |
785 | ret = 0; |
786 | } else { |
787 | /* Always called with iothread lock taken for |
788 | * simplicity, block_save_complete also calls it. |
789 | */ |
790 | qemu_mutex_lock_iothread(); |
791 | ret = blk_mig_save_dirty_block(f, 1); |
792 | qemu_mutex_unlock_iothread(); |
793 | } |
794 | if (ret < 0) { |
795 | return ret; |
796 | } |
797 | blk_mig_lock(); |
798 | if (ret != 0) { |
799 | /* no more dirty blocks */ |
800 | break; |
801 | } |
802 | } |
803 | blk_mig_unlock(); |
804 | |
805 | ret = flush_blks(f); |
806 | if (ret) { |
807 | return ret; |
808 | } |
809 | |
810 | qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
811 | delta_ftell = qemu_ftell(f) - last_ftell; |
812 | if (delta_ftell > 0) { |
813 | return 1; |
814 | } else if (delta_ftell < 0) { |
815 | return -1; |
816 | } else { |
817 | return 0; |
818 | } |
819 | } |
820 | |
821 | /* Called with iothread lock taken. */ |
822 | |
823 | static int block_save_complete(QEMUFile *f, void *opaque) |
824 | { |
825 | int ret; |
826 | |
827 | DPRINTF("Enter save live complete submitted %d transferred %d\n" , |
828 | block_mig_state.submitted, block_mig_state.transferred); |
829 | |
830 | ret = flush_blks(f); |
831 | if (ret) { |
832 | return ret; |
833 | } |
834 | |
835 | blk_mig_reset_dirty_cursor(); |
836 | |
837 | /* we know for sure that save bulk is completed and |
838 | all async read completed */ |
839 | blk_mig_lock(); |
840 | assert(block_mig_state.submitted == 0); |
841 | blk_mig_unlock(); |
842 | |
843 | do { |
844 | ret = blk_mig_save_dirty_block(f, 0); |
845 | if (ret < 0) { |
846 | return ret; |
847 | } |
848 | } while (ret == 0); |
849 | |
850 | /* report completion */ |
851 | qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); |
852 | |
853 | DPRINTF("Block migration completed\n" ); |
854 | |
855 | qemu_put_be64(f, BLK_MIG_FLAG_EOS); |
856 | |
857 | /* Make sure that our BlockBackends are gone, so that the block driver |
858 | * nodes can be inactivated. */ |
859 | block_migration_cleanup_bmds(); |
860 | |
861 | return 0; |
862 | } |
863 | |
864 | static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, |
865 | uint64_t *res_precopy_only, |
866 | uint64_t *res_compatible, |
867 | uint64_t *res_postcopy_only) |
868 | { |
869 | /* Estimate pending number of bytes to send */ |
870 | uint64_t pending; |
871 | |
872 | qemu_mutex_lock_iothread(); |
873 | pending = get_remaining_dirty(); |
874 | qemu_mutex_unlock_iothread(); |
875 | |
876 | blk_mig_lock(); |
877 | pending += block_mig_state.submitted * BLOCK_SIZE + |
878 | block_mig_state.read_done * BLOCK_SIZE; |
879 | blk_mig_unlock(); |
880 | |
881 | /* Report at least one block pending during bulk phase */ |
882 | if (pending <= max_size && !block_mig_state.bulk_completed) { |
883 | pending = max_size + BLOCK_SIZE; |
884 | } |
885 | |
886 | DPRINTF("Enter save live pending %" PRIu64 "\n" , pending); |
887 | /* We don't do postcopy */ |
888 | *res_precopy_only += pending; |
889 | } |
890 | |
891 | static int block_load(QEMUFile *f, void *opaque, int version_id) |
892 | { |
893 | static int banner_printed; |
894 | int len, flags; |
895 | char device_name[256]; |
896 | int64_t addr; |
897 | BlockBackend *blk, *blk_prev = NULL; |
898 | Error *local_err = NULL; |
899 | uint8_t *buf; |
900 | int64_t total_sectors = 0; |
901 | int nr_sectors; |
902 | int ret; |
903 | BlockDriverInfo bdi; |
904 | int cluster_size = BLOCK_SIZE; |
905 | |
906 | do { |
907 | addr = qemu_get_be64(f); |
908 | |
909 | flags = addr & ~BDRV_SECTOR_MASK; |
910 | addr >>= BDRV_SECTOR_BITS; |
911 | |
912 | if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { |
913 | /* get device name */ |
914 | len = qemu_get_byte(f); |
915 | qemu_get_buffer(f, (uint8_t *)device_name, len); |
916 | device_name[len] = '\0'; |
917 | |
918 | blk = blk_by_name(device_name); |
919 | if (!blk) { |
920 | fprintf(stderr, "Error unknown block device %s\n" , |
921 | device_name); |
922 | return -EINVAL; |
923 | } |
924 | |
925 | if (blk != blk_prev) { |
926 | blk_prev = blk; |
927 | total_sectors = blk_nb_sectors(blk); |
928 | if (total_sectors <= 0) { |
929 | error_report("Error getting length of block device %s" , |
930 | device_name); |
931 | return -EINVAL; |
932 | } |
933 | |
934 | blk_invalidate_cache(blk, &local_err); |
935 | if (local_err) { |
936 | error_report_err(local_err); |
937 | return -EINVAL; |
938 | } |
939 | |
940 | ret = bdrv_get_info(blk_bs(blk), &bdi); |
941 | if (ret == 0 && bdi.cluster_size > 0 && |
942 | bdi.cluster_size <= BLOCK_SIZE && |
943 | BLOCK_SIZE % bdi.cluster_size == 0) { |
944 | cluster_size = bdi.cluster_size; |
945 | } else { |
946 | cluster_size = BLOCK_SIZE; |
947 | } |
948 | } |
949 | |
950 | if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { |
951 | nr_sectors = total_sectors - addr; |
952 | } else { |
953 | nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; |
954 | } |
955 | |
956 | if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { |
957 | ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, |
958 | nr_sectors * BDRV_SECTOR_SIZE, |
959 | BDRV_REQ_MAY_UNMAP); |
960 | } else { |
961 | int i; |
962 | int64_t cur_addr; |
963 | uint8_t *cur_buf; |
964 | |
965 | buf = g_malloc(BLOCK_SIZE); |
966 | qemu_get_buffer(f, buf, BLOCK_SIZE); |
967 | for (i = 0; i < BLOCK_SIZE / cluster_size; i++) { |
968 | cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; |
969 | cur_buf = buf + i * cluster_size; |
970 | |
971 | if ((!block_mig_state.zero_blocks || |
972 | cluster_size < BLOCK_SIZE) && |
973 | buffer_is_zero(cur_buf, cluster_size)) { |
974 | ret = blk_pwrite_zeroes(blk, cur_addr, |
975 | cluster_size, |
976 | BDRV_REQ_MAY_UNMAP); |
977 | } else { |
978 | ret = blk_pwrite(blk, cur_addr, cur_buf, |
979 | cluster_size, 0); |
980 | } |
981 | if (ret < 0) { |
982 | break; |
983 | } |
984 | } |
985 | g_free(buf); |
986 | } |
987 | |
988 | if (ret < 0) { |
989 | return ret; |
990 | } |
991 | } else if (flags & BLK_MIG_FLAG_PROGRESS) { |
992 | if (!banner_printed) { |
993 | printf("Receiving block device images\n" ); |
994 | banner_printed = 1; |
995 | } |
996 | printf("Completed %d %%%c" , (int)addr, |
997 | (addr == 100) ? '\n' : '\r'); |
998 | fflush(stdout); |
999 | } else if (!(flags & BLK_MIG_FLAG_EOS)) { |
1000 | fprintf(stderr, "Unknown block migration flags: %#x\n" , flags); |
1001 | return -EINVAL; |
1002 | } |
1003 | ret = qemu_file_get_error(f); |
1004 | if (ret != 0) { |
1005 | return ret; |
1006 | } |
1007 | } while (!(flags & BLK_MIG_FLAG_EOS)); |
1008 | |
1009 | return 0; |
1010 | } |
1011 | |
1012 | static bool block_is_active(void *opaque) |
1013 | { |
1014 | return migrate_use_block(); |
1015 | } |
1016 | |
1017 | static SaveVMHandlers savevm_block_handlers = { |
1018 | .save_setup = block_save_setup, |
1019 | .save_live_iterate = block_save_iterate, |
1020 | .save_live_complete_precopy = block_save_complete, |
1021 | .save_live_pending = block_save_pending, |
1022 | .load_state = block_load, |
1023 | .save_cleanup = block_migration_cleanup, |
1024 | .is_active = block_is_active, |
1025 | }; |
1026 | |
1027 | void blk_mig_init(void) |
1028 | { |
1029 | QSIMPLEQ_INIT(&block_mig_state.bmds_list); |
1030 | QSIMPLEQ_INIT(&block_mig_state.blk_list); |
1031 | qemu_mutex_init(&block_mig_state.lock); |
1032 | |
1033 | register_savevm_live(NULL, "block" , 0, 1, &savevm_block_handlers, |
1034 | &block_mig_state); |
1035 | } |
1036 | |