1 | /* |
2 | * Write logging blk driver based on blkverify and blkdebug. |
3 | * |
4 | * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com> |
5 | * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com> |
6 | * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com> |
7 | * |
8 | * This work is licensed under the terms of the GNU GPL, version 2 or later. |
9 | * See the COPYING file in the top-level directory. |
10 | */ |
11 | |
12 | #include "qemu/osdep.h" |
13 | #include "qapi/error.h" |
14 | #include "qemu/sockets.h" /* for EINPROGRESS on Windows */ |
15 | #include "block/block_int.h" |
16 | #include "qapi/qmp/qdict.h" |
17 | #include "qapi/qmp/qstring.h" |
18 | #include "qemu/cutils.h" |
19 | #include "qemu/module.h" |
20 | #include "qemu/option.h" |
21 | |
22 | /* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */ |
23 | |
24 | #define LOG_FLUSH_FLAG (1 << 0) |
25 | #define LOG_FUA_FLAG (1 << 1) |
26 | #define LOG_DISCARD_FLAG (1 << 2) |
27 | #define LOG_MARK_FLAG (1 << 3) |
28 | #define LOG_FLAG_MASK (LOG_FLUSH_FLAG \ |
29 | | LOG_FUA_FLAG \ |
30 | | LOG_DISCARD_FLAG \ |
31 | | LOG_MARK_FLAG) |
32 | |
33 | #define WRITE_LOG_VERSION 1ULL |
34 | #define WRITE_LOG_MAGIC 0x6a736677736872ULL |
35 | |
36 | /* All fields are little-endian. */ |
37 | struct log_write_super { |
38 | uint64_t magic; |
39 | uint64_t version; |
40 | uint64_t nr_entries; |
41 | uint32_t sectorsize; |
42 | } QEMU_PACKED; |
43 | |
44 | struct log_write_entry { |
45 | uint64_t sector; |
46 | uint64_t nr_sectors; |
47 | uint64_t flags; |
48 | uint64_t data_len; |
49 | } QEMU_PACKED; |
50 | |
51 | /* End of disk format structures. */ |
52 | |
53 | typedef struct { |
54 | BdrvChild *log_file; |
55 | uint32_t sectorsize; |
56 | uint32_t sectorbits; |
57 | uint64_t cur_log_sector; |
58 | uint64_t nr_entries; |
59 | uint64_t update_interval; |
60 | } BDRVBlkLogWritesState; |
61 | |
62 | static QemuOptsList runtime_opts = { |
63 | .name = "blklogwrites" , |
64 | .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), |
65 | .desc = { |
66 | { |
67 | .name = "log-append" , |
68 | .type = QEMU_OPT_BOOL, |
69 | .help = "Append to an existing log" , |
70 | }, |
71 | { |
72 | .name = "log-sector-size" , |
73 | .type = QEMU_OPT_SIZE, |
74 | .help = "Log sector size" , |
75 | }, |
76 | { |
77 | .name = "log-super-update-interval" , |
78 | .type = QEMU_OPT_NUMBER, |
79 | .help = "Log superblock update interval (# of write requests)" , |
80 | }, |
81 | { /* end of list */ } |
82 | }, |
83 | }; |
84 | |
85 | static inline uint32_t blk_log_writes_log2(uint32_t value) |
86 | { |
87 | assert(value > 0); |
88 | return 31 - clz32(value); |
89 | } |
90 | |
91 | static inline bool blk_log_writes_sector_size_valid(uint32_t sector_size) |
92 | { |
93 | return is_power_of_2(sector_size) && |
94 | sector_size >= sizeof(struct log_write_super) && |
95 | sector_size >= sizeof(struct log_write_entry) && |
96 | sector_size < (1ull << 24); |
97 | } |
98 | |
99 | static uint64_t blk_log_writes_find_cur_log_sector(BdrvChild *log, |
100 | uint32_t sector_size, |
101 | uint64_t nr_entries, |
102 | Error **errp) |
103 | { |
104 | uint64_t cur_sector = 1; |
105 | uint64_t cur_idx = 0; |
106 | uint32_t sector_bits = blk_log_writes_log2(sector_size); |
107 | struct log_write_entry cur_entry; |
108 | |
109 | while (cur_idx < nr_entries) { |
110 | int read_ret = bdrv_pread(log, cur_sector << sector_bits, &cur_entry, |
111 | sizeof(cur_entry)); |
112 | if (read_ret < 0) { |
113 | error_setg_errno(errp, -read_ret, |
114 | "Failed to read log entry %" PRIu64, cur_idx); |
115 | return (uint64_t)-1ull; |
116 | } |
117 | |
118 | if (cur_entry.flags & ~cpu_to_le64(LOG_FLAG_MASK)) { |
119 | error_setg(errp, "Invalid flags 0x%" PRIx64" in log entry %" PRIu64, |
120 | le64_to_cpu(cur_entry.flags), cur_idx); |
121 | return (uint64_t)-1ull; |
122 | } |
123 | |
124 | /* Account for the sector of the entry itself */ |
125 | ++cur_sector; |
126 | |
127 | /* |
128 | * Account for the data of the write. |
129 | * For discards, this data is not present. |
130 | */ |
131 | if (!(cur_entry.flags & cpu_to_le64(LOG_DISCARD_FLAG))) { |
132 | cur_sector += le64_to_cpu(cur_entry.nr_sectors); |
133 | } |
134 | |
135 | ++cur_idx; |
136 | } |
137 | |
138 | return cur_sector; |
139 | } |
140 | |
141 | static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags, |
142 | Error **errp) |
143 | { |
144 | BDRVBlkLogWritesState *s = bs->opaque; |
145 | QemuOpts *opts; |
146 | Error *local_err = NULL; |
147 | int ret; |
148 | uint64_t log_sector_size; |
149 | bool log_append; |
150 | |
151 | opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); |
152 | qemu_opts_absorb_qdict(opts, options, &local_err); |
153 | if (local_err) { |
154 | ret = -EINVAL; |
155 | error_propagate(errp, local_err); |
156 | goto fail; |
157 | } |
158 | |
159 | /* Open the file */ |
160 | bs->file = bdrv_open_child(NULL, options, "file" , bs, &child_file, false, |
161 | &local_err); |
162 | if (local_err) { |
163 | ret = -EINVAL; |
164 | error_propagate(errp, local_err); |
165 | goto fail; |
166 | } |
167 | |
168 | /* Open the log file */ |
169 | s->log_file = bdrv_open_child(NULL, options, "log" , bs, &child_file, false, |
170 | &local_err); |
171 | if (local_err) { |
172 | ret = -EINVAL; |
173 | error_propagate(errp, local_err); |
174 | goto fail; |
175 | } |
176 | |
177 | log_append = qemu_opt_get_bool(opts, "log-append" , false); |
178 | |
179 | if (log_append) { |
180 | struct log_write_super log_sb = { 0, 0, 0, 0 }; |
181 | |
182 | if (qemu_opt_find(opts, "log-sector-size" )) { |
183 | ret = -EINVAL; |
184 | error_setg(errp, "log-append and log-sector-size are mutually " |
185 | "exclusive" ); |
186 | goto fail_log; |
187 | } |
188 | |
189 | /* Read log superblock or fake one for an empty log */ |
190 | if (!bdrv_getlength(s->log_file->bs)) { |
191 | log_sb.magic = cpu_to_le64(WRITE_LOG_MAGIC); |
192 | log_sb.version = cpu_to_le64(WRITE_LOG_VERSION); |
193 | log_sb.nr_entries = cpu_to_le64(0); |
194 | log_sb.sectorsize = cpu_to_le32(BDRV_SECTOR_SIZE); |
195 | } else { |
196 | ret = bdrv_pread(s->log_file, 0, &log_sb, sizeof(log_sb)); |
197 | if (ret < 0) { |
198 | error_setg_errno(errp, -ret, "Could not read log superblock" ); |
199 | goto fail_log; |
200 | } |
201 | } |
202 | |
203 | if (log_sb.magic != cpu_to_le64(WRITE_LOG_MAGIC)) { |
204 | ret = -EINVAL; |
205 | error_setg(errp, "Invalid log superblock magic" ); |
206 | goto fail_log; |
207 | } |
208 | |
209 | if (log_sb.version != cpu_to_le64(WRITE_LOG_VERSION)) { |
210 | ret = -EINVAL; |
211 | error_setg(errp, "Unsupported log version %" PRIu64, |
212 | le64_to_cpu(log_sb.version)); |
213 | goto fail_log; |
214 | } |
215 | |
216 | log_sector_size = le32_to_cpu(log_sb.sectorsize); |
217 | s->cur_log_sector = 1; |
218 | s->nr_entries = 0; |
219 | |
220 | if (blk_log_writes_sector_size_valid(log_sector_size)) { |
221 | s->cur_log_sector = |
222 | blk_log_writes_find_cur_log_sector(s->log_file, log_sector_size, |
223 | le64_to_cpu(log_sb.nr_entries), &local_err); |
224 | if (local_err) { |
225 | ret = -EINVAL; |
226 | error_propagate(errp, local_err); |
227 | goto fail_log; |
228 | } |
229 | |
230 | s->nr_entries = le64_to_cpu(log_sb.nr_entries); |
231 | } |
232 | } else { |
233 | log_sector_size = qemu_opt_get_size(opts, "log-sector-size" , |
234 | BDRV_SECTOR_SIZE); |
235 | s->cur_log_sector = 1; |
236 | s->nr_entries = 0; |
237 | } |
238 | |
239 | if (!blk_log_writes_sector_size_valid(log_sector_size)) { |
240 | ret = -EINVAL; |
241 | error_setg(errp, "Invalid log sector size %" PRIu64, log_sector_size); |
242 | goto fail_log; |
243 | } |
244 | |
245 | s->sectorsize = log_sector_size; |
246 | s->sectorbits = blk_log_writes_log2(log_sector_size); |
247 | s->update_interval = qemu_opt_get_number(opts, "log-super-update-interval" , |
248 | 4096); |
249 | if (!s->update_interval) { |
250 | ret = -EINVAL; |
251 | error_setg(errp, "Invalid log superblock update interval %" PRIu64, |
252 | s->update_interval); |
253 | goto fail_log; |
254 | } |
255 | |
256 | ret = 0; |
257 | fail_log: |
258 | if (ret < 0) { |
259 | bdrv_unref_child(bs, s->log_file); |
260 | s->log_file = NULL; |
261 | } |
262 | fail: |
263 | if (ret < 0) { |
264 | bdrv_unref_child(bs, bs->file); |
265 | bs->file = NULL; |
266 | } |
267 | qemu_opts_del(opts); |
268 | return ret; |
269 | } |
270 | |
271 | static void blk_log_writes_close(BlockDriverState *bs) |
272 | { |
273 | BDRVBlkLogWritesState *s = bs->opaque; |
274 | |
275 | bdrv_unref_child(bs, s->log_file); |
276 | s->log_file = NULL; |
277 | } |
278 | |
279 | static int64_t blk_log_writes_getlength(BlockDriverState *bs) |
280 | { |
281 | return bdrv_getlength(bs->file->bs); |
282 | } |
283 | |
284 | static void blk_log_writes_child_perm(BlockDriverState *bs, BdrvChild *c, |
285 | const BdrvChildRole *role, |
286 | BlockReopenQueue *ro_q, |
287 | uint64_t perm, uint64_t shrd, |
288 | uint64_t *nperm, uint64_t *nshrd) |
289 | { |
290 | if (!c) { |
291 | *nperm = perm & DEFAULT_PERM_PASSTHROUGH; |
292 | *nshrd = (shrd & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED; |
293 | return; |
294 | } |
295 | |
296 | if (!strcmp(c->name, "log" )) { |
297 | bdrv_format_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd); |
298 | } else { |
299 | bdrv_filter_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd); |
300 | } |
301 | } |
302 | |
303 | static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp) |
304 | { |
305 | BDRVBlkLogWritesState *s = bs->opaque; |
306 | bs->bl.request_alignment = s->sectorsize; |
307 | } |
308 | |
309 | static int coroutine_fn |
310 | blk_log_writes_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, |
311 | QEMUIOVector *qiov, int flags) |
312 | { |
313 | return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); |
314 | } |
315 | |
316 | typedef struct BlkLogWritesFileReq { |
317 | BlockDriverState *bs; |
318 | uint64_t offset; |
319 | uint64_t bytes; |
320 | int file_flags; |
321 | QEMUIOVector *qiov; |
322 | int (*func)(struct BlkLogWritesFileReq *r); |
323 | int file_ret; |
324 | } BlkLogWritesFileReq; |
325 | |
326 | typedef struct { |
327 | BlockDriverState *bs; |
328 | QEMUIOVector *qiov; |
329 | struct log_write_entry entry; |
330 | uint64_t zero_size; |
331 | int log_ret; |
332 | } BlkLogWritesLogReq; |
333 | |
334 | static void coroutine_fn blk_log_writes_co_do_log(BlkLogWritesLogReq *lr) |
335 | { |
336 | BDRVBlkLogWritesState *s = lr->bs->opaque; |
337 | uint64_t cur_log_offset = s->cur_log_sector << s->sectorbits; |
338 | |
339 | s->nr_entries++; |
340 | s->cur_log_sector += |
341 | ROUND_UP(lr->qiov->size, s->sectorsize) >> s->sectorbits; |
342 | |
343 | lr->log_ret = bdrv_co_pwritev(s->log_file, cur_log_offset, lr->qiov->size, |
344 | lr->qiov, 0); |
345 | |
346 | /* Logging for the "write zeroes" operation */ |
347 | if (lr->log_ret == 0 && lr->zero_size) { |
348 | cur_log_offset = s->cur_log_sector << s->sectorbits; |
349 | s->cur_log_sector += |
350 | ROUND_UP(lr->zero_size, s->sectorsize) >> s->sectorbits; |
351 | |
352 | lr->log_ret = bdrv_co_pwrite_zeroes(s->log_file, cur_log_offset, |
353 | lr->zero_size, 0); |
354 | } |
355 | |
356 | /* Update super block on flush or every update interval */ |
357 | if (lr->log_ret == 0 && ((lr->entry.flags & LOG_FLUSH_FLAG) |
358 | || (s->nr_entries % s->update_interval == 0))) |
359 | { |
360 | struct log_write_super super = { |
361 | .magic = cpu_to_le64(WRITE_LOG_MAGIC), |
362 | .version = cpu_to_le64(WRITE_LOG_VERSION), |
363 | .nr_entries = cpu_to_le64(s->nr_entries), |
364 | .sectorsize = cpu_to_le32(s->sectorsize), |
365 | }; |
366 | void *zeroes = g_malloc0(s->sectorsize - sizeof(super)); |
367 | QEMUIOVector qiov; |
368 | |
369 | qemu_iovec_init(&qiov, 2); |
370 | qemu_iovec_add(&qiov, &super, sizeof(super)); |
371 | qemu_iovec_add(&qiov, zeroes, s->sectorsize - sizeof(super)); |
372 | |
373 | lr->log_ret = |
374 | bdrv_co_pwritev(s->log_file, 0, s->sectorsize, &qiov, 0); |
375 | if (lr->log_ret == 0) { |
376 | lr->log_ret = bdrv_co_flush(s->log_file->bs); |
377 | } |
378 | qemu_iovec_destroy(&qiov); |
379 | g_free(zeroes); |
380 | } |
381 | } |
382 | |
383 | static void coroutine_fn blk_log_writes_co_do_file(BlkLogWritesFileReq *fr) |
384 | { |
385 | fr->file_ret = fr->func(fr); |
386 | } |
387 | |
388 | static int coroutine_fn |
389 | blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes, |
390 | QEMUIOVector *qiov, int flags, |
391 | int (*file_func)(BlkLogWritesFileReq *r), |
392 | uint64_t entry_flags, bool is_zero_write) |
393 | { |
394 | QEMUIOVector log_qiov; |
395 | size_t niov = qiov ? qiov->niov : 0; |
396 | BDRVBlkLogWritesState *s = bs->opaque; |
397 | BlkLogWritesFileReq fr = { |
398 | .bs = bs, |
399 | .offset = offset, |
400 | .bytes = bytes, |
401 | .file_flags = flags, |
402 | .qiov = qiov, |
403 | .func = file_func, |
404 | }; |
405 | BlkLogWritesLogReq lr = { |
406 | .bs = bs, |
407 | .qiov = &log_qiov, |
408 | .entry = { |
409 | .sector = cpu_to_le64(offset >> s->sectorbits), |
410 | .nr_sectors = cpu_to_le64(bytes >> s->sectorbits), |
411 | .flags = cpu_to_le64(entry_flags), |
412 | .data_len = 0, |
413 | }, |
414 | .zero_size = is_zero_write ? bytes : 0, |
415 | }; |
416 | void *zeroes = g_malloc0(s->sectorsize - sizeof(lr.entry)); |
417 | |
418 | assert((1 << s->sectorbits) == s->sectorsize); |
419 | assert(bs->bl.request_alignment == s->sectorsize); |
420 | assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)); |
421 | assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment)); |
422 | |
423 | qemu_iovec_init(&log_qiov, niov + 2); |
424 | qemu_iovec_add(&log_qiov, &lr.entry, sizeof(lr.entry)); |
425 | qemu_iovec_add(&log_qiov, zeroes, s->sectorsize - sizeof(lr.entry)); |
426 | if (qiov) { |
427 | qemu_iovec_concat(&log_qiov, qiov, 0, qiov->size); |
428 | } |
429 | |
430 | blk_log_writes_co_do_file(&fr); |
431 | blk_log_writes_co_do_log(&lr); |
432 | |
433 | qemu_iovec_destroy(&log_qiov); |
434 | g_free(zeroes); |
435 | |
436 | if (lr.log_ret < 0) { |
437 | return lr.log_ret; |
438 | } |
439 | |
440 | return fr.file_ret; |
441 | } |
442 | |
443 | static int coroutine_fn |
444 | blk_log_writes_co_do_file_pwritev(BlkLogWritesFileReq *fr) |
445 | { |
446 | return bdrv_co_pwritev(fr->bs->file, fr->offset, fr->bytes, |
447 | fr->qiov, fr->file_flags); |
448 | } |
449 | |
450 | static int coroutine_fn |
451 | blk_log_writes_co_do_file_pwrite_zeroes(BlkLogWritesFileReq *fr) |
452 | { |
453 | return bdrv_co_pwrite_zeroes(fr->bs->file, fr->offset, fr->bytes, |
454 | fr->file_flags); |
455 | } |
456 | |
457 | static int coroutine_fn blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr) |
458 | { |
459 | return bdrv_co_flush(fr->bs->file->bs); |
460 | } |
461 | |
462 | static int coroutine_fn |
463 | blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) |
464 | { |
465 | return bdrv_co_pdiscard(fr->bs->file, fr->offset, fr->bytes); |
466 | } |
467 | |
468 | static int coroutine_fn |
469 | blk_log_writes_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, |
470 | QEMUIOVector *qiov, int flags) |
471 | { |
472 | return blk_log_writes_co_log(bs, offset, bytes, qiov, flags, |
473 | blk_log_writes_co_do_file_pwritev, 0, false); |
474 | } |
475 | |
476 | static int coroutine_fn |
477 | blk_log_writes_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, |
478 | BdrvRequestFlags flags) |
479 | { |
480 | return blk_log_writes_co_log(bs, offset, bytes, NULL, flags, |
481 | blk_log_writes_co_do_file_pwrite_zeroes, 0, |
482 | true); |
483 | } |
484 | |
485 | static int coroutine_fn blk_log_writes_co_flush_to_disk(BlockDriverState *bs) |
486 | { |
487 | return blk_log_writes_co_log(bs, 0, 0, NULL, 0, |
488 | blk_log_writes_co_do_file_flush, |
489 | LOG_FLUSH_FLAG, false); |
490 | } |
491 | |
492 | static int coroutine_fn |
493 | blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) |
494 | { |
495 | return blk_log_writes_co_log(bs, offset, count, NULL, 0, |
496 | blk_log_writes_co_do_file_pdiscard, |
497 | LOG_DISCARD_FLAG, false); |
498 | } |
499 | |
500 | static const char *const blk_log_writes_strong_runtime_opts[] = { |
501 | "log-append" , |
502 | "log-sector-size" , |
503 | |
504 | NULL |
505 | }; |
506 | |
507 | static BlockDriver bdrv_blk_log_writes = { |
508 | .format_name = "blklogwrites" , |
509 | .instance_size = sizeof(BDRVBlkLogWritesState), |
510 | |
511 | .bdrv_open = blk_log_writes_open, |
512 | .bdrv_close = blk_log_writes_close, |
513 | .bdrv_getlength = blk_log_writes_getlength, |
514 | .bdrv_child_perm = blk_log_writes_child_perm, |
515 | .bdrv_refresh_limits = blk_log_writes_refresh_limits, |
516 | |
517 | .bdrv_co_preadv = blk_log_writes_co_preadv, |
518 | .bdrv_co_pwritev = blk_log_writes_co_pwritev, |
519 | .bdrv_co_pwrite_zeroes = blk_log_writes_co_pwrite_zeroes, |
520 | .bdrv_co_flush_to_disk = blk_log_writes_co_flush_to_disk, |
521 | .bdrv_co_pdiscard = blk_log_writes_co_pdiscard, |
522 | .bdrv_co_block_status = bdrv_co_block_status_from_file, |
523 | |
524 | .is_filter = true, |
525 | .strong_runtime_opts = blk_log_writes_strong_runtime_opts, |
526 | }; |
527 | |
528 | static void bdrv_blk_log_writes_init(void) |
529 | { |
530 | bdrv_register(&bdrv_blk_log_writes); |
531 | } |
532 | |
533 | block_init(bdrv_blk_log_writes_init); |
534 | |