qed.c source code [qemu/block/qed.c]

1	/*
2	* QEMU Enhanced Disk Format
3	*
4	* Copyright IBM, Corp. 2010
5	*
6	* Authors:
7	* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
8	* Anthony Liguori <aliguori@us.ibm.com>
9	*
10	* This work is licensed under the terms of the GNU LGPL, version 2 or later.
11	* See the COPYING.LIB file in the top-level directory.
12	*
13	*/
14
15	#include "qemu/osdep.h"
16	#include "block/qdict.h"
17	#include "qapi/error.h"
18	#include "qemu/timer.h"
19	#include "qemu/bswap.h"
20	#include "qemu/main-loop.h"
21	#include "qemu/module.h"
22	#include "qemu/option.h"
23	#include "trace.h"
24	#include "qed.h"
25	#include "sysemu/block-backend.h"
26	#include "qapi/qmp/qdict.h"
27	#include "qapi/qobject-input-visitor.h"
28	#include "qapi/qapi-visit-block-core.h"
29
30	static QemuOptsList qed_create_opts;
31
32	static int bdrv_qed_probe(const uint8_t buf, int* buf_size,
33	const char *filename)
34	{
35	const QEDHeader header = (const* QEDHeader *)buf;
36
37	if (buf_size < sizeof(*header)) {
38	return `0`;
39	}
40	if (le32_to_cpu(header->magic) != QED_MAGIC) {
41	return `0`;
42	}
43	return `100`;
44	}
45
46	/**
47	* Check whether an image format is raw
48	*
49	* @fmt: Backing file format, may be NULL
50	*/
51	static bool qed_fmt_is_raw(const char *fmt)
52	{
53	return fmt && strcmp(fmt, "raw") == `0`;
54	}
55
56	static void qed_header_le_to_cpu(const QEDHeader le, QEDHeader cpu)
57	{
58	cpu->magic = le32_to_cpu(le->magic);
59	cpu->cluster_size = le32_to_cpu(le->cluster_size);
60	cpu->table_size = le32_to_cpu(le->table_size);
61	cpu->header_size = le32_to_cpu(le->header_size);
62	cpu->features = le64_to_cpu(le->features);
63	cpu->compat_features = le64_to_cpu(le->compat_features);
64	cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
65	cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
66	cpu->image_size = le64_to_cpu(le->image_size);
67	cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
68	cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
69	}
70
71	static void qed_header_cpu_to_le(const QEDHeader cpu, QEDHeader le)
72	{
73	le->magic = cpu_to_le32(cpu->magic);
74	le->cluster_size = cpu_to_le32(cpu->cluster_size);
75	le->table_size = cpu_to_le32(cpu->table_size);
76	le->header_size = cpu_to_le32(cpu->header_size);
77	le->features = cpu_to_le64(cpu->features);
78	le->compat_features = cpu_to_le64(cpu->compat_features);
79	le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
80	le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
81	le->image_size = cpu_to_le64(cpu->image_size);
82	le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
83	le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
84	}
85
86	int qed_write_header_sync(BDRVQEDState *s)
87	{
88	QEDHeader le;
89	int ret;
90
91	qed_header_cpu_to_le(&s->header, &le);
92	ret = bdrv_pwrite(s->bs->file, `0`, &le, sizeof(le));
93	if (ret != sizeof(le)) {
94	return ret;
95	}
96	return `0`;
97	}
98
99	/**
100	* Update header in-place (does not rewrite backing filename or other strings)
101	*
102	* This function only updates known header fields in-place and does not affect
103	* extra data after the QED header.
104	*
105	* No new allocating reqs can start while this function runs.
106	*/
107	static int coroutine_fn qed_write_header(BDRVQEDState *s)
108	{
109	/ We must write full sectors for O_DIRECT but cannot necessarily generate*
110	* the data following the header if an unrecognized compat feature is
111	* active. Therefore, first read the sectors containing the header, update
112	* them, and write back.
113	*/
114
115	int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
116	size_t len = nsectors * BDRV_SECTOR_SIZE;
117	uint8_t *buf;
118	int ret;
119
120	assert(s->allocating_acb \|\| s->allocating_write_reqs_plugged);
121
122	buf = qemu_blockalign(s->bs, len);
123
124	ret = bdrv_co_pread(s->bs->file, `0`, len, buf, `0`);
125	if (ret < `0`) {
126	goto out;
127	}
128
129	/ Update header /
130	qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
131
132	ret = bdrv_co_pwrite(s->bs->file, `0`, len, buf, `0`);
133	if (ret < `0`) {
134	goto out;
135	}
136
137	ret = `0`;
138	out:
139	qemu_vfree(buf);
140	return ret;
141	}
142
143	static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
144	{
145	uint64_t table_entries;
146	uint64_t l2_size;
147
148	table_entries = (table_size * cluster_size) / sizeof(uint64_t);
149	l2_size = table_entries * cluster_size;
150
151	return l2_size * table_entries;
152	}
153
154	static bool qed_is_cluster_size_valid(uint32_t cluster_size)
155	{
156	if (cluster_size < QED_MIN_CLUSTER_SIZE \|\|
157	cluster_size > QED_MAX_CLUSTER_SIZE) {
158	return false;
159	}
160	if (cluster_size & (cluster_size - `1`)) {
161	return false; / not power of 2 /
162	}
163	return true;
164	}
165
166	static bool qed_is_table_size_valid(uint32_t table_size)
167	{
168	if (table_size < QED_MIN_TABLE_SIZE \|\|
169	table_size > QED_MAX_TABLE_SIZE) {
170	return false;
171	}
172	if (table_size & (table_size - `1`)) {
173	return false; / not power of 2 /
174	}
175	return true;
176	}
177
178	static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
179	uint32_t table_size)
180	{
181	if (image_size % BDRV_SECTOR_SIZE != `0`) {
182	return false; / not multiple of sector size /
183	}
184	if (image_size > qed_max_image_size(cluster_size, table_size)) {
185	return false; / image is too large /
186	}
187	return true;
188	}
189
190	/**
191	* Read a string of known length from the image file
192	*
193	* @file: Image file
194	* @offset: File offset to start of string, in bytes
195	* @n: String length in bytes
196	* @buf: Destination buffer
197	* @buflen: Destination buffer length in bytes
198	* @ret: 0 on success, -errno on failure
199	*
200	* The string is NUL-terminated.
201	*/
202	static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
203	char *buf, size_t buflen)
204	{
205	int ret;
206	if (n >= buflen) {
207	return -EINVAL;
208	}
209	ret = bdrv_pread(file, offset, buf, n);
210	if (ret < `0`) {
211	return ret;
212	}
213	buf[n] = `'\0'`;
214	return `0`;
215	}
216
217	/**
218	* Allocate new clusters
219	*
220	* @s: QED state
221	* @n: Number of contiguous clusters to allocate
222	* @ret: Offset of first allocated cluster
223	*
224	* This function only produces the offset where the new clusters should be
225	* written. It updates BDRVQEDState but does not make any changes to the image
226	* file.
227	*
228	* Called with table_lock held.
229	*/
230	static uint64_t qed_alloc_clusters(BDRVQEDState s, unsigned* int n)
231	{
232	uint64_t offset = s->file_size;
233	s->file_size += n * s->header.cluster_size;
234	return offset;
235	}
236
237	QEDTable qed_alloc_table(BDRVQEDState s)
238	{
239	/ Honor O_DIRECT memory alignment requirements /
240	return qemu_blockalign(s->bs,
241	s->header.cluster_size * s->header.table_size);
242	}
243
244	/**
245	* Allocate a new zeroed L2 table
246	*
247	* Called with table_lock held.
248	*/
249	static CachedL2Table qed_new_l2_table(BDRVQEDState s)
250	{
251	CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
252
253	l2_table->table = qed_alloc_table(s);
254	l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
255
256	memset(l2_table->table->offsets, `0`,
257	s->header.cluster_size * s->header.table_size);
258	return l2_table;
259	}
260
261	static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
262	{
263	qemu_co_mutex_lock(&s->table_lock);
264
265	/ No reentrancy is allowed. /
266	assert(!s->allocating_write_reqs_plugged);
267	if (s->allocating_acb != NULL) {
268	/ Another allocating write came concurrently. This cannot happen*
269	* from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
270	*/
271	qemu_co_mutex_unlock(&s->table_lock);
272	return false;
273	}
274
275	s->allocating_write_reqs_plugged = true;
276	qemu_co_mutex_unlock(&s->table_lock);
277	return true;
278	}
279
280	static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
281	{
282	qemu_co_mutex_lock(&s->table_lock);
283	assert(s->allocating_write_reqs_plugged);
284	s->allocating_write_reqs_plugged = false;
285	qemu_co_queue_next(&s->allocating_write_reqs);
286	qemu_co_mutex_unlock(&s->table_lock);
287	}
288
289	static void coroutine_fn qed_need_check_timer_entry(void *opaque)
290	{
291	BDRVQEDState *s = opaque;
292	int ret;
293
294	trace_qed_need_check_timer_cb(s);
295
296	if (!qed_plug_allocating_write_reqs(s)) {
297	return;
298	}
299
300	/ Ensure writes are on disk before clearing flag /
301	ret = bdrv_co_flush(s->bs->file->bs);
302	if (ret < `0`) {
303	qed_unplug_allocating_write_reqs(s);
304	return;
305	}
306
307	s->header.features &= ~QED_F_NEED_CHECK;
308	ret = qed_write_header(s);
309	(void) ret;
310
311	qed_unplug_allocating_write_reqs(s);
312
313	ret = bdrv_co_flush(s->bs);
314	(void) ret;
315	}
316
317	static void qed_need_check_timer_cb(void *opaque)
318	{
319	Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
320	qemu_coroutine_enter(co);
321	}
322
323	static void qed_start_need_check_timer(BDRVQEDState *s)
324	{
325	trace_qed_start_need_check_timer(s);
326
327	/ Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for*
328	* migration.
329	*/
330	timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
331	NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
332	}
333
334	/ It's okay to call this multiple times or when no timer is started /
335	static void qed_cancel_need_check_timer(BDRVQEDState *s)
336	{
337	trace_qed_cancel_need_check_timer(s);
338	timer_del(s->need_check_timer);
339	}
340
341	static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
342	{
343	BDRVQEDState *s = bs->opaque;
344
345	qed_cancel_need_check_timer(s);
346	timer_free(s->need_check_timer);
347	}
348
349	static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
350	AioContext *new_context)
351	{
352	BDRVQEDState *s = bs->opaque;
353
354	s->need_check_timer = aio_timer_new(new_context,
355	QEMU_CLOCK_VIRTUAL, SCALE_NS,
356	qed_need_check_timer_cb, s);
357	if (s->header.features & QED_F_NEED_CHECK) {
358	qed_start_need_check_timer(s);
359	}
360	}
361
362	static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
363	{
364	BDRVQEDState *s = bs->opaque;
365
366	/ Fire the timer immediately in order to start doing I/O as soon as the*
367	* header is flushed.
368	*/
369	if (s->need_check_timer && timer_pending(s->need_check_timer)) {
370	qed_cancel_need_check_timer(s);
371	qed_need_check_timer_entry(s);
372	}
373	}
374
375	static void bdrv_qed_init_state(BlockDriverState *bs)
376	{
377	BDRVQEDState *s = bs->opaque;
378
379	memset(s, `0`, sizeof(BDRVQEDState));
380	s->bs = bs;
381	qemu_co_mutex_init(&s->table_lock);
382	qemu_co_queue_init(&s->allocating_write_reqs);
383	}
384
385	/ Called with table_lock held. /
386	static int coroutine_fn bdrv_qed_do_open(BlockDriverState bs, QDict options,
387	int flags, Error **errp)
388	{
389	BDRVQEDState *s = bs->opaque;
390	QEDHeader le_header;
391	int64_t file_size;
392	int ret;
393
394	ret = bdrv_pread(bs->file, `0`, &le_header, sizeof(le_header));
395	if (ret < `0`) {
396	return ret;
397	}
398	qed_header_le_to_cpu(&le_header, &s->header);
399
400	if (s->header.magic != QED_MAGIC) {
401	error_setg(errp, "Image not in QED format");
402	return -EINVAL;
403	}
404	if (s->header.features & ~QED_FEATURE_MASK) {
405	/ image uses unsupported feature bits /
406	error_setg(errp, "Unsupported QED features: %" PRIx64,
407	s->header.features & ~QED_FEATURE_MASK);
408	return -ENOTSUP;
409	}
410	if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
411	return -EINVAL;
412	}
413
414	/ Round down file size to the last cluster /
415	file_size = bdrv_getlength(bs->file->bs);
416	if (file_size < `0`) {
417	return file_size;
418	}
419	s->file_size = qed_start_of_cluster(s, file_size);
420
421	if (!qed_is_table_size_valid(s->header.table_size)) {
422	return -EINVAL;
423	}
424	if (!qed_is_image_size_valid(s->header.image_size,
425	s->header.cluster_size,
426	s->header.table_size)) {
427	return -EINVAL;
428	}
429	if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
430	return -EINVAL;
431	}
432
433	s->table_nelems = (s->header.cluster_size * s->header.table_size) /
434	sizeof(uint64_t);
435	s->l2_shift = ctz32(s->header.cluster_size);
436	s->l2_mask = s->table_nelems - `1`;
437	s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
438
439	/ Header size calculation must not overflow uint32_t /
440	if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
441	return -EINVAL;
442	}
443
444	if ((s->header.features & QED_F_BACKING_FILE)) {
445	if ((uint64_t)s->header.backing_filename_offset +
446	s->header.backing_filename_size >
447	s->header.cluster_size * s->header.header_size) {
448	return -EINVAL;
449	}
450
451	ret = qed_read_string(bs->file, s->header.backing_filename_offset,
452	s->header.backing_filename_size,
453	bs->auto_backing_file,
454	sizeof(bs->auto_backing_file));
455	if (ret < `0`) {
456	return ret;
457	}
458	pstrcpy(bs->backing_file, sizeof(bs->backing_file),
459	bs->auto_backing_file);
460
461	if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
462	pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
463	}
464	}
465
466	/ Reset unknown autoclear feature bits. This is a backwards*
467	* compatibility mechanism that allows images to be opened by older
468	* programs, which "knock out" unknown feature bits. When an image is
469	* opened by a newer program again it can detect that the autoclear
470	* feature is no longer valid.
471	*/
472	if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != `0` &&
473	!bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
474	s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
475
476	ret = qed_write_header_sync(s);
477	if (ret) {
478	return ret;
479	}
480
481	/ From here on only known autoclear feature bits are valid /
482	bdrv_flush(bs->file->bs);
483	}
484
485	s->l1_table = qed_alloc_table(s);
486	qed_init_l2_cache(&s->l2_cache);
487
488	ret = qed_read_l1_table_sync(s);
489	if (ret) {
490	goto out;
491	}
492
493	/ If image was not closed cleanly, check consistency /
494	if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
495	/ Read-only images cannot be fixed. There is no risk of corruption*
496	* since write operations are not possible. Therefore, allow
497	* potentially inconsistent images to be opened read-only. This can
498	* aid data recovery from an otherwise inconsistent image.
499	*/
500	if (!bdrv_is_read_only(bs->file->bs) &&
501	!(flags & BDRV_O_INACTIVE)) {
502	BdrvCheckResult result = {`0`};
503
504	ret = qed_check(s, &result, true);
505	if (ret) {
506	goto out;
507	}
508	}
509	}
510
511	bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
512
513	out:
514	if (ret) {
515	qed_free_l2_cache(&s->l2_cache);
516	qemu_vfree(s->l1_table);
517	}
518	return ret;
519	}
520
521	typedef struct QEDOpenCo {
522	BlockDriverState *bs;
523	QDict *options;
524	int flags;
525	Error **errp;
526	int ret;
527	} QEDOpenCo;
528
529	static void coroutine_fn bdrv_qed_open_entry(void *opaque)
530	{
531	QEDOpenCo *qoc = opaque;
532	BDRVQEDState *s = qoc->bs->opaque;
533
534	qemu_co_mutex_lock(&s->table_lock);
535	qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
536	qemu_co_mutex_unlock(&s->table_lock);
537	}
538
539	static int bdrv_qed_open(BlockDriverState bs, QDict options, int flags,
540	Error **errp)
541	{
542	QEDOpenCo qoc = {
543	.bs = bs,
544	.options = options,
545	.flags = flags,
546	.errp = errp,
547	.ret = -EINPROGRESS
548	};
549
550	bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
551	false, errp);
552	if (!bs->file) {
553	return -EINVAL;
554	}
555
556	bdrv_qed_init_state(bs);
557	if (qemu_in_coroutine()) {
558	bdrv_qed_open_entry(&qoc);
559	} else {
560	assert(qemu_get_current_aio_context() == qemu_get_aio_context());
561	qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
562	BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
563	}
564	BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
565	return qoc.ret;
566	}
567
568	static void bdrv_qed_refresh_limits(BlockDriverState bs, Error *errp)
569	{
570	BDRVQEDState *s = bs->opaque;
571
572	bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
573	}
574
575	/ We have nothing to do for QED reopen, stubs just return*
576	* success */
577	static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
578	BlockReopenQueue queue, Error *errp)
579	{
580	return `0`;
581	}
582
583	static void bdrv_qed_close(BlockDriverState *bs)
584	{
585	BDRVQEDState *s = bs->opaque;
586
587	bdrv_qed_detach_aio_context(bs);
588
589	/ Ensure writes reach stable storage /
590	bdrv_flush(bs->file->bs);
591
592	/ Clean shutdown, no check required on next open /
593	if (s->header.features & QED_F_NEED_CHECK) {
594	s->header.features &= ~QED_F_NEED_CHECK;
595	qed_write_header_sync(s);
596	}
597
598	qed_free_l2_cache(&s->l2_cache);
599	qemu_vfree(s->l1_table);
600	}
601
602	static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
603	Error **errp)
604	{
605	BlockdevCreateOptionsQed *qed_opts;
606	BlockBackend *blk = NULL;
607	BlockDriverState *bs = NULL;
608
609	QEDHeader header;
610	QEDHeader le_header;
611	uint8_t *l1_table = NULL;
612	size_t l1_size;
613	int ret = `0`;
614
615	assert(opts->driver == BLOCKDEV_DRIVER_QED);
616	qed_opts = &opts->u.qed;
617
618	/ Validate options and set default values /
619	if (!qed_opts->has_cluster_size) {
620	qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
621	}
622	if (!qed_opts->has_table_size) {
623	qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
624	}
625
626	if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
627	error_setg(errp, "QED cluster size must be within range [%u, %u] "
628	"and power of 2",
629	QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
630	return -EINVAL;
631	}
632	if (!qed_is_table_size_valid(qed_opts->table_size)) {
633	error_setg(errp, "QED table size must be within range [%u, %u] "
634	"and power of 2",
635	QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
636	return -EINVAL;
637	}
638	if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
639	qed_opts->table_size))
640	{
641	error_setg(errp, "QED image size must be a non-zero multiple of "
642	"cluster size and less than %" PRIu64 " bytes",
643	qed_max_image_size(qed_opts->cluster_size,
644	qed_opts->table_size));
645	return -EINVAL;
646	}
647
648	/ Create BlockBackend to write to the image /
649	bs = bdrv_open_blockdev_ref(qed_opts->file, errp);
650	if (bs == NULL) {
651	return -EIO;
652	}
653
654	blk = blk_new(bdrv_get_aio_context(bs),
655	BLK_PERM_WRITE \| BLK_PERM_RESIZE, BLK_PERM_ALL);
656	ret = blk_insert_bs(blk, bs, errp);
657	if (ret < `0`) {
658	goto out;
659	}
660	blk_set_allow_write_beyond_eof(blk, true);
661
662	/ Prepare image format /
663	header = (QEDHeader) {
664	.magic = QED_MAGIC,
665	.cluster_size = qed_opts->cluster_size,
666	.table_size = qed_opts->table_size,
667	.header_size = `1`,
668	.features = `0`,
669	.compat_features = `0`,
670	.l1_table_offset = qed_opts->cluster_size,
671	.image_size = qed_opts->size,
672	};
673
674	l1_size = header.cluster_size * header.table_size;
675
676	/ File must start empty and grow, check truncate is supported /
677	ret = blk_truncate(blk, `0`, PREALLOC_MODE_OFF, errp);
678	if (ret < `0`) {
679	goto out;
680	}
681
682	if (qed_opts->has_backing_file) {
683	header.features \|= QED_F_BACKING_FILE;
684	header.backing_filename_offset = sizeof(le_header);
685	header.backing_filename_size = strlen(qed_opts->backing_file);
686
687	if (qed_opts->has_backing_fmt) {
688	const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
689	if (qed_fmt_is_raw(backing_fmt)) {
690	header.features \|= QED_F_BACKING_FORMAT_NO_PROBE;
691	}
692	}
693	}
694
695	qed_header_cpu_to_le(&header, &le_header);
696	ret = blk_pwrite(blk, `0`, &le_header, sizeof(le_header), `0`);
697	if (ret < `0`) {
698	goto out;
699	}
700	ret = blk_pwrite(blk, sizeof(le_header), qed_opts->backing_file,
701	header.backing_filename_size, `0`);
702	if (ret < `0`) {
703	goto out;
704	}
705
706	l1_table = g_malloc0(l1_size);
707	ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, `0`);
708	if (ret < `0`) {
709	goto out;
710	}
711
712	ret = `0`; / success /
713	out:
714	g_free(l1_table);
715	blk_unref(blk);
716	bdrv_unref(bs);
717	return ret;
718	}
719
720	static int coroutine_fn bdrv_qed_co_create_opts(const char *filename,
721	QemuOpts *opts,
722	Error **errp)
723	{
724	BlockdevCreateOptions *create_options = NULL;
725	QDict *qdict;
726	Visitor *v;
727	BlockDriverState *bs = NULL;
728	Error *local_err = NULL;
729	int ret;
730
731	static const QDictRenames opt_renames[] = {
732	{ BLOCK_OPT_BACKING_FILE, "backing-file" },
733	{ BLOCK_OPT_BACKING_FMT, "backing-fmt" },
734	{ BLOCK_OPT_CLUSTER_SIZE, "cluster-size" },
735	{ BLOCK_OPT_TABLE_SIZE, "table-size" },
736	{ NULL, NULL },
737	};
738
739	/ Parse options and convert legacy syntax /
740	qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
741
742	if (!qdict_rename_keys(qdict, opt_renames, errp)) {
743	ret = -EINVAL;
744	goto fail;
745	}
746
747	/ Create and open the file (protocol layer) /
748	ret = bdrv_create_file(filename, opts, &local_err);
749	if (ret < `0`) {
750	error_propagate(errp, local_err);
751	goto fail;
752	}
753
754	bs = bdrv_open(filename, NULL, NULL,
755	BDRV_O_RDWR \| BDRV_O_RESIZE \| BDRV_O_PROTOCOL, errp);
756	if (bs == NULL) {
757	ret = -EIO;
758	goto fail;
759	}
760
761	/ Now get the QAPI type BlockdevCreateOptions /
762	qdict_put_str(qdict, "driver", "qed");
763	qdict_put_str(qdict, "file", bs->node_name);
764
765	v = qobject_input_visitor_new_flat_confused(qdict, errp);
766	if (!v) {
767	ret = -EINVAL;
768	goto fail;
769	}
770
771	visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
772	visit_free(v);
773
774	if (local_err) {
775	error_propagate(errp, local_err);
776	ret = -EINVAL;
777	goto fail;
778	}
779
780	/ Silently round up size /
781	assert(create_options->driver == BLOCKDEV_DRIVER_QED);
782	create_options->u.qed.size =
783	ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
784
785	/ Create the qed image (format layer) /
786	ret = bdrv_qed_co_create(create_options, errp);
787
788	fail:
789	qobject_unref(qdict);
790	bdrv_unref(bs);
791	qapi_free_BlockdevCreateOptions(create_options);
792	return ret;
793	}
794
795	static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
796	bool want_zero,
797	int64_t pos, int64_t bytes,
798	int64_t pnum, int64_t map,
799	BlockDriverState **file)
800	{
801	BDRVQEDState *s = bs->opaque;
802	size_t len = MIN(bytes, SIZE_MAX);
803	int status;
804	QEDRequest request = { .l2_table = NULL };
805	uint64_t offset;
806	int ret;
807
808	qemu_co_mutex_lock(&s->table_lock);
809	ret = qed_find_cluster(s, &request, pos, &len, &offset);
810
811	*pnum = len;
812	switch (ret) {
813	case QED_CLUSTER_FOUND:
814	*map = offset \| qed_offset_into_cluster(s, pos);
815	status = BDRV_BLOCK_DATA \| BDRV_BLOCK_OFFSET_VALID;
816	*file = bs->file->bs;
817	break;
818	case QED_CLUSTER_ZERO:
819	status = BDRV_BLOCK_ZERO;
820	break;
821	case QED_CLUSTER_L2:
822	case QED_CLUSTER_L1:
823	status = `0`;
824	break;
825	default:
826	assert(ret < `0`);
827	status = ret;
828	break;
829	}
830
831	qed_unref_l2_cache_entry(request.l2_table);
832	qemu_co_mutex_unlock(&s->table_lock);
833
834	return status;
835	}
836
837	static BDRVQEDState acb_to_s(QEDAIOCB acb)
838	{
839	return acb->bs->opaque;
840	}
841
842	/**
843	* Read from the backing file or zero-fill if no backing file
844	*
845	* @s: QED state
846	* @pos: Byte position in device
847	* @qiov: Destination I/O vector
848	* @backing_qiov: Possibly shortened copy of qiov, to be allocated here
849	* @cb: Completion function
850	* @opaque: User data for completion function
851	*
852	* This function reads qiov->size bytes starting at pos from the backing file.
853	* If there is no backing file then zeroes are read.
854	*/
855	static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
856	QEMUIOVector *qiov,
857	QEMUIOVector **backing_qiov)
858	{
859	uint64_t backing_length = `0`;
860	size_t size;
861	int ret;
862
863	/ If there is a backing file, get its length. Treat the absence of a*
864	* backing file like a zero length backing file.
865	*/
866	if (s->bs->backing) {
867	int64_t l = bdrv_getlength(s->bs->backing->bs);
868	if (l < `0`) {
869	return l;
870	}
871	backing_length = l;
872	}
873
874	/ Zero all sectors if reading beyond the end of the backing file /
875	if (pos >= backing_length \|\|
876	pos + qiov->size > backing_length) {
877	qemu_iovec_memset(qiov, `0`, `0`, qiov->size);
878	}
879
880	/ Complete now if there are no backing file sectors to read /
881	if (pos >= backing_length) {
882	return `0`;
883	}
884
885	/ If the read straddles the end of the backing file, shorten it /
886	size = MIN((uint64_t)backing_length - pos, qiov->size);
887
888	assert(*backing_qiov == NULL);
889	*backing_qiov = g_new(QEMUIOVector, `1`);
890	qemu_iovec_init(*backing_qiov, qiov->niov);
891	qemu_iovec_concat(*backing_qiov, qiov, `0`, size);
892
893	BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
894	ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, `0`);
895	if (ret < `0`) {
896	return ret;
897	}
898	return `0`;
899	}
900
901	/**
902	* Copy data from backing file into the image
903	*
904	* @s: QED state
905	* @pos: Byte position in device
906	* @len: Number of bytes
907	* @offset: Byte offset in image file
908	*/
909	static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
910	uint64_t pos, uint64_t len,
911	uint64_t offset)
912	{
913	QEMUIOVector qiov;
914	QEMUIOVector *backing_qiov = NULL;
915	int ret;
916
917	/ Skip copy entirely if there is no work to do /
918	if (len == `0`) {
919	return `0`;
920	}
921
922	qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
923
924	ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);
925
926	if (backing_qiov) {
927	qemu_iovec_destroy(backing_qiov);
928	g_free(backing_qiov);
929	backing_qiov = NULL;
930	}
931
932	if (ret) {
933	goto out;
934	}
935
936	BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
937	ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, `0`);
938	if (ret < `0`) {
939	goto out;
940	}
941	ret = `0`;
942	out:
943	qemu_vfree(qemu_iovec_buf(&qiov));
944	return ret;
945	}
946
947	/**
948	* Link one or more contiguous clusters into a table
949	*
950	* @s: QED state
951	* @table: L2 table
952	* @index: First cluster index
953	* @n: Number of contiguous clusters
954	* @cluster: First cluster offset
955	*
956	* The cluster offset may be an allocated byte offset in the image file, the
957	* zero cluster marker, or the unallocated cluster marker.
958	*
959	* Called with table_lock held.
960	*/
961	static void coroutine_fn qed_update_l2_table(BDRVQEDState s, QEDTable table,
962	int index, unsigned int n,
963	uint64_t cluster)
964	{
965	int i;
966	for (i = index; i < index + n; i++) {
967	table->offsets[i] = cluster;
968	if (!qed_offset_is_unalloc_cluster(cluster) &&
969	!qed_offset_is_zero_cluster(cluster)) {
970	cluster += s->header.cluster_size;
971	}
972	}
973	}
974
975	/ Called with table_lock held. /
976	static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
977	{
978	BDRVQEDState *s = acb_to_s(acb);
979
980	/ Free resources /
981	qemu_iovec_destroy(&acb->cur_qiov);
982	qed_unref_l2_cache_entry(acb->request.l2_table);
983
984	/ Free the buffer we may have allocated for zero writes /
985	if (acb->flags & QED_AIOCB_ZERO) {
986	qemu_vfree(acb->qiov->iov[`0`].iov_base);
987	acb->qiov->iov[`0`].iov_base = NULL;
988	}
989
990	/ Start next allocating write request waiting behind this one. Note that*
991	* requests enqueue themselves when they first hit an unallocated cluster
992	* but they wait until the entire request is finished before waking up the
993	* next request in the queue. This ensures that we don't cycle through
994	* requests multiple times but rather finish one at a time completely.
995	*/
996	if (acb == s->allocating_acb) {
997	s->allocating_acb = NULL;
998	if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
999	qemu_co_queue_next(&s->allocating_write_reqs);
1000	} else if (s->header.features & QED_F_NEED_CHECK) {
1001	qed_start_need_check_timer(s);
1002	}
1003	}
1004	}
1005
1006	/**
1007	* Update L1 table with new L2 table offset and write it out
1008	*
1009	* Called with table_lock held.
1010	*/
1011	static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
1012	{
1013	BDRVQEDState *s = acb_to_s(acb);
1014	CachedL2Table *l2_table = acb->request.l2_table;
1015	uint64_t l2_offset = l2_table->offset;
1016	int index, ret;
1017
1018	index = qed_l1_index(s, acb->cur_pos);
1019	s->l1_table->offsets[index] = l2_table->offset;
1020
1021	ret = qed_write_l1_table(s, index, `1`);
1022
1023	/ Commit the current L2 table to the cache /
1024	qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1025
1026	/ This is guaranteed to succeed because we just committed the entry to the*
1027	* cache.
1028	*/
1029	acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1030	assert(acb->request.l2_table != NULL);
1031
1032	return ret;
1033	}
1034
1035
1036	/**
1037	* Update L2 table with new cluster offsets and write them out
1038	*
1039	* Called with table_lock held.
1040	*/
1041	static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1042	{
1043	BDRVQEDState *s = acb_to_s(acb);
1044	bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1045	int index, ret;
1046
1047	if (need_alloc) {
1048	qed_unref_l2_cache_entry(acb->request.l2_table);
1049	acb->request.l2_table = qed_new_l2_table(s);
1050	}
1051
1052	index = qed_l2_index(s, acb->cur_pos);
1053	qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1054	offset);
1055
1056	if (need_alloc) {
1057	/ Write out the whole new L2 table /
1058	ret = qed_write_l2_table(s, &acb->request, `0`, s->table_nelems, true);
1059	if (ret) {
1060	return ret;
1061	}
1062	return qed_aio_write_l1_update(acb);
1063	} else {
1064	/ Write out only the updated part of the L2 table /
1065	ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1066	false);
1067	if (ret) {
1068	return ret;
1069	}
1070	}
1071	return `0`;
1072	}
1073
1074	/**
1075	* Write data to the image file
1076	*
1077	* Called with table_lock not held.
1078	*/
1079	static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
1080	{
1081	BDRVQEDState *s = acb_to_s(acb);
1082	uint64_t offset = acb->cur_cluster +
1083	qed_offset_into_cluster(s, acb->cur_pos);
1084
1085	trace_qed_aio_write_main(s, acb, `0`, offset, acb->cur_qiov.size);
1086
1087	BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1088	return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1089	&acb->cur_qiov, `0`);
1090	}
1091
1092	/**
1093	* Populate untouched regions of new data cluster
1094	*
1095	* Called with table_lock held.
1096	*/
1097	static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
1098	{
1099	BDRVQEDState *s = acb_to_s(acb);
1100	uint64_t start, len, offset;
1101	int ret;
1102
1103	qemu_co_mutex_unlock(&s->table_lock);
1104
1105	/ Populate front untouched region of new data cluster /
1106	start = qed_start_of_cluster(s, acb->cur_pos);
1107	len = qed_offset_into_cluster(s, acb->cur_pos);
1108
1109	trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1110	ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1111	if (ret < `0`) {
1112	goto out;
1113	}
1114
1115	/ Populate back untouched region of new data cluster /
1116	start = acb->cur_pos + acb->cur_qiov.size;
1117	len = qed_start_of_cluster(s, start + s->header.cluster_size - `1`) - start;
1118	offset = acb->cur_cluster +
1119	qed_offset_into_cluster(s, acb->cur_pos) +
1120	acb->cur_qiov.size;
1121
1122	trace_qed_aio_write_postfill(s, acb, start, len, offset);
1123	ret = qed_copy_from_backing_file(s, start, len, offset);
1124	if (ret < `0`) {
1125	goto out;
1126	}
1127
1128	ret = qed_aio_write_main(acb);
1129	if (ret < `0`) {
1130	goto out;
1131	}
1132
1133	if (s->bs->backing) {
1134	/*
1135	* Flush new data clusters before updating the L2 table
1136	*
1137	* This flush is necessary when a backing file is in use. A crash
1138	* during an allocating write could result in empty clusters in the
1139	* image. If the write only touched a subregion of the cluster,
1140	* then backing image sectors have been lost in the untouched
1141	* region. The solution is to flush after writing a new data
1142	* cluster and before updating the L2 table.
1143	*/
1144	ret = bdrv_co_flush(s->bs->file->bs);
1145	}
1146
1147	out:
1148	qemu_co_mutex_lock(&s->table_lock);
1149	return ret;
1150	}
1151
1152	/**
1153	* Check if the QED_F_NEED_CHECK bit should be set during allocating write
1154	*/
1155	static bool qed_should_set_need_check(BDRVQEDState *s)
1156	{
1157	/ The flush before L2 update path ensures consistency /
1158	if (s->bs->backing) {
1159	return false;
1160	}
1161
1162	return !(s->header.features & QED_F_NEED_CHECK);
1163	}
1164
1165	/**
1166	* Write new data cluster
1167	*
1168	* @acb: Write request
1169	* @len: Length in bytes
1170	*
1171	* This path is taken when writing to previously unallocated clusters.
1172	*
1173	* Called with table_lock held.
1174	*/
1175	static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1176	{
1177	BDRVQEDState *s = acb_to_s(acb);
1178	int ret;
1179
1180	/ Cancel timer when the first allocating request comes in /
1181	if (s->allocating_acb == NULL) {
1182	qed_cancel_need_check_timer(s);
1183	}
1184
1185	/ Freeze this request if another allocating write is in progress /
1186	if (s->allocating_acb != acb \|\| s->allocating_write_reqs_plugged) {
1187	if (s->allocating_acb != NULL) {
1188	qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1189	assert(s->allocating_acb == NULL);
1190	}
1191	s->allocating_acb = acb;
1192	return -EAGAIN; / start over with looking up table entries /
1193	}
1194
1195	acb->cur_nclusters = qed_bytes_to_clusters(s,
1196	qed_offset_into_cluster(s, acb->cur_pos) + len);
1197	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1198
1199	if (acb->flags & QED_AIOCB_ZERO) {
1200	/ Skip ahead if the clusters are already zero /
1201	if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1202	return `0`;
1203	}
1204	acb->cur_cluster = `1`;
1205	} else {
1206	acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1207	}
1208
1209	if (qed_should_set_need_check(s)) {
1210	s->header.features \|= QED_F_NEED_CHECK;
1211	ret = qed_write_header(s);
1212	if (ret < `0`) {
1213	return ret;
1214	}
1215	}
1216
1217	if (!(acb->flags & QED_AIOCB_ZERO)) {
1218	ret = qed_aio_write_cow(acb);
1219	if (ret < `0`) {
1220	return ret;
1221	}
1222	}
1223
1224	return qed_aio_write_l2_update(acb, acb->cur_cluster);
1225	}
1226
1227	/**
1228	* Write data cluster in place
1229	*
1230	* @acb: Write request
1231	* @offset: Cluster offset in bytes
1232	* @len: Length in bytes
1233	*
1234	* This path is taken when writing to already allocated clusters.
1235	*
1236	* Called with table_lock held.
1237	*/
1238	static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
1239	size_t len)
1240	{
1241	BDRVQEDState *s = acb_to_s(acb);
1242	int r;
1243
1244	qemu_co_mutex_unlock(&s->table_lock);
1245
1246	/ Allocate buffer for zero writes /
1247	if (acb->flags & QED_AIOCB_ZERO) {
1248	struct iovec *iov = acb->qiov->iov;
1249
1250	if (!iov->iov_base) {
1251	iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1252	if (iov->iov_base == NULL) {
1253	r = -ENOMEM;
1254	goto out;
1255	}
1256	memset(iov->iov_base, `0`, iov->iov_len);
1257	}
1258	}
1259
1260	/ Calculate the I/O vector /
1261	acb->cur_cluster = offset;
1262	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1263
1264	/ Do the actual write. /
1265	r = qed_aio_write_main(acb);
1266	out:
1267	qemu_co_mutex_lock(&s->table_lock);
1268	return r;
1269	}
1270
1271	/**
1272	* Write data cluster
1273	*
1274	* @opaque: Write request
1275	* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1276	* @offset: Cluster offset in bytes
1277	* @len: Length in bytes
1278	*
1279	* Called with table_lock held.
1280	*/
1281	static int coroutine_fn qed_aio_write_data(void opaque, int* ret,
1282	uint64_t offset, size_t len)
1283	{
1284	QEDAIOCB *acb = opaque;
1285
1286	trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1287
1288	acb->find_cluster_ret = ret;
1289
1290	switch (ret) {
1291	case QED_CLUSTER_FOUND:
1292	return qed_aio_write_inplace(acb, offset, len);
1293
1294	case QED_CLUSTER_L2:
1295	case QED_CLUSTER_L1:
1296	case QED_CLUSTER_ZERO:
1297	return qed_aio_write_alloc(acb, len);
1298
1299	default:
1300	g_assert_not_reached();
1301	}
1302	}
1303
1304	/**
1305	* Read data cluster
1306	*
1307	* @opaque: Read request
1308	* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1309	* @offset: Cluster offset in bytes
1310	* @len: Length in bytes
1311	*
1312	* Called with table_lock held.
1313	*/
1314	static int coroutine_fn qed_aio_read_data(void opaque, int* ret,
1315	uint64_t offset, size_t len)
1316	{
1317	QEDAIOCB *acb = opaque;
1318	BDRVQEDState *s = acb_to_s(acb);
1319	BlockDriverState *bs = acb->bs;
1320	int r;
1321
1322	qemu_co_mutex_unlock(&s->table_lock);
1323
1324	/ Adjust offset into cluster /
1325	offset += qed_offset_into_cluster(s, acb->cur_pos);
1326
1327	trace_qed_aio_read_data(s, acb, ret, offset, len);
1328
1329	qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1330
1331	/ Handle zero cluster and backing file reads, otherwise read*
1332	* data cluster directly.
1333	*/
1334	if (ret == QED_CLUSTER_ZERO) {
1335	qemu_iovec_memset(&acb->cur_qiov, `0`, `0`, acb->cur_qiov.size);
1336	r = `0`;
1337	} else if (ret != QED_CLUSTER_FOUND) {
1338	r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
1339	&acb->backing_qiov);
1340	} else {
1341	BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1342	r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1343	&acb->cur_qiov, `0`);
1344	}
1345
1346	qemu_co_mutex_lock(&s->table_lock);
1347	return r;
1348	}
1349
1350	/**
1351	* Begin next I/O or complete the request
1352	*/
1353	static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
1354	{
1355	BDRVQEDState *s = acb_to_s(acb);
1356	uint64_t offset;
1357	size_t len;
1358	int ret;
1359
1360	qemu_co_mutex_lock(&s->table_lock);
1361	while (`1`) {
1362	trace_qed_aio_next_io(s, acb, `0`, acb->cur_pos + acb->cur_qiov.size);
1363
1364	if (acb->backing_qiov) {
1365	qemu_iovec_destroy(acb->backing_qiov);
1366	g_free(acb->backing_qiov);
1367	acb->backing_qiov = NULL;
1368	}
1369
1370	acb->qiov_offset += acb->cur_qiov.size;
1371	acb->cur_pos += acb->cur_qiov.size;
1372	qemu_iovec_reset(&acb->cur_qiov);
1373
1374	/ Complete request /
1375	if (acb->cur_pos >= acb->end_pos) {
1376	ret = `0`;
1377	break;
1378	}
1379
1380	/ Find next cluster and start I/O /
1381	len = acb->end_pos - acb->cur_pos;
1382	ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1383	if (ret < `0`) {
1384	break;
1385	}
1386
1387	if (acb->flags & QED_AIOCB_WRITE) {
1388	ret = qed_aio_write_data(acb, ret, offset, len);
1389	} else {
1390	ret = qed_aio_read_data(acb, ret, offset, len);
1391	}
1392
1393	if (ret < `0` && ret != -EAGAIN) {
1394	break;
1395	}
1396	}
1397
1398	trace_qed_aio_complete(s, acb, ret);
1399	qed_aio_complete(acb);
1400	qemu_co_mutex_unlock(&s->table_lock);
1401	return ret;
1402	}
1403
1404	static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
1405	QEMUIOVector qiov, int* nb_sectors,
1406	int flags)
1407	{
1408	QEDAIOCB acb = {
1409	.bs = bs,
1410	.cur_pos = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1411	.end_pos = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1412	.qiov = qiov,
1413	.flags = flags,
1414	};
1415	qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1416
1417	trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1418
1419	/ Start request /
1420	return qed_aio_next_io(&acb);
1421	}
1422
1423	static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
1424	int64_t sector_num, int nb_sectors,
1425	QEMUIOVector *qiov)
1426	{
1427	return qed_co_request(bs, sector_num, qiov, nb_sectors, `0`);
1428	}
1429
1430	static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
1431	int64_t sector_num, int nb_sectors,
1432	QEMUIOVector qiov, int* flags)
1433	{
1434	assert(!flags);
1435	return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1436	}
1437
1438	static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
1439	int64_t offset,
1440	int bytes,
1441	BdrvRequestFlags flags)
1442	{
1443	BDRVQEDState *s = bs->opaque;
1444
1445	/*
1446	* Zero writes start without an I/O buffer. If a buffer becomes necessary
1447	* then it will be allocated during request processing.
1448	*/
1449	QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1450
1451	/ Fall back if the request is not aligned /
1452	if (qed_offset_into_cluster(s, offset) \|\|
1453	qed_offset_into_cluster(s, bytes)) {
1454	return -ENOTSUP;
1455	}
1456
1457	return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1458	bytes >> BDRV_SECTOR_BITS,
1459	QED_AIOCB_WRITE \| QED_AIOCB_ZERO);
1460	}
1461
1462	static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
1463	int64_t offset,
1464	PreallocMode prealloc,
1465	Error **errp)
1466	{
1467	BDRVQEDState *s = bs->opaque;
1468	uint64_t old_image_size;
1469	int ret;
1470
1471	if (prealloc != PREALLOC_MODE_OFF) {
1472	error_setg(errp, "Unsupported preallocation mode '%s'",
1473	PreallocMode_str(prealloc));
1474	return -ENOTSUP;
1475	}
1476
1477	if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1478	s->header.table_size)) {
1479	error_setg(errp, "Invalid image size specified");
1480	return -EINVAL;
1481	}
1482
1483	if ((uint64_t)offset < s->header.image_size) {
1484	error_setg(errp, "Shrinking images is currently not supported");
1485	return -ENOTSUP;
1486	}
1487
1488	old_image_size = s->header.image_size;
1489	s->header.image_size = offset;
1490	ret = qed_write_header_sync(s);
1491	if (ret < `0`) {
1492	s->header.image_size = old_image_size;
1493	error_setg_errno(errp, -ret, "Failed to update the image size");
1494	}
1495	return ret;
1496	}
1497
1498	static int64_t bdrv_qed_getlength(BlockDriverState *bs)
1499	{
1500	BDRVQEDState *s = bs->opaque;
1501	return s->header.image_size;
1502	}
1503
1504	static int bdrv_qed_get_info(BlockDriverState bs, BlockDriverInfo bdi)
1505	{
1506	BDRVQEDState *s = bs->opaque;
1507
1508	memset(bdi, `0`, sizeof(*bdi));
1509	bdi->cluster_size = s->header.cluster_size;
1510	bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1511	bdi->unallocated_blocks_are_zero = true;
1512	return `0`;
1513	}
1514
1515	static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1516	const char *backing_file,
1517	const char *backing_fmt)
1518	{
1519	BDRVQEDState *s = bs->opaque;
1520	QEDHeader new_header, le_header;
1521	void *buffer;
1522	size_t buffer_len, backing_file_len;
1523	int ret;
1524
1525	/ Refuse to set backing filename if unknown compat feature bits are*
1526	* active. If the image uses an unknown compat feature then we may not
1527	* know the layout of data following the header structure and cannot safely
1528	* add a new string.
1529	*/
1530	if (backing_file && (s->header.compat_features &
1531	~QED_COMPAT_FEATURE_MASK)) {
1532	return -ENOTSUP;
1533	}
1534
1535	memcpy(&new_header, &s->header, sizeof(new_header));
1536
1537	new_header.features &= ~(QED_F_BACKING_FILE \|
1538	QED_F_BACKING_FORMAT_NO_PROBE);
1539
1540	/ Adjust feature flags /
1541	if (backing_file) {
1542	new_header.features \|= QED_F_BACKING_FILE;
1543
1544	if (qed_fmt_is_raw(backing_fmt)) {
1545	new_header.features \|= QED_F_BACKING_FORMAT_NO_PROBE;
1546	}
1547	}
1548
1549	/ Calculate new header size /
1550	backing_file_len = `0`;
1551
1552	if (backing_file) {
1553	backing_file_len = strlen(backing_file);
1554	}
1555
1556	buffer_len = sizeof(new_header);
1557	new_header.backing_filename_offset = buffer_len;
1558	new_header.backing_filename_size = backing_file_len;
1559	buffer_len += backing_file_len;
1560
1561	/ Make sure we can rewrite header without failing /
1562	if (buffer_len > new_header.header_size * new_header.cluster_size) {
1563	return -ENOSPC;
1564	}
1565
1566	/ Prepare new header /
1567	buffer = g_malloc(buffer_len);
1568
1569	qed_header_cpu_to_le(&new_header, &le_header);
1570	memcpy(buffer, &le_header, sizeof(le_header));
1571	buffer_len = sizeof(le_header);
1572
1573	if (backing_file) {
1574	memcpy(buffer + buffer_len, backing_file, backing_file_len);
1575	buffer_len += backing_file_len;
1576	}
1577
1578	/ Write new header /
1579	ret = bdrv_pwrite_sync(bs->file, `0`, buffer, buffer_len);
1580	g_free(buffer);
1581	if (ret == `0`) {
1582	memcpy(&s->header, &new_header, sizeof(new_header));
1583	}
1584	return ret;
1585	}
1586
1587	static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
1588	Error **errp)
1589	{
1590	BDRVQEDState *s = bs->opaque;
1591	Error *local_err = NULL;
1592	int ret;
1593
1594	bdrv_qed_close(bs);
1595
1596	bdrv_qed_init_state(bs);
1597	qemu_co_mutex_lock(&s->table_lock);
1598	ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
1599	qemu_co_mutex_unlock(&s->table_lock);
1600	if (local_err) {
1601	error_propagate_prepend(errp, local_err,
1602	"Could not reopen qed layer: ");
1603	return;
1604	} else if (ret < `0`) {
1605	error_setg_errno(errp, -ret, "Could not reopen qed layer");
1606	return;
1607	}
1608	}
1609
1610	static int coroutine_fn bdrv_qed_co_check(BlockDriverState *bs,
1611	BdrvCheckResult *result,
1612	BdrvCheckMode fix)
1613	{
1614	BDRVQEDState *s = bs->opaque;
1615	int ret;
1616
1617	qemu_co_mutex_lock(&s->table_lock);
1618	ret = qed_check(s, result, !!fix);
1619	qemu_co_mutex_unlock(&s->table_lock);
1620
1621	return ret;
1622	}
1623
1624	static QemuOptsList qed_create_opts = {
1625	.name = "qed-create-opts",
1626	.head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1627	.desc = {
1628	{
1629	.name = BLOCK_OPT_SIZE,
1630	.type = QEMU_OPT_SIZE,
1631	.help = "Virtual disk size"
1632	},
1633	{
1634	.name = BLOCK_OPT_BACKING_FILE,
1635	.type = QEMU_OPT_STRING,
1636	.help = "File name of a base image"
1637	},
1638	{
1639	.name = BLOCK_OPT_BACKING_FMT,
1640	.type = QEMU_OPT_STRING,
1641	.help = "Image format of the base image"
1642	},
1643	{
1644	.name = BLOCK_OPT_CLUSTER_SIZE,
1645	.type = QEMU_OPT_SIZE,
1646	.help = "Cluster size (in bytes)",
1647	.def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1648	},
1649	{
1650	.name = BLOCK_OPT_TABLE_SIZE,
1651	.type = QEMU_OPT_SIZE,
1652	.help = "L1/L2 table size (in clusters)"
1653	},
1654	{ / end of list / }
1655	}
1656	};
1657
1658	static BlockDriver bdrv_qed = {
1659	.format_name = "qed",
1660	.instance_size = sizeof(BDRVQEDState),
1661	.create_opts = &qed_create_opts,
1662	.supports_backing = true,
1663
1664	.bdrv_probe = bdrv_qed_probe,
1665	.bdrv_open = bdrv_qed_open,
1666	.bdrv_close = bdrv_qed_close,
1667	.bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
1668	.bdrv_child_perm = bdrv_format_default_perms,
1669	.bdrv_co_create = bdrv_qed_co_create,
1670	.bdrv_co_create_opts = bdrv_qed_co_create_opts,
1671	.bdrv_has_zero_init = bdrv_has_zero_init_1,
1672	.bdrv_has_zero_init_truncate = bdrv_has_zero_init_1,
1673	.bdrv_co_block_status = bdrv_qed_co_block_status,
1674	.bdrv_co_readv = bdrv_qed_co_readv,
1675	.bdrv_co_writev = bdrv_qed_co_writev,
1676	.bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes,
1677	.bdrv_co_truncate = bdrv_qed_co_truncate,
1678	.bdrv_getlength = bdrv_qed_getlength,
1679	.bdrv_get_info = bdrv_qed_get_info,
1680	.bdrv_refresh_limits = bdrv_qed_refresh_limits,
1681	.bdrv_change_backing_file = bdrv_qed_change_backing_file,
1682	.bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
1683	.bdrv_co_check = bdrv_qed_co_check,
1684	.bdrv_detach_aio_context = bdrv_qed_detach_aio_context,
1685	.bdrv_attach_aio_context = bdrv_qed_attach_aio_context,
1686	.bdrv_co_drain_begin = bdrv_qed_co_drain_begin,
1687	};
1688
1689	static void bdrv_qed_init(void)
1690	{
1691	bdrv_register(&bdrv_qed);
1692	}
1693
1694	block_init(bdrv_qed_init);
1695

Browse the source code of qemu/block/qed.c