vhost.c source code [qemu/hw/virtio/vhost.c]

1	/*
2	* vhost support
3	*
4	* Copyright Red Hat, Inc. 2010
5	*
6	* Authors:
7	* Michael S. Tsirkin <mst@redhat.com>
8	*
9	* This work is licensed under the terms of the GNU GPL, version 2. See
10	* the COPYING file in the top-level directory.
11	*
12	* Contributions after 2012-01-13 are licensed under the terms of the
13	* GNU GPL, version 2 or (at your option) any later version.
14	*/
15
16	#include "qemu/osdep.h"
17	#include "qapi/error.h"
18	#include "hw/virtio/vhost.h"
19	#include "qemu/atomic.h"
20	#include "qemu/range.h"
21	#include "qemu/error-report.h"
22	#include "qemu/memfd.h"
23	#include "standard-headers/linux/vhost_types.h"
24	#include "exec/address-spaces.h"
25	#include "hw/virtio/virtio-bus.h"
26	#include "hw/virtio/virtio-access.h"
27	#include "migration/blocker.h"
28	#include "migration/qemu-file-types.h"
29	#include "sysemu/dma.h"
30	#include "trace.h"
31
32	/ enabled until disconnected backend stabilizes /
33	#define _VHOST_DEBUG 1
34
35	#ifdef _VHOST_DEBUG
36	#define VHOST_OPS_DEBUG(fmt, ...) \
37	do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
38	strerror(errno), errno); } while (0)
39	#else
40	#define VHOST_OPS_DEBUG(fmt, ...) \
41	do { } while (0)
42	#endif
43
44	static struct vhost_log *vhost_log;
45	static struct vhost_log *vhost_log_shm;
46
47	static unsigned int used_memslots;
48	static QLIST_HEAD(, vhost_dev) vhost_devices =
49	QLIST_HEAD_INITIALIZER(vhost_devices);
50
51	bool vhost_has_free_slot(void)
52	{
53	unsigned int slots_limit = ~`0U`;
54	struct vhost_dev *hdev;
55
56	QLIST_FOREACH(hdev, &vhost_devices, entry) {
57	unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
58	slots_limit = MIN(slots_limit, r);
59	}
60	return slots_limit > used_memslots;
61	}
62
63	static void vhost_dev_sync_region(struct vhost_dev *dev,
64	MemoryRegionSection *section,
65	uint64_t mfirst, uint64_t mlast,
66	uint64_t rfirst, uint64_t rlast)
67	{
68	vhost_log_chunk_t *log = dev->log->log;
69
70	uint64_t start = MAX(mfirst, rfirst);
71	uint64_t end = MIN(mlast, rlast);
72	vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
73	vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + `1`;
74	uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
75
76	if (end < start) {
77	return;
78	}
79	assert(end / VHOST_LOG_CHUNK < dev->log_size);
80	assert(start / VHOST_LOG_CHUNK < dev->log_size);
81
82	for (;from < to; ++from) {
83	vhost_log_chunk_t log;
84	/ We first check with non-atomic: much cheaper,*
85	* and we expect non-dirty to be the common case. */
86	if (!*from) {
87	addr += VHOST_LOG_CHUNK;
88	continue;
89	}
90	/ Data must be read atomically. We don't really need barrier semantics*
91	* but it's easier to use atomic_* than roll our own. */
92	log = atomic_xchg(from, `0`);
93	while (log) {
94	int bit = ctzl(log);
95	hwaddr page_addr;
96	hwaddr section_offset;
97	hwaddr mr_offset;
98	page_addr = addr + bit * VHOST_LOG_PAGE;
99	section_offset = page_addr - section->offset_within_address_space;
100	mr_offset = section_offset + section->offset_within_region;
101	memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
102	log &= ~(`0x1ull` << bit);
103	}
104	addr += VHOST_LOG_CHUNK;
105	}
106	}
107
108	static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
109	MemoryRegionSection *section,
110	hwaddr first,
111	hwaddr last)
112	{
113	int i;
114	hwaddr start_addr;
115	hwaddr end_addr;
116
117	if (!dev->log_enabled \|\| !dev->started) {
118	return `0`;
119	}
120	start_addr = section->offset_within_address_space;
121	end_addr = range_get_last(start_addr, int128_get64(section->size));
122	start_addr = MAX(first, start_addr);
123	end_addr = MIN(last, end_addr);
124
125	for (i = `0`; i < dev->mem->nregions; ++i) {
126	struct vhost_memory_region *reg = dev->mem->regions + i;
127	vhost_dev_sync_region(dev, section, start_addr, end_addr,
128	reg->guest_phys_addr,
129	range_get_last(reg->guest_phys_addr,
130	reg->memory_size));
131	}
132	for (i = `0`; i < dev->nvqs; ++i) {
133	struct vhost_virtqueue *vq = dev->vqs + i;
134
135	if (!vq->used_phys && !vq->used_size) {
136	continue;
137	}
138
139	vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
140	range_get_last(vq->used_phys, vq->used_size));
141	}
142	return `0`;
143	}
144
145	static void vhost_log_sync(MemoryListener *listener,
146	MemoryRegionSection *section)
147	{
148	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
149	memory_listener);
150	vhost_sync_dirty_bitmap(dev, section, `0x0`, ~`0x0ULL`);
151	}
152
153	static void vhost_log_sync_range(struct vhost_dev *dev,
154	hwaddr first, hwaddr last)
155	{
156	int i;
157	/ FIXME: this is N^2 in number of sections /
158	for (i = `0`; i < dev->n_mem_sections; ++i) {
159	MemoryRegionSection *section = &dev->mem_sections[i];
160	vhost_sync_dirty_bitmap(dev, section, first, last);
161	}
162	}
163
164	static uint64_t vhost_get_log_size(struct vhost_dev *dev)
165	{
166	uint64_t log_size = `0`;
167	int i;
168	for (i = `0`; i < dev->mem->nregions; ++i) {
169	struct vhost_memory_region *reg = dev->mem->regions + i;
170	uint64_t last = range_get_last(reg->guest_phys_addr,
171	reg->memory_size);
172	log_size = MAX(log_size, last / VHOST_LOG_CHUNK + `1`);
173	}
174	for (i = `0`; i < dev->nvqs; ++i) {
175	struct vhost_virtqueue *vq = dev->vqs + i;
176
177	if (!vq->used_phys && !vq->used_size) {
178	continue;
179	}
180
181	uint64_t last = vq->used_phys + vq->used_size - `1`;
182	log_size = MAX(log_size, last / VHOST_LOG_CHUNK + `1`);
183	}
184	return log_size;
185	}
186
187	static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
188	{
189	Error *err = NULL;
190	struct vhost_log *log;
191	uint64_t logsize = size * sizeof(*(log->log));
192	int fd = -`1`;
193
194	log = g_new0(struct vhost_log, `1`);
195	if (share) {
196	log->log = qemu_memfd_alloc("vhost-log", logsize,
197	F_SEAL_GROW \| F_SEAL_SHRINK \| F_SEAL_SEAL,
198	&fd, &err);
199	if (err) {
200	error_report_err(err);
201	g_free(log);
202	return NULL;
203	}
204	memset(log->log, `0`, logsize);
205	} else {
206	log->log = g_malloc0(logsize);
207	}
208
209	log->size = size;
210	log->refcnt = `1`;
211	log->fd = fd;
212
213	return log;
214	}
215
216	static struct vhost_log *vhost_log_get(uint64_t size, bool share)
217	{
218	struct vhost_log *log = share ? vhost_log_shm : vhost_log;
219
220	if (!log \|\| log->size != size) {
221	log = vhost_log_alloc(size, share);
222	if (share) {
223	vhost_log_shm = log;
224	} else {
225	vhost_log = log;
226	}
227	} else {
228	++log->refcnt;
229	}
230
231	return log;
232	}
233
234	static void vhost_log_put(struct vhost_dev *dev, bool sync)
235	{
236	struct vhost_log *log = dev->log;
237
238	if (!log) {
239	return;
240	}
241
242	--log->refcnt;
243	if (log->refcnt == `0`) {
244	/ Sync only the range covered by the old log /
245	if (dev->log_size && sync) {
246	vhost_log_sync_range(dev, `0`, dev->log_size * VHOST_LOG_CHUNK - `1`);
247	}
248
249	if (vhost_log == log) {
250	g_free(log->log);
251	vhost_log = NULL;
252	} else if (vhost_log_shm == log) {
253	qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
254	log->fd);
255	vhost_log_shm = NULL;
256	}
257
258	g_free(log);
259	}
260
261	dev->log = NULL;
262	dev->log_size = `0`;
263	}
264
265	static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
266	{
267	return dev->vhost_ops->vhost_requires_shm_log &&
268	dev->vhost_ops->vhost_requires_shm_log(dev);
269	}
270
271	static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
272	{
273	struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
274	uint64_t log_base = (uintptr_t)log->log;
275	int r;
276
277	/ inform backend of log switching, this must be done before*
278	releasing the current log, to ensure no logging is lost /*
279	r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
280	if (r < `0`) {
281	VHOST_OPS_DEBUG("vhost_set_log_base failed");
282	}
283
284	vhost_log_put(dev, true);
285	dev->log = log;
286	dev->log_size = size;
287	}
288
289	static int vhost_dev_has_iommu(struct vhost_dev *dev)
290	{
291	VirtIODevice *vdev = dev->vdev;
292
293	return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
294	}
295
296	static void vhost_memory_map(struct* vhost_dev *dev, hwaddr addr,
297	hwaddr plen, int* is_write)
298	{
299	if (!vhost_dev_has_iommu(dev)) {
300	return cpu_physical_memory_map(addr, plen, is_write);
301	} else {
302	return (void *)(uintptr_t)addr;
303	}
304	}
305
306	static void vhost_memory_unmap(struct vhost_dev dev, void* *buffer,
307	hwaddr len, int is_write,
308	hwaddr access_len)
309	{
310	if (!vhost_dev_has_iommu(dev)) {
311	cpu_physical_memory_unmap(buffer, len, is_write, access_len);
312	}
313	}
314
315	static int vhost_verify_ring_part_mapping(void *ring_hva,
316	uint64_t ring_gpa,
317	uint64_t ring_size,
318	void *reg_hva,
319	uint64_t reg_gpa,
320	uint64_t reg_size)
321	{
322	uint64_t hva_ring_offset;
323	uint64_t ring_last = range_get_last(ring_gpa, ring_size);
324	uint64_t reg_last = range_get_last(reg_gpa, reg_size);
325
326	if (ring_last < reg_gpa \|\| ring_gpa > reg_last) {
327	return `0`;
328	}
329	/ check that whole ring's is mapped /
330	if (ring_last > reg_last) {
331	return -ENOMEM;
332	}
333	/ check that ring's MemoryRegion wasn't replaced /
334	hva_ring_offset = ring_gpa - reg_gpa;
335	if (ring_hva != reg_hva + hva_ring_offset) {
336	return -EBUSY;
337	}
338
339	return `0`;
340	}
341
342	static int vhost_verify_ring_mappings(struct vhost_dev *dev,
343	void *reg_hva,
344	uint64_t reg_gpa,
345	uint64_t reg_size)
346	{
347	int i, j;
348	int r = `0`;
349	const char *part_name[] = {
350	"descriptor table",
351	"available ring",
352	"used ring"
353	};
354
355	if (vhost_dev_has_iommu(dev)) {
356	return `0`;
357	}
358
359	for (i = `0`; i < dev->nvqs; ++i) {
360	struct vhost_virtqueue *vq = dev->vqs + i;
361
362	if (vq->desc_phys == `0`) {
363	continue;
364	}
365
366	j = `0`;
367	r = vhost_verify_ring_part_mapping(
368	vq->desc, vq->desc_phys, vq->desc_size,
369	reg_hva, reg_gpa, reg_size);
370	if (r) {
371	break;
372	}
373
374	j++;
375	r = vhost_verify_ring_part_mapping(
376	vq->avail, vq->avail_phys, vq->avail_size,
377	reg_hva, reg_gpa, reg_size);
378	if (r) {
379	break;
380	}
381
382	j++;
383	r = vhost_verify_ring_part_mapping(
384	vq->used, vq->used_phys, vq->used_size,
385	reg_hva, reg_gpa, reg_size);
386	if (r) {
387	break;
388	}
389	}
390
391	if (r == -ENOMEM) {
392	error_report("Unable to map %s for ring %d", part_name[j], i);
393	} else if (r == -EBUSY) {
394	error_report("%s relocated for ring %d", part_name[j], i);
395	}
396	return r;
397	}
398
399	static bool vhost_section(struct vhost_dev dev, MemoryRegionSection section)
400	{
401	bool result;
402	bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &
403	~(`1` << DIRTY_MEMORY_MIGRATION);
404	result = memory_region_is_ram(section->mr) &&
405	!memory_region_is_rom(section->mr);
406
407	/ Vhost doesn't handle any block which is doing dirty-tracking other*
408	* than migration; this typically fires on VGA areas.
409	*/
410	result &= !log_dirty;
411
412	if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {
413	result &=
414	dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);
415	}
416
417	trace_vhost_section(section->mr->name, result);
418	return result;
419	}
420
421	static void vhost_begin(MemoryListener *listener)
422	{
423	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
424	memory_listener);
425	dev->tmp_sections = NULL;
426	dev->n_tmp_sections = `0`;
427	}
428
429	static void vhost_commit(MemoryListener *listener)
430	{
431	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
432	memory_listener);
433	MemoryRegionSection *old_sections;
434	int n_old_sections;
435	uint64_t log_size;
436	size_t regions_size;
437	int r;
438	int i;
439	bool changed = false;
440
441	/ Note we can be called before the device is started, but then*
442	* starting the device calls set_mem_table, so we need to have
443	* built the data structures.
444	*/
445	old_sections = dev->mem_sections;
446	n_old_sections = dev->n_mem_sections;
447	dev->mem_sections = dev->tmp_sections;
448	dev->n_mem_sections = dev->n_tmp_sections;
449
450	if (dev->n_mem_sections != n_old_sections) {
451	changed = true;
452	} else {
453	/ Same size, lets check the contents /
454	changed = n_old_sections && memcmp(dev->mem_sections, old_sections,
455	n_old_sections * sizeof(old_sections[`0`])) != `0`;
456	}
457
458	trace_vhost_commit(dev->started, changed);
459	if (!changed) {
460	goto out;
461	}
462
463	/ Rebuild the regions list from the new sections list /
464	regions_size = offsetof(struct vhost_memory, regions) +
465	dev->n_mem_sections * sizeof dev->mem->regions[`0`];
466	dev->mem = g_realloc(dev->mem, regions_size);
467	dev->mem->nregions = dev->n_mem_sections;
468	used_memslots = dev->mem->nregions;
469	for (i = `0`; i < dev->n_mem_sections; i++) {
470	struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
471	struct MemoryRegionSection *mrs = dev->mem_sections + i;
472
473	cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
474	cur_vmr->memory_size = int128_get64(mrs->size);
475	cur_vmr->userspace_addr =
476	(uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
477	mrs->offset_within_region;
478	cur_vmr->flags_padding = `0`;
479	}
480
481	if (!dev->started) {
482	goto out;
483	}
484
485	for (i = `0`; i < dev->mem->nregions; i++) {
486	if (vhost_verify_ring_mappings(dev,
487	(void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
488	dev->mem->regions[i].guest_phys_addr,
489	dev->mem->regions[i].memory_size)) {
490	error_report("Verify ring failure on region %d", i);
491	abort();
492	}
493	}
494
495	if (!dev->log_enabled) {
496	r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
497	if (r < `0`) {
498	VHOST_OPS_DEBUG("vhost_set_mem_table failed");
499	}
500	goto out;
501	}
502	log_size = vhost_get_log_size(dev);
503	/ We allocate an extra 4K bytes to log,*
504	* to reduce the * number of reallocations. */
505	#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
506	/ To log more, must increase log size before table update. /
507	if (dev->log_size < log_size) {
508	vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
509	}
510	r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
511	if (r < `0`) {
512	VHOST_OPS_DEBUG("vhost_set_mem_table failed");
513	}
514	/ To log less, can only decrease log size after table update. /
515	if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
516	vhost_dev_log_resize(dev, log_size);
517	}
518
519	out:
520	/ Deref the old list of sections, this must happen _after_ the*
521	* vhost_set_mem_table to ensure the client isn't still using the
522	* section we're about to unref.
523	*/
524	while (n_old_sections--) {
525	memory_region_unref(old_sections[n_old_sections].mr);
526	}
527	g_free(old_sections);
528	return;
529	}
530
531	/ Adds the section data to the tmp_section structure.*
532	* It relies on the listener calling us in memory address order
533	* and for each region (via the _add and _nop methods) to
534	* join neighbours.
535	*/
536	static void vhost_region_add_section(struct vhost_dev *dev,
537	MemoryRegionSection *section)
538	{
539	bool need_add = true;
540	uint64_t mrs_size = int128_get64(section->size);
541	uint64_t mrs_gpa = section->offset_within_address_space;
542	uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
543	section->offset_within_region;
544	RAMBlock *mrs_rb = section->mr->ram_block;
545	size_t mrs_page = qemu_ram_pagesize(mrs_rb);
546
547	trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
548	mrs_host);
549
550	/ Round the section to it's page size /
551	/ First align the start down to a page boundary /
552	uint64_t alignage = mrs_host & (mrs_page - `1`);
553	if (alignage) {
554	mrs_host -= alignage;
555	mrs_size += alignage;
556	mrs_gpa -= alignage;
557	}
558	/ Now align the size up to a page boundary /
559	alignage = mrs_size & (mrs_page - `1`);
560	if (alignage) {
561	mrs_size += mrs_page - alignage;
562	}
563	trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
564	mrs_host);
565
566	if (dev->n_tmp_sections) {
567	/ Since we already have at least one section, lets see if*
568	* this extends it; since we're scanning in order, we only
569	* have to look at the last one, and the FlatView that calls
570	* us shouldn't have overlaps.
571	*/
572	MemoryRegionSection *prev_sec = dev->tmp_sections +
573	(dev->n_tmp_sections - `1`);
574	uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
575	uint64_t prev_size = int128_get64(prev_sec->size);
576	uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
577	uint64_t prev_host_start =
578	(uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
579	prev_sec->offset_within_region;
580	uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
581
582	if (mrs_gpa <= (prev_gpa_end + `1`)) {
583	/ OK, looks like overlapping/intersecting - it's possible that*
584	* the rounding to page sizes has made them overlap, but they should
585	* match up in the same RAMBlock if they do.
586	*/
587	if (mrs_gpa < prev_gpa_start) {
588	error_report("%s:Section rounded to %"PRIx64
589	" prior to previous %"PRIx64,
590	__func__, mrs_gpa, prev_gpa_start);
591	/ A way to cleanly fail here would be better /
592	return;
593	}
594	/ Offset from the start of the previous GPA to this GPA /
595	size_t offset = mrs_gpa - prev_gpa_start;
596
597	if (prev_host_start + offset == mrs_host &&
598	section->mr == prev_sec->mr &&
599	(!dev->vhost_ops->vhost_backend_can_merge \|\|
600	dev->vhost_ops->vhost_backend_can_merge(dev,
601	mrs_host, mrs_size,
602	prev_host_start, prev_size))) {
603	uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
604	need_add = false;
605	prev_sec->offset_within_address_space =
606	MIN(prev_gpa_start, mrs_gpa);
607	prev_sec->offset_within_region =
608	MIN(prev_host_start, mrs_host) -
609	(uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
610	prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
611	mrs_host));
612	trace_vhost_region_add_section_merge(section->mr->name,
613	int128_get64(prev_sec->size),
614	prev_sec->offset_within_address_space,
615	prev_sec->offset_within_region);
616	} else {
617	/ adjoining regions are fine, but overlapping ones with*
618	* different blocks/offsets shouldn't happen
619	*/
620	if (mrs_gpa != prev_gpa_end + `1`) {
621	error_report("%s: Overlapping but not coherent sections "
622	"at %"PRIx64,
623	__func__, mrs_gpa);
624	return;
625	}
626	}
627	}
628	}
629
630	if (need_add) {
631	++dev->n_tmp_sections;
632	dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
633	dev->n_tmp_sections);
634	dev->tmp_sections[dev->n_tmp_sections - `1`] = *section;
635	/ The flatview isn't stable and we don't use it, making it NULL*
636	* means we can memcmp the list.
637	*/
638	dev->tmp_sections[dev->n_tmp_sections - `1`].fv = NULL;
639	memory_region_ref(section->mr);
640	}
641	}
642
643	/ Used for both add and nop callbacks /
644	static void vhost_region_addnop(MemoryListener *listener,
645	MemoryRegionSection *section)
646	{
647	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
648	memory_listener);
649
650	if (!vhost_section(dev, section)) {
651	return;
652	}
653	vhost_region_add_section(dev, section);
654	}
655
656	static void vhost_iommu_unmap_notify(IOMMUNotifier n, IOMMUTLBEntry iotlb)
657	{
658	struct vhost_iommu iommu = container_of(n, struct* vhost_iommu, n);
659	struct vhost_dev *hdev = iommu->hdev;
660	hwaddr iova = iotlb->iova + iommu->iommu_offset;
661
662	if (vhost_backend_invalidate_device_iotlb(hdev, iova,
663	iotlb->addr_mask + `1`)) {
664	error_report("Fail to invalidate device iotlb");
665	}
666	}
667
668	static void vhost_iommu_region_add(MemoryListener *listener,
669	MemoryRegionSection *section)
670	{
671	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
672	iommu_listener);
673	struct vhost_iommu *iommu;
674	Int128 end;
675	int iommu_idx;
676	IOMMUMemoryRegion *iommu_mr;
677
678	if (!memory_region_is_iommu(section->mr)) {
679	return;
680	}
681
682	iommu_mr = IOMMU_MEMORY_REGION(section->mr);
683
684	iommu = g_malloc0(sizeof(*iommu));
685	end = int128_add(int128_make64(section->offset_within_region),
686	section->size);
687	end = int128_sub(end, int128_one());
688	iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
689	MEMTXATTRS_UNSPECIFIED);
690	iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
691	IOMMU_NOTIFIER_UNMAP,
692	section->offset_within_region,
693	int128_get64(end),
694	iommu_idx);
695	iommu->mr = section->mr;
696	iommu->iommu_offset = section->offset_within_address_space -
697	section->offset_within_region;
698	iommu->hdev = dev;
699	memory_region_register_iommu_notifier(section->mr, &iommu->n);
700	QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
701	/ TODO: can replay help performance here? /
702	}
703
704	static void vhost_iommu_region_del(MemoryListener *listener,
705	MemoryRegionSection *section)
706	{
707	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
708	iommu_listener);
709	struct vhost_iommu *iommu;
710
711	if (!memory_region_is_iommu(section->mr)) {
712	return;
713	}
714
715	QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
716	if (iommu->mr == section->mr &&
717	iommu->n.start == section->offset_within_region) {
718	memory_region_unregister_iommu_notifier(iommu->mr,
719	&iommu->n);
720	QLIST_REMOVE(iommu, iommu_next);
721	g_free(iommu);
722	break;
723	}
724	}
725	}
726
727	static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
728	struct vhost_virtqueue *vq,
729	unsigned idx, bool enable_log)
730	{
731	struct vhost_vring_addr addr = {
732	.index = idx,
733	.desc_user_addr = (uint64_t)(unsigned long)vq->desc,
734	.avail_user_addr = (uint64_t)(unsigned long)vq->avail,
735	.used_user_addr = (uint64_t)(unsigned long)vq->used,
736	.log_guest_addr = vq->used_phys,
737	.flags = enable_log ? (`1` << VHOST_VRING_F_LOG) : `0`,
738	};
739	int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
740	if (r < `0`) {
741	VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
742	return -errno;
743	}
744	return `0`;
745	}
746
747	static int vhost_dev_set_features(struct vhost_dev *dev,
748	bool enable_log)
749	{
750	uint64_t features = dev->acked_features;
751	int r;
752	if (enable_log) {
753	features \|= `0x1ULL` << VHOST_F_LOG_ALL;
754	}
755	r = dev->vhost_ops->vhost_set_features(dev, features);
756	if (r < `0`) {
757	VHOST_OPS_DEBUG("vhost_set_features failed");
758	}
759	return r < `0` ? -errno : `0`;
760	}
761
762	static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
763	{
764	int r, i, idx;
765	r = vhost_dev_set_features(dev, enable_log);
766	if (r < `0`) {
767	goto err_features;
768	}
769	for (i = `0`; i < dev->nvqs; ++i) {
770	idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
771	r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
772	enable_log);
773	if (r < `0`) {
774	goto err_vq;
775	}
776	}
777	return `0`;
778	err_vq:
779	for (; i >= `0`; --i) {
780	idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
781	vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
782	dev->log_enabled);
783	}
784	vhost_dev_set_features(dev, dev->log_enabled);
785	err_features:
786	return r;
787	}
788
789	static int vhost_migration_log(MemoryListener listener, int* enable)
790	{
791	struct vhost_dev dev = container_of(listener, struct* vhost_dev,
792	memory_listener);
793	int r;
794	if (!!enable == dev->log_enabled) {
795	return `0`;
796	}
797	if (!dev->started) {
798	dev->log_enabled = enable;
799	return `0`;
800	}
801	if (!enable) {
802	r = vhost_dev_set_log(dev, false);
803	if (r < `0`) {
804	return r;
805	}
806	vhost_log_put(dev, false);
807	} else {
808	vhost_dev_log_resize(dev, vhost_get_log_size(dev));
809	r = vhost_dev_set_log(dev, true);
810	if (r < `0`) {
811	return r;
812	}
813	}
814	dev->log_enabled = enable;
815	return `0`;
816	}
817
818	static void vhost_log_global_start(MemoryListener *listener)
819	{
820	int r;
821
822	r = vhost_migration_log(listener, true);
823	if (r < `0`) {
824	abort();
825	}
826	}
827
828	static void vhost_log_global_stop(MemoryListener *listener)
829	{
830	int r;
831
832	r = vhost_migration_log(listener, false);
833	if (r < `0`) {
834	abort();
835	}
836	}
837
838	static void vhost_log_start(MemoryListener *listener,
839	MemoryRegionSection *section,
840	int old, int new)
841	{
842	/ FIXME: implement /
843	}
844
845	static void vhost_log_stop(MemoryListener *listener,
846	MemoryRegionSection *section,
847	int old, int new)
848	{
849	/ FIXME: implement /
850	}
851
852	/ The vhost driver natively knows how to handle the vrings of non*
853	* cross-endian legacy devices and modern devices. Only legacy devices
854	* exposed to a bi-endian guest may require the vhost driver to use a
855	* specific endianness.
856	*/
857	static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
858	{
859	if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
860	return false;
861	}
862	#ifdef HOST_WORDS_BIGENDIAN
863	return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
864	#else
865	return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
866	#endif
867	}
868
869	static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
870	bool is_big_endian,
871	int vhost_vq_index)
872	{
873	struct vhost_vring_state s = {
874	.index = vhost_vq_index,
875	.num = is_big_endian
876	};
877
878	if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
879	return `0`;
880	}
881
882	VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
883	if (errno == ENOTTY) {
884	error_report("vhost does not support cross-endian");
885	return -ENOSYS;
886	}
887
888	return -errno;
889	}
890
891	static int vhost_memory_region_lookup(struct vhost_dev *hdev,
892	uint64_t gpa, uint64_t *uaddr,
893	uint64_t *len)
894	{
895	int i;
896
897	for (i = `0`; i < hdev->mem->nregions; i++) {
898	struct vhost_memory_region *reg = hdev->mem->regions + i;
899
900	if (gpa >= reg->guest_phys_addr &&
901	reg->guest_phys_addr + reg->memory_size > gpa) {
902	*uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
903	*len = reg->guest_phys_addr + reg->memory_size - gpa;
904	return `0`;
905	}
906	}
907
908	return -EFAULT;
909	}
910
911	int vhost_device_iotlb_miss(struct vhost_dev dev, uint64_t iova, int* write)
912	{
913	IOMMUTLBEntry iotlb;
914	uint64_t uaddr, len;
915	int ret = -EFAULT;
916
917	rcu_read_lock();
918
919	trace_vhost_iotlb_miss(dev, `1`);
920
921	iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
922	iova, write,
923	MEMTXATTRS_UNSPECIFIED);
924	if (iotlb.target_as != NULL) {
925	ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
926	&uaddr, &len);
927	if (ret) {
928	trace_vhost_iotlb_miss(dev, `3`);
929	error_report("Fail to lookup the translated address "
930	"%"PRIx64, iotlb.translated_addr);
931	goto out;
932	}
933
934	len = MIN(iotlb.addr_mask + `1`, len);
935	iova = iova & ~iotlb.addr_mask;
936
937	ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
938	len, iotlb.perm);
939	if (ret) {
940	trace_vhost_iotlb_miss(dev, `4`);
941	error_report("Fail to update device iotlb");
942	goto out;
943	}
944	}
945
946	trace_vhost_iotlb_miss(dev, `2`);
947
948	out:
949	rcu_read_unlock();
950
951	return ret;
952	}
953
954	static int vhost_virtqueue_start(struct vhost_dev *dev,
955	struct VirtIODevice *vdev,
956	struct vhost_virtqueue *vq,
957	unsigned idx)
958	{
959	BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
960	VirtioBusState *vbus = VIRTIO_BUS(qbus);
961	VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
962	hwaddr s, l, a;
963	int r;
964	int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
965	struct vhost_vring_file file = {
966	.index = vhost_vq_index
967	};
968	struct vhost_vring_state state = {
969	.index = vhost_vq_index
970	};
971	struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
972
973	a = virtio_queue_get_desc_addr(vdev, idx);
974	if (a == `0`) {
975	/ Queue might not be ready for start /
976	return `0`;
977	}
978
979	vq->num = state.num = virtio_queue_get_num(vdev, idx);
980	r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
981	if (r) {
982	VHOST_OPS_DEBUG("vhost_set_vring_num failed");
983	return -errno;
984	}
985
986	state.num = virtio_queue_get_last_avail_idx(vdev, idx);
987	r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
988	if (r) {
989	VHOST_OPS_DEBUG("vhost_set_vring_base failed");
990	return -errno;
991	}
992
993	if (vhost_needs_vring_endian(vdev)) {
994	r = vhost_virtqueue_set_vring_endian_legacy(dev,
995	virtio_is_big_endian(vdev),
996	vhost_vq_index);
997	if (r) {
998	return -errno;
999	}
1000	}
1001
1002	vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1003	vq->desc_phys = a;
1004	vq->desc = vhost_memory_map(dev, a, &l, `0`);
1005	if (!vq->desc \|\| l != s) {
1006	r = -ENOMEM;
1007	goto fail_alloc_desc;
1008	}
1009	vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1010	vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1011	vq->avail = vhost_memory_map(dev, a, &l, `0`);
1012	if (!vq->avail \|\| l != s) {
1013	r = -ENOMEM;
1014	goto fail_alloc_avail;
1015	}
1016	vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1017	vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1018	vq->used = vhost_memory_map(dev, a, &l, `1`);
1019	if (!vq->used \|\| l != s) {
1020	r = -ENOMEM;
1021	goto fail_alloc_used;
1022	}
1023
1024	r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1025	if (r < `0`) {
1026	r = -errno;
1027	goto fail_alloc;
1028	}
1029
1030	file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1031	r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1032	if (r) {
1033	VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1034	r = -errno;
1035	goto fail_kick;
1036	}
1037
1038	/ Clear and discard previous events if any. /
1039	event_notifier_test_and_clear(&vq->masked_notifier);
1040
1041	/ Init vring in unmasked state, unless guest_notifier_mask*
1042	* will do it later.
1043	*/
1044	if (!vdev->use_guest_notifier_mask) {
1045	/ TODO: check and handle errors. /
1046	vhost_virtqueue_mask(dev, vdev, idx, false);
1047	}
1048
1049	if (k->query_guest_notifiers &&
1050	k->query_guest_notifiers(qbus->parent) &&
1051	virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1052	file.fd = -`1`;
1053	r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1054	if (r) {
1055	goto fail_vector;
1056	}
1057	}
1058
1059	return `0`;
1060
1061	fail_vector:
1062	fail_kick:
1063	fail_alloc:
1064	vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1065	`0`, `0`);
1066	fail_alloc_used:
1067	vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1068	`0`, `0`);
1069	fail_alloc_avail:
1070	vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1071	`0`, `0`);
1072	fail_alloc_desc:
1073	return r;
1074	}
1075
1076	static void vhost_virtqueue_stop(struct vhost_dev *dev,
1077	struct VirtIODevice *vdev,
1078	struct vhost_virtqueue *vq,
1079	unsigned idx)
1080	{
1081	int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1082	struct vhost_vring_state state = {
1083	.index = vhost_vq_index,
1084	};
1085	int r;
1086
1087	if (virtio_queue_get_desc_addr(vdev, idx) == `0`) {
1088	/ Don't stop the virtqueue which might have not been started /
1089	return;
1090	}
1091
1092	r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1093	if (r < `0`) {
1094	VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
1095	/ Connection to the backend is broken, so let's sync internal*
1096	* last avail idx to the device used idx.
1097	*/
1098	virtio_queue_restore_last_avail_idx(vdev, idx);
1099	} else {
1100	virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1101	}
1102	virtio_queue_invalidate_signalled_used(vdev, idx);
1103	virtio_queue_update_used_idx(vdev, idx);
1104
1105	/ In the cross-endian case, we need to reset the vring endianness to*
1106	* native as legacy devices expect so by default.
1107	*/
1108	if (vhost_needs_vring_endian(vdev)) {
1109	vhost_virtqueue_set_vring_endian_legacy(dev,
1110	!virtio_is_big_endian(vdev),
1111	vhost_vq_index);
1112	}
1113
1114	vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1115	`1`, virtio_queue_get_used_size(vdev, idx));
1116	vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1117	`0`, virtio_queue_get_avail_size(vdev, idx));
1118	vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1119	`0`, virtio_queue_get_desc_size(vdev, idx));
1120	}
1121
1122	static void vhost_eventfd_add(MemoryListener *listener,
1123	MemoryRegionSection *section,
1124	bool match_data, uint64_t data, EventNotifier *e)
1125	{
1126	}
1127
1128	static void vhost_eventfd_del(MemoryListener *listener,
1129	MemoryRegionSection *section,
1130	bool match_data, uint64_t data, EventNotifier *e)
1131	{
1132	}
1133
1134	static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1135	int n, uint32_t timeout)
1136	{
1137	int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1138	struct vhost_vring_state state = {
1139	.index = vhost_vq_index,
1140	.num = timeout,
1141	};
1142	int r;
1143
1144	if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1145	return -EINVAL;
1146	}
1147
1148	r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1149	if (r) {
1150	VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1151	return r;
1152	}
1153
1154	return `0`;
1155	}
1156
1157	static int vhost_virtqueue_init(struct vhost_dev *dev,
1158	struct vhost_virtqueue vq, int* n)
1159	{
1160	int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1161	struct vhost_vring_file file = {
1162	.index = vhost_vq_index,
1163	};
1164	int r = event_notifier_init(&vq->masked_notifier, `0`);
1165	if (r < `0`) {
1166	return r;
1167	}
1168
1169	file.fd = event_notifier_get_fd(&vq->masked_notifier);
1170	r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1171	if (r) {
1172	VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1173	r = -errno;
1174	goto fail_call;
1175	}
1176
1177	vq->dev = dev;
1178
1179	return `0`;
1180	fail_call:
1181	event_notifier_cleanup(&vq->masked_notifier);
1182	return r;
1183	}
1184
1185	static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1186	{
1187	event_notifier_cleanup(&vq->masked_notifier);
1188	}
1189
1190	int vhost_dev_init(struct vhost_dev hdev, void* *opaque,
1191	VhostBackendType backend_type, uint32_t busyloop_timeout)
1192	{
1193	uint64_t features;
1194	int i, r, n_initialized_vqs = `0`;
1195	Error *local_err = NULL;
1196
1197	hdev->vdev = NULL;
1198	hdev->migration_blocker = NULL;
1199
1200	r = vhost_set_backend_type(hdev, backend_type);
1201	assert(r >= `0`);
1202
1203	r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1204	if (r < `0`) {
1205	goto fail;
1206	}
1207
1208	r = hdev->vhost_ops->vhost_set_owner(hdev);
1209	if (r < `0`) {
1210	VHOST_OPS_DEBUG("vhost_set_owner failed");
1211	goto fail;
1212	}
1213
1214	r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1215	if (r < `0`) {
1216	VHOST_OPS_DEBUG("vhost_get_features failed");
1217	goto fail;
1218	}
1219
1220	for (i = `0`; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1221	r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1222	if (r < `0`) {
1223	goto fail;
1224	}
1225	}
1226
1227	if (busyloop_timeout) {
1228	for (i = `0`; i < hdev->nvqs; ++i) {
1229	r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1230	busyloop_timeout);
1231	if (r < `0`) {
1232	goto fail_busyloop;
1233	}
1234	}
1235	}
1236
1237	hdev->features = features;
1238
1239	hdev->memory_listener = (MemoryListener) {
1240	.begin = vhost_begin,
1241	.commit = vhost_commit,
1242	.region_add = vhost_region_addnop,
1243	.region_nop = vhost_region_addnop,
1244	.log_start = vhost_log_start,
1245	.log_stop = vhost_log_stop,
1246	.log_sync = vhost_log_sync,
1247	.log_global_start = vhost_log_global_start,
1248	.log_global_stop = vhost_log_global_stop,
1249	.eventfd_add = vhost_eventfd_add,
1250	.eventfd_del = vhost_eventfd_del,
1251	.priority = `10`
1252	};
1253
1254	hdev->iommu_listener = (MemoryListener) {
1255	.region_add = vhost_iommu_region_add,
1256	.region_del = vhost_iommu_region_del,
1257	};
1258
1259	if (hdev->migration_blocker == NULL) {
1260	if (!(hdev->features & (`0x1ULL` << VHOST_F_LOG_ALL))) {
1261	error_setg(&hdev->migration_blocker,
1262	"Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1263	} else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1264	error_setg(&hdev->migration_blocker,
1265	"Migration disabled: failed to allocate shared memory");
1266	}
1267	}
1268
1269	if (hdev->migration_blocker != NULL) {
1270	r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1271	if (local_err) {
1272	error_report_err(local_err);
1273	error_free(hdev->migration_blocker);
1274	goto fail_busyloop;
1275	}
1276	}
1277
1278	hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1279	hdev->n_mem_sections = `0`;
1280	hdev->mem_sections = NULL;
1281	hdev->log = NULL;
1282	hdev->log_size = `0`;
1283	hdev->log_enabled = false;
1284	hdev->started = false;
1285	memory_listener_register(&hdev->memory_listener, &address_space_memory);
1286	QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1287
1288	if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1289	error_report("vhost backend memory slots limit is less"
1290	" than current number of present memory slots");
1291	r = -`1`;
1292	if (busyloop_timeout) {
1293	goto fail_busyloop;
1294	} else {
1295	goto fail;
1296	}
1297	}
1298
1299	return `0`;
1300
1301	fail_busyloop:
1302	while (--i >= `0`) {
1303	vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, `0`);
1304	}
1305	fail:
1306	hdev->nvqs = n_initialized_vqs;
1307	vhost_dev_cleanup(hdev);
1308	return r;
1309	}
1310
1311	void vhost_dev_cleanup(struct vhost_dev *hdev)
1312	{
1313	int i;
1314
1315	for (i = `0`; i < hdev->nvqs; ++i) {
1316	vhost_virtqueue_cleanup(hdev->vqs + i);
1317	}
1318	if (hdev->mem) {
1319	/ those are only safe after successful init /
1320	memory_listener_unregister(&hdev->memory_listener);
1321	QLIST_REMOVE(hdev, entry);
1322	}
1323	if (hdev->migration_blocker) {
1324	migrate_del_blocker(hdev->migration_blocker);
1325	error_free(hdev->migration_blocker);
1326	}
1327	g_free(hdev->mem);
1328	g_free(hdev->mem_sections);
1329	if (hdev->vhost_ops) {
1330	hdev->vhost_ops->vhost_backend_cleanup(hdev);
1331	}
1332	assert(!hdev->log);
1333
1334	memset(hdev, `0`, sizeof(struct vhost_dev));
1335	}
1336
1337	/ Stop processing guest IO notifications in qemu.*
1338	* Start processing them in vhost in kernel.
1339	*/
1340	int vhost_dev_enable_notifiers(struct vhost_dev hdev, VirtIODevice vdev)
1341	{
1342	BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1343	int i, r, e;
1344
1345	/ We will pass the notifiers to the kernel, make sure that QEMU*
1346	* doesn't interfere.
1347	*/
1348	r = virtio_device_grab_ioeventfd(vdev);
1349	if (r < `0`) {
1350	error_report("binding does not support host notifiers");
1351	goto fail;
1352	}
1353
1354	for (i = `0`; i < hdev->nvqs; ++i) {
1355	r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1356	true);
1357	if (r < `0`) {
1358	error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1359	goto fail_vq;
1360	}
1361	}
1362
1363	return `0`;
1364	fail_vq:
1365	while (--i >= `0`) {
1366	e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1367	false);
1368	if (e < `0`) {
1369	error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1370	}
1371	assert (e >= `0`);
1372	virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1373	}
1374	virtio_device_release_ioeventfd(vdev);
1375	fail:
1376	return r;
1377	}
1378
1379	/ Stop processing guest IO notifications in vhost.*
1380	* Start processing them in qemu.
1381	* This might actually run the qemu handlers right away,
1382	* so virtio in qemu must be completely setup when this is called.
1383	*/
1384	void vhost_dev_disable_notifiers(struct vhost_dev hdev, VirtIODevice vdev)
1385	{
1386	BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1387	int i, r;
1388
1389	for (i = `0`; i < hdev->nvqs; ++i) {
1390	r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1391	false);
1392	if (r < `0`) {
1393	error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1394	}
1395	assert (r >= `0`);
1396	virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1397	}
1398	virtio_device_release_ioeventfd(vdev);
1399	}
1400
1401	/ Test and clear event pending status.*
1402	* Should be called after unmask to avoid losing events.
1403	*/
1404	bool vhost_virtqueue_pending(struct vhost_dev hdev, int* n)
1405	{
1406	struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1407	assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1408	return event_notifier_test_and_clear(&vq->masked_notifier);
1409	}
1410
1411	/ Mask/unmask events from this vq. /
1412	void vhost_virtqueue_mask(struct vhost_dev hdev, VirtIODevice vdev, int n,
1413	bool mask)
1414	{
1415	struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1416	int r, index = n - hdev->vq_index;
1417	struct vhost_vring_file file;
1418
1419	/ should only be called after backend is connected /
1420	assert(hdev->vhost_ops);
1421
1422	if (mask) {
1423	assert(vdev->use_guest_notifier_mask);
1424	file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1425	} else {
1426	file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1427	}
1428
1429	file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1430	r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1431	if (r < `0`) {
1432	VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1433	}
1434	}
1435
1436	uint64_t vhost_get_features(struct vhost_dev hdev, const* int *feature_bits,
1437	uint64_t features)
1438	{
1439	const int *bit = feature_bits;
1440	while (*bit != VHOST_INVALID_FEATURE_BIT) {
1441	uint64_t bit_mask = (`1ULL` << *bit);
1442	if (!(hdev->features & bit_mask)) {
1443	features &= ~bit_mask;
1444	}
1445	bit++;
1446	}
1447	return features;
1448	}
1449
1450	void vhost_ack_features(struct vhost_dev hdev, const* int *feature_bits,
1451	uint64_t features)
1452	{
1453	const int *bit = feature_bits;
1454	while (*bit != VHOST_INVALID_FEATURE_BIT) {
1455	uint64_t bit_mask = (`1ULL` << *bit);
1456	if (features & bit_mask) {
1457	hdev->acked_features \|= bit_mask;
1458	}
1459	bit++;
1460	}
1461	}
1462
1463	int vhost_dev_get_config(struct vhost_dev hdev, uint8_t config,
1464	uint32_t config_len)
1465	{
1466	assert(hdev->vhost_ops);
1467
1468	if (hdev->vhost_ops->vhost_get_config) {
1469	return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
1470	}
1471
1472	return -`1`;
1473	}
1474
1475	int vhost_dev_set_config(struct vhost_dev hdev, const* uint8_t *data,
1476	uint32_t offset, uint32_t size, uint32_t flags)
1477	{
1478	assert(hdev->vhost_ops);
1479
1480	if (hdev->vhost_ops->vhost_set_config) {
1481	return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1482	size, flags);
1483	}
1484
1485	return -`1`;
1486	}
1487
1488	void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1489	const VhostDevConfigOps *ops)
1490	{
1491	hdev->config_ops = ops;
1492	}
1493
1494	void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1495	{
1496	if (inflight->addr) {
1497	qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1498	inflight->addr = NULL;
1499	inflight->fd = -`1`;
1500	}
1501	}
1502
1503	static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1504	uint64_t new_size)
1505	{
1506	Error *err = NULL;
1507	int fd = -`1`;
1508	void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1509	F_SEAL_GROW \| F_SEAL_SHRINK \| F_SEAL_SEAL,
1510	&fd, &err);
1511
1512	if (err) {
1513	error_report_err(err);
1514	return -`1`;
1515	}
1516
1517	vhost_dev_free_inflight(inflight);
1518	inflight->offset = `0`;
1519	inflight->addr = addr;
1520	inflight->fd = fd;
1521	inflight->size = new_size;
1522
1523	return `0`;
1524	}
1525
1526	void vhost_dev_save_inflight(struct vhost_inflight inflight, QEMUFile f)
1527	{
1528	if (inflight->addr) {
1529	qemu_put_be64(f, inflight->size);
1530	qemu_put_be16(f, inflight->queue_size);
1531	qemu_put_buffer(f, inflight->addr, inflight->size);
1532	} else {
1533	qemu_put_be64(f, `0`);
1534	}
1535	}
1536
1537	int vhost_dev_load_inflight(struct vhost_inflight inflight, QEMUFile f)
1538	{
1539	uint64_t size;
1540
1541	size = qemu_get_be64(f);
1542	if (!size) {
1543	return `0`;
1544	}
1545
1546	if (inflight->size != size) {
1547	if (vhost_dev_resize_inflight(inflight, size)) {
1548	return -`1`;
1549	}
1550	}
1551	inflight->queue_size = qemu_get_be16(f);
1552
1553	qemu_get_buffer(f, inflight->addr, size);
1554
1555	return `0`;
1556	}
1557
1558	int vhost_dev_set_inflight(struct vhost_dev *dev,
1559	struct vhost_inflight *inflight)
1560	{
1561	int r;
1562
1563	if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1564	r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1565	if (r) {
1566	VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
1567	return -errno;
1568	}
1569	}
1570
1571	return `0`;
1572	}
1573
1574	int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1575	struct vhost_inflight *inflight)
1576	{
1577	int r;
1578
1579	if (dev->vhost_ops->vhost_get_inflight_fd) {
1580	r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1581	if (r) {
1582	VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
1583	return -errno;
1584	}
1585	}
1586
1587	return `0`;
1588	}
1589
1590	/ Host notifiers must be enabled at this point. /
1591	int vhost_dev_start(struct vhost_dev hdev, VirtIODevice vdev)
1592	{
1593	int i, r;
1594
1595	/ should only be called after backend is connected /
1596	assert(hdev->vhost_ops);
1597
1598	hdev->started = true;
1599	hdev->vdev = vdev;
1600
1601	r = vhost_dev_set_features(hdev, hdev->log_enabled);
1602	if (r < `0`) {
1603	goto fail_features;
1604	}
1605
1606	if (vhost_dev_has_iommu(hdev)) {
1607	memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1608	}
1609
1610	r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1611	if (r < `0`) {
1612	VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1613	r = -errno;
1614	goto fail_mem;
1615	}
1616	for (i = `0`; i < hdev->nvqs; ++i) {
1617	r = vhost_virtqueue_start(hdev,
1618	vdev,
1619	hdev->vqs + i,
1620	hdev->vq_index + i);
1621	if (r < `0`) {
1622	goto fail_vq;
1623	}
1624	}
1625
1626	if (hdev->log_enabled) {
1627	uint64_t log_base;
1628
1629	hdev->log_size = vhost_get_log_size(hdev);
1630	hdev->log = vhost_log_get(hdev->log_size,
1631	vhost_dev_log_is_shared(hdev));
1632	log_base = (uintptr_t)hdev->log->log;
1633	r = hdev->vhost_ops->vhost_set_log_base(hdev,
1634	hdev->log_size ? log_base : `0`,
1635	hdev->log);
1636	if (r < `0`) {
1637	VHOST_OPS_DEBUG("vhost_set_log_base failed");
1638	r = -errno;
1639	goto fail_log;
1640	}
1641	}
1642
1643	if (vhost_dev_has_iommu(hdev)) {
1644	hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1645
1646	/ Update used ring information for IOTLB to work correctly,*
1647	* vhost-kernel code requires for this.*/
1648	for (i = `0`; i < hdev->nvqs; ++i) {
1649	struct vhost_virtqueue *vq = hdev->vqs + i;
1650	vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1651	}
1652	}
1653	return `0`;
1654	fail_log:
1655	vhost_log_put(hdev, false);
1656	fail_vq:
1657	while (--i >= `0`) {
1658	vhost_virtqueue_stop(hdev,
1659	vdev,
1660	hdev->vqs + i,
1661	hdev->vq_index + i);
1662	}
1663
1664	fail_mem:
1665	fail_features:
1666
1667	hdev->started = false;
1668	return r;
1669	}
1670
1671	/ Host notifiers must be enabled at this point. /
1672	void vhost_dev_stop(struct vhost_dev hdev, VirtIODevice vdev)
1673	{
1674	int i;
1675
1676	/ should only be called after backend is connected /
1677	assert(hdev->vhost_ops);
1678
1679	for (i = `0`; i < hdev->nvqs; ++i) {
1680	vhost_virtqueue_stop(hdev,
1681	vdev,
1682	hdev->vqs + i,
1683	hdev->vq_index + i);
1684	}
1685
1686	if (vhost_dev_has_iommu(hdev)) {
1687	hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1688	memory_listener_unregister(&hdev->iommu_listener);
1689	}
1690	vhost_log_put(hdev, true);
1691	hdev->started = false;
1692	hdev->vdev = NULL;
1693	}
1694
1695	int vhost_net_set_backend(struct vhost_dev *hdev,
1696	struct vhost_vring_file *file)
1697	{
1698	if (hdev->vhost_ops->vhost_net_set_backend) {
1699	return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1700	}
1701
1702	return -`1`;
1703	}
1704

Browse the source code of qemu/hw/virtio/vhost.c