kvm-all.c source code [qemu/accel/kvm/kvm-all.c]

1	/*
2	* QEMU KVM support
3	*
4	* Copyright IBM, Corp. 2008
5	* Red Hat, Inc. 2008
6	*
7	* Authors:
8	* Anthony Liguori <aliguori@us.ibm.com>
9	* Glauber Costa <gcosta@redhat.com>
10	*
11	* This work is licensed under the terms of the GNU GPL, version 2 or later.
12	* See the COPYING file in the top-level directory.
13	*
14	*/
15
16	#include "qemu/osdep.h"
17	#include <sys/ioctl.h>
18
19	#include <linux/kvm.h>
20
21	#include "qemu/atomic.h"
22	#include "qemu/option.h"
23	#include "qemu/config-file.h"
24	#include "qemu/error-report.h"
25	#include "qapi/error.h"
26	#include "hw/pci/msi.h"
27	#include "hw/pci/msix.h"
28	#include "hw/s390x/adapter.h"
29	#include "exec/gdbstub.h"
30	#include "sysemu/kvm_int.h"
31	#include "sysemu/runstate.h"
32	#include "sysemu/cpus.h"
33	#include "sysemu/sysemu.h"
34	#include "qemu/bswap.h"
35	#include "exec/memory.h"
36	#include "exec/ram_addr.h"
37	#include "exec/address-spaces.h"
38	#include "qemu/event_notifier.h"
39	#include "qemu/main-loop.h"
40	#include "trace.h"
41	#include "hw/irq.h"
42	#include "sysemu/sev.h"
43	#include "sysemu/balloon.h"
44
45	#include "hw/boards.h"
46
47	/ This check must be after config-host.h is included /
48	#ifdef CONFIG_EVENTFD
49	#include <sys/eventfd.h>
50	#endif
51
52	/ KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We*
53	* need to use the real host PAGE_SIZE, as that's what KVM will use.
54	*/
55	#define PAGE_SIZE getpagesize()
56
57	//#define DEBUG_KVM
58
59	#ifdef DEBUG_KVM
60	#define DPRINTF(fmt, ...) \
61	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
62	#else
63	#define DPRINTF(fmt, ...) \
64	do { } while (0)
65	#endif
66
67	#define KVM_MSI_HASHTAB_SIZE 256
68
69	struct KVMParkedVcpu {
70	unsigned long vcpu_id;
71	int kvm_fd;
72	QLIST_ENTRY(KVMParkedVcpu) node;
73	};
74
75	struct KVMState
76	{
77	AccelState parent_obj;
78
79	int nr_slots;
80	int fd;
81	int vmfd;
82	int coalesced_mmio;
83	int coalesced_pio;
84	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
85	bool coalesced_flush_in_progress;
86	int vcpu_events;
87	int robust_singlestep;
88	int debugregs;
89	#ifdef KVM_CAP_SET_GUEST_DEBUG
90	QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
91	#endif
92	int max_nested_state_len;
93	int many_ioeventfds;
94	int intx_set_mask;
95	bool sync_mmu;
96	bool manual_dirty_log_protect;
97	/ The man page (and posix) say ioctl numbers are signed int, but*
98	* they're not. Linux, glibc and *BSD all treat ioctl numbers as
99	* unsigned, and treating them as signed here can break things */
100	unsigned irq_set_ioctl;
101	unsigned int sigmask_len;
102	GHashTable *gsimap;
103	#ifdef KVM_CAP_IRQ_ROUTING
104	struct kvm_irq_routing *irq_routes;
105	int nr_allocated_irq_routes;
106	unsigned long *used_gsi_bitmap;
107	unsigned int gsi_count;
108	QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
109	#endif
110	KVMMemoryListener memory_listener;
111	QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
112
113	/ memory encryption /
114	void *memcrypt_handle;
115	int (memcrypt_encrypt_data)(void* handle, uint8_t ptr, uint64_t len);
116
117	/ For "info mtree -f" to tell if an MR is registered in KVM /
118	int nr_as;
119	struct KVMAs {
120	KVMMemoryListener *ml;
121	AddressSpace *as;
122	} *as;
123	};
124
125	KVMState *kvm_state;
126	bool kvm_kernel_irqchip;
127	bool kvm_split_irqchip;
128	bool kvm_async_interrupts_allowed;
129	bool kvm_halt_in_kernel_allowed;
130	bool kvm_eventfds_allowed;
131	bool kvm_irqfds_allowed;
132	bool kvm_resamplefds_allowed;
133	bool kvm_msi_via_irqfd_allowed;
134	bool kvm_gsi_routing_allowed;
135	bool kvm_gsi_direct_mapping;
136	bool kvm_allowed;
137	bool kvm_readonly_mem_allowed;
138	bool kvm_vm_attributes_allowed;
139	bool kvm_direct_msi_allowed;
140	bool kvm_ioeventfd_any_length_allowed;
141	bool kvm_msi_use_devid;
142	static bool kvm_immediate_exit;
143
144	static const KVMCapabilityInfo kvm_required_capabilites[] = {
145	KVM_CAP_INFO(USER_MEMORY),
146	KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
147	KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
148	KVM_CAP_LAST_INFO
149	};
150
151	#define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
152	#define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
153
154	int kvm_get_max_memslots(void)
155	{
156	KVMState *s = KVM_STATE(current_machine->accelerator);
157
158	return s->nr_slots;
159	}
160
161	bool kvm_memcrypt_enabled(void)
162	{
163	if (kvm_state && kvm_state->memcrypt_handle) {
164	return true;
165	}
166
167	return false;
168	}
169
170	int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
171	{
172	if (kvm_state->memcrypt_handle &&
173	kvm_state->memcrypt_encrypt_data) {
174	return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
175	ptr, len);
176	}
177
178	return `1`;
179	}
180
181	/ Called with KVMMemoryListener.slots_lock held /
182	static KVMSlot kvm_get_free_slot(KVMMemoryListener kml)
183	{
184	KVMState *s = kvm_state;
185	int i;
186
187	for (i = `0`; i < s->nr_slots; i++) {
188	if (kml->slots[i].memory_size == `0`) {
189	return &kml->slots[i];
190	}
191	}
192
193	return NULL;
194	}
195
196	bool kvm_has_free_slot(MachineState *ms)
197	{
198	KVMState *s = KVM_STATE(ms->accelerator);
199	bool result;
200	KVMMemoryListener *kml = &s->memory_listener;
201
202	kvm_slots_lock(kml);
203	result = !!kvm_get_free_slot(kml);
204	kvm_slots_unlock(kml);
205
206	return result;
207	}
208
209	/ Called with KVMMemoryListener.slots_lock held /
210	static KVMSlot kvm_alloc_slot(KVMMemoryListener kml)
211	{
212	KVMSlot *slot = kvm_get_free_slot(kml);
213
214	if (slot) {
215	return slot;
216	}
217
218	fprintf(stderr, "%s: no free slot available\n", __func__);
219	abort();
220	}
221
222	static KVMSlot kvm_lookup_matching_slot(KVMMemoryListener kml,
223	hwaddr start_addr,
224	hwaddr size)
225	{
226	KVMState *s = kvm_state;
227	int i;
228
229	for (i = `0`; i < s->nr_slots; i++) {
230	KVMSlot *mem = &kml->slots[i];
231
232	if (start_addr == mem->start_addr && size == mem->memory_size) {
233	return mem;
234	}
235	}
236
237	return NULL;
238	}
239
240	/*
241	* Calculate and align the start address and the size of the section.
242	* Return the size. If the size is 0, the aligned section is empty.
243	*/
244	static hwaddr kvm_align_section(MemoryRegionSection *section,
245	hwaddr *start)
246	{
247	hwaddr size = int128_get64(section->size);
248	hwaddr delta, aligned;
249
250	/ kvm works in page size chunks, but the function may be called*
251	with sub-page size and unaligned start address. Pad the start
252	address to next and truncate size to previous page boundary. /*
253	aligned = ROUND_UP(section->offset_within_address_space,
254	qemu_real_host_page_size);
255	delta = aligned - section->offset_within_address_space;
256	*start = aligned;
257	if (delta > size) {
258	return `0`;
259	}
260
261	return (size - delta) & qemu_real_host_page_mask;
262	}
263
264	int kvm_physical_memory_addr_from_host(KVMState s, void* *ram,
265	hwaddr *phys_addr)
266	{
267	KVMMemoryListener *kml = &s->memory_listener;
268	int i, ret = `0`;
269
270	kvm_slots_lock(kml);
271	for (i = `0`; i < s->nr_slots; i++) {
272	KVMSlot *mem = &kml->slots[i];
273
274	if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
275	*phys_addr = mem->start_addr + (ram - mem->ram);
276	ret = `1`;
277	break;
278	}
279	}
280	kvm_slots_unlock(kml);
281
282	return ret;
283	}
284
285	static int kvm_set_user_memory_region(KVMMemoryListener kml, KVMSlot slot, bool new)
286	{
287	KVMState *s = kvm_state;
288	struct kvm_userspace_memory_region mem;
289	int ret;
290
291	mem.slot = slot->slot \| (kml->as_id << `16`);
292	mem.guest_phys_addr = slot->start_addr;
293	mem.userspace_addr = (unsigned long)slot->ram;
294	mem.flags = slot->flags;
295
296	if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
297	/ Set the slot size to 0 before setting the slot to the desired*
298	* value. This is needed based on KVM commit 75d61fbc. */
299	mem.memory_size = `0`;
300	kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
301	}
302	mem.memory_size = slot->memory_size;
303	ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
304	slot->old_flags = mem.flags;
305	trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
306	mem.memory_size, mem.userspace_addr, ret);
307	return ret;
308	}
309
310	int kvm_destroy_vcpu(CPUState *cpu)
311	{
312	KVMState *s = kvm_state;
313	long mmap_size;
314	struct KVMParkedVcpu *vcpu = NULL;
315	int ret = `0`;
316
317	DPRINTF("kvm_destroy_vcpu\n");
318
319	ret = kvm_arch_destroy_vcpu(cpu);
320	if (ret < `0`) {
321	goto err;
322	}
323
324	mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, `0`);
325	if (mmap_size < `0`) {
326	ret = mmap_size;
327	DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
328	goto err;
329	}
330
331	ret = munmap(cpu->kvm_run, mmap_size);
332	if (ret < `0`) {
333	goto err;
334	}
335
336	vcpu = g_malloc0(sizeof(*vcpu));
337	vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
338	vcpu->kvm_fd = cpu->kvm_fd;
339	QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
340	err:
341	return ret;
342	}
343
344	static int kvm_get_vcpu(KVMState s, unsigned* long vcpu_id)
345	{
346	struct KVMParkedVcpu *cpu;
347
348	QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
349	if (cpu->vcpu_id == vcpu_id) {
350	int kvm_fd;
351
352	QLIST_REMOVE(cpu, node);
353	kvm_fd = cpu->kvm_fd;
354	g_free(cpu);
355	return kvm_fd;
356	}
357	}
358
359	return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
360	}
361
362	int kvm_init_vcpu(CPUState *cpu)
363	{
364	KVMState *s = kvm_state;
365	long mmap_size;
366	int ret;
367
368	DPRINTF("kvm_init_vcpu\n");
369
370	ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
371	if (ret < `0`) {
372	DPRINTF("kvm_create_vcpu failed\n");
373	goto err;
374	}
375
376	cpu->kvm_fd = ret;
377	cpu->kvm_state = s;
378	cpu->vcpu_dirty = true;
379
380	mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, `0`);
381	if (mmap_size < `0`) {
382	ret = mmap_size;
383	DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
384	goto err;
385	}
386
387	cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ \| PROT_WRITE, MAP_SHARED,
388	cpu->kvm_fd, `0`);
389	if (cpu->kvm_run == MAP_FAILED) {
390	ret = -errno;
391	DPRINTF("mmap'ing vcpu state failed\n");
392	goto err;
393	}
394
395	if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
396	s->coalesced_mmio_ring =
397	(void )cpu->kvm_run + s->coalesced_mmio PAGE_SIZE;
398	}
399
400	ret = kvm_arch_init_vcpu(cpu);
401	err:
402	return ret;
403	}
404
405	/*
406	* dirty pages logging control
407	*/
408
409	static int kvm_mem_flags(MemoryRegion *mr)
410	{
411	bool readonly = mr->readonly \|\| memory_region_is_romd(mr);
412	int flags = `0`;
413
414	if (memory_region_get_dirty_log_mask(mr) != `0`) {
415	flags \|= KVM_MEM_LOG_DIRTY_PAGES;
416	}
417	if (readonly && kvm_readonly_mem_allowed) {
418	flags \|= KVM_MEM_READONLY;
419	}
420	return flags;
421	}
422
423	/ Called with KVMMemoryListener.slots_lock held /
424	static int kvm_slot_update_flags(KVMMemoryListener kml, KVMSlot mem,
425	MemoryRegion *mr)
426	{
427	mem->flags = kvm_mem_flags(mr);
428
429	/ If nothing changed effectively, no need to issue ioctl /
430	if (mem->flags == mem->old_flags) {
431	return `0`;
432	}
433
434	return kvm_set_user_memory_region(kml, mem, false);
435	}
436
437	static int kvm_section_update_flags(KVMMemoryListener *kml,
438	MemoryRegionSection *section)
439	{
440	hwaddr start_addr, size;
441	KVMSlot *mem;
442	int ret = `0`;
443
444	size = kvm_align_section(section, &start_addr);
445	if (!size) {
446	return `0`;
447	}
448
449	kvm_slots_lock(kml);
450
451	mem = kvm_lookup_matching_slot(kml, start_addr, size);
452	if (!mem) {
453	/ We don't have a slot if we want to trap every access. /
454	goto out;
455	}
456
457	ret = kvm_slot_update_flags(kml, mem, section->mr);
458
459	out:
460	kvm_slots_unlock(kml);
461	return ret;
462	}
463
464	static void kvm_log_start(MemoryListener *listener,
465	MemoryRegionSection *section,
466	int old, int new)
467	{
468	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
469	int r;
470
471	if (old != `0`) {
472	return;
473	}
474
475	r = kvm_section_update_flags(kml, section);
476	if (r < `0`) {
477	abort();
478	}
479	}
480
481	static void kvm_log_stop(MemoryListener *listener,
482	MemoryRegionSection *section,
483	int old, int new)
484	{
485	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
486	int r;
487
488	if (new != `0`) {
489	return;
490	}
491
492	r = kvm_section_update_flags(kml, section);
493	if (r < `0`) {
494	abort();
495	}
496	}
497
498	/ get kvm's dirty pages bitmap and update qemu's /
499	static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
500	unsigned long *bitmap)
501	{
502	ram_addr_t start = section->offset_within_region +
503	memory_region_get_ram_addr(section->mr);
504	ram_addr_t pages = int128_get64(section->size) / getpagesize();
505
506	cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
507	return `0`;
508	}
509
510	#define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
511
512	/**
513	* kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
514	*
515	* This function will first try to fetch dirty bitmap from the kernel,
516	* and then updates qemu's dirty bitmap.
517	*
518	* NOTE: caller must be with kml->slots_lock held.
519	*
520	* @kml: the KVM memory listener object
521	* @section: the memory section to sync the dirty bitmap with
522	*/
523	static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
524	MemoryRegionSection *section)
525	{
526	KVMState *s = kvm_state;
527	struct kvm_dirty_log d = {};
528	KVMSlot *mem;
529	hwaddr start_addr, size;
530	int ret = `0`;
531
532	size = kvm_align_section(section, &start_addr);
533	if (size) {
534	mem = kvm_lookup_matching_slot(kml, start_addr, size);
535	if (!mem) {
536	/ We don't have a slot if we want to trap every access. /
537	goto out;
538	}
539
540	/ XXX bad kernel interface alert*
541	* For dirty bitmap, kernel allocates array of size aligned to
542	* bits-per-long. But for case when the kernel is 64bits and
543	* the userspace is 32bits, userspace can't align to the same
544	* bits-per-long, since sizeof(long) is different between kernel
545	* and user space. This way, userspace will provide buffer which
546	* may be 4 bytes less than the kernel will use, resulting in
547	* userspace memory corruption (which is not detectable by valgrind
548	* too, in most cases).
549	* So for now, let's align to 64 instead of HOST_LONG_BITS here, in
550	* a hope that sizeof(long) won't become >8 any time soon.
551	*/
552	size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
553	/HOST_LONG_BITS/ `64`) / `8`;
554	if (!mem->dirty_bmap) {
555	/ Allocate on the first log_sync, once and for all /
556	mem->dirty_bmap = g_malloc0(size);
557	}
558
559	d.dirty_bitmap = mem->dirty_bmap;
560	d.slot = mem->slot \| (kml->as_id << `16`);
561	if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -`1`) {
562	DPRINTF("ioctl failed %d\n", errno);
563	ret = -`1`;
564	goto out;
565	}
566
567	kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
568	}
569	out:
570	return ret;
571	}
572
573	/ Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages /
574	#define KVM_CLEAR_LOG_SHIFT 6
575	#define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT)
576	#define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
577
578	/**
579	* kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
580	*
581	* NOTE: this will be a no-op if we haven't enabled manual dirty log
582	* protection in the host kernel because in that case this operation
583	* will be done within log_sync().
584	*
585	* @kml: the kvm memory listener
586	* @section: the memory range to clear dirty bitmap
587	*/
588	static int kvm_physical_log_clear(KVMMemoryListener *kml,
589	MemoryRegionSection *section)
590	{
591	KVMState *s = kvm_state;
592	struct kvm_clear_dirty_log d;
593	uint64_t start, end, bmap_start, start_delta, bmap_npages, size;
594	unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
595	KVMSlot *mem = NULL;
596	int ret, i;
597
598	if (!s->manual_dirty_log_protect) {
599	/ No need to do explicit clear /
600	return `0`;
601	}
602
603	start = section->offset_within_address_space;
604	size = int128_get64(section->size);
605
606	if (!size) {
607	/ Nothing more we can do... /
608	return `0`;
609	}
610
611	kvm_slots_lock(kml);
612
613	/ Find any possible slot that covers the section /
614	for (i = `0`; i < s->nr_slots; i++) {
615	mem = &kml->slots[i];
616	if (mem->start_addr <= start &&
617	start + size <= mem->start_addr + mem->memory_size) {
618	break;
619	}
620	}
621
622	/*
623	* We should always find one memslot until this point, otherwise
624	* there could be something wrong from the upper layer
625	*/
626	assert(mem && i != s->nr_slots);
627
628	/*
629	* We need to extend either the start or the size or both to
630	* satisfy the KVM interface requirement. Firstly, do the start
631	* page alignment on 64 host pages
632	*/
633	bmap_start = (start - mem->start_addr) & KVM_CLEAR_LOG_MASK;
634	start_delta = start - mem->start_addr - bmap_start;
635	bmap_start /= psize;
636
637	/*
638	* The kernel interface has restriction on the size too, that either:
639	*
640	* (1) the size is 64 host pages aligned (just like the start), or
641	* (2) the size fills up until the end of the KVM memslot.
642	*/
643	bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
644	<< KVM_CLEAR_LOG_SHIFT;
645	end = mem->memory_size / psize;
646	if (bmap_npages > end - bmap_start) {
647	bmap_npages = end - bmap_start;
648	}
649	start_delta /= psize;
650
651	/*
652	* Prepare the bitmap to clear dirty bits. Here we must guarantee
653	* that we won't clear any unknown dirty bits otherwise we might
654	* accidentally clear some set bits which are not yet synced from
655	* the kernel into QEMU's bitmap, then we'll lose track of the
656	* guest modifications upon those pages (which can directly lead
657	* to guest data loss or panic after migration).
658	*
659	* Layout of the KVMSlot.dirty_bmap:
660	*
661	* \|<-------- bmap_npages -----------..>\|
662	* [1]
663	* start_delta size
664	* \|----------------\|-------------\|------------------\|------------\|
665	* ^ ^ ^ ^
666	* \| \| \| \|
667	* start bmap_start (start) end
668	* of memslot of memslot
669	*
670	* [1] bmap_npages can be aligned to either 64 pages or the end of slot
671	*/
672
673	assert(bmap_start % BITS_PER_LONG == `0`);
674	/ We should never do log_clear before log_sync /
675	assert(mem->dirty_bmap);
676	if (start_delta) {
677	/ Slow path - we need to manipulate a temp bitmap /
678	bmap_clear = bitmap_new(bmap_npages);
679	bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
680	bmap_start, start_delta + size / psize);
681	/*
682	* We need to fill the holes at start because that was not
683	* specified by the caller and we extended the bitmap only for
684	* 64 pages alignment
685	*/
686	bitmap_clear(bmap_clear, `0`, start_delta);
687	d.dirty_bitmap = bmap_clear;
688	} else {
689	/ Fast path - start address aligns well with BITS_PER_LONG /
690	d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
691	}
692
693	d.first_page = bmap_start;
694	/ It should never overflow. If it happens, say something /
695	assert(bmap_npages <= UINT32_MAX);
696	d.num_pages = bmap_npages;
697	d.slot = mem->slot \| (kml->as_id << `16`);
698
699	if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -`1`) {
700	ret = -errno;
701	error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
702	"start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
703	__func__, d.slot, (uint64_t)d.first_page,
704	(uint32_t)d.num_pages, ret);
705	} else {
706	ret = `0`;
707	trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
708	}
709
710	/*
711	* After we have updated the remote dirty bitmap, we update the
712	* cached bitmap as well for the memslot, then if another user
713	* clears the same region we know we shouldn't clear it again on
714	* the remote otherwise it's data loss as well.
715	*/
716	bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
717	size / psize);
718	/ This handles the NULL case well /
719	g_free(bmap_clear);
720
721	kvm_slots_unlock(kml);
722
723	return ret;
724	}
725
726	static void kvm_coalesce_mmio_region(MemoryListener *listener,
727	MemoryRegionSection *secion,
728	hwaddr start, hwaddr size)
729	{
730	KVMState *s = kvm_state;
731
732	if (s->coalesced_mmio) {
733	struct kvm_coalesced_mmio_zone zone;
734
735	zone.addr = start;
736	zone.size = size;
737	zone.pad = `0`;
738
739	(void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
740	}
741	}
742
743	static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
744	MemoryRegionSection *secion,
745	hwaddr start, hwaddr size)
746	{
747	KVMState *s = kvm_state;
748
749	if (s->coalesced_mmio) {
750	struct kvm_coalesced_mmio_zone zone;
751
752	zone.addr = start;
753	zone.size = size;
754	zone.pad = `0`;
755
756	(void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
757	}
758	}
759
760	static void kvm_coalesce_pio_add(MemoryListener *listener,
761	MemoryRegionSection *section,
762	hwaddr start, hwaddr size)
763	{
764	KVMState *s = kvm_state;
765
766	if (s->coalesced_pio) {
767	struct kvm_coalesced_mmio_zone zone;
768
769	zone.addr = start;
770	zone.size = size;
771	zone.pio = `1`;
772
773	(void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
774	}
775	}
776
777	static void kvm_coalesce_pio_del(MemoryListener *listener,
778	MemoryRegionSection *section,
779	hwaddr start, hwaddr size)
780	{
781	KVMState *s = kvm_state;
782
783	if (s->coalesced_pio) {
784	struct kvm_coalesced_mmio_zone zone;
785
786	zone.addr = start;
787	zone.size = size;
788	zone.pio = `1`;
789
790	(void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
791	}
792	}
793
794	static MemoryListener kvm_coalesced_pio_listener = {
795	.coalesced_io_add = kvm_coalesce_pio_add,
796	.coalesced_io_del = kvm_coalesce_pio_del,
797	};
798
799	int kvm_check_extension(KVMState s, unsigned* int extension)
800	{
801	int ret;
802
803	ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
804	if (ret < `0`) {
805	ret = `0`;
806	}
807
808	return ret;
809	}
810
811	int kvm_vm_check_extension(KVMState s, unsigned* int extension)
812	{
813	int ret;
814
815	ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
816	if (ret < `0`) {
817	/ VM wide version not implemented, use global one instead /
818	ret = kvm_check_extension(s, extension);
819	}
820
821	return ret;
822	}
823
824	static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
825	{
826	#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
827	/ The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN*
828	* endianness, but the memory core hands them in target endianness.
829	* For example, PPC is always treated as big-endian even if running
830	* on KVM and on PPC64LE. Correct here.
831	*/
832	switch (size) {
833	case `2`:
834	val = bswap16(val);
835	break;
836	case `4`:
837	val = bswap32(val);
838	break;
839	}
840	#endif
841	return val;
842	}
843
844	static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
845	bool assign, uint32_t size, bool datamatch)
846	{
847	int ret;
848	struct kvm_ioeventfd iofd = {
849	.datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : `0`,
850	.addr = addr,
851	.len = size,
852	.flags = `0`,
853	.fd = fd,
854	};
855
856	trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
857	datamatch);
858	if (!kvm_enabled()) {
859	return -ENOSYS;
860	}
861
862	if (datamatch) {
863	iofd.flags \|= KVM_IOEVENTFD_FLAG_DATAMATCH;
864	}
865	if (!assign) {
866	iofd.flags \|= KVM_IOEVENTFD_FLAG_DEASSIGN;
867	}
868
869	ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
870
871	if (ret < `0`) {
872	return -errno;
873	}
874
875	return `0`;
876	}
877
878	static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
879	bool assign, uint32_t size, bool datamatch)
880	{
881	struct kvm_ioeventfd kick = {
882	.datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : `0`,
883	.addr = addr,
884	.flags = KVM_IOEVENTFD_FLAG_PIO,
885	.len = size,
886	.fd = fd,
887	};
888	int r;
889	trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
890	if (!kvm_enabled()) {
891	return -ENOSYS;
892	}
893	if (datamatch) {
894	kick.flags \|= KVM_IOEVENTFD_FLAG_DATAMATCH;
895	}
896	if (!assign) {
897	kick.flags \|= KVM_IOEVENTFD_FLAG_DEASSIGN;
898	}
899	r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
900	if (r < `0`) {
901	return r;
902	}
903	return `0`;
904	}
905
906
907	static int kvm_check_many_ioeventfds(void)
908	{
909	/ Userspace can use ioeventfd for io notification. This requires a host*
910	* that supports eventfd(2) and an I/O thread; since eventfd does not
911	* support SIGIO it cannot interrupt the vcpu.
912	*
913	* Older kernels have a 6 device limit on the KVM io bus. Find out so we
914	* can avoid creating too many ioeventfds.
915	*/
916	#if defined(CONFIG_EVENTFD)
917	int ioeventfds[`7`];
918	int i, ret = `0`;
919	for (i = `0`; i < ARRAY_SIZE(ioeventfds); i++) {
920	ioeventfds[i] = eventfd(`0`, EFD_CLOEXEC);
921	if (ioeventfds[i] < `0`) {
922	break;
923	}
924	ret = kvm_set_ioeventfd_pio(ioeventfds[i], `0`, i, true, `2`, true);
925	if (ret < `0`) {
926	close(ioeventfds[i]);
927	break;
928	}
929	}
930
931	/ Decide whether many devices are supported or not /
932	ret = i == ARRAY_SIZE(ioeventfds);
933
934	while (i-- > `0`) {
935	kvm_set_ioeventfd_pio(ioeventfds[i], `0`, i, false, `2`, true);
936	close(ioeventfds[i]);
937	}
938	return ret;
939	#else
940	return `0`;
941	#endif
942	}
943
944	static const KVMCapabilityInfo *
945	kvm_check_extension_list(KVMState s, const* KVMCapabilityInfo *list)
946	{
947	while (list->name) {
948	if (!kvm_check_extension(s, list->value)) {
949	return list;
950	}
951	list++;
952	}
953	return NULL;
954	}
955
956	static void kvm_set_phys_mem(KVMMemoryListener *kml,
957	MemoryRegionSection *section, bool add)
958	{
959	KVMSlot *mem;
960	int err;
961	MemoryRegion *mr = section->mr;
962	bool writeable = !mr->readonly && !mr->rom_device;
963	hwaddr start_addr, size;
964	void *ram;
965
966	if (!memory_region_is_ram(mr)) {
967	if (writeable \|\| !kvm_readonly_mem_allowed) {
968	return;
969	} else if (!mr->romd_mode) {
970	/ If the memory device is not in romd_mode, then we actually want*
971	* to remove the kvm memory slot so all accesses will trap. */
972	add = false;
973	}
974	}
975
976	size = kvm_align_section(section, &start_addr);
977	if (!size) {
978	return;
979	}
980
981	/ use aligned delta to align the ram address /
982	ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
983	(start_addr - section->offset_within_address_space);
984
985	kvm_slots_lock(kml);
986
987	if (!add) {
988	mem = kvm_lookup_matching_slot(kml, start_addr, size);
989	if (!mem) {
990	goto out;
991	}
992	if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
993	kvm_physical_sync_dirty_bitmap(kml, section);
994	}
995
996	/ unregister the slot /
997	g_free(mem->dirty_bmap);
998	mem->dirty_bmap = NULL;
999	mem->memory_size = `0`;
1000	mem->flags = `0`;
1001	err = kvm_set_user_memory_region(kml, mem, false);
1002	if (err) {
1003	fprintf(stderr, "%s: error unregistering slot: %s\n",
1004	__func__, strerror(-err));
1005	abort();
1006	}
1007	goto out;
1008	}
1009
1010	/ register the new slot /
1011	mem = kvm_alloc_slot(kml);
1012	mem->memory_size = size;
1013	mem->start_addr = start_addr;
1014	mem->ram = ram;
1015	mem->flags = kvm_mem_flags(mr);
1016
1017	err = kvm_set_user_memory_region(kml, mem, true);
1018	if (err) {
1019	fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1020	strerror(-err));
1021	abort();
1022	}
1023
1024	out:
1025	kvm_slots_unlock(kml);
1026	}
1027
1028	static void kvm_region_add(MemoryListener *listener,
1029	MemoryRegionSection *section)
1030	{
1031	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1032
1033	memory_region_ref(section->mr);
1034	kvm_set_phys_mem(kml, section, true);
1035	}
1036
1037	static void kvm_region_del(MemoryListener *listener,
1038	MemoryRegionSection *section)
1039	{
1040	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1041
1042	kvm_set_phys_mem(kml, section, false);
1043	memory_region_unref(section->mr);
1044	}
1045
1046	static void kvm_log_sync(MemoryListener *listener,
1047	MemoryRegionSection *section)
1048	{
1049	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1050	int r;
1051
1052	kvm_slots_lock(kml);
1053	r = kvm_physical_sync_dirty_bitmap(kml, section);
1054	kvm_slots_unlock(kml);
1055	if (r < `0`) {
1056	abort();
1057	}
1058	}
1059
1060	static void kvm_log_clear(MemoryListener *listener,
1061	MemoryRegionSection *section)
1062	{
1063	KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1064	int r;
1065
1066	r = kvm_physical_log_clear(kml, section);
1067	if (r < `0`) {
1068	error_report_once("%s: kvm log clear failed: mr=%s "
1069	"offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1070	section->mr->name, section->offset_within_region,
1071	int128_get64(section->size));
1072	abort();
1073	}
1074	}
1075
1076	static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1077	MemoryRegionSection *section,
1078	bool match_data, uint64_t data,
1079	EventNotifier *e)
1080	{
1081	int fd = event_notifier_get_fd(e);
1082	int r;
1083
1084	r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1085	data, true, int128_get64(section->size),
1086	match_data);
1087	if (r < `0`) {
1088	fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1089	__func__, strerror(-r), -r);
1090	abort();
1091	}
1092	}
1093
1094	static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1095	MemoryRegionSection *section,
1096	bool match_data, uint64_t data,
1097	EventNotifier *e)
1098	{
1099	int fd = event_notifier_get_fd(e);
1100	int r;
1101
1102	r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1103	data, false, int128_get64(section->size),
1104	match_data);
1105	if (r < `0`) {
1106	fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1107	__func__, strerror(-r), -r);
1108	abort();
1109	}
1110	}
1111
1112	static void kvm_io_ioeventfd_add(MemoryListener *listener,
1113	MemoryRegionSection *section,
1114	bool match_data, uint64_t data,
1115	EventNotifier *e)
1116	{
1117	int fd = event_notifier_get_fd(e);
1118	int r;
1119
1120	r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1121	data, true, int128_get64(section->size),
1122	match_data);
1123	if (r < `0`) {
1124	fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1125	__func__, strerror(-r), -r);
1126	abort();
1127	}
1128	}
1129
1130	static void kvm_io_ioeventfd_del(MemoryListener *listener,
1131	MemoryRegionSection *section,
1132	bool match_data, uint64_t data,
1133	EventNotifier *e)
1134
1135	{
1136	int fd = event_notifier_get_fd(e);
1137	int r;
1138
1139	r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1140	data, false, int128_get64(section->size),
1141	match_data);
1142	if (r < `0`) {
1143	fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1144	__func__, strerror(-r), -r);
1145	abort();
1146	}
1147	}
1148
1149	void kvm_memory_listener_register(KVMState s, KVMMemoryListener kml,
1150	AddressSpace as, int* as_id)
1151	{
1152	int i;
1153
1154	qemu_mutex_init(&kml->slots_lock);
1155	kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
1156	kml->as_id = as_id;
1157
1158	for (i = `0`; i < s->nr_slots; i++) {
1159	kml->slots[i].slot = i;
1160	}
1161
1162	kml->listener.region_add = kvm_region_add;
1163	kml->listener.region_del = kvm_region_del;
1164	kml->listener.log_start = kvm_log_start;
1165	kml->listener.log_stop = kvm_log_stop;
1166	kml->listener.log_sync = kvm_log_sync;
1167	kml->listener.log_clear = kvm_log_clear;
1168	kml->listener.priority = `10`;
1169
1170	memory_listener_register(&kml->listener, as);
1171
1172	for (i = `0`; i < s->nr_as; ++i) {
1173	if (!s->as[i].as) {
1174	s->as[i].as = as;
1175	s->as[i].ml = kml;
1176	break;
1177	}
1178	}
1179	}
1180
1181	static MemoryListener kvm_io_listener = {
1182	.eventfd_add = kvm_io_ioeventfd_add,
1183	.eventfd_del = kvm_io_ioeventfd_del,
1184	.priority = `10`,
1185	};
1186
1187	int kvm_set_irq(KVMState s, int* irq, int level)
1188	{
1189	struct kvm_irq_level event;
1190	int ret;
1191
1192	assert(kvm_async_interrupts_enabled());
1193
1194	event.level = level;
1195	event.irq = irq;
1196	ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1197	if (ret < `0`) {
1198	perror("kvm_set_irq");
1199	abort();
1200	}
1201
1202	return (s->irq_set_ioctl == KVM_IRQ_LINE) ? `1` : event.status;
1203	}
1204
1205	#ifdef KVM_CAP_IRQ_ROUTING
1206	typedef struct KVMMSIRoute {
1207	struct kvm_irq_routing_entry kroute;
1208	QTAILQ_ENTRY(KVMMSIRoute) entry;
1209	} KVMMSIRoute;
1210
1211	static void set_gsi(KVMState s, unsigned* int gsi)
1212	{
1213	set_bit(gsi, s->used_gsi_bitmap);
1214	}
1215
1216	static void clear_gsi(KVMState s, unsigned* int gsi)
1217	{
1218	clear_bit(gsi, s->used_gsi_bitmap);
1219	}
1220
1221	void kvm_init_irq_routing(KVMState *s)
1222	{
1223	int gsi_count, i;
1224
1225	gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - `1`;
1226	if (gsi_count > `0`) {
1227	/ Round up so we can search ints using ffs /
1228	s->used_gsi_bitmap = bitmap_new(gsi_count);
1229	s->gsi_count = gsi_count;
1230	}
1231
1232	s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1233	s->nr_allocated_irq_routes = `0`;
1234
1235	if (!kvm_direct_msi_allowed) {
1236	for (i = `0`; i < KVM_MSI_HASHTAB_SIZE; i++) {
1237	QTAILQ_INIT(&s->msi_hashtab[i]);
1238	}
1239	}
1240
1241	kvm_arch_init_irq_routing(s);
1242	}
1243
1244	void kvm_irqchip_commit_routes(KVMState *s)
1245	{
1246	int ret;
1247
1248	if (kvm_gsi_direct_mapping()) {
1249	return;
1250	}
1251
1252	if (!kvm_gsi_routing_enabled()) {
1253	return;
1254	}
1255
1256	s->irq_routes->flags = `0`;
1257	trace_kvm_irqchip_commit_routes();
1258	ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1259	assert(ret == `0`);
1260	}
1261
1262	static void kvm_add_routing_entry(KVMState *s,
1263	struct kvm_irq_routing_entry *entry)
1264	{
1265	struct kvm_irq_routing_entry *new;
1266	int n, size;
1267
1268	if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1269	n = s->nr_allocated_irq_routes * `2`;
1270	if (n < `64`) {
1271	n = `64`;
1272	}
1273	size = sizeof(struct kvm_irq_routing);
1274	size += n * sizeof(*new);
1275	s->irq_routes = g_realloc(s->irq_routes, size);
1276	s->nr_allocated_irq_routes = n;
1277	}
1278	n = s->irq_routes->nr++;
1279	new = &s->irq_routes->entries[n];
1280
1281	new = entry;
1282
1283	set_gsi(s, entry->gsi);
1284	}
1285
1286	static int kvm_update_routing_entry(KVMState *s,
1287	struct kvm_irq_routing_entry *new_entry)
1288	{
1289	struct kvm_irq_routing_entry *entry;
1290	int n;
1291
1292	for (n = `0`; n < s->irq_routes->nr; n++) {
1293	entry = &s->irq_routes->entries[n];
1294	if (entry->gsi != new_entry->gsi) {
1295	continue;
1296	}
1297
1298	if(!memcmp(entry, new_entry, sizeof *entry)) {
1299	return `0`;
1300	}
1301
1302	entry = new_entry;
1303
1304	return `0`;
1305	}
1306
1307	return -ESRCH;
1308	}
1309
1310	void kvm_irqchip_add_irq_route(KVMState s, int* irq, int irqchip, int pin)
1311	{
1312	struct kvm_irq_routing_entry e = {};
1313
1314	assert(pin < s->gsi_count);
1315
1316	e.gsi = irq;
1317	e.type = KVM_IRQ_ROUTING_IRQCHIP;
1318	e.flags = `0`;
1319	e.u.irqchip.irqchip = irqchip;
1320	e.u.irqchip.pin = pin;
1321	kvm_add_routing_entry(s, &e);
1322	}
1323
1324	void kvm_irqchip_release_virq(KVMState s, int* virq)
1325	{
1326	struct kvm_irq_routing_entry *e;
1327	int i;
1328
1329	if (kvm_gsi_direct_mapping()) {
1330	return;
1331	}
1332
1333	for (i = `0`; i < s->irq_routes->nr; i++) {
1334	e = &s->irq_routes->entries[i];
1335	if (e->gsi == virq) {
1336	s->irq_routes->nr--;
1337	*e = s->irq_routes->entries[s->irq_routes->nr];
1338	}
1339	}
1340	clear_gsi(s, virq);
1341	kvm_arch_release_virq_post(virq);
1342	trace_kvm_irqchip_release_virq(virq);
1343	}
1344
1345	static unsigned int kvm_hash_msi(uint32_t data)
1346	{
1347	/ This is optimized for IA32 MSI layout. However, no other arch shall*
1348	* repeat the mistake of not providing a direct MSI injection API. */
1349	return data & `0xff`;
1350	}
1351
1352	static void kvm_flush_dynamic_msi_routes(KVMState *s)
1353	{
1354	KVMMSIRoute route, next;
1355	unsigned int hash;
1356
1357	for (hash = `0`; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1358	QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1359	kvm_irqchip_release_virq(s, route->kroute.gsi);
1360	QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1361	g_free(route);
1362	}
1363	}
1364	}
1365
1366	static int kvm_irqchip_get_virq(KVMState *s)
1367	{
1368	int next_virq;
1369
1370	/*
1371	* PIC and IOAPIC share the first 16 GSI numbers, thus the available
1372	* GSI numbers are more than the number of IRQ route. Allocating a GSI
1373	* number can succeed even though a new route entry cannot be added.
1374	* When this happens, flush dynamic MSI entries to free IRQ route entries.
1375	*/
1376	if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1377	kvm_flush_dynamic_msi_routes(s);
1378	}
1379
1380	/ Return the lowest unused GSI in the bitmap /
1381	next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1382	if (next_virq >= s->gsi_count) {
1383	return -ENOSPC;
1384	} else {
1385	return next_virq;
1386	}
1387	}
1388
1389	static KVMMSIRoute kvm_lookup_msi_route(KVMState s, MSIMessage msg)
1390	{
1391	unsigned int hash = kvm_hash_msi(msg.data);
1392	KVMMSIRoute *route;
1393
1394	QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1395	if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1396	route->kroute.u.msi.address_hi == (msg.address >> `32`) &&
1397	route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1398	return route;
1399	}
1400	}
1401	return NULL;
1402	}
1403
1404	int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1405	{
1406	struct kvm_msi msi;
1407	KVMMSIRoute *route;
1408
1409	if (kvm_direct_msi_allowed) {
1410	msi.address_lo = (uint32_t)msg.address;
1411	msi.address_hi = msg.address >> `32`;
1412	msi.data = le32_to_cpu(msg.data);
1413	msi.flags = `0`;
1414	memset(msi.pad, `0`, sizeof(msi.pad));
1415
1416	return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1417	}
1418
1419	route = kvm_lookup_msi_route(s, msg);
1420	if (!route) {
1421	int virq;
1422
1423	virq = kvm_irqchip_get_virq(s);
1424	if (virq < `0`) {
1425	return virq;
1426	}
1427
1428	route = g_malloc0(sizeof(KVMMSIRoute));
1429	route->kroute.gsi = virq;
1430	route->kroute.type = KVM_IRQ_ROUTING_MSI;
1431	route->kroute.flags = `0`;
1432	route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1433	route->kroute.u.msi.address_hi = msg.address >> `32`;
1434	route->kroute.u.msi.data = le32_to_cpu(msg.data);
1435
1436	kvm_add_routing_entry(s, &route->kroute);
1437	kvm_irqchip_commit_routes(s);
1438
1439	QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1440	entry);
1441	}
1442
1443	assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1444
1445	return kvm_set_irq(s, route->kroute.gsi, `1`);
1446	}
1447
1448	int kvm_irqchip_add_msi_route(KVMState s, int* vector, PCIDevice *dev)
1449	{
1450	struct kvm_irq_routing_entry kroute = {};
1451	int virq;
1452	MSIMessage msg = {`0`, `0`};
1453
1454	if (pci_available && dev) {
1455	msg = pci_get_msi_message(dev, vector);
1456	}
1457
1458	if (kvm_gsi_direct_mapping()) {
1459	return kvm_arch_msi_data_to_gsi(msg.data);
1460	}
1461
1462	if (!kvm_gsi_routing_enabled()) {
1463	return -ENOSYS;
1464	}
1465
1466	virq = kvm_irqchip_get_virq(s);
1467	if (virq < `0`) {
1468	return virq;
1469	}
1470
1471	kroute.gsi = virq;
1472	kroute.type = KVM_IRQ_ROUTING_MSI;
1473	kroute.flags = `0`;
1474	kroute.u.msi.address_lo = (uint32_t)msg.address;
1475	kroute.u.msi.address_hi = msg.address >> `32`;
1476	kroute.u.msi.data = le32_to_cpu(msg.data);
1477	if (pci_available && kvm_msi_devid_required()) {
1478	kroute.flags = KVM_MSI_VALID_DEVID;
1479	kroute.u.msi.devid = pci_requester_id(dev);
1480	}
1481	if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1482	kvm_irqchip_release_virq(s, virq);
1483	return -EINVAL;
1484	}
1485
1486	trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1487	vector, virq);
1488
1489	kvm_add_routing_entry(s, &kroute);
1490	kvm_arch_add_msi_route_post(&kroute, vector, dev);
1491	kvm_irqchip_commit_routes(s);
1492
1493	return virq;
1494	}
1495
1496	int kvm_irqchip_update_msi_route(KVMState s, int* virq, MSIMessage msg,
1497	PCIDevice *dev)
1498	{
1499	struct kvm_irq_routing_entry kroute = {};
1500
1501	if (kvm_gsi_direct_mapping()) {
1502	return `0`;
1503	}
1504
1505	if (!kvm_irqchip_in_kernel()) {
1506	return -ENOSYS;
1507	}
1508
1509	kroute.gsi = virq;
1510	kroute.type = KVM_IRQ_ROUTING_MSI;
1511	kroute.flags = `0`;
1512	kroute.u.msi.address_lo = (uint32_t)msg.address;
1513	kroute.u.msi.address_hi = msg.address >> `32`;
1514	kroute.u.msi.data = le32_to_cpu(msg.data);
1515	if (pci_available && kvm_msi_devid_required()) {
1516	kroute.flags = KVM_MSI_VALID_DEVID;
1517	kroute.u.msi.devid = pci_requester_id(dev);
1518	}
1519	if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1520	return -EINVAL;
1521	}
1522
1523	trace_kvm_irqchip_update_msi_route(virq);
1524
1525	return kvm_update_routing_entry(s, &kroute);
1526	}
1527
1528	static int kvm_irqchip_assign_irqfd(KVMState s, int* fd, int rfd, int virq,
1529	bool assign)
1530	{
1531	struct kvm_irqfd irqfd = {
1532	.fd = fd,
1533	.gsi = virq,
1534	.flags = assign ? `0` : KVM_IRQFD_FLAG_DEASSIGN,
1535	};
1536
1537	if (rfd != -`1`) {
1538	irqfd.flags \|= KVM_IRQFD_FLAG_RESAMPLE;
1539	irqfd.resamplefd = rfd;
1540	}
1541
1542	if (!kvm_irqfds_enabled()) {
1543	return -ENOSYS;
1544	}
1545
1546	return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1547	}
1548
1549	int kvm_irqchip_add_adapter_route(KVMState s, AdapterInfo adapter)
1550	{
1551	struct kvm_irq_routing_entry kroute = {};
1552	int virq;
1553
1554	if (!kvm_gsi_routing_enabled()) {
1555	return -ENOSYS;
1556	}
1557
1558	virq = kvm_irqchip_get_virq(s);
1559	if (virq < `0`) {
1560	return virq;
1561	}
1562
1563	kroute.gsi = virq;
1564	kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1565	kroute.flags = `0`;
1566	kroute.u.adapter.summary_addr = adapter->summary_addr;
1567	kroute.u.adapter.ind_addr = adapter->ind_addr;
1568	kroute.u.adapter.summary_offset = adapter->summary_offset;
1569	kroute.u.adapter.ind_offset = adapter->ind_offset;
1570	kroute.u.adapter.adapter_id = adapter->adapter_id;
1571
1572	kvm_add_routing_entry(s, &kroute);
1573
1574	return virq;
1575	}
1576
1577	int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1578	{
1579	struct kvm_irq_routing_entry kroute = {};
1580	int virq;
1581
1582	if (!kvm_gsi_routing_enabled()) {
1583	return -ENOSYS;
1584	}
1585	if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1586	return -ENOSYS;
1587	}
1588	virq = kvm_irqchip_get_virq(s);
1589	if (virq < `0`) {
1590	return virq;
1591	}
1592
1593	kroute.gsi = virq;
1594	kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1595	kroute.flags = `0`;
1596	kroute.u.hv_sint.vcpu = vcpu;
1597	kroute.u.hv_sint.sint = sint;
1598
1599	kvm_add_routing_entry(s, &kroute);
1600	kvm_irqchip_commit_routes(s);
1601
1602	return virq;
1603	}
1604
1605	#else /* !KVM_CAP_IRQ_ROUTING */
1606
1607	void kvm_init_irq_routing(KVMState *s)
1608	{
1609	}
1610
1611	void kvm_irqchip_release_virq(KVMState s, int* virq)
1612	{
1613	}
1614
1615	int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1616	{
1617	abort();
1618	}
1619
1620	int kvm_irqchip_add_msi_route(KVMState s, int* vector, PCIDevice *dev)
1621	{
1622	return -ENOSYS;
1623	}
1624
1625	int kvm_irqchip_add_adapter_route(KVMState s, AdapterInfo adapter)
1626	{
1627	return -ENOSYS;
1628	}
1629
1630	int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1631	{
1632	return -ENOSYS;
1633	}
1634
1635	static int kvm_irqchip_assign_irqfd(KVMState s, int* fd, int virq, bool assign)
1636	{
1637	abort();
1638	}
1639
1640	int kvm_irqchip_update_msi_route(KVMState s, int* virq, MSIMessage msg)
1641	{
1642	return -ENOSYS;
1643	}
1644	#endif /* !KVM_CAP_IRQ_ROUTING */
1645
1646	int kvm_irqchip_add_irqfd_notifier_gsi(KVMState s, EventNotifier n,
1647	EventNotifier rn, int* virq)
1648	{
1649	return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1650	rn ? event_notifier_get_fd(rn) : -`1`, virq, true);
1651	}
1652
1653	int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState s, EventNotifier n,
1654	int virq)
1655	{
1656	return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -`1`, virq,
1657	false);
1658	}
1659
1660	int kvm_irqchip_add_irqfd_notifier(KVMState s, EventNotifier n,
1661	EventNotifier *rn, qemu_irq irq)
1662	{
1663	gpointer key, gsi;
1664	gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1665
1666	if (!found) {
1667	return -ENXIO;
1668	}
1669	return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1670	}
1671
1672	int kvm_irqchip_remove_irqfd_notifier(KVMState s, EventNotifier n,
1673	qemu_irq irq)
1674	{
1675	gpointer key, gsi;
1676	gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1677
1678	if (!found) {
1679	return -ENXIO;
1680	}
1681	return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1682	}
1683
1684	void kvm_irqchip_set_qemuirq_gsi(KVMState s, qemu_irq irq, int* gsi)
1685	{
1686	g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1687	}
1688
1689	static void kvm_irqchip_create(MachineState machine, KVMState s)
1690	{
1691	int ret;
1692
1693	if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1694	;
1695	} else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1696	ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, `0`);
1697	if (ret < `0`) {
1698	fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1699	exit(`1`);
1700	}
1701	} else {
1702	return;
1703	}
1704
1705	/ First probe and see if there's a arch-specific hook to create the*
1706	* in-kernel irqchip for us */
1707	ret = kvm_arch_irqchip_create(machine, s);
1708	if (ret == `0`) {
1709	if (machine_kernel_irqchip_split(machine)) {
1710	perror("Split IRQ chip mode not supported.");
1711	exit(`1`);
1712	} else {
1713	ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1714	}
1715	}
1716	if (ret < `0`) {
1717	fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1718	exit(`1`);
1719	}
1720
1721	kvm_kernel_irqchip = true;
1722	/ If we have an in-kernel IRQ chip then we must have asynchronous*
1723	* interrupt delivery (though the reverse is not necessarily true)
1724	*/
1725	kvm_async_interrupts_allowed = true;
1726	kvm_halt_in_kernel_allowed = true;
1727
1728	kvm_init_irq_routing(s);
1729
1730	s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1731	}
1732
1733	/ Find number of supported CPUs using the recommended*
1734	* procedure from the kernel API documentation to cope with
1735	* older kernels that may be missing capabilities.
1736	*/
1737	static int kvm_recommended_vcpus(KVMState *s)
1738	{
1739	int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
1740	return (ret) ? ret : `4`;
1741	}
1742
1743	static int kvm_max_vcpus(KVMState *s)
1744	{
1745	int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1746	return (ret) ? ret : kvm_recommended_vcpus(s);
1747	}
1748
1749	static int kvm_max_vcpu_id(KVMState *s)
1750	{
1751	int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1752	return (ret) ? ret : kvm_max_vcpus(s);
1753	}
1754
1755	bool kvm_vcpu_id_is_valid(int vcpu_id)
1756	{
1757	KVMState *s = KVM_STATE(current_machine->accelerator);
1758	return vcpu_id >= `0` && vcpu_id < kvm_max_vcpu_id(s);
1759	}
1760
1761	static int kvm_init(MachineState *ms)
1762	{
1763	MachineClass *mc = MACHINE_GET_CLASS(ms);
1764	static const char upgrade_note[] =
1765	"Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1766	"(see http://sourceforge.net/projects/kvm).\n";
1767	struct {
1768	const char *name;
1769	int num;
1770	} num_cpus[] = {
1771	{ "SMP", ms->smp.cpus },
1772	{ "hotpluggable", ms->smp.max_cpus },
1773	{ NULL, }
1774	}, *nc = num_cpus;
1775	int soft_vcpus_limit, hard_vcpus_limit;
1776	KVMState *s;
1777	const KVMCapabilityInfo *missing_cap;
1778	int ret;
1779	int type = `0`;
1780	const char *kvm_type;
1781
1782	s = KVM_STATE(ms->accelerator);
1783
1784	/*
1785	* On systems where the kernel can support different base page
1786	* sizes, host page size may be different from TARGET_PAGE_SIZE,
1787	* even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
1788	* page size for the system though.
1789	*/
1790	assert(TARGET_PAGE_SIZE <= getpagesize());
1791
1792	s->sigmask_len = `8`;
1793
1794	#ifdef KVM_CAP_SET_GUEST_DEBUG
1795	QTAILQ_INIT(&s->kvm_sw_breakpoints);
1796	#endif
1797	QLIST_INIT(&s->kvm_parked_vcpus);
1798	s->vmfd = -`1`;
1799	s->fd = qemu_open("/dev/kvm", O_RDWR);
1800	if (s->fd == -`1`) {
1801	fprintf(stderr, "Could not access KVM kernel module: %m\n");
1802	ret = -errno;
1803	goto err;
1804	}
1805
1806	ret = kvm_ioctl(s, KVM_GET_API_VERSION, `0`);
1807	if (ret < KVM_API_VERSION) {
1808	if (ret >= `0`) {
1809	ret = -EINVAL;
1810	}
1811	fprintf(stderr, "kvm version too old\n");
1812	goto err;
1813	}
1814
1815	if (ret > KVM_API_VERSION) {
1816	ret = -EINVAL;
1817	fprintf(stderr, "kvm version not supported\n");
1818	goto err;
1819	}
1820
1821	kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1822	s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1823
1824	/ If unspecified, use the default value /
1825	if (!s->nr_slots) {
1826	s->nr_slots = `32`;
1827	}
1828
1829	s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
1830	if (s->nr_as <= `1`) {
1831	s->nr_as = `1`;
1832	}
1833	s->as = g_new0(struct KVMAs, s->nr_as);
1834
1835	kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1836	if (mc->kvm_type) {
1837	type = mc->kvm_type(ms, kvm_type);
1838	} else if (kvm_type) {
1839	ret = -EINVAL;
1840	fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1841	goto err;
1842	}
1843
1844	do {
1845	ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1846	} while (ret == -EINTR);
1847
1848	if (ret < `0`) {
1849	fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1850	strerror(-ret));
1851
1852	#ifdef TARGET_S390X
1853	if (ret == -EINVAL) {
1854	fprintf(stderr,
1855	"Host kernel setup problem detected. Please verify:\n");
1856	fprintf(stderr, "- for kernels supporting the switch_amode or"
1857	" user_mode parameters, whether\n");
1858	fprintf(stderr,
1859	" user space is running in primary address space\n");
1860	fprintf(stderr,
1861	"- for kernels supporting the vm.allocate_pgste sysctl, "
1862	"whether it is enabled\n");
1863	}
1864	#endif
1865	goto err;
1866	}
1867
1868	s->vmfd = ret;
1869
1870	/ check the vcpu limits /
1871	soft_vcpus_limit = kvm_recommended_vcpus(s);
1872	hard_vcpus_limit = kvm_max_vcpus(s);
1873
1874	while (nc->name) {
1875	if (nc->num > soft_vcpus_limit) {
1876	warn_report("Number of %s cpus requested (%d) exceeds "
1877	"the recommended cpus supported by KVM (%d)",
1878	nc->name, nc->num, soft_vcpus_limit);
1879
1880	if (nc->num > hard_vcpus_limit) {
1881	fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1882	"the maximum cpus supported by KVM (%d)\n",
1883	nc->name, nc->num, hard_vcpus_limit);
1884	exit(`1`);
1885	}
1886	}
1887	nc++;
1888	}
1889
1890	missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1891	if (!missing_cap) {
1892	missing_cap =
1893	kvm_check_extension_list(s, kvm_arch_required_capabilities);
1894	}
1895	if (missing_cap) {
1896	ret = -EINVAL;
1897	fprintf(stderr, "kvm does not support %s\n%s",
1898	missing_cap->name, upgrade_note);
1899	goto err;
1900	}
1901
1902	s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1903	s->coalesced_pio = s->coalesced_mmio &&
1904	kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
1905
1906	s->manual_dirty_log_protect =
1907	kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
1908	if (s->manual_dirty_log_protect) {
1909	ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, `0`, `1`);
1910	if (ret) {
1911	warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
1912	"but failed. Falling back to the legacy mode. ");
1913	s->manual_dirty_log_protect = false;
1914	}
1915	}
1916
1917	#ifdef KVM_CAP_VCPU_EVENTS
1918	s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1919	#endif
1920
1921	s->robust_singlestep =
1922	kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1923
1924	#ifdef KVM_CAP_DEBUGREGS
1925	s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1926	#endif
1927
1928	s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
1929
1930	#ifdef KVM_CAP_IRQ_ROUTING
1931	kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > `0`);
1932	#endif
1933
1934	s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1935
1936	s->irq_set_ioctl = KVM_IRQ_LINE;
1937	if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1938	s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1939	}
1940
1941	kvm_readonly_mem_allowed =
1942	(kvm_check_extension(s, KVM_CAP_READONLY_MEM) > `0`);
1943
1944	kvm_eventfds_allowed =
1945	(kvm_check_extension(s, KVM_CAP_IOEVENTFD) > `0`);
1946
1947	kvm_irqfds_allowed =
1948	(kvm_check_extension(s, KVM_CAP_IRQFD) > `0`);
1949
1950	kvm_resamplefds_allowed =
1951	(kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > `0`);
1952
1953	kvm_vm_attributes_allowed =
1954	(kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > `0`);
1955
1956	kvm_ioeventfd_any_length_allowed =
1957	(kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > `0`);
1958
1959	kvm_state = s;
1960
1961	/*
1962	* if memory encryption object is specified then initialize the memory
1963	* encryption context.
1964	*/
1965	if (ms->memory_encryption) {
1966	kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
1967	if (!kvm_state->memcrypt_handle) {
1968	ret = -`1`;
1969	goto err;
1970	}
1971
1972	kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
1973	}
1974
1975	ret = kvm_arch_init(ms, s);
1976	if (ret < `0`) {
1977	goto err;
1978	}
1979
1980	if (machine_kernel_irqchip_allowed(ms)) {
1981	kvm_irqchip_create(ms, s);
1982	}
1983
1984	if (kvm_eventfds_allowed) {
1985	s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
1986	s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
1987	}
1988	s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
1989	s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
1990
1991	kvm_memory_listener_register(s, &s->memory_listener,
1992	&address_space_memory, `0`);
1993	memory_listener_register(&kvm_io_listener,
1994	&address_space_io);
1995	memory_listener_register(&kvm_coalesced_pio_listener,
1996	&address_space_io);
1997
1998	s->many_ioeventfds = kvm_check_many_ioeventfds();
1999
2000	s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2001	if (!s->sync_mmu) {
2002	qemu_balloon_inhibit(true);
2003	}
2004
2005	return `0`;
2006
2007	err:
2008	assert(ret < `0`);
2009	if (s->vmfd >= `0`) {
2010	close(s->vmfd);
2011	}
2012	if (s->fd != -`1`) {
2013	close(s->fd);
2014	}
2015	g_free(s->memory_listener.slots);
2016
2017	return ret;
2018	}
2019
2020	void kvm_set_sigmask_len(KVMState s, unsigned* int sigmask_len)
2021	{
2022	s->sigmask_len = sigmask_len;
2023	}
2024
2025	static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void data, int* direction,
2026	int size, uint32_t count)
2027	{
2028	int i;
2029	uint8_t *ptr = data;
2030
2031	for (i = `0`; i < count; i++) {
2032	address_space_rw(&address_space_io, port, attrs,
2033	ptr, size,
2034	direction == KVM_EXIT_IO_OUT);
2035	ptr += size;
2036	}
2037	}
2038
2039	static int kvm_handle_internal_error(CPUState cpu, struct* kvm_run *run)
2040	{
2041	fprintf(stderr, "KVM internal error. Suberror: %d\n",
2042	run->internal.suberror);
2043
2044	if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2045	int i;
2046
2047	for (i = `0`; i < run->internal.ndata; ++i) {
2048	fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
2049	i, (uint64_t)run->internal.data[i]);
2050	}
2051	}
2052	if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2053	fprintf(stderr, "emulation failure\n");
2054	if (!kvm_arch_stop_on_emulation_error(cpu)) {
2055	cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2056	return EXCP_INTERRUPT;
2057	}
2058	}
2059	/ FIXME: Should trigger a qmp message to let management know*
2060	* something went wrong.
2061	*/
2062	return -`1`;
2063	}
2064
2065	void kvm_flush_coalesced_mmio_buffer(void)
2066	{
2067	KVMState *s = kvm_state;
2068
2069	if (s->coalesced_flush_in_progress) {
2070	return;
2071	}
2072
2073	s->coalesced_flush_in_progress = true;
2074
2075	if (s->coalesced_mmio_ring) {
2076	struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2077	while (ring->first != ring->last) {
2078	struct kvm_coalesced_mmio *ent;
2079
2080	ent = &ring->coalesced_mmio[ring->first];
2081
2082	if (ent->pio == `1`) {
2083	address_space_rw(&address_space_io, ent->phys_addr,
2084	MEMTXATTRS_UNSPECIFIED, ent->data,
2085	ent->len, true);
2086	} else {
2087	cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2088	}
2089	smp_wmb();
2090	ring->first = (ring->first + `1`) % KVM_COALESCED_MMIO_MAX;
2091	}
2092	}
2093
2094	s->coalesced_flush_in_progress = false;
2095	}
2096
2097	static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2098	{
2099	if (!cpu->vcpu_dirty) {
2100	kvm_arch_get_registers(cpu);
2101	cpu->vcpu_dirty = true;
2102	}
2103	}
2104
2105	void kvm_cpu_synchronize_state(CPUState *cpu)
2106	{
2107	if (!cpu->vcpu_dirty) {
2108	run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2109	}
2110	}
2111
2112	static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2113	{
2114	kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2115	cpu->vcpu_dirty = false;
2116	}
2117
2118	void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2119	{
2120	run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2121	}
2122
2123	static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2124	{
2125	kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2126	cpu->vcpu_dirty = false;
2127	}
2128
2129	void kvm_cpu_synchronize_post_init(CPUState *cpu)
2130	{
2131	run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2132	}
2133
2134	static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2135	{
2136	cpu->vcpu_dirty = true;
2137	}
2138
2139	void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2140	{
2141	run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2142	}
2143
2144	#ifdef KVM_HAVE_MCE_INJECTION
2145	static __thread void *pending_sigbus_addr;
2146	static __thread int pending_sigbus_code;
2147	static __thread bool have_sigbus_pending;
2148	#endif
2149
2150	static void kvm_cpu_kick(CPUState *cpu)
2151	{
2152	atomic_set(&cpu->kvm_run->immediate_exit, `1`);
2153	}
2154
2155	static void kvm_cpu_kick_self(void)
2156	{
2157	if (kvm_immediate_exit) {
2158	kvm_cpu_kick(current_cpu);
2159	} else {
2160	qemu_cpu_kick_self();
2161	}
2162	}
2163
2164	static void kvm_eat_signals(CPUState *cpu)
2165	{
2166	struct timespec ts = { `0`, `0` };
2167	siginfo_t siginfo;
2168	sigset_t waitset;
2169	sigset_t chkset;
2170	int r;
2171
2172	if (kvm_immediate_exit) {
2173	atomic_set(&cpu->kvm_run->immediate_exit, `0`);
2174	/ Write kvm_run->immediate_exit before the cpu->exit_request*
2175	* write in kvm_cpu_exec.
2176	*/
2177	smp_wmb();
2178	return;
2179	}
2180
2181	sigemptyset(&waitset);
2182	sigaddset(&waitset, SIG_IPI);
2183
2184	do {
2185	r = sigtimedwait(&waitset, &siginfo, &ts);
2186	if (r == -`1` && !(errno == EAGAIN \|\| errno == EINTR)) {
2187	perror("sigtimedwait");
2188	exit(`1`);
2189	}
2190
2191	r = sigpending(&chkset);
2192	if (r == -`1`) {
2193	perror("sigpending");
2194	exit(`1`);
2195	}
2196	} while (sigismember(&chkset, SIG_IPI));
2197	}
2198
2199	int kvm_cpu_exec(CPUState *cpu)
2200	{
2201	struct kvm_run *run = cpu->kvm_run;
2202	int ret, run_ret;
2203
2204	DPRINTF("kvm_cpu_exec()\n");
2205
2206	if (kvm_arch_process_async_events(cpu)) {
2207	atomic_set(&cpu->exit_request, `0`);
2208	return EXCP_HLT;
2209	}
2210
2211	qemu_mutex_unlock_iothread();
2212	cpu_exec_start(cpu);
2213
2214	do {
2215	MemTxAttrs attrs;
2216
2217	if (cpu->vcpu_dirty) {
2218	kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2219	cpu->vcpu_dirty = false;
2220	}
2221
2222	kvm_arch_pre_run(cpu, run);
2223	if (atomic_read(&cpu->exit_request)) {
2224	DPRINTF("interrupt exit requested\n");
2225	/*
2226	* KVM requires us to reenter the kernel after IO exits to complete
2227	* instruction emulation. This self-signal will ensure that we
2228	* leave ASAP again.
2229	*/
2230	kvm_cpu_kick_self();
2231	}
2232
2233	/ Read cpu->exit_request before KVM_RUN reads run->immediate_exit.*
2234	* Matching barrier in kvm_eat_signals.
2235	*/
2236	smp_rmb();
2237
2238	run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, `0`);
2239
2240	attrs = kvm_arch_post_run(cpu, run);
2241
2242	#ifdef KVM_HAVE_MCE_INJECTION
2243	if (unlikely(have_sigbus_pending)) {
2244	qemu_mutex_lock_iothread();
2245	kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2246	pending_sigbus_addr);
2247	have_sigbus_pending = false;
2248	qemu_mutex_unlock_iothread();
2249	}
2250	#endif
2251
2252	if (run_ret < `0`) {
2253	if (run_ret == -EINTR \|\| run_ret == -EAGAIN) {
2254	DPRINTF("io window exit\n");
2255	kvm_eat_signals(cpu);
2256	ret = EXCP_INTERRUPT;
2257	break;
2258	}
2259	fprintf(stderr, "error: kvm run failed %s\n",
2260	strerror(-run_ret));
2261	#ifdef TARGET_PPC
2262	if (run_ret == -EBUSY) {
2263	fprintf(stderr,
2264	"This is probably because your SMT is enabled.\n"
2265	"VCPU can only run on primary threads with all "
2266	"secondary threads offline.\n");
2267	}
2268	#endif
2269	ret = -`1`;
2270	break;
2271	}
2272
2273	trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2274	switch (run->exit_reason) {
2275	case KVM_EXIT_IO:
2276	DPRINTF("handle_io\n");
2277	/ Called outside BQL /
2278	kvm_handle_io(run->io.port, attrs,
2279	(uint8_t *)run + run->io.data_offset,
2280	run->io.direction,
2281	run->io.size,
2282	run->io.count);
2283	ret = `0`;
2284	break;
2285	case KVM_EXIT_MMIO:
2286	DPRINTF("handle_mmio\n");
2287	/ Called outside BQL /
2288	address_space_rw(&address_space_memory,
2289	run->mmio.phys_addr, attrs,
2290	run->mmio.data,
2291	run->mmio.len,
2292	run->mmio.is_write);
2293	ret = `0`;
2294	break;
2295	case KVM_EXIT_IRQ_WINDOW_OPEN:
2296	DPRINTF("irq_window_open\n");
2297	ret = EXCP_INTERRUPT;
2298	break;
2299	case KVM_EXIT_SHUTDOWN:
2300	DPRINTF("shutdown\n");
2301	qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2302	ret = EXCP_INTERRUPT;
2303	break;
2304	case KVM_EXIT_UNKNOWN:
2305	fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2306	(uint64_t)run->hw.hardware_exit_reason);
2307	ret = -`1`;
2308	break;
2309	case KVM_EXIT_INTERNAL_ERROR:
2310	ret = kvm_handle_internal_error(cpu, run);
2311	break;
2312	case KVM_EXIT_SYSTEM_EVENT:
2313	switch (run->system_event.type) {
2314	case KVM_SYSTEM_EVENT_SHUTDOWN:
2315	qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2316	ret = EXCP_INTERRUPT;
2317	break;
2318	case KVM_SYSTEM_EVENT_RESET:
2319	qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2320	ret = EXCP_INTERRUPT;
2321	break;
2322	case KVM_SYSTEM_EVENT_CRASH:
2323	kvm_cpu_synchronize_state(cpu);
2324	qemu_mutex_lock_iothread();
2325	qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2326	qemu_mutex_unlock_iothread();
2327	ret = `0`;
2328	break;
2329	default:
2330	DPRINTF("kvm_arch_handle_exit\n");
2331	ret = kvm_arch_handle_exit(cpu, run);
2332	break;
2333	}
2334	break;
2335	default:
2336	DPRINTF("kvm_arch_handle_exit\n");
2337	ret = kvm_arch_handle_exit(cpu, run);
2338	break;
2339	}
2340	} while (ret == `0`);
2341
2342	cpu_exec_end(cpu);
2343	qemu_mutex_lock_iothread();
2344
2345	if (ret < `0`) {
2346	cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2347	vm_stop(RUN_STATE_INTERNAL_ERROR);
2348	}
2349
2350	atomic_set(&cpu->exit_request, `0`);
2351	return ret;
2352	}
2353
2354	int kvm_ioctl(KVMState s, int* type, ...)
2355	{
2356	int ret;
2357	void *arg;
2358	va_list ap;
2359
2360	va_start(ap, type);
2361	arg = va_arg(ap, void *);
2362	va_end(ap);
2363
2364	trace_kvm_ioctl(type, arg);
2365	ret = ioctl(s->fd, type, arg);
2366	if (ret == -`1`) {
2367	ret = -errno;
2368	}
2369	return ret;
2370	}
2371
2372	int kvm_vm_ioctl(KVMState s, int* type, ...)
2373	{
2374	int ret;
2375	void *arg;
2376	va_list ap;
2377
2378	va_start(ap, type);
2379	arg = va_arg(ap, void *);
2380	va_end(ap);
2381
2382	trace_kvm_vm_ioctl(type, arg);
2383	ret = ioctl(s->vmfd, type, arg);
2384	if (ret == -`1`) {
2385	ret = -errno;
2386	}
2387	return ret;
2388	}
2389
2390	int kvm_vcpu_ioctl(CPUState cpu, int* type, ...)
2391	{
2392	int ret;
2393	void *arg;
2394	va_list ap;
2395
2396	va_start(ap, type);
2397	arg = va_arg(ap, void *);
2398	va_end(ap);
2399
2400	trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2401	ret = ioctl(cpu->kvm_fd, type, arg);
2402	if (ret == -`1`) {
2403	ret = -errno;
2404	}
2405	return ret;
2406	}
2407
2408	int kvm_device_ioctl(int fd, int type, ...)
2409	{
2410	int ret;
2411	void *arg;
2412	va_list ap;
2413
2414	va_start(ap, type);
2415	arg = va_arg(ap, void *);
2416	va_end(ap);
2417
2418	trace_kvm_device_ioctl(fd, type, arg);
2419	ret = ioctl(fd, type, arg);
2420	if (ret == -`1`) {
2421	ret = -errno;
2422	}
2423	return ret;
2424	}
2425
2426	int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2427	{
2428	int ret;
2429	struct kvm_device_attr attribute = {
2430	.group = group,
2431	.attr = attr,
2432	};
2433
2434	if (!kvm_vm_attributes_allowed) {
2435	return `0`;
2436	}
2437
2438	ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2439	/ kvm returns 0 on success for HAS_DEVICE_ATTR /
2440	return ret ? `0` : `1`;
2441	}
2442
2443	int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2444	{
2445	struct kvm_device_attr attribute = {
2446	.group = group,
2447	.attr = attr,
2448	.flags = `0`,
2449	};
2450
2451	return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? `0` : `1`;
2452	}
2453
2454	int kvm_device_access(int fd, int group, uint64_t attr,
2455	void val, bool write, Error *errp)
2456	{
2457	struct kvm_device_attr kvmattr;
2458	int err;
2459
2460	kvmattr.flags = `0`;
2461	kvmattr.group = group;
2462	kvmattr.attr = attr;
2463	kvmattr.addr = (uintptr_t)val;
2464
2465	err = kvm_device_ioctl(fd,
2466	write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2467	&kvmattr);
2468	if (err < `0`) {
2469	error_setg_errno(errp, -err,
2470	"KVM_%s_DEVICE_ATTR failed: Group %d "
2471	"attr 0x%016" PRIx64,
2472	write ? "SET" : "GET", group, attr);
2473	}
2474	return err;
2475	}
2476
2477	bool kvm_has_sync_mmu(void)
2478	{
2479	return kvm_state->sync_mmu;
2480	}
2481
2482	int kvm_has_vcpu_events(void)
2483	{
2484	return kvm_state->vcpu_events;
2485	}
2486
2487	int kvm_has_robust_singlestep(void)
2488	{
2489	return kvm_state->robust_singlestep;
2490	}
2491
2492	int kvm_has_debugregs(void)
2493	{
2494	return kvm_state->debugregs;
2495	}
2496
2497	int kvm_max_nested_state_length(void)
2498	{
2499	return kvm_state->max_nested_state_len;
2500	}
2501
2502	int kvm_has_many_ioeventfds(void)
2503	{
2504	if (!kvm_enabled()) {
2505	return `0`;
2506	}
2507	return kvm_state->many_ioeventfds;
2508	}
2509
2510	int kvm_has_gsi_routing(void)
2511	{
2512	#ifdef KVM_CAP_IRQ_ROUTING
2513	return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2514	#else
2515	return false;
2516	#endif
2517	}
2518
2519	int kvm_has_intx_set_mask(void)
2520	{
2521	return kvm_state->intx_set_mask;
2522	}
2523
2524	bool kvm_arm_supports_user_irq(void)
2525	{
2526	return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2527	}
2528
2529	#ifdef KVM_CAP_SET_GUEST_DEBUG
2530	struct kvm_sw_breakpoint kvm_find_sw_breakpoint(CPUState cpu,
2531	target_ulong pc)
2532	{
2533	struct kvm_sw_breakpoint *bp;
2534
2535	QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2536	if (bp->pc == pc) {
2537	return bp;
2538	}
2539	}
2540	return NULL;
2541	}
2542
2543	int kvm_sw_breakpoints_active(CPUState *cpu)
2544	{
2545	return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2546	}
2547
2548	struct kvm_set_guest_debug_data {
2549	struct kvm_guest_debug dbg;
2550	int err;
2551	};
2552
2553	static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2554	{
2555	struct kvm_set_guest_debug_data *dbg_data =
2556	(struct kvm_set_guest_debug_data *) data.host_ptr;
2557
2558	dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2559	&dbg_data->dbg);
2560	}
2561
2562	int kvm_update_guest_debug(CPUState cpu, unsigned* long reinject_trap)
2563	{
2564	struct kvm_set_guest_debug_data data;
2565
2566	data.dbg.control = reinject_trap;
2567
2568	if (cpu->singlestep_enabled) {
2569	data.dbg.control \|= KVM_GUESTDBG_ENABLE \| KVM_GUESTDBG_SINGLESTEP;
2570	}
2571	kvm_arch_update_guest_debug(cpu, &data.dbg);
2572
2573	run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2574	RUN_ON_CPU_HOST_PTR(&data));
2575	return data.err;
2576	}
2577
2578	int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2579	target_ulong len, int type)
2580	{
2581	struct kvm_sw_breakpoint *bp;
2582	int err;
2583
2584	if (type == GDB_BREAKPOINT_SW) {
2585	bp = kvm_find_sw_breakpoint(cpu, addr);
2586	if (bp) {
2587	bp->use_count++;
2588	return `0`;
2589	}
2590
2591	bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2592	bp->pc = addr;
2593	bp->use_count = `1`;
2594	err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2595	if (err) {
2596	g_free(bp);
2597	return err;
2598	}
2599
2600	QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2601	} else {
2602	err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2603	if (err) {
2604	return err;
2605	}
2606	}
2607
2608	CPU_FOREACH(cpu) {
2609	err = kvm_update_guest_debug(cpu, `0`);
2610	if (err) {
2611	return err;
2612	}
2613	}
2614	return `0`;
2615	}
2616
2617	int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2618	target_ulong len, int type)
2619	{
2620	struct kvm_sw_breakpoint *bp;
2621	int err;
2622
2623	if (type == GDB_BREAKPOINT_SW) {
2624	bp = kvm_find_sw_breakpoint(cpu, addr);
2625	if (!bp) {
2626	return -ENOENT;
2627	}
2628
2629	if (bp->use_count > `1`) {
2630	bp->use_count--;
2631	return `0`;
2632	}
2633
2634	err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2635	if (err) {
2636	return err;
2637	}
2638
2639	QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2640	g_free(bp);
2641	} else {
2642	err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2643	if (err) {
2644	return err;
2645	}
2646	}
2647
2648	CPU_FOREACH(cpu) {
2649	err = kvm_update_guest_debug(cpu, `0`);
2650	if (err) {
2651	return err;
2652	}
2653	}
2654	return `0`;
2655	}
2656
2657	void kvm_remove_all_breakpoints(CPUState *cpu)
2658	{
2659	struct kvm_sw_breakpoint bp, next;
2660	KVMState *s = cpu->kvm_state;
2661	CPUState *tmpcpu;
2662
2663	QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2664	if (kvm_arch_remove_sw_breakpoint(cpu, bp) != `0`) {
2665	/ Try harder to find a CPU that currently sees the breakpoint. /
2666	CPU_FOREACH(tmpcpu) {
2667	if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == `0`) {
2668	break;
2669	}
2670	}
2671	}
2672	QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2673	g_free(bp);
2674	}
2675	kvm_arch_remove_all_hw_breakpoints();
2676
2677	CPU_FOREACH(cpu) {
2678	kvm_update_guest_debug(cpu, `0`);
2679	}
2680	}
2681
2682	#else /* !KVM_CAP_SET_GUEST_DEBUG */
2683
2684	int kvm_update_guest_debug(CPUState cpu, unsigned* long reinject_trap)
2685	{
2686	return -EINVAL;
2687	}
2688
2689	int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2690	target_ulong len, int type)
2691	{
2692	return -EINVAL;
2693	}
2694
2695	int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2696	target_ulong len, int type)
2697	{
2698	return -EINVAL;
2699	}
2700
2701	void kvm_remove_all_breakpoints(CPUState *cpu)
2702	{
2703	}
2704	#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2705
2706	static int kvm_set_signal_mask(CPUState cpu, const* sigset_t *sigset)
2707	{
2708	KVMState *s = kvm_state;
2709	struct kvm_signal_mask *sigmask;
2710	int r;
2711
2712	sigmask = g_malloc(sizeof(sigmask) + sizeof(sigset));
2713
2714	sigmask->len = s->sigmask_len;
2715	memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2716	r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2717	g_free(sigmask);
2718
2719	return r;
2720	}
2721
2722	static void kvm_ipi_signal(int sig)
2723	{
2724	if (current_cpu) {
2725	assert(kvm_immediate_exit);
2726	kvm_cpu_kick(current_cpu);
2727	}
2728	}
2729
2730	void kvm_init_cpu_signals(CPUState *cpu)
2731	{
2732	int r;
2733	sigset_t set;
2734	struct sigaction sigact;
2735
2736	memset(&sigact, `0`, sizeof(sigact));
2737	sigact.sa_handler = kvm_ipi_signal;
2738	sigaction(SIG_IPI, &sigact, NULL);
2739
2740	pthread_sigmask(SIG_BLOCK, NULL, &set);
2741	#if defined KVM_HAVE_MCE_INJECTION
2742	sigdelset(&set, SIGBUS);
2743	pthread_sigmask(SIG_SETMASK, &set, NULL);
2744	#endif
2745	sigdelset(&set, SIG_IPI);
2746	if (kvm_immediate_exit) {
2747	r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2748	} else {
2749	r = kvm_set_signal_mask(cpu, &set);
2750	}
2751	if (r) {
2752	fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2753	exit(`1`);
2754	}
2755	}
2756
2757	/ Called asynchronously in VCPU thread. /
2758	int kvm_on_sigbus_vcpu(CPUState cpu, int* code, void *addr)
2759	{
2760	#ifdef KVM_HAVE_MCE_INJECTION
2761	if (have_sigbus_pending) {
2762	return `1`;
2763	}
2764	have_sigbus_pending = true;
2765	pending_sigbus_addr = addr;
2766	pending_sigbus_code = code;
2767	atomic_set(&cpu->exit_request, `1`);
2768	return `0`;
2769	#else
2770	return `1`;
2771	#endif
2772	}
2773
2774	/ Called synchronously (via signalfd) in main thread. /
2775	int kvm_on_sigbus(int code, void *addr)
2776	{
2777	#ifdef KVM_HAVE_MCE_INJECTION
2778	/ Action required MCE kills the process if SIGBUS is blocked. Because*
2779	* that's what happens in the I/O thread, where we handle MCE via signalfd,
2780	* we can only get action optional here.
2781	*/
2782	assert(code != BUS_MCEERR_AR);
2783	kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2784	return `0`;
2785	#else
2786	return `1`;
2787	#endif
2788	}
2789
2790	int kvm_create_device(KVMState *s, uint64_t type, bool test)
2791	{
2792	int ret;
2793	struct kvm_create_device create_dev;
2794
2795	create_dev.type = type;
2796	create_dev.fd = -`1`;
2797	create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : `0`;
2798
2799	if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2800	return -ENOTSUP;
2801	}
2802
2803	ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2804	if (ret) {
2805	return ret;
2806	}
2807
2808	return test ? `0` : create_dev.fd;
2809	}
2810
2811	bool kvm_device_supported(int vmfd, uint64_t type)
2812	{
2813	struct kvm_create_device create_dev = {
2814	.type = type,
2815	.fd = -`1`,
2816	.flags = KVM_CREATE_DEVICE_TEST,
2817	};
2818
2819	if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= `0`) {
2820	return false;
2821	}
2822
2823	return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= `0`);
2824	}
2825
2826	int kvm_set_one_reg(CPUState cs, uint64_t id, void* *source)
2827	{
2828	struct kvm_one_reg reg;
2829	int r;
2830
2831	reg.id = id;
2832	reg.addr = (uintptr_t) source;
2833	r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2834	if (r) {
2835	trace_kvm_failed_reg_set(id, strerror(-r));
2836	}
2837	return r;
2838	}
2839
2840	int kvm_get_one_reg(CPUState cs, uint64_t id, void* *target)
2841	{
2842	struct kvm_one_reg reg;
2843	int r;
2844
2845	reg.id = id;
2846	reg.addr = (uintptr_t) target;
2847	r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2848	if (r) {
2849	trace_kvm_failed_reg_get(id, strerror(-r));
2850	}
2851	return r;
2852	}
2853
2854	static bool kvm_accel_has_memory(MachineState ms, AddressSpace as,
2855	hwaddr start_addr, hwaddr size)
2856	{
2857	KVMState *kvm = KVM_STATE(ms->accelerator);
2858	int i;
2859
2860	for (i = `0`; i < kvm->nr_as; ++i) {
2861	if (kvm->as[i].as == as && kvm->as[i].ml) {
2862	return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
2863	start_addr, size);
2864	}
2865	}
2866
2867	return false;
2868	}
2869
2870	static void kvm_accel_class_init(ObjectClass oc, void* *data)
2871	{
2872	AccelClass *ac = ACCEL_CLASS(oc);
2873	ac->name = "KVM";
2874	ac->init_machine = kvm_init;
2875	ac->has_memory = kvm_accel_has_memory;
2876	ac->allowed = &kvm_allowed;
2877	}
2878
2879	static const TypeInfo kvm_accel_type = {
2880	.name = TYPE_KVM_ACCEL,
2881	.parent = TYPE_ACCEL,
2882	.class_init = kvm_accel_class_init,
2883	.instance_size = sizeof(KVMState),
2884	};
2885
2886	static void kvm_type_init(void)
2887	{
2888	type_register_static(&kvm_accel_type);
2889	}
2890
2891	type_init(kvm_type_init);
2892

Browse the source code of qemu/accel/kvm/kvm-all.c