1 | /* |
2 | * VFIO utility |
3 | * |
4 | * Copyright 2016 - 2018 Red Hat, Inc. |
5 | * |
6 | * Authors: |
7 | * Fam Zheng <famz@redhat.com> |
8 | * |
9 | * This work is licensed under the terms of the GNU GPL, version 2 or later. |
10 | * See the COPYING file in the top-level directory. |
11 | */ |
12 | |
13 | #include "qemu/osdep.h" |
14 | #include <sys/ioctl.h> |
15 | #include <linux/vfio.h> |
16 | #include "qapi/error.h" |
17 | #include "exec/ramlist.h" |
18 | #include "exec/cpu-common.h" |
19 | #include "trace.h" |
20 | #include "qemu/error-report.h" |
21 | #include "standard-headers/linux/pci_regs.h" |
22 | #include "qemu/event_notifier.h" |
23 | #include "qemu/vfio-helpers.h" |
24 | #include "trace.h" |
25 | |
26 | #define QEMU_VFIO_DEBUG 0 |
27 | |
28 | #define QEMU_VFIO_IOVA_MIN 0x10000ULL |
29 | /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, |
30 | * we can use a runtime limit; alternatively it's also possible to do platform |
31 | * specific detection by reading sysfs entries. Until then, 39 is a safe bet. |
32 | **/ |
33 | #define QEMU_VFIO_IOVA_MAX (1ULL << 39) |
34 | |
35 | typedef struct { |
36 | /* Page aligned addr. */ |
37 | void *host; |
38 | size_t size; |
39 | uint64_t iova; |
40 | } IOVAMapping; |
41 | |
42 | struct QEMUVFIOState { |
43 | QemuMutex lock; |
44 | |
45 | /* These fields are protected by BQL */ |
46 | int container; |
47 | int group; |
48 | int device; |
49 | RAMBlockNotifier ram_notifier; |
50 | struct vfio_region_info config_region_info, bar_region_info[6]; |
51 | |
52 | /* These fields are protected by @lock */ |
53 | /* VFIO's IO virtual address space is managed by splitting into a few |
54 | * sections: |
55 | * |
56 | * --------------- <= 0 |
57 | * |xxxxxxxxxxxxx| |
58 | * |-------------| <= QEMU_VFIO_IOVA_MIN |
59 | * | | |
60 | * | Fixed | |
61 | * | | |
62 | * |-------------| <= low_water_mark |
63 | * | | |
64 | * | Free | |
65 | * | | |
66 | * |-------------| <= high_water_mark |
67 | * | | |
68 | * | Temp | |
69 | * | | |
70 | * |-------------| <= QEMU_VFIO_IOVA_MAX |
71 | * |xxxxxxxxxxxxx| |
72 | * |xxxxxxxxxxxxx| |
73 | * --------------- |
74 | * |
75 | * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; |
76 | * |
77 | * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of |
78 | * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be |
79 | * reclaimed - low_water_mark never shrinks; |
80 | * |
81 | * - IOVAs in range [low_water_mark, high_water_mark) are free; |
82 | * |
83 | * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile |
84 | * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area |
85 | * is recycled. The caller should make sure I/O's depending on these |
86 | * mappings are completed before calling. |
87 | **/ |
88 | uint64_t low_water_mark; |
89 | uint64_t high_water_mark; |
90 | IOVAMapping *mappings; |
91 | int nr_mappings; |
92 | }; |
93 | |
94 | /** |
95 | * Find group file by PCI device address as specified @device, and return the |
96 | * path. The returned string is owned by caller and should be g_free'ed later. |
97 | */ |
98 | static char *sysfs_find_group_file(const char *device, Error **errp) |
99 | { |
100 | char *sysfs_link; |
101 | char *sysfs_group; |
102 | char *p; |
103 | char *path = NULL; |
104 | |
105 | sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group" , device); |
106 | sysfs_group = g_malloc0(PATH_MAX); |
107 | if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { |
108 | error_setg_errno(errp, errno, "Failed to find iommu group sysfs path" ); |
109 | goto out; |
110 | } |
111 | p = strrchr(sysfs_group, '/'); |
112 | if (!p) { |
113 | error_setg(errp, "Failed to find iommu group number" ); |
114 | goto out; |
115 | } |
116 | |
117 | path = g_strdup_printf("/dev/vfio/%s" , p + 1); |
118 | out: |
119 | g_free(sysfs_link); |
120 | g_free(sysfs_group); |
121 | return path; |
122 | } |
123 | |
124 | static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) |
125 | { |
126 | assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); |
127 | } |
128 | |
129 | static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) |
130 | { |
131 | assert_bar_index_valid(s, index); |
132 | s->bar_region_info[index] = (struct vfio_region_info) { |
133 | .index = VFIO_PCI_BAR0_REGION_INDEX + index, |
134 | .argsz = sizeof(struct vfio_region_info), |
135 | }; |
136 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { |
137 | error_setg_errno(errp, errno, "Failed to get BAR region info" ); |
138 | return -errno; |
139 | } |
140 | |
141 | return 0; |
142 | } |
143 | |
144 | /** |
145 | * Map a PCI bar area. |
146 | */ |
147 | void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, |
148 | uint64_t offset, uint64_t size, |
149 | Error **errp) |
150 | { |
151 | void *p; |
152 | assert_bar_index_valid(s, index); |
153 | p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), |
154 | PROT_READ | PROT_WRITE, MAP_SHARED, |
155 | s->device, s->bar_region_info[index].offset + offset); |
156 | if (p == MAP_FAILED) { |
157 | error_setg_errno(errp, errno, "Failed to map BAR region" ); |
158 | p = NULL; |
159 | } |
160 | return p; |
161 | } |
162 | |
163 | /** |
164 | * Unmap a PCI bar area. |
165 | */ |
166 | void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, |
167 | uint64_t offset, uint64_t size) |
168 | { |
169 | if (bar) { |
170 | munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); |
171 | } |
172 | } |
173 | |
174 | /** |
175 | * Initialize device IRQ with @irq_type and and register an event notifier. |
176 | */ |
177 | int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, |
178 | int irq_type, Error **errp) |
179 | { |
180 | int r; |
181 | struct vfio_irq_set *irq_set; |
182 | size_t irq_set_size; |
183 | struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; |
184 | |
185 | irq_info.index = irq_type; |
186 | if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { |
187 | error_setg_errno(errp, errno, "Failed to get device interrupt info" ); |
188 | return -errno; |
189 | } |
190 | if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { |
191 | error_setg(errp, "Device interrupt doesn't support eventfd" ); |
192 | return -EINVAL; |
193 | } |
194 | |
195 | irq_set_size = sizeof(*irq_set) + sizeof(int); |
196 | irq_set = g_malloc0(irq_set_size); |
197 | |
198 | /* Get to a known IRQ state */ |
199 | *irq_set = (struct vfio_irq_set) { |
200 | .argsz = irq_set_size, |
201 | .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, |
202 | .index = irq_info.index, |
203 | .start = 0, |
204 | .count = 1, |
205 | }; |
206 | |
207 | *(int *)&irq_set->data = event_notifier_get_fd(e); |
208 | r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); |
209 | g_free(irq_set); |
210 | if (r) { |
211 | error_setg_errno(errp, errno, "Failed to setup device interrupt" ); |
212 | return -errno; |
213 | } |
214 | return 0; |
215 | } |
216 | |
217 | static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, |
218 | int size, int ofs) |
219 | { |
220 | int ret; |
221 | |
222 | do { |
223 | ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); |
224 | } while (ret == -1 && errno == EINTR); |
225 | return ret == size ? 0 : -errno; |
226 | } |
227 | |
228 | static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) |
229 | { |
230 | int ret; |
231 | |
232 | do { |
233 | ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); |
234 | } while (ret == -1 && errno == EINTR); |
235 | return ret == size ? 0 : -errno; |
236 | } |
237 | |
238 | static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, |
239 | Error **errp) |
240 | { |
241 | int ret; |
242 | int i; |
243 | uint16_t pci_cmd; |
244 | struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; |
245 | struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; |
246 | struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; |
247 | char *group_file = NULL; |
248 | |
249 | /* Create a new container */ |
250 | s->container = open("/dev/vfio/vfio" , O_RDWR); |
251 | |
252 | if (s->container == -1) { |
253 | error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio" ); |
254 | return -errno; |
255 | } |
256 | if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { |
257 | error_setg(errp, "Invalid VFIO version" ); |
258 | ret = -EINVAL; |
259 | goto fail_container; |
260 | } |
261 | |
262 | if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { |
263 | error_setg_errno(errp, errno, "VFIO IOMMU check failed" ); |
264 | ret = -EINVAL; |
265 | goto fail_container; |
266 | } |
267 | |
268 | /* Open the group */ |
269 | group_file = sysfs_find_group_file(device, errp); |
270 | if (!group_file) { |
271 | ret = -EINVAL; |
272 | goto fail_container; |
273 | } |
274 | |
275 | s->group = open(group_file, O_RDWR); |
276 | if (s->group == -1) { |
277 | error_setg_errno(errp, errno, "Failed to open VFIO group file: %s" , |
278 | group_file); |
279 | g_free(group_file); |
280 | ret = -errno; |
281 | goto fail_container; |
282 | } |
283 | g_free(group_file); |
284 | |
285 | /* Test the group is viable and available */ |
286 | if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { |
287 | error_setg_errno(errp, errno, "Failed to get VFIO group status" ); |
288 | ret = -errno; |
289 | goto fail; |
290 | } |
291 | |
292 | if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { |
293 | error_setg(errp, "VFIO group is not viable" ); |
294 | ret = -EINVAL; |
295 | goto fail; |
296 | } |
297 | |
298 | /* Add the group to the container */ |
299 | if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { |
300 | error_setg_errno(errp, errno, "Failed to add group to VFIO container" ); |
301 | ret = -errno; |
302 | goto fail; |
303 | } |
304 | |
305 | /* Enable the IOMMU model we want */ |
306 | if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { |
307 | error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type" ); |
308 | ret = -errno; |
309 | goto fail; |
310 | } |
311 | |
312 | /* Get additional IOMMU info */ |
313 | if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { |
314 | error_setg_errno(errp, errno, "Failed to get IOMMU info" ); |
315 | ret = -errno; |
316 | goto fail; |
317 | } |
318 | |
319 | s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); |
320 | |
321 | if (s->device < 0) { |
322 | error_setg_errno(errp, errno, "Failed to get device fd" ); |
323 | ret = -errno; |
324 | goto fail; |
325 | } |
326 | |
327 | /* Test and setup the device */ |
328 | if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { |
329 | error_setg_errno(errp, errno, "Failed to get device info" ); |
330 | ret = -errno; |
331 | goto fail; |
332 | } |
333 | |
334 | if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { |
335 | error_setg(errp, "Invalid device regions" ); |
336 | ret = -EINVAL; |
337 | goto fail; |
338 | } |
339 | |
340 | s->config_region_info = (struct vfio_region_info) { |
341 | .index = VFIO_PCI_CONFIG_REGION_INDEX, |
342 | .argsz = sizeof(struct vfio_region_info), |
343 | }; |
344 | if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { |
345 | error_setg_errno(errp, errno, "Failed to get config region info" ); |
346 | ret = -errno; |
347 | goto fail; |
348 | } |
349 | |
350 | for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { |
351 | ret = qemu_vfio_pci_init_bar(s, i, errp); |
352 | if (ret) { |
353 | goto fail; |
354 | } |
355 | } |
356 | |
357 | /* Enable bus master */ |
358 | ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); |
359 | if (ret) { |
360 | goto fail; |
361 | } |
362 | pci_cmd |= PCI_COMMAND_MASTER; |
363 | ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); |
364 | if (ret) { |
365 | goto fail; |
366 | } |
367 | return 0; |
368 | fail: |
369 | close(s->group); |
370 | fail_container: |
371 | close(s->container); |
372 | return ret; |
373 | } |
374 | |
375 | static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, |
376 | void *host, size_t size) |
377 | { |
378 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); |
379 | trace_qemu_vfio_ram_block_added(s, host, size); |
380 | qemu_vfio_dma_map(s, host, size, false, NULL); |
381 | } |
382 | |
383 | static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, |
384 | void *host, size_t size) |
385 | { |
386 | QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); |
387 | if (host) { |
388 | trace_qemu_vfio_ram_block_removed(s, host, size); |
389 | qemu_vfio_dma_unmap(s, host); |
390 | } |
391 | } |
392 | |
393 | static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque) |
394 | { |
395 | void *host_addr = qemu_ram_get_host_addr(rb); |
396 | ram_addr_t length = qemu_ram_get_used_length(rb); |
397 | int ret; |
398 | QEMUVFIOState *s = opaque; |
399 | |
400 | if (!host_addr) { |
401 | return 0; |
402 | } |
403 | ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); |
404 | if (ret) { |
405 | fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n" , |
406 | host_addr, (uint64_t)length); |
407 | } |
408 | return 0; |
409 | } |
410 | |
411 | static void qemu_vfio_open_common(QEMUVFIOState *s) |
412 | { |
413 | qemu_mutex_init(&s->lock); |
414 | s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; |
415 | s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; |
416 | ram_block_notifier_add(&s->ram_notifier); |
417 | s->low_water_mark = QEMU_VFIO_IOVA_MIN; |
418 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; |
419 | qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); |
420 | } |
421 | |
422 | /** |
423 | * Open a PCI device, e.g. "0000:00:01.0". |
424 | */ |
425 | QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) |
426 | { |
427 | int r; |
428 | QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); |
429 | |
430 | r = qemu_vfio_init_pci(s, device, errp); |
431 | if (r) { |
432 | g_free(s); |
433 | return NULL; |
434 | } |
435 | qemu_vfio_open_common(s); |
436 | return s; |
437 | } |
438 | |
439 | static void qemu_vfio_dump_mapping(IOVAMapping *m) |
440 | { |
441 | if (QEMU_VFIO_DEBUG) { |
442 | printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n" , m->host, |
443 | (uint64_t)m->size, (uint64_t)m->iova); |
444 | } |
445 | } |
446 | |
447 | static void qemu_vfio_dump_mappings(QEMUVFIOState *s) |
448 | { |
449 | int i; |
450 | |
451 | if (QEMU_VFIO_DEBUG) { |
452 | printf("vfio mappings\n" ); |
453 | for (i = 0; i < s->nr_mappings; ++i) { |
454 | qemu_vfio_dump_mapping(&s->mappings[i]); |
455 | } |
456 | } |
457 | } |
458 | |
459 | /** |
460 | * Find the mapping entry that contains [host, host + size) and set @index to |
461 | * the position. If no entry contains it, @index is the position _after_ which |
462 | * to insert the new mapping. IOW, it is the index of the largest element that |
463 | * is smaller than @host, or -1 if no entry is. |
464 | */ |
465 | static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, |
466 | int *index) |
467 | { |
468 | IOVAMapping *p = s->mappings; |
469 | IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; |
470 | IOVAMapping *mid; |
471 | trace_qemu_vfio_find_mapping(s, host); |
472 | if (!p) { |
473 | *index = -1; |
474 | return NULL; |
475 | } |
476 | while (true) { |
477 | mid = p + (q - p) / 2; |
478 | if (mid == p) { |
479 | break; |
480 | } |
481 | if (mid->host > host) { |
482 | q = mid; |
483 | } else if (mid->host < host) { |
484 | p = mid; |
485 | } else { |
486 | break; |
487 | } |
488 | } |
489 | if (mid->host > host) { |
490 | mid--; |
491 | } else if (mid < &s->mappings[s->nr_mappings - 1] |
492 | && (mid + 1)->host <= host) { |
493 | mid++; |
494 | } |
495 | *index = mid - &s->mappings[0]; |
496 | if (mid >= &s->mappings[0] && |
497 | mid->host <= host && mid->host + mid->size > host) { |
498 | assert(mid < &s->mappings[s->nr_mappings]); |
499 | return mid; |
500 | } |
501 | /* At this point *index + 1 is the right position to insert the new |
502 | * mapping.*/ |
503 | return NULL; |
504 | } |
505 | |
506 | /** |
507 | * Allocate IOVA and and create a new mapping record and insert it in @s. |
508 | */ |
509 | static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, |
510 | void *host, size_t size, |
511 | int index, uint64_t iova) |
512 | { |
513 | int shift; |
514 | IOVAMapping m = {.host = host, .size = size, .iova = iova}; |
515 | IOVAMapping *insert; |
516 | |
517 | assert(QEMU_IS_ALIGNED(size, getpagesize())); |
518 | assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); |
519 | assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); |
520 | trace_qemu_vfio_new_mapping(s, host, size, index, iova); |
521 | |
522 | assert(index >= 0); |
523 | s->nr_mappings++; |
524 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
525 | insert = &s->mappings[index]; |
526 | shift = s->nr_mappings - index - 1; |
527 | if (shift) { |
528 | memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); |
529 | } |
530 | *insert = m; |
531 | return insert; |
532 | } |
533 | |
534 | /* Do the DMA mapping with VFIO. */ |
535 | static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, |
536 | uint64_t iova) |
537 | { |
538 | struct vfio_iommu_type1_dma_map dma_map = { |
539 | .argsz = sizeof(dma_map), |
540 | .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, |
541 | .iova = iova, |
542 | .vaddr = (uintptr_t)host, |
543 | .size = size, |
544 | }; |
545 | trace_qemu_vfio_do_mapping(s, host, size, iova); |
546 | |
547 | if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { |
548 | error_report("VFIO_MAP_DMA: %d" , -errno); |
549 | return -errno; |
550 | } |
551 | return 0; |
552 | } |
553 | |
554 | /** |
555 | * Undo the DMA mapping from @s with VFIO, and remove from mapping list. |
556 | */ |
557 | static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, |
558 | Error **errp) |
559 | { |
560 | int index; |
561 | struct vfio_iommu_type1_dma_unmap unmap = { |
562 | .argsz = sizeof(unmap), |
563 | .flags = 0, |
564 | .iova = mapping->iova, |
565 | .size = mapping->size, |
566 | }; |
567 | |
568 | index = mapping - s->mappings; |
569 | assert(mapping->size > 0); |
570 | assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); |
571 | assert(index >= 0 && index < s->nr_mappings); |
572 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
573 | error_setg(errp, "VFIO_UNMAP_DMA failed: %d" , -errno); |
574 | } |
575 | memmove(mapping, &s->mappings[index + 1], |
576 | sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); |
577 | s->nr_mappings--; |
578 | s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); |
579 | } |
580 | |
581 | /* Check if the mapping list is (ascending) ordered. */ |
582 | static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) |
583 | { |
584 | int i; |
585 | if (QEMU_VFIO_DEBUG) { |
586 | for (i = 0; i < s->nr_mappings - 1; ++i) { |
587 | if (!(s->mappings[i].host < s->mappings[i + 1].host)) { |
588 | fprintf(stderr, "item %d not sorted!\n" , i); |
589 | qemu_vfio_dump_mappings(s); |
590 | return false; |
591 | } |
592 | if (!(s->mappings[i].host + s->mappings[i].size <= |
593 | s->mappings[i + 1].host)) { |
594 | fprintf(stderr, "item %d overlap with next!\n" , i); |
595 | qemu_vfio_dump_mappings(s); |
596 | return false; |
597 | } |
598 | } |
599 | } |
600 | return true; |
601 | } |
602 | |
603 | /* Map [host, host + size) area into a contiguous IOVA address space, and store |
604 | * the result in @iova if not NULL. The caller need to make sure the area is |
605 | * aligned to page size, and mustn't overlap with existing mapping areas (split |
606 | * mapping status within this area is not allowed). |
607 | */ |
608 | int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, |
609 | bool temporary, uint64_t *iova) |
610 | { |
611 | int ret = 0; |
612 | int index; |
613 | IOVAMapping *mapping; |
614 | uint64_t iova0; |
615 | |
616 | assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); |
617 | assert(QEMU_IS_ALIGNED(size, getpagesize())); |
618 | trace_qemu_vfio_dma_map(s, host, size, temporary, iova); |
619 | qemu_mutex_lock(&s->lock); |
620 | mapping = qemu_vfio_find_mapping(s, host, &index); |
621 | if (mapping) { |
622 | iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); |
623 | } else { |
624 | if (s->high_water_mark - s->low_water_mark + 1 < size) { |
625 | ret = -ENOMEM; |
626 | goto out; |
627 | } |
628 | if (!temporary) { |
629 | iova0 = s->low_water_mark; |
630 | mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); |
631 | if (!mapping) { |
632 | ret = -ENOMEM; |
633 | goto out; |
634 | } |
635 | assert(qemu_vfio_verify_mappings(s)); |
636 | ret = qemu_vfio_do_mapping(s, host, size, iova0); |
637 | if (ret) { |
638 | qemu_vfio_undo_mapping(s, mapping, NULL); |
639 | goto out; |
640 | } |
641 | s->low_water_mark += size; |
642 | qemu_vfio_dump_mappings(s); |
643 | } else { |
644 | iova0 = s->high_water_mark - size; |
645 | ret = qemu_vfio_do_mapping(s, host, size, iova0); |
646 | if (ret) { |
647 | goto out; |
648 | } |
649 | s->high_water_mark -= size; |
650 | } |
651 | } |
652 | if (iova) { |
653 | *iova = iova0; |
654 | } |
655 | out: |
656 | qemu_mutex_unlock(&s->lock); |
657 | return ret; |
658 | } |
659 | |
660 | /* Reset the high watermark and free all "temporary" mappings. */ |
661 | int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) |
662 | { |
663 | struct vfio_iommu_type1_dma_unmap unmap = { |
664 | .argsz = sizeof(unmap), |
665 | .flags = 0, |
666 | .iova = s->high_water_mark, |
667 | .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, |
668 | }; |
669 | trace_qemu_vfio_dma_reset_temporary(s); |
670 | qemu_mutex_lock(&s->lock); |
671 | if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { |
672 | error_report("VFIO_UNMAP_DMA: %d" , -errno); |
673 | qemu_mutex_unlock(&s->lock); |
674 | return -errno; |
675 | } |
676 | s->high_water_mark = QEMU_VFIO_IOVA_MAX; |
677 | qemu_mutex_unlock(&s->lock); |
678 | return 0; |
679 | } |
680 | |
681 | /* Unmapping the whole area that was previously mapped with |
682 | * qemu_vfio_dma_map(). */ |
683 | void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) |
684 | { |
685 | int index = 0; |
686 | IOVAMapping *m; |
687 | |
688 | if (!host) { |
689 | return; |
690 | } |
691 | |
692 | trace_qemu_vfio_dma_unmap(s, host); |
693 | qemu_mutex_lock(&s->lock); |
694 | m = qemu_vfio_find_mapping(s, host, &index); |
695 | if (!m) { |
696 | goto out; |
697 | } |
698 | qemu_vfio_undo_mapping(s, m, NULL); |
699 | out: |
700 | qemu_mutex_unlock(&s->lock); |
701 | } |
702 | |
703 | static void qemu_vfio_reset(QEMUVFIOState *s) |
704 | { |
705 | ioctl(s->device, VFIO_DEVICE_RESET); |
706 | } |
707 | |
708 | /* Close and free the VFIO resources. */ |
709 | void qemu_vfio_close(QEMUVFIOState *s) |
710 | { |
711 | int i; |
712 | |
713 | if (!s) { |
714 | return; |
715 | } |
716 | for (i = 0; i < s->nr_mappings; ++i) { |
717 | qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); |
718 | } |
719 | ram_block_notifier_remove(&s->ram_notifier); |
720 | qemu_vfio_reset(s); |
721 | close(s->device); |
722 | close(s->group); |
723 | close(s->container); |
724 | } |
725 | |