summaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2018-02-08 14:31:51 +0000
committerPeter Maydell <peter.maydell@linaro.org>2018-02-08 14:31:51 +0000
commit008a51bbb343972dd8cf09126da8c3b87f4e1c96 (patch)
tree6e5f7c697a796ef35198a65623e4dbdc76d3e6da /util
parentb256b89c63c12e902645dd55e7b5362f60574742 (diff)
parent4eb995603479f0f7aff14b518f8ada16fe694ca7 (diff)
downloadqemu-008a51bbb343972dd8cf09126da8c3b87f4e1c96.tar.gz
Merge remote-tracking branch 'remotes/famz/tags/staging-pull-request' into staging
# gpg: Signature made Thu 08 Feb 2018 01:29:22 GMT # gpg: using RSA key CA35624C6A9171C6 # gpg: Good signature from "Fam Zheng <famz@redhat.com>" # Primary key fingerprint: 5003 7CB7 9706 0F76 F021 AD56 CA35 624C 6A91 71C6 * remotes/famz/tags/staging-pull-request: docs: Add docs/devel/testing.rst qapi: Add NVMe driver options to the schema docs: Add section for NVMe VFIO driver block: Move NVMe constants to a separate header qemu-img: Map bench buffer block/nvme: Implement .bdrv_(un)register_buf block: Introduce buf register API block: Add VFIO based NVMe driver util: Introduce vfio helpers stubs: Add stubs for ram block API curl: convert to CoQueue coroutine-lock: make qemu_co_enter_next thread-safe coroutine-lock: convert CoQueue to use QemuLockable lockable: add QemuLockable test-coroutine: add simple CoMutex test docker: change Fedora base image to fedora:27 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'util')
-rw-r--r--util/Makefile.objs1
-rw-r--r--util/qemu-coroutine-lock.c22
-rw-r--r--util/trace-events11
-rw-r--r--util/vfio-helpers.c727
4 files changed, 754 insertions, 7 deletions
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 2973b0a323..3fb611631f 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -46,3 +46,4 @@ util-obj-y += qht.o
util-obj-y += range.o
util-obj-y += stats64.o
util-obj-y += systemd.o
+util-obj-$(CONFIG_LINUX) += vfio-helpers.o
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index 846ff9167f..78fb79acf8 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -40,13 +40,13 @@ void qemu_co_queue_init(CoQueue *queue)
QSIMPLEQ_INIT(&queue->entries);
}
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
+void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock)
{
Coroutine *self = qemu_coroutine_self();
QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
- if (mutex) {
- qemu_co_mutex_unlock(mutex);
+ if (lock) {
+ qemu_lockable_unlock(lock);
}
/* There is no race condition here. Other threads will call
@@ -60,9 +60,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
/* TODO: OSv implements wait morphing here, where the wakeup
* primitive automatically places the woken coroutine on the
* mutex's queue. This avoids the thundering herd effect.
+ * This could be implemented for CoMutexes, but not really for
+ * other cases of QemuLockable.
*/
- if (mutex) {
- qemu_co_mutex_lock(mutex);
+ if (lock) {
+ qemu_lockable_lock(lock);
}
}
@@ -130,7 +132,7 @@ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue)
qemu_co_queue_do_restart(queue, false);
}
-bool qemu_co_enter_next(CoQueue *queue)
+bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock)
{
Coroutine *next;
@@ -140,7 +142,13 @@ bool qemu_co_enter_next(CoQueue *queue)
}
QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
- qemu_coroutine_enter(next);
+ if (lock) {
+ qemu_lockable_unlock(lock);
+ }
+ aio_co_wake(next);
+ if (lock) {
+ qemu_lockable_lock(lock);
+ }
return true;
}
diff --git a/util/trace-events b/util/trace-events
index 515e6257fb..4822434c89 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -60,3 +60,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter"
qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)"
qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)"
qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)"
+
+# util/vfio-helpers.c
+qemu_vfio_dma_reset_temporary(void *s) "s %p"
+qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64
+qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p"
+qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size, void *host, size_t size) "s %p mapping %p %zu requested %p %zu"
+qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
new file mode 100644
index 0000000000..f478b68400
--- /dev/null
+++ b/util/vfio-helpers.c
@@ -0,0 +1,727 @@
+/*
+ * VFIO utility
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/cpu-common.h"
+#include "trace.h"
+#include "qemu/queue.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/pci_regs.h"
+#include "qemu/event_notifier.h"
+#include "qemu/vfio-helpers.h"
+#include "trace.h"
+
+#define QEMU_VFIO_DEBUG 0
+
+#define QEMU_VFIO_IOVA_MIN 0x10000ULL
+/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
+ * we can use a runtime limit; alternatively it's also possible to do platform
+ * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
+ **/
+#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
+
+typedef struct {
+ /* Page aligned addr. */
+ void *host;
+ size_t size;
+ uint64_t iova;
+} IOVAMapping;
+
+struct QEMUVFIOState {
+ QemuMutex lock;
+
+ /* These fields are protected by BQL */
+ int container;
+ int group;
+ int device;
+ RAMBlockNotifier ram_notifier;
+ struct vfio_region_info config_region_info, bar_region_info[6];
+
+ /* These fields are protected by @lock */
+ /* VFIO's IO virtual address space is managed by splitting into a few
+ * sections:
+ *
+ * --------------- <= 0
+ * |xxxxxxxxxxxxx|
+ * |-------------| <= QEMU_VFIO_IOVA_MIN
+ * | |
+ * | Fixed |
+ * | |
+ * |-------------| <= low_water_mark
+ * | |
+ * | Free |
+ * | |
+ * |-------------| <= high_water_mark
+ * | |
+ * | Temp |
+ * | |
+ * |-------------| <= QEMU_VFIO_IOVA_MAX
+ * |xxxxxxxxxxxxx|
+ * |xxxxxxxxxxxxx|
+ * ---------------
+ *
+ * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+ *
+ * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
+ * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
+ * reclaimed - low_water_mark never shrinks;
+ *
+ * - IOVAs in range [low_water_mark, high_water_mark) are free;
+ *
+ * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+ * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
+ * is recycled. The caller should make sure I/O's depending on these
+ * mappings are completed before calling.
+ **/
+ uint64_t low_water_mark;
+ uint64_t high_water_mark;
+ IOVAMapping *mappings;
+ int nr_mappings;
+};
+
+/**
+ * Find group file by PCI device address as specified @device, and return the
+ * path. The returned string is owned by caller and should be g_free'ed later.
+ */
+static char *sysfs_find_group_file(const char *device, Error **errp)
+{
+ char *sysfs_link;
+ char *sysfs_group;
+ char *p;
+ char *path = NULL;
+
+ sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
+ sysfs_group = g_malloc(PATH_MAX);
+ if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
+ error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
+ goto out;
+ }
+ p = strrchr(sysfs_group, '/');
+ if (!p) {
+ error_setg(errp, "Failed to find iommu group number");
+ goto out;
+ }
+
+ path = g_strdup_printf("/dev/vfio/%s", p + 1);
+out:
+ g_free(sysfs_link);
+ g_free(sysfs_group);
+ return path;
+}
+
+static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
+{
+ assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
+}
+
+static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
+{
+ assert_bar_index_valid(s, index);
+ s->bar_region_info[index] = (struct vfio_region_info) {
+ .index = VFIO_PCI_BAR0_REGION_INDEX + index,
+ .argsz = sizeof(struct vfio_region_info),
+ };
+ if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
+ error_setg_errno(errp, errno, "Failed to get BAR region info");
+ return -errno;
+ }
+
+ return 0;
+}
+
+/**
+ * Map a PCI bar area.
+ */
+void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
+ uint64_t offset, uint64_t size,
+ Error **errp)
+{
+ void *p;
+ assert_bar_index_valid(s, index);
+ p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ s->device, s->bar_region_info[index].offset + offset);
+ if (p == MAP_FAILED) {
+ error_setg_errno(errp, errno, "Failed to map BAR region");
+ p = NULL;
+ }
+ return p;
+}
+
+/**
+ * Unmap a PCI bar area.
+ */
+void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
+ uint64_t offset, uint64_t size)
+{
+ if (bar) {
+ munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
+ }
+}
+
+/**
+ * Initialize device IRQ with @irq_type and and register an event notifier.
+ */
+int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
+ int irq_type, Error **errp)
+{
+ int r;
+ struct vfio_irq_set *irq_set;
+ size_t irq_set_size;
+ struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+
+ irq_info.index = irq_type;
+ if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
+ error_setg_errno(errp, errno, "Failed to get device interrupt info");
+ return -errno;
+ }
+ if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ error_setg(errp, "Device interrupt doesn't support eventfd");
+ return -EINVAL;
+ }
+
+ irq_set_size = sizeof(*irq_set) + sizeof(int);
+ irq_set = g_malloc0(irq_set_size);
+
+ /* Get to a known IRQ state */
+ *irq_set = (struct vfio_irq_set) {
+ .argsz = irq_set_size,
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = irq_info.index,
+ .start = 0,
+ .count = 1,
+ };
+
+ *(int *)&irq_set->data = event_notifier_get_fd(e);
+ r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
+ g_free(irq_set);
+ if (r) {
+ error_setg_errno(errp, errno, "Failed to setup device interrupt");
+ return -errno;
+ }
+ return 0;
+}
+
+static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
+ int size, int ofs)
+{
+ int ret;
+
+ do {
+ ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
+ } while (ret == -1 && errno == EINTR);
+ return ret == size ? 0 : -errno;
+}
+
+static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
+{
+ int ret;
+
+ do {
+ ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
+ } while (ret == -1 && errno == EINTR);
+ return ret == size ? 0 : -errno;
+}
+
+static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+ Error **errp)
+{
+ int ret;
+ int i;
+ uint16_t pci_cmd;
+ struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+ struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char *group_file = NULL;
+
+ /* Create a new container */
+ s->container = open("/dev/vfio/vfio", O_RDWR);
+
+ if (s->container == -1) {
+ error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
+ return -errno;
+ }
+ if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+ error_setg(errp, "Invalid VFIO version");
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+ error_setg_errno(errp, errno, "VFIO IOMMU check failed");
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ /* Open the group */
+ group_file = sysfs_find_group_file(device, errp);
+ if (!group_file) {
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ s->group = open(group_file, O_RDWR);
+ if (s->group == -1) {
+ error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
+ group_file);
+ g_free(group_file);
+ ret = -errno;
+ goto fail_container;
+ }
+ g_free(group_file);
+
+ /* Test the group is viable and available */
+ if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
+ error_setg_errno(errp, errno, "Failed to get VFIO group status");
+ ret = -errno;
+ goto fail;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ error_setg(errp, "VFIO group is not viable");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Add the group to the container */
+ if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
+ error_setg_errno(errp, errno, "Failed to add group to VFIO container");
+ ret = -errno;
+ goto fail;
+ }
+
+ /* Enable the IOMMU model we want */
+ if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+ error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
+ ret = -errno;
+ goto fail;
+ }
+
+ /* Get additional IOMMU info */
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+ error_setg_errno(errp, errno, "Failed to get IOMMU info");
+ ret = -errno;
+ goto fail;
+ }
+
+ s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
+
+ if (s->device < 0) {
+ error_setg_errno(errp, errno, "Failed to get device fd");
+ ret = -errno;
+ goto fail;
+ }
+
+ /* Test and setup the device */
+ if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
+ error_setg_errno(errp, errno, "Failed to get device info");
+ ret = -errno;
+ goto fail;
+ }
+
+ if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+ error_setg(errp, "Invalid device regions");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->config_region_info = (struct vfio_region_info) {
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ .argsz = sizeof(struct vfio_region_info),
+ };
+ if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
+ error_setg_errno(errp, errno, "Failed to get config region info");
+ ret = -errno;
+ goto fail;
+ }
+
+ for (i = 0; i < 6; i++) {
+ ret = qemu_vfio_pci_init_bar(s, i, errp);
+ if (ret) {
+ goto fail;
+ }
+ }
+
+ /* Enable bus master */
+ ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+ if (ret) {
+ goto fail;
+ }
+ pci_cmd |= PCI_COMMAND_MASTER;
+ ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+ if (ret) {
+ goto fail;
+ }
+ return 0;
+fail:
+ close(s->group);
+fail_container:
+ close(s->container);
+ return ret;
+}
+
+static void qemu_vfio_ram_block_added(RAMBlockNotifier *n,
+ void *host, size_t size)
+{
+ QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+ trace_qemu_vfio_ram_block_added(s, host, size);
+ qemu_vfio_dma_map(s, host, size, false, NULL);
+}
+
+static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n,
+ void *host, size_t size)
+{
+ QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+ if (host) {
+ trace_qemu_vfio_ram_block_removed(s, host, size);
+ qemu_vfio_dma_unmap(s, host);
+ }
+}
+
+static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr,
+ ram_addr_t offset, ram_addr_t length,
+ void *opaque)
+{
+ int ret;
+ QEMUVFIOState *s = opaque;
+
+ if (!host_addr) {
+ return 0;
+ }
+ ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL);
+ if (ret) {
+ fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n",
+ host_addr, (uint64_t)length);
+ }
+ return 0;
+}
+
+static void qemu_vfio_open_common(QEMUVFIOState *s)
+{
+ s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
+ s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
+ ram_block_notifier_add(&s->ram_notifier);
+ s->low_water_mark = QEMU_VFIO_IOVA_MIN;
+ s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+ qemu_ram_foreach_block(qemu_vfio_init_ramblock, s);
+ qemu_mutex_init(&s->lock);
+}
+
+/**
+ * Open a PCI device, e.g. "0000:00:01.0".
+ */
+QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
+{
+ int r;
+ QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
+
+ r = qemu_vfio_init_pci(s, device, errp);
+ if (r) {
+ g_free(s);
+ return NULL;
+ }
+ qemu_vfio_open_common(s);
+ return s;
+}
+
+static void qemu_vfio_dump_mapping(IOVAMapping *m)
+{
+ if (QEMU_VFIO_DEBUG) {
+ printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
+ (uint64_t)m->size, (uint64_t)m->iova);
+ }
+}
+
+static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
+{
+ int i;
+
+ if (QEMU_VFIO_DEBUG) {
+ printf("vfio mappings\n");
+ for (i = 0; i < s->nr_mappings; ++i) {
+ qemu_vfio_dump_mapping(&s->mappings[i]);
+ }
+ }
+}
+
+/**
+ * Find the mapping entry that contains [host, host + size) and set @index to
+ * the position. If no entry contains it, @index is the position _after_ which
+ * to insert the new mapping. IOW, it is the index of the largest element that
+ * is smaller than @host, or -1 if no entry is.
+ */
+static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
+ int *index)
+{
+ IOVAMapping *p = s->mappings;
+ IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
+ IOVAMapping *mid;
+ trace_qemu_vfio_find_mapping(s, host);
+ if (!p) {
+ *index = -1;
+ return NULL;
+ }
+ while (true) {
+ mid = p + (q - p) / 2;
+ if (mid == p) {
+ break;
+ }
+ if (mid->host > host) {
+ q = mid;
+ } else if (mid->host < host) {
+ p = mid;
+ } else {
+ break;
+ }
+ }
+ if (mid->host > host) {
+ mid--;
+ } else if (mid < &s->mappings[s->nr_mappings - 1]
+ && (mid + 1)->host <= host) {
+ mid++;
+ }
+ *index = mid - &s->mappings[0];
+ if (mid >= &s->mappings[0] &&
+ mid->host <= host && mid->host + mid->size > host) {
+ assert(mid < &s->mappings[s->nr_mappings]);
+ return mid;
+ }
+ /* At this point *index + 1 is the right position to insert the new
+ * mapping.*/
+ return NULL;
+}
+
+/**
+ * Allocate IOVA and and create a new mapping record and insert it in @s.
+ */
+static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
+ void *host, size_t size,
+ int index, uint64_t iova)
+{
+ int shift;
+ IOVAMapping m = {.host = host, .size = size, .iova = iova};
+ IOVAMapping *insert;
+
+ assert(QEMU_IS_ALIGNED(size, getpagesize()));
+ assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize()));
+ assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize()));
+ trace_qemu_vfio_new_mapping(s, host, size, index, iova);
+
+ assert(index >= 0);
+ s->nr_mappings++;
+ s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
+ s->nr_mappings);
+ insert = &s->mappings[index];
+ shift = s->nr_mappings - index - 1;
+ if (shift) {
+ memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
+ }
+ *insert = m;
+ return insert;
+}
+
+/* Do the DMA mapping with VFIO. */
+static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
+ uint64_t iova)
+{
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .iova = iova,
+ .vaddr = (uintptr_t)host,
+ .size = size,
+ };
+ trace_qemu_vfio_do_mapping(s, host, size, iova);
+
+ if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+ error_report("VFIO_MAP_DMA: %d", -errno);
+ return -errno;
+ }
+ return 0;
+}
+
+/**
+ * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
+ */
+static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
+ Error **errp)
+{
+ int index;
+ struct vfio_iommu_type1_dma_unmap unmap = {
+ .argsz = sizeof(unmap),
+ .flags = 0,
+ .iova = mapping->iova,
+ .size = mapping->size,
+ };
+
+ index = mapping - s->mappings;
+ assert(mapping->size > 0);
+ assert(QEMU_IS_ALIGNED(mapping->size, getpagesize()));
+ assert(index >= 0 && index < s->nr_mappings);
+ if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+ error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
+ }
+ memmove(mapping, &s->mappings[index + 1],
+ sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
+ s->nr_mappings--;
+ s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]),
+ s->nr_mappings);
+}
+
+/* Check if the mapping list is (ascending) ordered. */
+static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
+{
+ int i;
+ if (QEMU_VFIO_DEBUG) {
+ for (i = 0; i < s->nr_mappings - 1; ++i) {
+ if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
+ fprintf(stderr, "item %d not sorted!\n", i);
+ qemu_vfio_dump_mappings(s);
+ return false;
+ }
+ if (!(s->mappings[i].host + s->mappings[i].size <=
+ s->mappings[i + 1].host)) {
+ fprintf(stderr, "item %d overlap with next!\n", i);
+ qemu_vfio_dump_mappings(s);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/* Map [host, host + size) area into a contiguous IOVA address space, and store
+ * the result in @iova if not NULL. The caller need to make sure the area is
+ * aligned to page size, and mustn't overlap with existing mapping areas (split
+ * mapping status within this area is not allowed).
+ */
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+ bool temporary, uint64_t *iova)
+{
+ int ret = 0;
+ int index;
+ IOVAMapping *mapping;
+ uint64_t iova0;
+
+ assert(QEMU_PTR_IS_ALIGNED(host, getpagesize()));
+ assert(QEMU_IS_ALIGNED(size, getpagesize()));
+ trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
+ qemu_mutex_lock(&s->lock);
+ mapping = qemu_vfio_find_mapping(s, host, &index);
+ if (mapping) {
+ iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
+ } else {
+ if (s->high_water_mark - s->low_water_mark + 1 < size) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!temporary) {
+ iova0 = s->low_water_mark;
+ mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
+ if (!mapping) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ assert(qemu_vfio_verify_mappings(s));
+ ret = qemu_vfio_do_mapping(s, host, size, iova0);
+ if (ret) {
+ qemu_vfio_undo_mapping(s, mapping, NULL);
+ goto out;
+ }
+ s->low_water_mark += size;
+ qemu_vfio_dump_mappings(s);
+ } else {
+ iova0 = s->high_water_mark - size;
+ ret = qemu_vfio_do_mapping(s, host, size, iova0);
+ if (ret) {
+ goto out;
+ }
+ s->high_water_mark -= size;
+ }
+ }
+ if (iova) {
+ *iova = iova0;
+ }
+out:
+ qemu_mutex_unlock(&s->lock);
+ return ret;
+}
+
+/* Reset the high watermark and free all "temporary" mappings. */
+int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
+{
+ struct vfio_iommu_type1_dma_unmap unmap = {
+ .argsz = sizeof(unmap),
+ .flags = 0,
+ .iova = s->high_water_mark,
+ .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+ };
+ trace_qemu_vfio_dma_reset_temporary(s);
+ qemu_mutex_lock(&s->lock);
+ if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+ error_report("VFIO_UNMAP_DMA: %d", -errno);
+ qemu_mutex_unlock(&s->lock);
+ return -errno;
+ }
+ s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+ qemu_mutex_unlock(&s->lock);
+ return 0;
+}
+
+/* Unmapping the whole area that was previously mapped with
+ * qemu_vfio_dma_map(). */
+void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
+{
+ int index = 0;
+ IOVAMapping *m;
+
+ if (!host) {
+ return;
+ }
+
+ trace_qemu_vfio_dma_unmap(s, host);
+ qemu_mutex_lock(&s->lock);
+ m = qemu_vfio_find_mapping(s, host, &index);
+ if (!m) {
+ goto out;
+ }
+ qemu_vfio_undo_mapping(s, m, NULL);
+out:
+ qemu_mutex_unlock(&s->lock);
+}
+
+static void qemu_vfio_reset(QEMUVFIOState *s)
+{
+ ioctl(s->device, VFIO_DEVICE_RESET);
+}
+
+/* Close and free the VFIO resources. */
+void qemu_vfio_close(QEMUVFIOState *s)
+{
+ int i;
+
+ if (!s) {
+ return;
+ }
+ for (i = 0; i < s->nr_mappings; ++i) {
+ qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
+ }
+ ram_block_notifier_remove(&s->ram_notifier);
+ qemu_vfio_reset(s);
+ close(s->device);
+ close(s->group);
+ close(s->container);
+}