From 9732baf67850dac57dfc7dc8980bf408889a8973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Fri, 27 Nov 2015 15:41:18 +0100 Subject: vhost-user-test: fix chardriver race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vhost-user-tests uses a helper thread to dispatch the vhost-user servers sources. However the CharDriverState is not thread-safe. Therefore, when it's given to the thread, it shouldn't be manipulated concurrently. We dispatch cleaning the server in an idle source. By the end of the test, we ensure not to leave anything behind by joining the thread and finishing the sources dispatch. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-test.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index e4c36afbda..261f4b711a 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -216,8 +216,7 @@ static void read_guest_mem(TestServer *s) static void *thread_function(void *data) { - GMainLoop *loop; - loop = g_main_loop_new(NULL, FALSE); + GMainLoop *loop = data; g_main_loop_run(loop); return NULL; } @@ -389,7 +388,7 @@ static TestServer *test_server_new(const gchar *name) g_strdup_printf(QEMU_CMD extra, (mem), (mem), (root), (s)->chr_name, \ (s)->socket_path, (s)->chr_name, ##__VA_ARGS__) -static void test_server_free(TestServer *server) +static gboolean _test_server_free(TestServer *server) { int i; @@ -406,9 +405,15 @@ static void test_server_free(TestServer *server) unlink(server->socket_path); g_free(server->socket_path); - g_free(server->chr_name); g_free(server); + + return FALSE; +} + +static void test_server_free(TestServer *server) +{ + g_idle_add((GSourceFunc)_test_server_free, server); } static void wait_for_log_fd(TestServer *s) @@ -590,6 +595,8 @@ int main(int argc, char **argv) char *qemu_cmd = NULL; int ret; char template[] = "/tmp/vhost-test-XXXXXX"; + GMainLoop *loop; + GThread *thread; g_test_init(&argc, &argv, NULL); @@ -612,8 +619,9 @@ int main(int argc, char **argv) server = test_server_new("test"); + loop = g_main_loop_new(NULL, FALSE); /* run the main loop thread so the chardev may operate */ - g_thread_new(NULL, thread_function, NULL); + thread = g_thread_new(NULL, thread_function, loop); qemu_cmd = GET_QEMU_CMD(server); @@ -632,6 +640,14 @@ int main(int argc, char **argv) /* cleanup */ test_server_free(server); + /* finish the helper thread and dispatch pending sources */ + g_main_loop_quit(loop); + g_thread_join(thread); + while (g_main_context_pending(NULL)) { + g_main_context_iteration (NULL, TRUE); + } + g_main_loop_unref(loop); + ret = rmdir(tmpfs); if (ret != 0) { g_test_message("unable to rmdir: path (%s): %s\n", -- cgit v1.2.1 From a899b1ea2a6d6baa18f1c12da566aad35cb0d807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Fri, 27 Nov 2015 15:41:19 +0100 Subject: vhost-user-test: use unix port for migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCP port 1234 may be used by another process concurrently. Instead use a temporary unix socket. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index 261f4b711a..29205edc81 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -123,6 +123,7 @@ static VhostUserMsg m __attribute__ ((unused)); typedef struct TestServer { gchar *socket_path; + gchar *mig_path; gchar *chr_name; CharDriverState *chr; int fds_num; @@ -364,6 +365,7 @@ static TestServer *test_server_new(const gchar *name) gchar *chr_path; server->socket_path = g_strdup_printf("%s/%s.sock", tmpfs, name); + server->mig_path = g_strdup_printf("%s/%s.mig", tmpfs, name); chr_path = g_strdup_printf("unix:%s,server,nowait", server->socket_path); server->chr_name = g_strdup_printf("chr-%s", name); @@ -405,6 +407,9 @@ static gboolean _test_server_free(TestServer *server) unlink(server->socket_path); g_free(server->socket_path); + unlink(server->mig_path); + g_free(server->mig_path); + g_free(server->chr_name); g_free(server); @@ -512,7 +517,7 @@ static void test_migrate(void) { TestServer *s = test_server_new("src"); TestServer *dest = test_server_new("dest"); - const char *uri = "tcp:127.0.0.1:1234"; + char *uri = g_strdup_printf("%s%s", "unix:", dest->mig_path); QTestState *global = global_qtest, *from, *to; GSource *source; gchar *cmd; @@ -583,6 +588,7 @@ static void test_migrate(void) test_server_free(dest); qtest_quit(from); test_server_free(s); + g_free(uri); global_qtest = global; } -- cgit v1.2.1 From 45ce512670f34d10be34448e621fd1484bea0ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 30 Nov 2015 17:44:49 +0100 Subject: vhost-user-test: fix crash with glib < 2.36 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prepare callback needs to be implemented with glib < 2.36, quoting glib documentation: "Since 2.36 this may be NULL, in which case the effect is as if the function always returns FALSE with a timeout of -1." Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-test.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index 29205edc81..29de739ce5 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -506,11 +506,22 @@ test_migrate_source_check(GSource *source) return FALSE; } +#if !GLIB_CHECK_VERSION(2,36,0) +/* this callback is unnecessary with glib >2.36, the default + * prepare for the source does the same */ +static gboolean +test_migrate_source_prepare(GSource *source, gint *timeout) +{ + *timeout = -1; + return FALSE; +} +#endif + GSourceFuncs test_migrate_source_funcs = { - NULL, - test_migrate_source_check, - NULL, - NULL +#if !GLIB_CHECK_VERSION(2,36,0) + .prepare = test_migrate_source_prepare, +#endif + .check = test_migrate_source_check, }; static void test_migrate(void) -- cgit v1.2.1 From 6f6f9512ea915009abeb6a4c5f204d4c25f090e0 Mon Sep 17 00:00:00 2001 From: Victor Kaplansky Date: Tue, 1 Dec 2015 15:32:26 +0200 Subject: vhost-user: verify that number of queues is non-zero Fix QEMU crash when -netdev type=vhost-user,queues=n is passed with zero number of queues. Signed-off-by: Victor Kaplansky Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- net/vhost-user.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/vhost-user.c b/net/vhost-user.c index 5071602e9b..b368a90219 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -316,6 +316,11 @@ int net_init_vhost_user(const NetClientOptions *opts, const char *name, } queues = vhost_user_opts->has_queues ? vhost_user_opts->queues : 1; + if (queues < 1) { + error_setg(errp, + "vhost-user number of queues must be bigger than zero"); + return -1; + } return net_vhost_user_init(peer, "vhost_user", name, chr, queues); } -- cgit v1.2.1 From b0ae1536c5248bc5f0f4472b8020a7a9d2ef9293 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 Dec 2015 13:50:00 +0200 Subject: vhost: drop dead code commit 1e7398a1 ("vhost: enable vhost without without MSI-X"_ dropped the implementation of vhost_dev_query, drop it from the header file as well. Signed-off-by: Michael S. Tsirkin Reviewed-by: Yuanhan Liu --- include/hw/virtio/vhost.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 7437fd476a..b60d7585b4 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -66,7 +66,6 @@ struct vhost_dev { int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type); void vhost_dev_cleanup(struct vhost_dev *hdev); -bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev); int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev); int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev); -- cgit v1.2.1 From 6d0b908a628a7086fa855c68b217cc1e2a5c4c19 Mon Sep 17 00:00:00 2001 From: Victor Kaplansky Date: Tue, 1 Dec 2015 18:57:39 +0200 Subject: tests/vhost-user-bridge.c: fix fd leakage This fixes file descriptor leakage in vhost-user-bridge application. Whenever a new callfd or kickfd is set, the previous one should be explicitly closed. File descriptors used to map guest's memory are closed immediately after mmap call. Signed-off-by: Victor Kaplansky Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 85c4c8a835..9fb09f1df4 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -113,7 +113,6 @@ dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) return 0; } -#if 0 /* dispatcher_remove() is not currently in use but may be useful * in the future. */ static int @@ -127,9 +126,9 @@ dispatcher_remove(Dispatcher *dispr, int sock) } FD_CLR(sock, &dispr->fdset); + DPRINT("Sock %d removed from dispatcher watch.\n", sock); return 0; } -#endif /* timeout in us */ static int @@ -156,11 +155,16 @@ dispatcher_wait(Dispatcher *dispr, uint32_t timeout) /* Now call callback for every ready socket. */ int sock; - for (sock = 0; sock < dispr->max_sock + 1; sock++) - if (FD_ISSET(sock, &fdset)) { + for (sock = 0; sock < dispr->max_sock + 1; sock++) { + /* The callback on a socket can remove other sockets from the + * dispatcher, thus we have to check that the socket is + * still not removed from dispatcher's list + */ + if (FD_ISSET(sock, &fdset) && FD_ISSET(sock, &dispr->fdset)) { Event *e = &dispr->events[sock]; e->callback(sock, e->ctx); } + } return 0; } @@ -837,9 +841,10 @@ vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) if (mmap_addr == MAP_FAILED) { vubr_die("mmap"); } - dev_region->mmap_addr = (uint64_t) mmap_addr; DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); + + close(vmsg->fds[i]); } return 0; @@ -950,6 +955,17 @@ vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */ dev->ready = 0; + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); + dev->vq[index].call_fd = -1; + } + if (dev->vq[index].kick_fd != -1) { + close(dev->vq[index].kick_fd); + dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); + dev->vq[index].kick_fd = -1; + } + /* Reply */ return 1; } @@ -965,6 +981,10 @@ vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); assert(vmsg->fd_num == 1); + if (dev->vq[index].kick_fd != -1) { + close(dev->vq[index].kick_fd); + dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); + } dev->vq[index].kick_fd = vmsg->fds[0]; DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); @@ -999,6 +1019,10 @@ vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); assert(vmsg->fd_num == 1); + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); + } dev->vq[index].call_fd = vmsg->fds[0]; DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); -- cgit v1.2.1 From 11380b36196c483ff5c7f800b0f7af6aa53b5657 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 2 Dec 2015 18:31:57 +0100 Subject: virtio: handle non-virtio-1-capable backend for ccw If you run a qemu advertising VERSION_1 with an old kernel where vhost did not yet support VERSION_1, you'll end up with a device that is {modern pci|ccw revision 1} but does not advertise VERSION_1. This is not a sensible configuration and is rejected by the Linux guest drivers. To fix this, add a ->post_plugged() callback invoked after features have been queried that can handle the VERSION_1 bit being withdrawn and change ccw to fall back to revision 0 if VERSION_1 is gone. Note that pci is _not_ fixed; we'll need to rethink the approach for the next release but at least for pci it's not a regression. Signed-off-by: Cornelia Huck Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/s390x/virtio-ccw.c | 12 ++++++++++++ hw/virtio/virtio-bus.c | 3 +++ include/hw/virtio/virtio-bus.h | 5 +++++ 3 files changed, 20 insertions(+) diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index fb103b78ac..63da303864 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1555,6 +1555,17 @@ static void virtio_ccw_device_plugged(DeviceState *d, Error **errp) d->hotplugged, 1); } +static void virtio_ccw_post_plugged(DeviceState *d, Error **errp) +{ + VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d); + VirtIODevice *vdev = virtio_bus_get_device(&dev->bus); + + if (!virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1)) { + /* A backend didn't support modern virtio. */ + dev->max_rev = 0; + } +} + static void virtio_ccw_device_unplugged(DeviceState *d) { VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d); @@ -1891,6 +1902,7 @@ static void virtio_ccw_bus_class_init(ObjectClass *klass, void *data) k->save_config = virtio_ccw_save_config; k->load_config = virtio_ccw_load_config; k->device_plugged = virtio_ccw_device_plugged; + k->post_plugged = virtio_ccw_post_plugged; k->device_unplugged = virtio_ccw_device_unplugged; } diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index febda76b94..81c7cdd507 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -56,6 +56,9 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) assert(vdc->get_features != NULL); vdev->host_features = vdc->get_features(vdev, vdev->host_features, errp); + if (klass->post_plugged != NULL) { + klass->post_plugged(qbus->parent, errp); + } } /* Reset the virtio_bus */ diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h index 6c3d4cb19e..3f2c1363d0 100644 --- a/include/hw/virtio/virtio-bus.h +++ b/include/hw/virtio/virtio-bus.h @@ -59,6 +59,11 @@ typedef struct VirtioBusClass { * This is called by virtio-bus just after the device is plugged. */ void (*device_plugged)(DeviceState *d, Error **errp); + /* + * Re-evaluate setup after feature bits have been validated + * by the device backend. + */ + void (*post_plugged)(DeviceState *d, Error **errp); /* * transport independent exit function. * This is called by virtio-bus just before the device is unplugged. -- cgit v1.2.1 From 0560b0e97df3da43651158c799c6d889f27529c3 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 2 Dec 2015 19:49:07 +0200 Subject: virtio-pci: Set the QEMU_PCI_CAP_EXPRESS capability early in its DeviceClass realize method In 1811e64 'hw/virtio: Add PCIe capability to virtio devices', the QEMU_PCI_CAP_EXPRESS capability was added to virtio's pci_dev, within 'virtio_pci_realize' - the pci device object realization method. This occurs to late, as 'pci_qdev_realize' (DeviceClass.realize of TYPE_PCI_DEVICE) has already been called, without knowing that the device instance is indeed an "express" instance, thus allocating insufficient pci config space. As a result, device may crash upon attempt to write to the PCIE config space. Fix, by arming the QEMU_PCI_CAP_EXPRESS capability early in virtio-pci's own DeviceClass realize method. This also makes code cleaner, as 'virtio_pci_realize' may now access the 'pci_is_express' predicate when needed. Signed-off-by: Shmulik Ladkani Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Marcel Apfelbaum Tested-by: Marcel Apfelbaum --- hw/virtio/virtio-pci.c | 30 +++++++++++++++++++++++++----- hw/virtio/virtio-pci.h | 1 + 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index dd485629da..94667e6256 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1814,13 +1814,10 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as"); - if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) - && !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN) - && pci_bus_is_express(pci_dev->bus) - && !pci_bus_is_root(pci_dev->bus)) { + if (pci_is_express(pci_dev) && pci_bus_is_express(pci_dev->bus) && + !pci_bus_is_root(pci_dev->bus)) { int pos; - pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); @@ -1832,6 +1829,12 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) * PCI Power Management Interface Specification. */ pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); + } else { + /* + * make future invocations of pci_is_express() return false + * and pci_config_size() return PCI_CONFIG_SPACE_SIZE. + */ + pci_dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS; } virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy); @@ -1879,10 +1882,25 @@ static Property virtio_pci_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) +{ + VirtioPCIClass *vpciklass = VIRTIO_PCI_GET_CLASS(qdev); + VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); + PCIDevice *pci_dev = &proxy->pci_dev; + + if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) && + !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN)) { + pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + } + + vpciklass->parent_dc_realize(qdev, errp); +} + static void virtio_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass); dc->props = virtio_pci_properties; k->realize = virtio_pci_realize; @@ -1890,6 +1908,8 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data) k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; k->revision = VIRTIO_PCI_ABI_VERSION; k->class_id = PCI_CLASS_OTHERS; + vpciklass->parent_dc_realize = dc->realize; + dc->realize = virtio_pci_dc_realize; dc->reset = virtio_pci_reset; } diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h index ffb74bb908..a104ff2072 100644 --- a/hw/virtio/virtio-pci.h +++ b/hw/virtio/virtio-pci.h @@ -105,6 +105,7 @@ typedef struct { typedef struct VirtioPCIClass { PCIDeviceClass parent_class; + DeviceRealize parent_dc_realize; void (*realize)(VirtIOPCIProxy *vpci_dev, Error **errp); } VirtioPCIClass; -- cgit v1.2.1 From 7197fb4058bcb68986bae2bb2c04d6370f3e7218 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 Dec 2015 21:14:12 +0200 Subject: util/mmap-alloc: fix hugetlb support on ppc64 Since commit 8561c9244ddf1122d "exec: allocate PROT_NONE pages on top of RAM", it is no longer possible to back guest RAM with hugepages on ppc64 hosts: mmap(NULL, 285212672, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x3fff57000000 mmap(0x3fff57000000, 268435456, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 19, 0) = -1 EBUSY (Device or resource busy) This is because on ppc64, Linux fixes a page size for a virtual address at mmap time, so we can't switch a range of memory from anonymous small pages to hugetlbs with MAP_FIXED. See commit d0f13e3c20b6fb73ccb467bdca97fa7cf5a574cd ("[POWERPC] Introduce address space "slices"") in Linux history for the details. Detect this and create the PROT_NONE mapping using the same fd. Naturally, this makes the guard page bigger with hugetlbfs. Based on patch by Greg Kurz. Acked-by: Rik van Riel Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: Michael S. Tsirkin --- include/qemu/mmap-alloc.h | 2 ++ util/mmap-alloc.c | 39 +++++++++++++++++++++++++++++++++++++++ util/oslib-posix.c | 24 +----------------------- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h index 56388e689b..0899b2f01e 100644 --- a/include/qemu/mmap-alloc.h +++ b/include/qemu/mmap-alloc.h @@ -3,6 +3,8 @@ #include "qemu-common.h" +size_t qemu_fd_getpagesize(int fd); + void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared); void qemu_ram_munmap(void *ptr, size_t size); diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index c37acbe58e..54793a5dcf 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -14,6 +14,32 @@ #include #include +#define HUGETLBFS_MAGIC 0x958458f6 + +#ifdef CONFIG_LINUX +#include +#endif + +size_t qemu_fd_getpagesize(int fd) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } + } +#endif + + return getpagesize(); +} + void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) { /* @@ -21,7 +47,20 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) * space, even if size is already aligned. */ size_t total = size + align; +#if defined(__powerpc64__) && defined(__linux__) + /* On ppc64 mappings in the same segment (aka slice) must share the same + * page size. Since we will be re-allocating part of this segment + * from the supplied fd, we should make sure to use the same page size, + * unless we are using the system page size, in which case anonymous memory + * is OK. Use align as a hint for the page size. + * In this case, set MAP_NORESERVE to avoid allocating backing store memory. + */ + int anonfd = fd == -1 || qemu_fd_getpagesize(fd) == getpagesize() ? -1 : fd; + int flags = anonfd == -1 ? MAP_ANONYMOUS : MAP_NORESERVE; + void *ptr = mmap(0, total, PROT_NONE, flags | MAP_PRIVATE, anonfd, 0); +#else void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; void *ptr1; diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 914cef5c2c..d25f6715c7 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -46,7 +46,6 @@ extern int daemon(int, int); #else # define QEMU_VMALLOC_ALIGN getpagesize() #endif -#define HUGETLBFS_MAGIC 0x958458f6 #include #include @@ -65,7 +64,6 @@ extern int daemon(int, int); #ifdef CONFIG_LINUX #include -#include #endif #ifdef __FreeBSD__ @@ -340,26 +338,6 @@ static void sigbus_handler(int signal) siglongjmp(sigjump, 1); } -static size_t fd_getpagesize(int fd) -{ -#ifdef CONFIG_LINUX - struct statfs fs; - int ret; - - if (fd != -1) { - do { - ret = fstatfs(fd, &fs); - } while (ret != 0 && errno == EINTR); - - if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { - return fs.f_bsize; - } - } -#endif - - return getpagesize(); -} - void os_mem_prealloc(int fd, char *area, size_t memory) { int ret; @@ -387,7 +365,7 @@ void os_mem_prealloc(int fd, char *area, size_t memory) exit(1); } else { int i; - size_t hpagesize = fd_getpagesize(fd); + size_t hpagesize = qemu_fd_getpagesize(fd); size_t numpages = DIV_ROUND_UP(memory, hpagesize); /* MAP_POPULATE silently ignores failures */ -- cgit v1.2.1