Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-2.7-20160705' into staging

ppc patch queue for 2016-07-05 Here's the current ppc, sPAPR and related drivers patch queue. * The big addition is dynamic DMA window support (this includes some core VFIO changes) * There are also several fixes to the MMU emulation for bugs introduced with the HV mode patches * Several other bugfixes and cleanups Changes in v2: I messed up and forgot to make a fix in the last patch which BenH pointed out (introduced by my rebasing). That's fixed in this version, and I'm replacing the tag in place with the revised version. # gpg: Signature made Tue 05 Jul 2016 06:28:58 BST # gpg: using RSA key 0x6C38CACA20D9B392 # gpg: Good signature from "David Gibson <david@gibson.dropbear.id.au>" # gpg: aka "David Gibson (Red Hat) <dgibson@redhat.com>" # gpg: aka "David Gibson (ozlabs.org) <dgibson@ozlabs.org>" # gpg: WARNING: This key is not certified with sufficiently trusted signatures! # gpg: It is not certain that the signature belongs to the owner. # Primary key fingerprint: 75F4 6586 AE61 A66C C44E 87DC 6C38 CACA 20D9 B392 * remotes/dgibson/tags/ppc-for-2.7-20160705: ppc/hash64: Fix support for LPCR:ISL ppc/hash64: Add proper real mode translation support target-ppc: Return page shift from PTEG search target-ppc: Simplify HPTE matching target-ppc: Correct page size decoding in ppc_hash64_pteg_search() ppc: simplify ppc_hash64_hpte_page_shift_noslb() spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW) vfio/spapr: Create DMA window dynamically (SPAPR IOMMU v2) vfio: Add host side DMA window capabilities vfio: spapr: Add DMA memory preregistering (SPAPR IOMMU v2) spapr_iommu: Realloc guest visible TCE table when starting/stopping listening ppc: simplify max_smt initialization in ppc_cpu_realizefn() spapr: Ensure thread0 of CPU core is always realized first ppc: Fix xsrdpi, xvrdpi and xvrspi rounding Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
author: Peter Maydell <peter.maydell@linaro.org> 2016-07-05 11:14:27 +0100
committer: Peter Maydell <peter.maydell@linaro.org> 2016-07-05 11:14:27 +0100
commit: 8662d7db392f906c7808014051b278ad1542db93 (patch)
tree: ff0bced1fc218f98a01813144f43ce6de905d486
parent: 11659423113d2fbcf055085b5e8285d590addfaa (diff)
parent: 2c7ad80443e9747eb85b508be01cded958191bad (diff)
download: qemu-8662d7db392f906c7808014051b278ad1542db93.tar.gz
20 files changed, 1043 insertions, 183 deletions
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 5cc6608e50..91a3420f47 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -8,6 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 78ebd9ee38..7f33a1b2b5 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1771,6 +1771,13 @@ static void ppc_spapr_init(MachineState *machine)
             spapr->vrma_adjust = 1;
             spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
         }
+
+        /* Actually we don't support unbounded RMA anymore since we
+         * added proper emulation of HV mode. The max we can get is
+         * 16G which also happens to be what we configure for PAPR
+         * mode so make sure we don't do anything bigger than that
+         */
+        spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull);
     }
 
     if (spapr->rma_size > node0_size) {
@@ -2489,7 +2496,12 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", true);
  * pseries-2.6
  */
 #define SPAPR_COMPAT_2_6 \
-    HW_COMPAT_2_6
+    HW_COMPAT_2_6 \
+    { \
+        .driver   = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+        .property = "ddw",\
+        .value    = stringify(off),\
+    },
 
 static void spapr_machine_2_6_instance_options(MachineState *machine)
 {
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index a384db5204..70b6b0b5ee 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -259,9 +259,9 @@ out:
     error_propagate(errp, local_err);
 }
 
-static int spapr_cpu_core_realize_child(Object *child, void *opaque)
+static void spapr_cpu_core_realize_child(Object *child, Error **errp)
 {
-    Error **errp = opaque, *local_err = NULL;
+    Error *local_err = NULL;
     sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     CPUState *cs = CPU(child);
     PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -269,15 +269,14 @@ static int spapr_cpu_core_realize_child(Object *child, void *opaque)
     object_property_set_bool(child, true, "realized", &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return 1;
+        return;
     }
 
     spapr_cpu_init(spapr, cpu, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return 1;
+        return;
     }
-    return 0;
 }
 
 static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
@@ -287,13 +286,13 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
     const char *typename = object_class_get_name(sc->cpu_class);
     size_t size = object_type_get_instance_size(typename);
     Error *local_err = NULL;
-    Object *obj;
-    int i;
+    void *obj;
+    int i, j;
 
     sc->threads = g_malloc0(size * cc->nr_threads);
     for (i = 0; i < cc->nr_threads; i++) {
         char id[32];
-        void *obj = sc->threads + i * size;
+        obj = sc->threads + i * size;
 
         object_initialize(obj, size, typename);
         snprintf(id, sizeof(id), "thread[%d]", i);
@@ -303,12 +302,16 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
         }
         object_unref(obj);
     }
-    object_child_foreach(OBJECT(dev), spapr_cpu_core_realize_child, &local_err);
-    if (local_err) {
-        goto err;
-    } else {
-        return;
+
+    for (j = 0; j < cc->nr_threads; j++) {
+        obj = sc->threads + j * size;
+
+        spapr_cpu_core_realize_child(obj, &local_err);
+        if (local_err) {
+            goto err;
+        }
     }
+    return;
 
 err:
     while (--i >= 0) {
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e011ed4b66..73af112e1d 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -83,12 +83,12 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     target_ulong pte_index = args[1];
     target_ulong pteh = args[2];
     target_ulong ptel = args[3];
-    unsigned apshift, spshift;
+    unsigned apshift;
     target_ulong raddr;
     target_ulong index;
     uint64_t token;
 
-    apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel, &spshift);
+    apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel);
     if (!apshift) {
         /* Bad page size encoding */
         return H_PARAMETER;
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index e230bacae1..d57b05d5c0 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -156,6 +156,16 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu)
     return 1ULL << tcet->page_shift;
 }
 
+static void spapr_tce_notify_started(MemoryRegion *iommu)
+{
+    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true);
+}
+
+static void spapr_tce_notify_stopped(MemoryRegion *iommu)
+{
+    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false);
+}
+
 static int spapr_tce_table_post_load(void *opaque, int version_id)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -236,6 +246,8 @@ static const VMStateDescription vmstate_spapr_tce_table = {
 static MemoryRegionIOMMUOps spapr_iommu_ops = {
     .translate = spapr_tce_translate_iommu,
     .get_min_page_size = spapr_tce_get_min_page_size,
+    .notify_started = spapr_tce_notify_started,
+    .notify_stopped = spapr_tce_notify_stopped,
 };
 
 static int spapr_tce_table_realize(DeviceState *dev)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 8c1e6b17c3..949c44fec8 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -35,6 +35,7 @@
 #include "hw/ppc/spapr.h"
 #include "hw/pci-host/spapr.h"
 #include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
 #include <libfdt.h>
 #include "trace.h"
 #include "qemu/error-report.h"
@@ -45,6 +46,7 @@
 #include "hw/ppc/spapr_drc.h"
 #include "sysemu/device_tree.h"
 #include "sysemu/kvm.h"
+#include "sysemu/hostmem.h"
 
 #include "hw/vfio/vfio.h"
 
@@ -1087,12 +1089,6 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
     void *fdt = NULL;
     int fdt_start_offset = 0, fdt_size;
 
-    if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
-        sPAPRTCETable *tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
-
-        spapr_tce_set_need_vfio(tcet, true);
-    }
-
     fdt = create_device_tree(&fdt_size);
     fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
     if (!fdt_start_offset) {
@@ -1310,11 +1306,14 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
     PCIBus *bus;
     uint64_t msi_window_size = 4096;
     sPAPRTCETable *tcet;
+    const unsigned windows_supported =
+        sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
 
     if (sphb->index != (uint32_t)-1) {
         hwaddr windows_base;
 
-        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1)
+        if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != (uint32_t)-1)
+            || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2)
             || (sphb->mem_win_addr != (hwaddr)-1)
             || (sphb->io_win_addr != (hwaddr)-1)) {
             error_setg(errp, "Either \"index\" or other parameters must"
@@ -1329,7 +1328,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         }
 
         sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index;
-        sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0);
+        for (i = 0; i < windows_supported; ++i) {
+            sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i);
+        }
 
         windows_base = SPAPR_PCI_WINDOW_BASE
             + sphb->index * SPAPR_PCI_WINDOW_SPACING;
@@ -1342,8 +1343,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    if (sphb->dma_liobn == (uint32_t)-1) {
-        error_setg(errp, "LIOBN not specified for PHB");
+    if ((sphb->dma_liobn[0] == (uint32_t)-1) ||
+        ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) {
+        error_setg(errp, "LIOBN(s) not specified for PHB");
         return;
     }
 
@@ -1462,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
         }
     }
 
-    tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
-    if (!tcet) {
-        error_setg(errp, "Unable to create TCE table for %s",
-                   sphb->dtbusname);
-        return;
+    /* DMA setup */
+    for (i = 0; i < windows_supported; ++i) {
+        tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]);
+        if (!tcet) {
+            error_setg(errp, "Creating window#%d failed for %s",
+                       i, sphb->dtbusname);
+            return;
+        }
+        memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+                                            spapr_tce_get_iommu(tcet), 0);
     }
 
-    memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
-                                        spapr_tce_get_iommu(tcet), 0);
-
     sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
 }
 
@@ -1488,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
 
 void spapr_phb_dma_reset(sPAPRPHBState *sphb)
 {
-    sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
+    int i;
+    sPAPRTCETable *tcet;
 
-    if (tcet && tcet->nb_table) {
-        spapr_tce_table_disable(tcet);
+    for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+        tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]);
+
+        if (tcet && tcet->nb_table) {
+            spapr_tce_table_disable(tcet);
+        }
     }
 
     /* Register default 32bit DMA window */
+    tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]);
     spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr,
                            sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT);
 }
@@ -1516,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev)
 static Property spapr_phb_properties[] = {
     DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1),
     DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1),
-    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1),
+    DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1),
+    DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1),
     DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1),
     DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size,
                        SPAPR_PCI_MMIO_WIN_SIZE),
@@ -1528,6 +1539,11 @@ static Property spapr_phb_properties[] = {
     /* Default DMA window is 0..1GB */
     DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
     DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
+    DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_win_addr,
+                       0x800000000000000ULL),
+    DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+    DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
+                       (1ULL << 12) | (1ULL << 16)),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -1604,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = {
     .post_load = spapr_pci_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState),
-        VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState),
+        VMSTATE_UINT32_EQUAL(dma_liobn[0], sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
         VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
@@ -1780,6 +1796,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     uint32_t interrupt_map_mask[] = {
         cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
     uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+    uint32_t ddw_applicable[] = {
+        cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+        cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+    };
+    uint32_t ddw_extensions[] = {
+        cpu_to_be32(1),
+        cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+    };
     sPAPRTCETable *tcet;
     PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
     sPAPRFDT s_fdt;
@@ -1804,6 +1829,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
     _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
 
+    /* Dynamic DMA window */
+    if (phb->ddw_enabled) {
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+                         sizeof(ddw_applicable)));
+        _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+                         &ddw_extensions, sizeof(ddw_extensions)));
+    }
+
     /* Build the interrupt-map, this must matches what is done
      * in pci_spapr_map_irq
      */
@@ -1827,7 +1860,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
     _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
                      sizeof(interrupt_map)));
 
-    tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
+    tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]);
     if (!tcet) {
         return -1;
     }
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000000..177dcffc9b
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,295 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && tcet->nb_table) {
+        ++*(unsigned *)opaque;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+    unsigned ret = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+    return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+    sPAPRTCETable *tcet;
+
+    tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+    if (tcet && !tcet->nb_table) {
+        *(uint32_t *)opaque = tcet->liobn;
+        return 1;
+    }
+    return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+    uint32_t liobn = 0;
+
+    object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+    return liobn;
+}
+
+static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask)
+{
+    int i;
+    uint32_t mask = 0;
+    const struct { int shift; uint32_t mask; } masks[] = {
+        { 12, RTAS_DDW_PGSIZE_4K },
+        { 16, RTAS_DDW_PGSIZE_64K },
+        { 24, RTAS_DDW_PGSIZE_16M },
+        { 25, RTAS_DDW_PGSIZE_32M },
+        { 26, RTAS_DDW_PGSIZE_64M },
+        { 27, RTAS_DDW_PGSIZE_128M },
+        { 28, RTAS_DDW_PGSIZE_256M },
+        { 34, RTAS_DDW_PGSIZE_16G },
+    };
+
+    for (i = 0; i < ARRAY_SIZE(masks); ++i) {
+        if (page_mask & (1ULL << masks[i].shift)) {
+            mask |= masks[i].mask;
+        }
+    }
+
+    return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid, max_window_size;
+    uint32_t avail, addr, pgmask = 0;
+    MachineState *machine = MACHINE(spapr);
+
+    if ((nargs != 3) || (nret != 5)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    /* Translate page mask to LoPAPR format */
+    pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
+
+    /*
+     * This is "Largest contiguous block of TCEs allocated specifically
+     * for (that is, are reserved for) this PE".
+     * Return the maximum number as maximum supported RAM size was in 4K pages.
+     */
+    if (machine->ram_size == machine->maxram_size) {
+        max_window_size = machine->ram_size;
+    } else {
+        MemoryHotplugState *hpms = &spapr->hotplug_memory;
+
+        max_window_size = hpms->base + memory_region_size(&hpms->mr);
+    }
+
+    avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, avail);
+    rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
+    rtas_st(rets, 3, pgmask);
+    rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+    trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet = NULL;
+    uint32_t addr, page_shift, window_shift, liobn;
+    uint64_t buid, win_addr;
+    int windows;
+
+    if ((nargs != 5) || (nret != 4)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    page_shift = rtas_ld(args, 3);
+    window_shift = rtas_ld(args, 4);
+    liobn = spapr_phb_get_free_liobn(sphb);
+    windows = spapr_phb_get_active_win_num(sphb);
+
+    if (!(sphb->page_size_mask & (1ULL << page_shift)) ||
+        (window_shift < page_shift)) {
+        goto param_error_exit;
+    }
+
+    if (!liobn || !sphb->ddw_enabled || windows == SPAPR_PCI_DMA_MAX_WINDOWS) {
+        goto hw_error_exit;
+    }
+
+    tcet = spapr_tce_find_by_liobn(liobn);
+    if (!tcet) {
+        goto hw_error_exit;
+    }
+
+    win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
+    spapr_tce_table_enable(tcet, page_shift, win_addr,
+                           1ULL << (window_shift - page_shift));
+    if (!tcet->nb_table) {
+        goto hw_error_exit;
+    }
+
+    trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
+                                 1ULL << window_shift, tcet->bus_offset, liobn);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    rtas_st(rets, 1, liobn);
+    rtas_st(rets, 2, tcet->bus_offset >> 32);
+    rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
+
+    return;
+
+hw_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
+                                          sPAPRMachineState *spapr,
+                                          uint32_t token, uint32_t nargs,
+                                          target_ulong args,
+                                          uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    sPAPRTCETable *tcet;
+    uint32_t liobn;
+
+    if ((nargs != 1) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    liobn = rtas_ld(args, 0);
+    tcet = spapr_tce_find_by_liobn(liobn);
+    if (!tcet) {
+        goto param_error_exit;
+    }
+
+    sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
+    if (!sphb || !sphb->ddw_enabled || !tcet->nb_table) {
+        goto param_error_exit;
+    }
+
+    spapr_tce_table_disable(tcet);
+    trace_spapr_iommu_ddw_remove(liobn);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
+                                         sPAPRMachineState *spapr,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+    sPAPRPHBState *sphb;
+    uint64_t buid;
+    uint32_t addr;
+
+    if ((nargs != 3) || (nret != 1)) {
+        goto param_error_exit;
+    }
+
+    buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+    addr = rtas_ld(args, 0);
+    sphb = spapr_pci_find_phb(spapr, buid);
+    if (!sphb || !sphb->ddw_enabled) {
+        goto param_error_exit;
+    }
+
+    spapr_phb_dma_reset(sphb);
+    trace_spapr_iommu_ddw_reset(buid, addr);
+
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+
+    return;
+
+param_error_exit:
+    rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void spapr_rtas_ddw_init(void)
+{
+    spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
+                        "ibm,query-pe-dma-window",
+                        rtas_ibm_query_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
+                        "ibm,create-pe-dma-window",
+                        rtas_ibm_create_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
+                        "ibm,remove-pe-dma-window",
+                        rtas_ibm_remove_pe_dma_window);
+    spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
+                        "ibm,reset-pe-dma-window",
+                        rtas_ibm_reset_pe_dma_window);
+}
+
+type_init(spapr_rtas_ddw_init)
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index 6da713547f..900679bc9d 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -30,6 +30,10 @@ spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, un
 spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
 spapr_iommu_pre_save(uint64_t liobn, uint32_t nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
 spapr_iommu_post_load(uint64_t liobn, uint32_t pre_nb, uint32_t post_nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" => %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
+spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
+spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32
+spapr_iommu_ddw_remove(uint32_t liobn) "liobn=%"PRIx32
+spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32
 
 # hw/ppc/ppc.c
 ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index ceddbb8f99..c25e32b029 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -4,4 +4,5 @@ obj-$(CONFIG_PCI) += pci.o pci-quirks.o
 obj-$(CONFIG_SOFTMMU) += platform.o
 obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o
 obj-$(CONFIG_SOFTMMU) += amd-xgbe.o
+obj-$(CONFIG_SOFTMMU) += spapr.o
 endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7be638e0e3..f3c0522e7e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -28,6 +28,7 @@
 #include "exec/memory.h"
 #include "hw/hw.h"
 #include "qemu/error-report.h"
+#include "qemu/range.h"
 #include "sysemu/kvm.h"
 #ifdef CONFIG_KVM
 #include "linux/kvm.h"
@@ -241,6 +242,44 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
     return -errno;
 }
 
+static void vfio_host_win_add(VFIOContainer *container,
+                              hwaddr min_iova, hwaddr max_iova,
+                              uint64_t iova_pgsizes)
+{
+    VFIOHostDMAWindow *hostwin;
+
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (ranges_overlap(hostwin->min_iova,
+                           hostwin->max_iova - hostwin->min_iova + 1,
+                           min_iova,
+                           max_iova - min_iova + 1)) {
+            hw_error("%s: Overlapped IOMMU are not enabled", __func__);
+        }
+    }
+
+    hostwin = g_malloc0(sizeof(*hostwin));
+
+    hostwin->min_iova = min_iova;
+    hostwin->max_iova = max_iova;
+    hostwin->iova_pgsizes = iova_pgsizes;
+    QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+}
+
+static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
+                             hwaddr max_iova)
+{
+    VFIOHostDMAWindow *hostwin;
+
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
+            QLIST_REMOVE(hostwin, hostwin_next);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
     return (!memory_region_is_ram(section->mr) &&
@@ -329,6 +368,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
     Int128 llend, llsize;
     void *vaddr;
     int ret;
+    VFIOHostDMAWindow *hostwin;
+    bool hostwin_found;
 
     if (vfio_listener_skipped_section(section)) {
         trace_vfio_listener_region_add_skip(
@@ -354,7 +395,40 @@ static void vfio_listener_region_add(MemoryListener *listener,
     }
     end = int128_get64(int128_sub(llend, int128_one()));
 
-    if ((iova < container->min_iova) || (end > container->max_iova)) {
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        VFIOHostDMAWindow *hostwin;
+        hwaddr pgsize = 0;
+
+        /* For now intersections are not allowed, we may relax this later */
+        QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+            if (ranges_overlap(hostwin->min_iova,
+                               hostwin->max_iova - hostwin->min_iova + 1,
+                               section->offset_within_address_space,
+                               int128_get64(section->size))) {
+                ret = -1;
+                goto fail;
+            }
+        }
+
+        ret = vfio_spapr_create_window(container, section, &pgsize);
+        if (ret) {
+            goto fail;
+        }
+
+        vfio_host_win_add(container, section->offset_within_address_space,
+                          section->offset_within_address_space +
+                          int128_get64(section->size) - 1, pgsize);
+    }
+
+    hostwin_found = false;
+    QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+        if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+            hostwin_found = true;
+            break;
+        }
+    }
+
+    if (!hostwin_found) {
         error_report("vfio: IOMMU container %p can't map guest IOVA region"
                      " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
                      container, iova, end);
@@ -369,10 +443,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
 
         trace_vfio_listener_region_add_iommu(iova, end);
         /*
-         * FIXME: We should do some checking to see if the
-         * capabilities of the host VFIO IOMMU are adequate to model
-         * the guest IOMMU
-         *
          * FIXME: For VFIO iommu types which have KVM acceleration to
          * avoid bouncing all map/unmaps through qemu this way, this
          * would be the right place to wire that up (tell the KVM
@@ -493,6 +563,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
                      "0x%"HWADDR_PRIx") = %d (%m)",
                      container, iova, int128_get64(llsize), ret);
     }
+
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        vfio_spapr_remove_window(container,
+                                 section->offset_within_address_space);
+        if (vfio_host_win_del(container,
+                              section->offset_within_address_space,
+                              section->offset_within_address_space +
+                              int128_get64(section->size) - 1) < 0) {
+            hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
+                     __func__, section->offset_within_address_space);
+        }
+    }
 }
 
 static const MemoryListener vfio_memory_listener = {
@@ -503,6 +585,9 @@ static const MemoryListener vfio_memory_listener = {
 static void vfio_listener_release(VFIOContainer *container)
 {
     memory_listener_unregister(&container->listener);
+    if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+        memory_listener_unregister(&container->prereg_listener);
+    }
 }
 
 static struct vfio_info_cap_header *
@@ -861,8 +946,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             goto free_container_exit;
         }
 
-        ret = ioctl(fd, VFIO_SET_IOMMU,
-                    v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU);
+        container->iommu_type = v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU;
+        ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
         if (ret) {
             error_report("vfio: failed to set iommu for container: %m");
             ret = -errno;
@@ -876,19 +961,18 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
          * existing Type1 IOMMUs generally support any IOVA we're
          * going to actually try in practice.
          */
-        container->min_iova = 0;
-        container->max_iova = (hwaddr)-1;
-
-        /* Assume just 4K IOVA page size */
-        container->iova_pgsizes = 0x1000;
         info.argsz = sizeof(info);
         ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
         /* Ignore errors */
-        if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
-            container->iova_pgsizes = info.iova_pgsizes;
+        if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
+            /* Assume 4k IOVA page size */
+            info.iova_pgsizes = 4096;
         }
-    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
+        vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) ||
+               ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) {
         struct vfio_iommu_spapr_tce_info info;
+        bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU);
 
         ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
         if (ret) {
@@ -896,7 +980,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
             ret = -errno;
             goto free_container_exit;
         }
-        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
+        container->iommu_type =
+            v2 ? VFIO_SPAPR_TCE_v2_IOMMU : VFIO_SPAPR_TCE_IOMMU;
+        ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
         if (ret) {
             error_report("vfio: failed to set iommu for container: %m");
             ret = -errno;
@@ -908,30 +994,54 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
          * when container fd is closed so we do not call it explicitly
          * in this file.
          */
-        ret = ioctl(fd, VFIO_IOMMU_ENABLE);
-        if (ret) {
-            error_report("vfio: failed to enable container: %m");
-            ret = -errno;
-            goto free_container_exit;
+        if (!v2) {
+            ret = ioctl(fd, VFIO_IOMMU_ENABLE);
+            if (ret) {
+                error_report("vfio: failed to enable container: %m");
+                ret = -errno;
+                goto free_container_exit;
+            }
+        } else {
+            container->prereg_listener = vfio_prereg_listener;
+
+            memory_listener_register(&container->prereg_listener,
+                                     &address_space_memory);
+            if (container->error) {
+                memory_listener_unregister(&container->prereg_listener);
+                error_report("vfio: RAM memory listener initialization failed for container");
+                goto free_container_exit;
+            }
         }
 
-        /*
-         * This only considers the host IOMMU's 32-bit window.  At
-         * some point we need to add support for the optional 64-bit
-         * window and dynamic windows
-         */
         info.argsz = sizeof(info);
         ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
         if (ret) {
             error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m");
             ret = -errno;
+            if (v2) {
+                memory_listener_unregister(&container->prereg_listener);
+            }
             goto free_container_exit;
         }
-        container->min_iova = info.dma32_window_start;
-        container->max_iova = container->min_iova + info.dma32_window_size - 1;
 
-        /* Assume just 4K IOVA pages for now */
-        container->iova_pgsizes = 0x1000;
+        if (v2) {
+            /*
+             * There is a default window in just created container.
+             * To make region_add/del simpler, we better remove this
+             * window now and let those iommu_listener callbacks
+             * create/remove them when needed.
+             */
+            ret = vfio_spapr_remove_window(container, info.dma32_window_start);
+            if (ret) {
+                goto free_container_exit;
+            }
+        } else {
+            /* The default table uses 4K pages */
+            vfio_host_win_add(container, info.dma32_window_start,
+                              info.dma32_window_start +
+                              info.dma32_window_size - 1,
+                              0x1000);
+        }
     } else {
         error_report("vfio: No available IOMMU models");
         ret = -EINVAL;
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
new file mode 100644
index 0000000000..0af342332c
--- /dev/null
+++ b/hw/vfio/spapr.c
@@ -0,0 +1,210 @@
+/*
+ * DMA memory preregistration
+ *
+ * Authors:
+ *  Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "hw/hw.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
+{
+    if (memory_region_is_iommu(section->mr)) {
+        hw_error("Cannot possibly preregister IOMMU memory");
+    }
+
+    return !memory_region_is_ram(section->mr) ||
+            memory_region_is_skip_dump(section->mr);
+}
+
+static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
+{
+    return memory_region_get_ram_ptr(section->mr) +
+        section->offset_within_region +
+        (gpa - section->offset_within_address_space);
+}
+
+static void vfio_prereg_listener_region_add(MemoryListener *listener,
+                                            MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            prereg_listener);
+    const hwaddr gpa = section->offset_within_address_space;
+    hwaddr end;
+    int ret;
+    hwaddr page_mask = qemu_real_host_page_mask;
+    struct vfio_iommu_spapr_register_memory reg = {
+        .argsz = sizeof(reg),
+        .flags = 0,
+    };
+
+    if (vfio_prereg_listener_skipped_section(section)) {
+        trace_vfio_prereg_listener_region_add_skip(
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) ||
+                 (section->offset_within_region & ~page_mask) ||
+                 (int128_get64(section->size) & ~page_mask))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    end = section->offset_within_address_space + int128_get64(section->size);
+    if (gpa >= end) {
+        return;
+    }
+
+    memory_region_ref(section->mr);
+
+    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+    reg.size = end - gpa;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+    trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
+    if (ret) {
+        /*
+         * On the initfn path, store the first error in the container so we
+         * can gracefully fail.  Runtime, there's not much we can do other
+         * than throw a hardware error.
+         */
+        if (!container->initialized) {
+            if (!container->error) {
+                container->error = ret;
+            }
+        } else {
+            hw_error("vfio: Memory registering failed, unable to continue");
+        }
+    }
+}
+
+static void vfio_prereg_listener_region_del(MemoryListener *listener,
+                                            MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            prereg_listener);
+    const hwaddr gpa = section->offset_within_address_space;
+    hwaddr end;
+    int ret;
+    hwaddr page_mask = qemu_real_host_page_mask;
+    struct vfio_iommu_spapr_register_memory reg = {
+        .argsz = sizeof(reg),
+        .flags = 0,
+    };
+
+    if (vfio_prereg_listener_skipped_section(section)) {
+        trace_vfio_prereg_listener_region_del_skip(
+                section->offset_within_address_space,
+                section->offset_within_address_space +
+                int128_get64(int128_sub(section->size, int128_one())));
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~page_mask) ||
+                 (section->offset_within_region & ~page_mask) ||
+                 (int128_get64(section->size) & ~page_mask))) {
+        error_report("%s received unaligned region", __func__);
+        return;
+    }
+
+    end = section->offset_within_address_space + int128_get64(section->size);
+    if (gpa >= end) {
+        return;
+    }
+
+    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+    reg.size = end - gpa;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+    trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
+}
+
+const MemoryListener vfio_prereg_listener = {
+    .region_add = vfio_prereg_listener_region_add,
+    .region_del = vfio_prereg_listener_region_del,
+};
+
+int vfio_spapr_create_window(VFIOContainer *container,
+                             MemoryRegionSection *section,
+                             hwaddr *pgsize)
+{
+    int ret;
+    unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr);
+    unsigned entries, pages;
+    struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
+
+    /*
+     * FIXME: For VFIO iommu types which have KVM acceleration to
+     * avoid bouncing all map/unmaps through qemu this way, this
+     * would be the right place to wire that up (tell the KVM
+     * device emulation the VFIO iommu handles to use).
+     */
+    create.window_size = int128_get64(section->size);
+    create.page_shift = ctz64(pagesize);
+    /*
+     * SPAPR host supports multilevel TCE tables, there is some
+     * heuristic to decide how many levels we want for our table:
+     * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
+     */
+    entries = create.window_size >> create.page_shift;
+    pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
+    pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */
+    create.levels = ctz64(pages) / 6 + 1;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+    if (ret) {
+        error_report("Failed to create a window, ret = %d (%m)", ret);
+        return -errno;
+    }
+
+    if (create.start_addr != section->offset_within_address_space) {
+        vfio_spapr_remove_window(container, create.start_addr);
+
+        error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
+                     section->offset_within_address_space,
+                     (uint64_t)create.start_addr);
+        ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+        return -EINVAL;
+    }
+    trace_vfio_spapr_create_window(create.page_shift,
+                                   create.window_size,
+                                   create.start_addr);
+    *pgsize = pagesize;
+
+    return 0;
+}
+
+int vfio_spapr_remove_window(VFIOContainer *container,
+                             hwaddr offset_within_address_space)
+{
+    struct vfio_iommu_spapr_tce_remove remove = {
+        .argsz = sizeof(remove),
+        .start_addr = offset_within_address_space,
+    };
+    int ret;
+
+    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+    if (ret) {
+        error_report("Failed to remove window at %"PRIx64,
+                     (uint64_t)remove.start_addr);
+        return -errno;
+    }
+
+    trace_vfio_spapr_remove_window(offset_within_address_space);
+
+    return 0;
+}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index a768fb54ec..4bb7690c46 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -115,3 +115,11 @@ vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d
 vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING"
 vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d"
 vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d"
+
+# hw/vfio/spapr.c
+vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 288b89c04a..193631d2dc 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -32,6 +32,8 @@
 #define SPAPR_PCI_HOST_BRIDGE(obj) \
     OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
 
+#define SPAPR_PCI_DMA_MAX_WINDOWS    2
+
 typedef struct sPAPRPHBState sPAPRPHBState;
 
 typedef struct spapr_pci_msi {
@@ -56,7 +58,7 @@ struct sPAPRPHBState {
     hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
     MemoryRegion memwindow, iowindow, msiwindow;
 
-    uint32_t dma_liobn;
+    uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
     hwaddr dma_win_addr, dma_win_size;
     AddressSpace iommu_as;
     MemoryRegion iommu_root;
@@ -71,6 +73,10 @@ struct sPAPRPHBState {
     spapr_pci_msi_mig *msi_devs;
 
     QLIST_ENTRY(sPAPRPHBState) list;
+
+    bool ddw_enabled;
+    uint64_t page_size_mask;
+    uint64_t dma64_win_addr;
 };
 
 #define SPAPR_PCI_MAX_INDEX          255
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 49325555d3..2e2dd14c30 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_OUT_NOT_AUTHORIZED                 -9002
 #define RTAS_OUT_SYSPARM_PARAM_ERROR            -9999
 
+/* DDW pagesize mask values from ibm,query-pe-dma-window */
+#define RTAS_DDW_PGSIZE_4K       0x01
+#define RTAS_DDW_PGSIZE_64K      0x02
+#define RTAS_DDW_PGSIZE_16M      0x04
+#define RTAS_DDW_PGSIZE_32M      0x08
+#define RTAS_DDW_PGSIZE_64M      0x10
+#define RTAS_DDW_PGSIZE_128M     0x20
+#define RTAS_DDW_PGSIZE_256M     0x40
+#define RTAS_DDW_PGSIZE_16G      0x80
+
 /* RTAS tokens */
 #define RTAS_TOKEN_BASE      0x2000
 
@@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
 #define RTAS_IBM_SET_SLOT_RESET                 (RTAS_TOKEN_BASE + 0x23)
 #define RTAS_IBM_CONFIGURE_PE                   (RTAS_TOKEN_BASE + 0x24)
 #define RTAS_IBM_SLOT_ERROR_DETAIL              (RTAS_TOKEN_BASE + 0x25)
+#define RTAS_IBM_QUERY_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
+#define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
+#define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
 
-#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
 
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0610377789..07f7188df4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,6 +73,8 @@ typedef struct VFIOContainer {
     VFIOAddressSpace *space;
     int fd; /* /dev/vfio/vfio, empowered by the attached groups */
     MemoryListener listener;
+    MemoryListener prereg_listener;
+    unsigned iommu_type;
     int error;
     bool initialized;
     /*
@@ -80,9 +82,8 @@ typedef struct VFIOContainer {
      * contiguous IOVA window.  We may need to generalize that in
      * future
      */
-    hwaddr min_iova, max_iova;
-    uint64_t iova_pgsizes;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
@@ -95,6 +96,13 @@ typedef struct VFIOGuestIOMMU {
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
 
+typedef struct VFIOHostDMAWindow {
+    hwaddr min_iova;
+    hwaddr max_iova;
+    uint64_t iova_pgsizes;
+    QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
+} VFIOHostDMAWindow;
+
 typedef struct VFIODeviceOps VFIODeviceOps;
 
 typedef struct VFIODevice {
@@ -158,4 +166,12 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
                              uint32_t subtype, struct vfio_region_info **info);
 #endif
+extern const MemoryListener vfio_prereg_listener;
+
+int vfio_spapr_create_window(VFIOContainer *container,
+                             MemoryRegionSection *section,
+                             hwaddr *pgsize);
+int vfio_spapr_remove_window(VFIOContainer *container,
+                             hwaddr offset_within_address_space);
+
 #endif /* !HW_VFIO_VFIO_COMMON_H */
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index af73bced9f..2666a3f80d 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -1047,6 +1047,8 @@ struct CPUPPCState {
     uint64_t insns_flags2;
 #if defined(TARGET_PPC64)
     struct ppc_segment_page_sizes sps;
+    ppc_slb_t vrma_slb;
+    target_ulong rmls;
     bool ci_large_pages;
 #endif
 
diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c
index 4ef893be2c..d9795d04d0 100644
--- a/target-ppc/fpu_helper.c
+++ b/target-ppc/fpu_helper.c
@@ -2689,19 +2689,19 @@ void helper_##op(CPUPPCState *env, uint32_t opcode)                    \
     helper_float_check_status(env);                                    \
 }
 
-VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_nearest_even, 1)
+VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_ties_away, 1)
 VSX_ROUND(xsrdpic, 1, float64, VsrD(0), FLOAT_ROUND_CURRENT, 1)
 VSX_ROUND(xsrdpim, 1, float64, VsrD(0), float_round_down, 1)
 VSX_ROUND(xsrdpip, 1, float64, VsrD(0), float_round_up, 1)
 VSX_ROUND(xsrdpiz, 1, float64, VsrD(0), float_round_to_zero, 1)
 
-VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_ties_away, 0)
 VSX_ROUND(xvrdpic, 2, float64, VsrD(i), FLOAT_ROUND_CURRENT, 0)
 VSX_ROUND(xvrdpim, 2, float64, VsrD(i), float_round_down, 0)
 VSX_ROUND(xvrdpip, 2, float64, VsrD(i), float_round_up, 0)
 VSX_ROUND(xvrdpiz, 2, float64, VsrD(i), float_round_to_zero, 0)
 
-VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_ties_away, 0)
 VSX_ROUND(xvrspic, 4, float32, VsrW(i), FLOAT_ROUND_CURRENT, 0)
 VSX_ROUND(xvrspim, 4, float32, VsrW(i), float_round_down, 0)
 VSX_ROUND(xvrspip, 4, float32, VsrW(i), float_round_up, 0)
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 3b1357a648..82c2186bcf 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -450,31 +450,47 @@ void ppc_hash64_stop_access(PowerPCCPU *cpu, uint64_t token)
     }
 }
 
-/* Returns the effective page shift or 0. MPSS isn't supported yet so
- * this will always be the slb_pshift or 0
- */
-static uint32_t ppc_hash64_pte_size_decode(uint64_t pte1, uint32_t slb_pshift)
+static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
+    uint64_t pte0, uint64_t pte1)
 {
-    switch (slb_pshift) {
-    case 12:
+    int i;
+
+    if (!(pte0 & HPTE64_V_LARGE)) {
+        if (sps->page_shift != 12) {
+            /* 4kiB page in a non 4kiB segment */
+            return 0;
+        }
+        /* Normal 4kiB page */
         return 12;
-    case 16:
-        if ((pte1 & 0xf000) == 0x1000) {
-            return 16;
+    }
+
+    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+        const struct ppc_one_page_size *ps = &sps->enc[i];
+        uint64_t mask;
+
+        if (!ps->page_shift) {
+            break;
         }
-        return 0;
-    case 24:
-        if ((pte1 & 0xff000) == 0) {
-            return 24;
+
+        if (ps->page_shift == 12) {
+            /* L bit is set so this can't be a 4kiB page */
+            continue;
+        }
+
+        mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
+
+        if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
+            return ps->page_shift;
         }
-        return 0;
     }
-    return 0;
+
+    return 0; /* Bad page size encoding */
 }
 
 static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
-                                     uint32_t slb_pshift, bool secondary,
-                                     target_ulong ptem, ppc_hash_pte64_t *pte)
+                                     const struct ppc_one_seg_page_size *sps,
+                                     target_ulong ptem,
+                                     ppc_hash_pte64_t *pte, unsigned *pshift)
 {
     CPUPPCState *env = &cpu->env;
     int i;
@@ -491,11 +507,17 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
         pte0 = ppc_hash64_load_hpte0(cpu, token, i);
         pte1 = ppc_hash64_load_hpte1(cpu, token, i);
 
-        if ((pte0 & HPTE64_V_VALID)
-            && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
-            && HPTE64_V_COMPARE(pte0, ptem)) {
-            uint32_t pshift = ppc_hash64_pte_size_decode(pte1, slb_pshift);
-            if (pshift == 0) {
+        /* This compares V, B, H (secondary) and the AVPN */
+        if (HPTE64_V_COMPARE(pte0, ptem)) {
+            *pshift = hpte_page_shift(sps, pte0, pte1);
+            /*
+             * If there is no match, ignore the PTE, it could simply
+             * be for a different segment size encoding and the
+             * architecture specifies we should not match. Linux will
+             * potentially leave behind PTEs for the wrong base page
+             * size when demoting segments.
+             */
+            if (*pshift == 0) {
                 continue;
             }
             /* We don't do anything with pshift yet as qemu TLB only deals
@@ -516,31 +538,40 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
 
 static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
                                      ppc_slb_t *slb, target_ulong eaddr,
-                                     ppc_hash_pte64_t *pte)
+                                     ppc_hash_pte64_t *pte, unsigned *pshift)
 {
     CPUPPCState *env = &cpu->env;
     hwaddr pte_offset;
     hwaddr hash;
     uint64_t vsid, epnmask, epn, ptem;
+    const struct ppc_one_seg_page_size *sps = slb->sps;
 
     /* The SLB store path should prevent any bad page size encodings
      * getting in there, so: */
-    assert(slb->sps);
+    assert(sps);
+
+    /* If ISL is set in LPCR we need to clamp the page size to 4K */
+    if (env->spr[SPR_LPCR] & LPCR_ISL) {
+        /* We assume that when using TCG, 4k is first entry of SPS */
+        sps = &env->sps.sps[0];
+        assert(sps->page_shift == 12);
+    }
 
-    epnmask = ~((1ULL << slb->sps->page_shift) - 1);
+    epnmask = ~((1ULL << sps->page_shift) - 1);
 
     if (slb->vsid & SLB_VSID_B) {
         /* 1TB segment */
         vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT_1T;
         epn = (eaddr & ~SEGMENT_MASK_1T) & epnmask;
-        hash = vsid ^ (vsid << 25) ^ (epn >> slb->sps->page_shift);
+        hash = vsid ^ (vsid << 25) ^ (epn >> sps->page_shift);
     } else {
         /* 256M segment */
         vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT;
         epn = (eaddr & ~SEGMENT_MASK_256M) & epnmask;
-        hash = vsid ^ (epn >> slb->sps->page_shift);
+        hash = vsid ^ (epn >> sps->page_shift);
     }
     ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN);
+    ptem |= HPTE64_V_VALID;
 
     /* Page address translation */
     qemu_log_mask(CPU_LOG_MMU,
@@ -554,70 +585,30 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pte_offset = ppc_hash64_pteg_search(cpu, hash, slb->sps->page_shift,
-                                        0, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(cpu, hash, sps, ptem, pte, pshift);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
+        ptem |= HPTE64_V_SECONDARY;
         qemu_log_mask(CPU_LOG_MMU,
                 "1 htab=" TARGET_FMT_plx "/" TARGET_FMT_plx
                 " vsid=" TARGET_FMT_lx " api=" TARGET_FMT_lx
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb->sps->page_shift, 1,
-                                            ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(cpu, ~hash, sps, ptem, pte, pshift);
     }
 
     return pte_offset;
 }
 
-static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
-    uint64_t pte0, uint64_t pte1)
-{
-    int i;
-
-    if (!(pte0 & HPTE64_V_LARGE)) {
-        if (sps->page_shift != 12) {
-            /* 4kiB page in a non 4kiB segment */
-            return 0;
-        }
-        /* Normal 4kiB page */
-        return 12;
-    }
-
-    for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
-        const struct ppc_one_page_size *ps = &sps->enc[i];
-        uint64_t mask;
-
-        if (!ps->page_shift) {
-            break;
-        }
-
-        if (ps->page_shift == 12) {
-            /* L bit is set so this can't be a 4kiB page */
-            continue;
-        }
-
-        mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
-
-        if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
-            return ps->page_shift;
-        }
-    }
-
-    return 0; /* Bad page size encoding */
-}
-
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
-                                          uint64_t pte0, uint64_t pte1,
-                                          unsigned *seg_page_shift)
+                                          uint64_t pte0, uint64_t pte1)
 {
     CPUPPCState *env = &cpu->env;
     int i;
 
     if (!(pte0 & HPTE64_V_LARGE)) {
-        *seg_page_shift = 12;
         return 12;
     }
 
@@ -635,12 +626,10 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 
         shift = hpte_page_shift(sps, pte0, pte1);
         if (shift) {
-            *seg_page_shift = sps->page_shift;
             return shift;
         }
     }
 
-    *seg_page_shift = 0;
     return 0;
 }
 
@@ -701,11 +690,52 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
 
     assert((rwx == 0) || (rwx == 1) || (rwx == 2));
 
+    /* Note on LPCR usage: 970 uses HID4, but our special variant
+     * of store_spr copies relevant fields into env->spr[SPR_LPCR].
+     * Similarily we filter unimplemented bits when storing into
+     * LPCR depending on the MMU version. This code can thus just
+     * use the LPCR "as-is".
+     */
+
     /* 1. Handle real mode accesses */
     if (((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0))) {
-        /* Translation is off */
-        /* In real mode the top 4 effective address bits are ignored */
+        /* Translation is supposedly "off"  */
+        /* In real mode the top 4 effective address bits are (mostly) ignored */
         raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
+
+        /* In HV mode, add HRMOR if top EA bit is clear */
+        if (msr_hv || !env->has_hv_mode) {
+            if (!(eaddr >> 63)) {
+                raddr |= env->spr[SPR_HRMOR];
+            }
+        } else {
+            /* Otherwise, check VPM for RMA vs VRMA */
+            if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+                slb = &env->vrma_slb;
+                if (slb->sps) {
+                    goto skip_slb_search;
+                }
+                /* Not much else to do here */
+                cs->exception_index = POWERPC_EXCP_MCHECK;
+                env->error_code = 0;
+                return 1;
+            } else if (raddr < env->rmls) {
+                /* RMA. Check bounds in RMLS */
+                raddr |= env->spr[SPR_RMOR];
+            } else {
+                /* The access failed, generate the approriate interrupt */
+                if (rwx == 2) {
+                    ppc_hash64_set_isi(cs, env, 0x08000000);
+                } else {
+                    dsisr = 0x08000000;
+                    if (rwx == 1) {
+                        dsisr |= 0x02000000;
+                    }
+                    ppc_hash64_set_dsi(cs, env, eaddr, dsisr);
+                }
+                return 1;
+            }
+        }
         tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
                      PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx,
                      TARGET_PAGE_SIZE);
@@ -714,7 +744,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
 
     /* 2. Translation is on, so look up the SLB */
     slb = slb_lookup(cpu, eaddr);
-
     if (!slb) {
         if (rwx == 2) {
             cs->exception_index = POWERPC_EXCP_ISEG;
@@ -727,6 +756,8 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
         return 1;
     }
 
+skip_slb_search:
+
     /* 3. Check for segment level no-execute violation */
     if ((rwx == 2) && (slb->vsid & SLB_VSID_N)) {
         ppc_hash64_set_isi(cs, env, 0x10000000);
@@ -734,7 +765,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
     }
 
     /* 4. Locate the PTE in the hash table */
-    pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte);
+    pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift);
     if (pte_offset == -1) {
         dsisr = 0x40000000;
         if (rwx == 2) {
@@ -750,18 +781,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
     qemu_log_mask(CPU_LOG_MMU,
                 "found PTE at offset %08" HWADDR_PRIx "\n", pte_offset);
 
-    /* Validate page size encoding */
-    apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
-    if (!apshift) {
-        error_report("Bad page size encoding in HPTE 0x%"PRIx64" - 0x%"PRIx64
-                     " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1, pte_offset);
-        /* Not entirely sure what the right action here, but machine
-         * check seems reasonable */
-        cs->exception_index = POWERPC_EXCP_MCHECK;
-        env->error_code = 0;
-        return 1;
-    }
-
     /* 5. Check access permissions */
 
     pp_prot = ppc_hash64_pte_prot(cpu, slb, pte);
@@ -821,27 +840,41 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr)
 {
     CPUPPCState *env = &cpu->env;
     ppc_slb_t *slb;
-    hwaddr pte_offset;
+    hwaddr pte_offset, raddr;
     ppc_hash_pte64_t pte;
     unsigned apshift;
 
+    /* Handle real mode */
     if (msr_dr == 0) {
         /* In real mode the top 4 effective address bits are ignored */
-        return addr & 0x0FFFFFFFFFFFFFFFULL;
-    }
+        raddr = addr & 0x0FFFFFFFFFFFFFFFULL;
 
-    slb = slb_lookup(cpu, addr);
-    if (!slb) {
-        return -1;
-    }
+        /* In HV mode, add HRMOR if top EA bit is clear */
+        if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) {
+            return raddr | env->spr[SPR_HRMOR];
+        }
 
-    pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte);
-    if (pte_offset == -1) {
-        return -1;
+        /* Otherwise, check VPM for RMA vs VRMA */
+        if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+            slb = &env->vrma_slb;
+            if (!slb->sps) {
+                return -1;
+            }
+        } else if (raddr < env->rmls) {
+            /* RMA. Check bounds in RMLS */
+            return raddr | env->spr[SPR_RMOR];
+        } else {
+            return -1;
+        }
+    } else {
+        slb = slb_lookup(cpu, addr);
+        if (!slb) {
+            return -1;
+        }
     }
 
-    apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
-    if (!apshift) {
+    pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift);
+    if (pte_offset == -1) {
         return -1;
     }
 
@@ -883,6 +916,90 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
     tlb_flush(CPU(cpu), 1);
 }
 
+void ppc_hash64_update_rmls(CPUPPCState *env)
+{
+    uint64_t lpcr = env->spr[SPR_LPCR];
+
+    /*
+     * This is the full 4 bits encoding of POWER8. Previous
+     * CPUs only support a subset of these but the filtering
+     * is done when writing LPCR
+     */
+    switch ((lpcr & LPCR_RMLS) >> LPCR_RMLS_SHIFT) {
+    case 0x8: /* 32MB */
+        env->rmls = 0x2000000ull;
+        break;
+    case 0x3: /* 64MB */
+        env->rmls = 0x4000000ull;
+        break;
+    case 0x7: /* 128MB */
+        env->rmls = 0x8000000ull;
+        break;
+    case 0x4: /* 256MB */
+        env->rmls = 0x10000000ull;
+        break;
+    case 0x2: /* 1GB */
+        env->rmls = 0x40000000ull;
+        break;
+    case 0x1: /* 16GB */
+        env->rmls = 0x400000000ull;
+        break;
+    default:
+        /* What to do here ??? */
+        env->rmls = 0;
+    }
+}
+
+void ppc_hash64_update_vrma(CPUPPCState *env)
+{
+    const struct ppc_one_seg_page_size *sps = NULL;
+    target_ulong esid, vsid, lpcr;
+    ppc_slb_t *slb = &env->vrma_slb;
+    uint32_t vrmasd;
+    int i;
+
+    /* First clear it */
+    slb->esid = slb->vsid = 0;
+    slb->sps = NULL;
+
+    /* Is VRMA enabled ? */
+    lpcr = env->spr[SPR_LPCR];
+    if (!(lpcr & LPCR_VPM0)) {
+        return;
+    }
+
+    /* Make one up. Mostly ignore the ESID which will not be
+     * needed for translation
+     */
+    vsid = SLB_VSID_VRMA;
+    vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT;
+    vsid |= (vrmasd << 4) & (SLB_VSID_L | SLB_VSID_LP);
+    esid = SLB_ESID_V;
+
+   for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+        const struct ppc_one_seg_page_size *sps1 = &env->sps.sps[i];
+
+        if (!sps1->page_shift) {
+            break;
+        }
+
+        if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) {
+            sps = sps1;
+            break;
+        }
+    }
+
+    if (!sps) {
+        error_report("Bad page size encoding esid 0x"TARGET_FMT_lx
+                     " vsid 0x"TARGET_FMT_lx, esid, vsid);
+        return;
+    }
+
+    slb->vsid = vsid;
+    slb->esid = esid;
+    slb->sps = sps;
+}
+
 void helper_store_lpcr(CPUPPCState *env, target_ulong val)
 {
     uint64_t lpcr = 0;
@@ -938,4 +1055,6 @@ void helper_store_lpcr(CPUPPCState *env, target_ulong val)
         ;
     }
     env->spr[SPR_LPCR] = lpcr;
+    ppc_hash64_update_rmls(env);
+    ppc_hash64_update_vrma(env);
 }
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 6423b9f791..3a7476b30a 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -17,8 +17,9 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
                                target_ulong pte_index,
                                target_ulong pte0, target_ulong pte1);
 unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
-                                          uint64_t pte0, uint64_t pte1,
-                                          unsigned *seg_page_shift);
+                                          uint64_t pte0, uint64_t pte1);
+void ppc_hash64_update_vrma(CPUPPCState *env);
+void ppc_hash64_update_rmls(CPUPPCState *env);
 #endif
 
 /*
@@ -37,6 +38,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 #define SLB_VSID_B_256M         0x0000000000000000ULL
 #define SLB_VSID_B_1T           0x4000000000000000ULL
 #define SLB_VSID_VSID           0x3FFFFFFFFFFFF000ULL
+#define SLB_VSID_VRMA           (0x0001FFFFFF000000ULL | SLB_VSID_B_1T)
 #define SLB_VSID_PTEM           (SLB_VSID_B | SLB_VSID_VSID)
 #define SLB_VSID_KS             0x0000000000000800ULL
 #define SLB_VSID_KP             0x0000000000000400ULL
@@ -63,7 +65,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
 #define HPTE64_V_AVPN_SHIFT     7
 #define HPTE64_V_AVPN           0x3fffffffffffff80ULL
 #define HPTE64_V_AVPN_VAL(x)    (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT)
-#define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & 0xffffffffffffff80ULL))
+#define HPTE64_V_COMPARE(x, y)  (!(((x) ^ (y)) & 0xffffffffffffff83ULL))
 #define HPTE64_V_LARGE          0x0000000000000004ULL
 #define HPTE64_V_SECONDARY      0x0000000000000002ULL
 #define HPTE64_V_VALID          0x0000000000000001ULL
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 843f19b748..8f257fb74a 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -8791,11 +8791,19 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
     /* Set emulated LPCR to not send interrupts to hypervisor. Note that
      * under KVM, the actual HW LPCR will be set differently by KVM itself,
      * the settings below ensure proper operations with TCG in absence of
-     * a real hypervisor
+     * a real hypervisor.
+     *
+     * Clearing VPM0 will also cause us to use RMOR in mmu-hash64.c for
+     * real mode accesses, which thankfully defaults to 0 and isn't
+     * accessible in guest mode.
      */
     lpcr->default_value &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV);
     lpcr->default_value |= LPCR_LPES0 | LPCR_LPES1;
 
+    /* Set RMLS to the max (ie, 16G) */
+    lpcr->default_value &= ~LPCR_RMLS;
+    lpcr->default_value |= 1ull << LPCR_RMLS_SHIFT;
+
     /* P7 and P8 has slightly different PECE bits, mostly because P8 adds
      * bit 47 and 48 which are reserved on P7. Here we set them all, which
      * will work as expected for both implementations
@@ -8811,6 +8819,10 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
     /* Set a full AMOR so guest can use the AMR as it sees fit */
     env->spr[SPR_AMOR] = amor->default_value = 0xffffffffffffffffull;
 
+    /* Update some env bits based on new LPCR value */
+    ppc_hash64_update_rmls(env);
+    ppc_hash64_update_vrma(env);
+
     /* Tell KVM that we're in PAPR mode */
     if (kvm_enabled()) {
         kvmppc_set_papr(cpu);
@@ -9516,7 +9528,7 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp)
     PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
     Error *local_err = NULL;
 #if !defined(CONFIG_USER_ONLY)
-    int max_smt = kvm_enabled() ? kvmppc_smt_threads() : 1;
+    int max_smt = kvmppc_smt_threads();
 #endif
 
 #if !defined(CONFIG_USER_ONLY)
author	Peter Maydell <peter.maydell@linaro.org>	2016-07-05 11:14:27 +0100
committer	Peter Maydell <peter.maydell@linaro.org>	2016-07-05 11:14:27 +0100
commit	8662d7db392f906c7808014051b278ad1542db93 (patch)
tree	ff0bced1fc218f98a01813144f43ce6de905d486
parent	11659423113d2fbcf055085b5e8285d590addfaa (diff)
parent	2c7ad80443e9747eb85b508be01cded958191bad (diff)
download	qemu-8662d7db392f906c7808014051b278ad1542db93.tar.gz