summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2016-07-05 11:14:27 +0100
committerPeter Maydell <peter.maydell@linaro.org>2016-07-05 11:14:27 +0100
commit8662d7db392f906c7808014051b278ad1542db93 (patch)
treeff0bced1fc218f98a01813144f43ce6de905d486
parent11659423113d2fbcf055085b5e8285d590addfaa (diff)
parent2c7ad80443e9747eb85b508be01cded958191bad (diff)
downloadqemu-8662d7db392f906c7808014051b278ad1542db93.tar.gz
Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-2.7-20160705' into staging
ppc patch queue for 2016-07-05 Here's the current ppc, sPAPR and related drivers patch queue. * The big addition is dynamic DMA window support (this includes some core VFIO changes) * There are also several fixes to the MMU emulation for bugs introduced with the HV mode patches * Several other bugfixes and cleanups Changes in v2: I messed up and forgot to make a fix in the last patch which BenH pointed out (introduced by my rebasing). That's fixed in this version, and I'm replacing the tag in place with the revised version. # gpg: Signature made Tue 05 Jul 2016 06:28:58 BST # gpg: using RSA key 0x6C38CACA20D9B392 # gpg: Good signature from "David Gibson <david@gibson.dropbear.id.au>" # gpg: aka "David Gibson (Red Hat) <dgibson@redhat.com>" # gpg: aka "David Gibson (ozlabs.org) <dgibson@ozlabs.org>" # gpg: WARNING: This key is not certified with sufficiently trusted signatures! # gpg: It is not certain that the signature belongs to the owner. # Primary key fingerprint: 75F4 6586 AE61 A66C C44E 87DC 6C38 CACA 20D9 B392 * remotes/dgibson/tags/ppc-for-2.7-20160705: ppc/hash64: Fix support for LPCR:ISL ppc/hash64: Add proper real mode translation support target-ppc: Return page shift from PTEG search target-ppc: Simplify HPTE matching target-ppc: Correct page size decoding in ppc_hash64_pteg_search() ppc: simplify ppc_hash64_hpte_page_shift_noslb() spapr_pci/spapr_pci_vfio: Support Dynamic DMA Windows (DDW) vfio/spapr: Create DMA window dynamically (SPAPR IOMMU v2) vfio: Add host side DMA window capabilities vfio: spapr: Add DMA memory preregistering (SPAPR IOMMU v2) spapr_iommu: Realloc guest visible TCE table when starting/stopping listening ppc: simplify max_smt initialization in ppc_cpu_realizefn() spapr: Ensure thread0 of CPU core is always realized first ppc: Fix xsrdpi, xvrdpi and xvrspi rounding Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--hw/ppc/Makefile.objs1
-rw-r--r--hw/ppc/spapr.c14
-rw-r--r--hw/ppc/spapr_cpu_core.c29
-rw-r--r--hw/ppc/spapr_hcall.c4
-rw-r--r--hw/ppc/spapr_iommu.c12
-rw-r--r--hw/ppc/spapr_pci.c81
-rw-r--r--hw/ppc/spapr_rtas_ddw.c295
-rw-r--r--hw/ppc/trace-events4
-rw-r--r--hw/vfio/Makefile.objs1
-rw-r--r--hw/vfio/common.c170
-rw-r--r--hw/vfio/spapr.c210
-rw-r--r--hw/vfio/trace-events8
-rw-r--r--include/hw/pci-host/spapr.h8
-rw-r--r--include/hw/ppc/spapr.h16
-rw-r--r--include/hw/vfio/vfio-common.h20
-rw-r--r--target-ppc/cpu.h2
-rw-r--r--target-ppc/fpu_helper.c6
-rw-r--r--target-ppc/mmu-hash64.c321
-rw-r--r--target-ppc/mmu-hash64.h8
-rw-r--r--target-ppc/translate_init.c16
20 files changed, 1043 insertions, 183 deletions
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 5cc6608e50..91a3420f47 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -8,6 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o
ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
obj-y += spapr_pci_vfio.o
endif
+obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
# PowerPC 4xx boards
obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 78ebd9ee38..7f33a1b2b5 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1771,6 +1771,13 @@ static void ppc_spapr_init(MachineState *machine)
spapr->vrma_adjust = 1;
spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
}
+
+ /* Actually we don't support unbounded RMA anymore since we
+ * added proper emulation of HV mode. The max we can get is
+ * 16G which also happens to be what we configure for PAPR
+ * mode so make sure we don't do anything bigger than that
+ */
+ spapr->rma_size = MIN(spapr->rma_size, 0x400000000ull);
}
if (spapr->rma_size > node0_size) {
@@ -2489,7 +2496,12 @@ DEFINE_SPAPR_MACHINE(2_7, "2.7", true);
* pseries-2.6
*/
#define SPAPR_COMPAT_2_6 \
- HW_COMPAT_2_6
+ HW_COMPAT_2_6 \
+ { \
+ .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+ .property = "ddw",\
+ .value = stringify(off),\
+ },
static void spapr_machine_2_6_instance_options(MachineState *machine)
{
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index a384db5204..70b6b0b5ee 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -259,9 +259,9 @@ out:
error_propagate(errp, local_err);
}
-static int spapr_cpu_core_realize_child(Object *child, void *opaque)
+static void spapr_cpu_core_realize_child(Object *child, Error **errp)
{
- Error **errp = opaque, *local_err = NULL;
+ Error *local_err = NULL;
sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
CPUState *cs = CPU(child);
PowerPCCPU *cpu = POWERPC_CPU(cs);
@@ -269,15 +269,14 @@ static int spapr_cpu_core_realize_child(Object *child, void *opaque)
object_property_set_bool(child, true, "realized", &local_err);
if (local_err) {
error_propagate(errp, local_err);
- return 1;
+ return;
}
spapr_cpu_init(spapr, cpu, &local_err);
if (local_err) {
error_propagate(errp, local_err);
- return 1;
+ return;
}
- return 0;
}
static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
@@ -287,13 +286,13 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
const char *typename = object_class_get_name(sc->cpu_class);
size_t size = object_type_get_instance_size(typename);
Error *local_err = NULL;
- Object *obj;
- int i;
+ void *obj;
+ int i, j;
sc->threads = g_malloc0(size * cc->nr_threads);
for (i = 0; i < cc->nr_threads; i++) {
char id[32];
- void *obj = sc->threads + i * size;
+ obj = sc->threads + i * size;
object_initialize(obj, size, typename);
snprintf(id, sizeof(id), "thread[%d]", i);
@@ -303,12 +302,16 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
}
object_unref(obj);
}
- object_child_foreach(OBJECT(dev), spapr_cpu_core_realize_child, &local_err);
- if (local_err) {
- goto err;
- } else {
- return;
+
+ for (j = 0; j < cc->nr_threads; j++) {
+ obj = sc->threads + j * size;
+
+ spapr_cpu_core_realize_child(obj, &local_err);
+ if (local_err) {
+ goto err;
+ }
}
+ return;
err:
while (--i >= 0) {
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e011ed4b66..73af112e1d 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -83,12 +83,12 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr,
target_ulong pte_index = args[1];
target_ulong pteh = args[2];
target_ulong ptel = args[3];
- unsigned apshift, spshift;
+ unsigned apshift;
target_ulong raddr;
target_ulong index;
uint64_t token;
- apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel, &spshift);
+ apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel);
if (!apshift) {
/* Bad page size encoding */
return H_PARAMETER;
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index e230bacae1..d57b05d5c0 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -156,6 +156,16 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu)
return 1ULL << tcet->page_shift;
}
+static void spapr_tce_notify_started(MemoryRegion *iommu)
+{
+ spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true);
+}
+
+static void spapr_tce_notify_stopped(MemoryRegion *iommu)
+{
+ spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false);
+}
+
static int spapr_tce_table_post_load(void *opaque, int version_id)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -236,6 +246,8 @@ static const VMStateDescription vmstate_spapr_tce_table = {
static MemoryRegionIOMMUOps spapr_iommu_ops = {
.translate = spapr_tce_translate_iommu,
.get_min_page_size = spapr_tce_get_min_page_size,
+ .notify_started = spapr_tce_notify_started,
+ .notify_stopped = spapr_tce_notify_stopped,
};
static int spapr_tce_table_realize(DeviceState *dev)
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 8c1e6b17c3..949c44fec8 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -35,6 +35,7 @@
#include "hw/ppc/spapr.h"
#include "hw/pci-host/spapr.h"
#include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
#include <libfdt.h>
#include "trace.h"
#include "qemu/error-report.h"
@@ -45,6 +46,7 @@
#include "hw/ppc/spapr_drc.h"
#include "sysemu/device_tree.h"
#include "sysemu/kvm.h"
+#include "sysemu/hostmem.h"
#include "hw/vfio/vfio.h"
@@ -1087,12 +1089,6 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc,
void *fdt = NULL;
int fdt_start_offset = 0, fdt_size;
- if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) {
- sPAPRTCETable *tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
-
- spapr_tce_set_need_vfio(tcet, true);
- }
-
fdt = create_device_tree(&fdt_size);
fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0);
if (!fdt_start_offset) {
@@ -1310,11 +1306,14 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
PCIBus *bus;
uint64_t msi_window_size = 4096;
sPAPRTCETable *tcet;
+ const unsigned windows_supported =
+ sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
if (sphb->index != (uint32_t)-1) {
hwaddr windows_base;
- if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn != (uint32_t)-1)
+ if ((sphb->buid != (uint64_t)-1) || (sphb->dma_liobn[0] != (uint32_t)-1)
+ || (sphb->dma_liobn[1] != (uint32_t)-1 && windows_supported == 2)
|| (sphb->mem_win_addr != (hwaddr)-1)
|| (sphb->io_win_addr != (hwaddr)-1)) {
error_setg(errp, "Either \"index\" or other parameters must"
@@ -1329,7 +1328,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
}
sphb->buid = SPAPR_PCI_BASE_BUID + sphb->index;
- sphb->dma_liobn = SPAPR_PCI_LIOBN(sphb->index, 0);
+ for (i = 0; i < windows_supported; ++i) {
+ sphb->dma_liobn[i] = SPAPR_PCI_LIOBN(sphb->index, i);
+ }
windows_base = SPAPR_PCI_WINDOW_BASE
+ sphb->index * SPAPR_PCI_WINDOW_SPACING;
@@ -1342,8 +1343,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
return;
}
- if (sphb->dma_liobn == (uint32_t)-1) {
- error_setg(errp, "LIOBN not specified for PHB");
+ if ((sphb->dma_liobn[0] == (uint32_t)-1) ||
+ ((sphb->dma_liobn[1] == (uint32_t)-1) && (windows_supported > 1))) {
+ error_setg(errp, "LIOBN(s) not specified for PHB");
return;
}
@@ -1462,16 +1464,18 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
}
}
- tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn);
- if (!tcet) {
- error_setg(errp, "Unable to create TCE table for %s",
- sphb->dtbusname);
- return;
+ /* DMA setup */
+ for (i = 0; i < windows_supported; ++i) {
+ tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn[i]);
+ if (!tcet) {
+ error_setg(errp, "Creating window#%d failed for %s",
+ i, sphb->dtbusname);
+ return;
+ }
+ memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
+ spapr_tce_get_iommu(tcet), 0);
}
- memory_region_add_subregion_overlap(&sphb->iommu_root, 0,
- spapr_tce_get_iommu(tcet), 0);
-
sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free);
}
@@ -1488,13 +1492,19 @@ static int spapr_phb_children_reset(Object *child, void *opaque)
void spapr_phb_dma_reset(sPAPRPHBState *sphb)
{
- sPAPRTCETable *tcet = spapr_tce_find_by_liobn(sphb->dma_liobn);
+ int i;
+ sPAPRTCETable *tcet;
- if (tcet && tcet->nb_table) {
- spapr_tce_table_disable(tcet);
+ for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) {
+ tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[i]);
+
+ if (tcet && tcet->nb_table) {
+ spapr_tce_table_disable(tcet);
+ }
}
/* Register default 32bit DMA window */
+ tcet = spapr_tce_find_by_liobn(sphb->dma_liobn[0]);
spapr_tce_table_enable(tcet, SPAPR_TCE_PAGE_SHIFT, sphb->dma_win_addr,
sphb->dma_win_size >> SPAPR_TCE_PAGE_SHIFT);
}
@@ -1516,7 +1526,8 @@ static void spapr_phb_reset(DeviceState *qdev)
static Property spapr_phb_properties[] = {
DEFINE_PROP_UINT32("index", sPAPRPHBState, index, -1),
DEFINE_PROP_UINT64("buid", sPAPRPHBState, buid, -1),
- DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn, -1),
+ DEFINE_PROP_UINT32("liobn", sPAPRPHBState, dma_liobn[0], -1),
+ DEFINE_PROP_UINT32("liobn64", sPAPRPHBState, dma_liobn[1], -1),
DEFINE_PROP_UINT64("mem_win_addr", sPAPRPHBState, mem_win_addr, -1),
DEFINE_PROP_UINT64("mem_win_size", sPAPRPHBState, mem_win_size,
SPAPR_PCI_MMIO_WIN_SIZE),
@@ -1528,6 +1539,11 @@ static Property spapr_phb_properties[] = {
/* Default DMA window is 0..1GB */
DEFINE_PROP_UINT64("dma_win_addr", sPAPRPHBState, dma_win_addr, 0),
DEFINE_PROP_UINT64("dma_win_size", sPAPRPHBState, dma_win_size, 0x40000000),
+ DEFINE_PROP_UINT64("dma64_win_addr", sPAPRPHBState, dma64_win_addr,
+ 0x800000000000000ULL),
+ DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true),
+ DEFINE_PROP_UINT64("pgsz", sPAPRPHBState, page_size_mask,
+ (1ULL << 12) | (1ULL << 16)),
DEFINE_PROP_END_OF_LIST(),
};
@@ -1604,7 +1620,7 @@ static const VMStateDescription vmstate_spapr_pci = {
.post_load = spapr_pci_post_load,
.fields = (VMStateField[]) {
VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState),
- VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState),
+ VMSTATE_UINT32_EQUAL(dma_liobn[0], sPAPRPHBState),
VMSTATE_UINT64_EQUAL(mem_win_addr, sPAPRPHBState),
VMSTATE_UINT64_EQUAL(mem_win_size, sPAPRPHBState),
VMSTATE_UINT64_EQUAL(io_win_addr, sPAPRPHBState),
@@ -1780,6 +1796,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
uint32_t interrupt_map_mask[] = {
cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+ uint32_t ddw_applicable[] = {
+ cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW),
+ cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW),
+ cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW)
+ };
+ uint32_t ddw_extensions[] = {
+ cpu_to_be32(1),
+ cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW)
+ };
sPAPRTCETable *tcet;
PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
sPAPRFDT s_fdt;
@@ -1804,6 +1829,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
_FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1));
_FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS_SPAPR));
+ /* Dynamic DMA window */
+ if (phb->ddw_enabled) {
+ _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable,
+ sizeof(ddw_applicable)));
+ _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions",
+ &ddw_extensions, sizeof(ddw_extensions)));
+ }
+
/* Build the interrupt-map, this must matches what is done
* in pci_spapr_map_irq
*/
@@ -1827,7 +1860,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
_FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
sizeof(interrupt_map)));
- tcet = spapr_tce_find_by_liobn(phb->dma_liobn);
+ tcet = spapr_tce_find_by_liobn(phb->dma_liobn[0]);
if (!tcet) {
return -1;
}
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 0000000000..177dcffc9b
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,295 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2015 Alexey Kardashevskiy, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "qemu/error-report.h"
+#include "hw/ppc/spapr.h"
+#include "hw/pci-host/spapr.h"
+#include "trace.h"
+
+static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque)
+{
+ sPAPRTCETable *tcet;
+
+ tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+ if (tcet && tcet->nb_table) {
+ ++*(unsigned *)opaque;
+ }
+ return 0;
+}
+
+static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb)
+{
+ unsigned ret = 0;
+
+ object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, &ret);
+
+ return ret;
+}
+
+static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque)
+{
+ sPAPRTCETable *tcet;
+
+ tcet = (sPAPRTCETable *) object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
+ if (tcet && !tcet->nb_table) {
+ *(uint32_t *)opaque = tcet->liobn;
+ return 1;
+ }
+ return 0;
+}
+
+static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb)
+{
+ uint32_t liobn = 0;
+
+ object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn);
+
+ return liobn;
+}
+
+static uint32_t spapr_page_mask_to_query_mask(uint64_t page_mask)
+{
+ int i;
+ uint32_t mask = 0;
+ const struct { int shift; uint32_t mask; } masks[] = {
+ { 12, RTAS_DDW_PGSIZE_4K },
+ { 16, RTAS_DDW_PGSIZE_64K },
+ { 24, RTAS_DDW_PGSIZE_16M },
+ { 25, RTAS_DDW_PGSIZE_32M },
+ { 26, RTAS_DDW_PGSIZE_64M },
+ { 27, RTAS_DDW_PGSIZE_128M },
+ { 28, RTAS_DDW_PGSIZE_256M },
+ { 34, RTAS_DDW_PGSIZE_16G },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(masks); ++i) {
+ if (page_mask & (1ULL << masks[i].shift)) {
+ mask |= masks[i].mask;
+ }
+ }
+
+ return mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ uint64_t buid, max_window_size;
+ uint32_t avail, addr, pgmask = 0;
+ MachineState *machine = MACHINE(spapr);
+
+ if ((nargs != 3) || (nret != 5)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ /* Translate page mask to LoPAPR format */
+ pgmask = spapr_page_mask_to_query_mask(sphb->page_size_mask);
+
+ /*
+ * This is "Largest contiguous block of TCEs allocated specifically
+ * for (that is, are reserved for) this PE".
+ * Return the maximum number as maximum supported RAM size was in 4K pages.
+ */
+ if (machine->ram_size == machine->maxram_size) {
+ max_window_size = machine->ram_size;
+ } else {
+ MemoryHotplugState *hpms = &spapr->hotplug_memory;
+
+ max_window_size = hpms->base + memory_region_size(&hpms->mr);
+ }
+
+ avail = SPAPR_PCI_DMA_MAX_WINDOWS - spapr_phb_get_active_win_num(sphb);
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ rtas_st(rets, 1, avail);
+ rtas_st(rets, 2, max_window_size >> SPAPR_TCE_PAGE_SHIFT);
+ rtas_st(rets, 3, pgmask);
+ rtas_st(rets, 4, 0); /* DMA migration mask, not supported */
+
+ trace_spapr_iommu_ddw_query(buid, addr, avail, max_window_size, pgmask);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ sPAPRTCETable *tcet = NULL;
+ uint32_t addr, page_shift, window_shift, liobn;
+ uint64_t buid, win_addr;
+ int windows;
+
+ if ((nargs != 5) || (nret != 4)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ page_shift = rtas_ld(args, 3);
+ window_shift = rtas_ld(args, 4);
+ liobn = spapr_phb_get_free_liobn(sphb);
+ windows = spapr_phb_get_active_win_num(sphb);
+
+ if (!(sphb->page_size_mask & (1ULL << page_shift)) ||
+ (window_shift < page_shift)) {
+ goto param_error_exit;
+ }
+
+ if (!liobn || !sphb->ddw_enabled || windows == SPAPR_PCI_DMA_MAX_WINDOWS) {
+ goto hw_error_exit;
+ }
+
+ tcet = spapr_tce_find_by_liobn(liobn);
+ if (!tcet) {
+ goto hw_error_exit;
+ }
+
+ win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
+ spapr_tce_table_enable(tcet, page_shift, win_addr,
+ 1ULL << (window_shift - page_shift));
+ if (!tcet->nb_table) {
+ goto hw_error_exit;
+ }
+
+ trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift,
+ 1ULL << window_shift, tcet->bus_offset, liobn);
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ rtas_st(rets, 1, liobn);
+ rtas_st(rets, 2, tcet->bus_offset >> 32);
+ rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1));
+
+ return;
+
+hw_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_HW_ERROR);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ sPAPRTCETable *tcet;
+ uint32_t liobn;
+
+ if ((nargs != 1) || (nret != 1)) {
+ goto param_error_exit;
+ }
+
+ liobn = rtas_ld(args, 0);
+ tcet = spapr_tce_find_by_liobn(liobn);
+ if (!tcet) {
+ goto param_error_exit;
+ }
+
+ sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent);
+ if (!sphb || !sphb->ddw_enabled || !tcet->nb_table) {
+ goto param_error_exit;
+ }
+
+ spapr_tce_table_disable(tcet);
+ trace_spapr_iommu_ddw_remove(liobn);
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ sPAPRPHBState *sphb;
+ uint64_t buid;
+ uint32_t addr;
+
+ if ((nargs != 3) || (nret != 1)) {
+ goto param_error_exit;
+ }
+
+ buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2);
+ addr = rtas_ld(args, 0);
+ sphb = spapr_pci_find_phb(spapr, buid);
+ if (!sphb || !sphb->ddw_enabled) {
+ goto param_error_exit;
+ }
+
+ spapr_phb_dma_reset(sphb);
+ trace_spapr_iommu_ddw_reset(buid, addr);
+
+ rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+
+ return;
+
+param_error_exit:
+ rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+}
+
+static void spapr_rtas_ddw_init(void)
+{
+ spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW,
+ "ibm,query-pe-dma-window",
+ rtas_ibm_query_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW,
+ "ibm,create-pe-dma-window",
+ rtas_ibm_create_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW,
+ "ibm,remove-pe-dma-window",
+ rtas_ibm_remove_pe_dma_window);
+ spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW,
+ "ibm,reset-pe-dma-window",
+ rtas_ibm_reset_pe_dma_window);
+}
+
+type_init(spapr_rtas_ddw_init)
diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events
index 6da713547f..900679bc9d 100644
--- a/hw/ppc/trace-events
+++ b/hw/ppc/trace-events
@@ -30,6 +30,10 @@ spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, un
spapr_iommu_new_table(uint64_t liobn, void *table, int fd) "liobn=%"PRIx64" table=%p fd=%d"
spapr_iommu_pre_save(uint64_t liobn, uint32_t nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
spapr_iommu_post_load(uint64_t liobn, uint32_t pre_nb, uint32_t post_nb, uint64_t offs, uint32_t ps) "liobn=%"PRIx64" %"PRIx32" => %"PRIx32" bus_offset=%"PRIx64" ps=%"PRIu32
+spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows available, max window size=%"PRIx64", mask=%"PRIx32
+spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, uint64_t pg_size, uint64_t req_size, uint64_t start, uint32_t liobn) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%"PRIx64", requested=0x%"PRIx64", start addr=%"PRIx64", liobn=%"PRIx32
+spapr_iommu_ddw_remove(uint32_t liobn) "liobn=%"PRIx32
+spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr) "buid=%"PRIx64" addr=%"PRIx32
# hw/ppc/ppc.c
ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
index ceddbb8f99..c25e32b029 100644
--- a/hw/vfio/Makefile.objs
+++ b/hw/vfio/Makefile.objs
@@ -4,4 +4,5 @@ obj-$(CONFIG_PCI) += pci.o pci-quirks.o
obj-$(CONFIG_SOFTMMU) += platform.o
obj-$(CONFIG_SOFTMMU) += calxeda-xgmac.o
obj-$(CONFIG_SOFTMMU) += amd-xgbe.o
+obj-$(CONFIG_SOFTMMU) += spapr.o
endif
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 7be638e0e3..f3c0522e7e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -28,6 +28,7 @@
#include "exec/memory.h"
#include "hw/hw.h"
#include "qemu/error-report.h"
+#include "qemu/range.h"
#include "sysemu/kvm.h"
#ifdef CONFIG_KVM
#include "linux/kvm.h"
@@ -241,6 +242,44 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
return -errno;
}
+static void vfio_host_win_add(VFIOContainer *container,
+ hwaddr min_iova, hwaddr max_iova,
+ uint64_t iova_pgsizes)
+{
+ VFIOHostDMAWindow *hostwin;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (ranges_overlap(hostwin->min_iova,
+ hostwin->max_iova - hostwin->min_iova + 1,
+ min_iova,
+ max_iova - min_iova + 1)) {
+ hw_error("%s: Overlapped IOMMU are not enabled", __func__);
+ }
+ }
+
+ hostwin = g_malloc0(sizeof(*hostwin));
+
+ hostwin->min_iova = min_iova;
+ hostwin->max_iova = max_iova;
+ hostwin->iova_pgsizes = iova_pgsizes;
+ QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
+}
+
+static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
+ hwaddr max_iova)
+{
+ VFIOHostDMAWindow *hostwin;
+
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
+ QLIST_REMOVE(hostwin, hostwin_next);
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
return (!memory_region_is_ram(section->mr) &&
@@ -329,6 +368,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
Int128 llend, llsize;
void *vaddr;
int ret;
+ VFIOHostDMAWindow *hostwin;
+ bool hostwin_found;
if (vfio_listener_skipped_section(section)) {
trace_vfio_listener_region_add_skip(
@@ -354,7 +395,40 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
end = int128_get64(int128_sub(llend, int128_one()));
- if ((iova < container->min_iova) || (end > container->max_iova)) {
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+ VFIOHostDMAWindow *hostwin;
+ hwaddr pgsize = 0;
+
+ /* For now intersections are not allowed, we may relax this later */
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (ranges_overlap(hostwin->min_iova,
+ hostwin->max_iova - hostwin->min_iova + 1,
+ section->offset_within_address_space,
+ int128_get64(section->size))) {
+ ret = -1;
+ goto fail;
+ }
+ }
+
+ ret = vfio_spapr_create_window(container, section, &pgsize);
+ if (ret) {
+ goto fail;
+ }
+
+ vfio_host_win_add(container, section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(section->size) - 1, pgsize);
+ }
+
+ hostwin_found = false;
+ QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
+ if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
+ hostwin_found = true;
+ break;
+ }
+ }
+
+ if (!hostwin_found) {
error_report("vfio: IOMMU container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
container, iova, end);
@@ -369,10 +443,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
trace_vfio_listener_region_add_iommu(iova, end);
/*
- * FIXME: We should do some checking to see if the
- * capabilities of the host VFIO IOMMU are adequate to model
- * the guest IOMMU
- *
* FIXME: For VFIO iommu types which have KVM acceleration to
* avoid bouncing all map/unmaps through qemu this way, this
* would be the right place to wire that up (tell the KVM
@@ -493,6 +563,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
"0x%"HWADDR_PRIx") = %d (%m)",
container, iova, int128_get64(llsize), ret);
}
+
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+ vfio_spapr_remove_window(container,
+ section->offset_within_address_space);
+ if (vfio_host_win_del(container,
+ section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(section->size) - 1) < 0) {
+ hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
+ __func__, section->offset_within_address_space);
+ }
+ }
}
static const MemoryListener vfio_memory_listener = {
@@ -503,6 +585,9 @@ static const MemoryListener vfio_memory_listener = {
static void vfio_listener_release(VFIOContainer *container)
{
memory_listener_unregister(&container->listener);
+ if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+ memory_listener_unregister(&container->prereg_listener);
+ }
}
static struct vfio_info_cap_header *
@@ -861,8 +946,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
goto free_container_exit;
}
- ret = ioctl(fd, VFIO_SET_IOMMU,
- v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU);
+ container->iommu_type = v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU;
+ ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
if (ret) {
error_report("vfio: failed to set iommu for container: %m");
ret = -errno;
@@ -876,19 +961,18 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
* existing Type1 IOMMUs generally support any IOVA we're
* going to actually try in practice.
*/
- container->min_iova = 0;
- container->max_iova = (hwaddr)-1;
-
- /* Assume just 4K IOVA page size */
- container->iova_pgsizes = 0x1000;
info.argsz = sizeof(info);
ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
/* Ignore errors */
- if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
- container->iova_pgsizes = info.iova_pgsizes;
+ if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
+ /* Assume 4k IOVA page size */
+ info.iova_pgsizes = 4096;
}
- } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
+ vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes);
+ } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU) ||
+ ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU)) {
struct vfio_iommu_spapr_tce_info info;
+ bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_v2_IOMMU);
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
if (ret) {
@@ -896,7 +980,9 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
ret = -errno;
goto free_container_exit;
}
- ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
+ container->iommu_type =
+ v2 ? VFIO_SPAPR_TCE_v2_IOMMU : VFIO_SPAPR_TCE_IOMMU;
+ ret = ioctl(fd, VFIO_SET_IOMMU, container->iommu_type);
if (ret) {
error_report("vfio: failed to set iommu for container: %m");
ret = -errno;
@@ -908,30 +994,54 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
* when container fd is closed so we do not call it explicitly
* in this file.
*/
- ret = ioctl(fd, VFIO_IOMMU_ENABLE);
- if (ret) {
- error_report("vfio: failed to enable container: %m");
- ret = -errno;
- goto free_container_exit;
+ if (!v2) {
+ ret = ioctl(fd, VFIO_IOMMU_ENABLE);
+ if (ret) {
+ error_report("vfio: failed to enable container: %m");
+ ret = -errno;
+ goto free_container_exit;
+ }
+ } else {
+ container->prereg_listener = vfio_prereg_listener;
+
+ memory_listener_register(&container->prereg_listener,
+ &address_space_memory);
+ if (container->error) {
+ memory_listener_unregister(&container->prereg_listener);
+ error_report("vfio: RAM memory listener initialization failed for container");
+ goto free_container_exit;
+ }
}
- /*
- * This only considers the host IOMMU's 32-bit window. At
- * some point we need to add support for the optional 64-bit
- * window and dynamic windows
- */
info.argsz = sizeof(info);
ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
if (ret) {
error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m");
ret = -errno;
+ if (v2) {
+ memory_listener_unregister(&container->prereg_listener);
+ }
goto free_container_exit;
}
- container->min_iova = info.dma32_window_start;
- container->max_iova = container->min_iova + info.dma32_window_size - 1;
- /* Assume just 4K IOVA pages for now */
- container->iova_pgsizes = 0x1000;
+ if (v2) {
+ /*
+ * There is a default window in just created container.
+ * To make region_add/del simpler, we better remove this
+ * window now and let those iommu_listener callbacks
+ * create/remove them when needed.
+ */
+ ret = vfio_spapr_remove_window(container, info.dma32_window_start);
+ if (ret) {
+ goto free_container_exit;
+ }
+ } else {
+ /* The default table uses 4K pages */
+ vfio_host_win_add(container, info.dma32_window_start,
+ info.dma32_window_start +
+ info.dma32_window_size - 1,
+ 0x1000);
+ }
} else {
error_report("vfio: No available IOMMU models");
ret = -EINVAL;
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
new file mode 100644
index 0000000000..0af342332c
--- /dev/null
+++ b/hw/vfio/spapr.c
@@ -0,0 +1,210 @@
+/*
+ * DMA memory preregistration
+ *
+ * Authors:
+ * Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+
+#include "hw/vfio/vfio-common.h"
+#include "hw/hw.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
+{
+ if (memory_region_is_iommu(section->mr)) {
+ hw_error("Cannot possibly preregister IOMMU memory");
+ }
+
+ return !memory_region_is_ram(section->mr) ||
+ memory_region_is_skip_dump(section->mr);
+}
+
+static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
+{
+ return memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region +
+ (gpa - section->offset_within_address_space);
+}
+
+static void vfio_prereg_listener_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ VFIOContainer *container = container_of(listener, VFIOContainer,
+ prereg_listener);
+ const hwaddr gpa = section->offset_within_address_space;
+ hwaddr end;
+ int ret;
+ hwaddr page_mask = qemu_real_host_page_mask;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0,
+ };
+
+ if (vfio_prereg_listener_skipped_section(section)) {
+ trace_vfio_prereg_listener_region_add_skip(
+ section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
+ return;
+ }
+
+ if (unlikely((section->offset_within_address_space & ~page_mask) ||
+ (section->offset_within_region & ~page_mask) ||
+ (int128_get64(section->size) & ~page_mask))) {
+ error_report("%s received unaligned region", __func__);
+ return;
+ }
+
+ end = section->offset_within_address_space + int128_get64(section->size);
+ if (gpa >= end) {
+ return;
+ }
+
+ memory_region_ref(section->mr);
+
+ reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+ reg.size = end - gpa;
+
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+ trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
+ if (ret) {
+ /*
+ * On the initfn path, store the first error in the container so we
+ * can gracefully fail. Runtime, there's not much we can do other
+ * than throw a hardware error.
+ */
+ if (!container->initialized) {
+ if (!container->error) {
+ container->error = ret;
+ }
+ } else {
+ hw_error("vfio: Memory registering failed, unable to continue");
+ }
+ }
+}
+
+static void vfio_prereg_listener_region_del(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ VFIOContainer *container = container_of(listener, VFIOContainer,
+ prereg_listener);
+ const hwaddr gpa = section->offset_within_address_space;
+ hwaddr end;
+ int ret;
+ hwaddr page_mask = qemu_real_host_page_mask;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0,
+ };
+
+ if (vfio_prereg_listener_skipped_section(section)) {
+ trace_vfio_prereg_listener_region_del_skip(
+ section->offset_within_address_space,
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
+ return;
+ }
+
+ if (unlikely((section->offset_within_address_space & ~page_mask) ||
+ (section->offset_within_region & ~page_mask) ||
+ (int128_get64(section->size) & ~page_mask))) {
+ error_report("%s received unaligned region", __func__);
+ return;
+ }
+
+ end = section->offset_within_address_space + int128_get64(section->size);
+ if (gpa >= end) {
+ return;
+ }
+
+ reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
+ reg.size = end - gpa;
+
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+ trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
+}
+
+const MemoryListener vfio_prereg_listener = {
+ .region_add = vfio_prereg_listener_region_add,
+ .region_del = vfio_prereg_listener_region_del,
+};
+
+int vfio_spapr_create_window(VFIOContainer *container,
+ MemoryRegionSection *section,
+ hwaddr *pgsize)
+{
+ int ret;
+ unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr);
+ unsigned entries, pages;
+ struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
+
+ /*
+ * FIXME: For VFIO iommu types which have KVM acceleration to
+ * avoid bouncing all map/unmaps through qemu this way, this
+ * would be the right place to wire that up (tell the KVM
+ * device emulation the VFIO iommu handles to use).
+ */
+ create.window_size = int128_get64(section->size);
+ create.page_shift = ctz64(pagesize);
+ /*
+ * SPAPR host supports multilevel TCE tables, there is some
+ * heuristic to decide how many levels we want for our table:
+ * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
+ */
+ entries = create.window_size >> create.page_shift;
+ pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
+ pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */
+ create.levels = ctz64(pages) / 6 + 1;
+
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+ if (ret) {
+ error_report("Failed to create a window, ret = %d (%m)", ret);
+ return -errno;
+ }
+
+ if (create.start_addr != section->offset_within_address_space) {
+ vfio_spapr_remove_window(container, create.start_addr);
+
+ error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
+ section->offset_within_address_space,
+ (uint64_t)create.start_addr);
+ ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ return -EINVAL;
+ }
+ trace_vfio_spapr_create_window(create.page_shift,
+ create.window_size,
+ create.start_addr);
+ *pgsize = pagesize;
+
+ return 0;
+}
+
+int vfio_spapr_remove_window(VFIOContainer *container,
+ hwaddr offset_within_address_space)
+{
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ .start_addr = offset_within_address_space,
+ };
+ int ret;
+
+ ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ error_report("Failed to remove window at %"PRIx64,
+ (uint64_t)remove.start_addr);
+ return -errno;
+ }
+
+ trace_vfio_spapr_remove_window(offset_within_address_space);
+
+ return 0;
+}
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index a768fb54ec..4bb7690c46 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -115,3 +115,11 @@ vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d
vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING"
vfio_platform_start_level_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d"
vfio_platform_start_edge_irqfd_injection(int index, int fd) "IRQ index=%d, fd = %d"
+
+# hw/vfio/spapr.c
+vfio_prereg_listener_region_add_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_listener_region_del_skip(uint64_t start, uint64_t end) "%"PRIx64" - %"PRIx64
+vfio_prereg_register(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_prereg_unregister(uint64_t va, uint64_t size, int ret) "va=%"PRIx64" size=%"PRIx64" ret=%d"
+vfio_spapr_create_window(int ps, uint64_t ws, uint64_t off) "pageshift=0x%x winsize=0x%"PRIx64" offset=0x%"PRIx64
+vfio_spapr_remove_window(uint64_t off) "offset=%"PRIx64
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 288b89c04a..193631d2dc 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -32,6 +32,8 @@
#define SPAPR_PCI_HOST_BRIDGE(obj) \
OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE)
+#define SPAPR_PCI_DMA_MAX_WINDOWS 2
+
typedef struct sPAPRPHBState sPAPRPHBState;
typedef struct spapr_pci_msi {
@@ -56,7 +58,7 @@ struct sPAPRPHBState {
hwaddr mem_win_addr, mem_win_size, io_win_addr, io_win_size;
MemoryRegion memwindow, iowindow, msiwindow;
- uint32_t dma_liobn;
+ uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
hwaddr dma_win_addr, dma_win_size;
AddressSpace iommu_as;
MemoryRegion iommu_root;
@@ -71,6 +73,10 @@ struct sPAPRPHBState {
spapr_pci_msi_mig *msi_devs;
QLIST_ENTRY(sPAPRPHBState) list;
+
+ bool ddw_enabled;
+ uint64_t page_size_mask;
+ uint64_t dma64_win_addr;
};
#define SPAPR_PCI_MAX_INDEX 255
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 49325555d3..2e2dd14c30 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -416,6 +416,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
#define RTAS_OUT_NOT_AUTHORIZED -9002
#define RTAS_OUT_SYSPARM_PARAM_ERROR -9999
+/* DDW pagesize mask values from ibm,query-pe-dma-window */
+#define RTAS_DDW_PGSIZE_4K 0x01
+#define RTAS_DDW_PGSIZE_64K 0x02
+#define RTAS_DDW_PGSIZE_16M 0x04
+#define RTAS_DDW_PGSIZE_32M 0x08
+#define RTAS_DDW_PGSIZE_64M 0x10
+#define RTAS_DDW_PGSIZE_128M 0x20
+#define RTAS_DDW_PGSIZE_256M 0x40
+#define RTAS_DDW_PGSIZE_16G 0x80
+
/* RTAS tokens */
#define RTAS_TOKEN_BASE 0x2000
@@ -457,8 +467,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi);
#define RTAS_IBM_SET_SLOT_RESET (RTAS_TOKEN_BASE + 0x23)
#define RTAS_IBM_CONFIGURE_PE (RTAS_TOKEN_BASE + 0x24)
#define RTAS_IBM_SLOT_ERROR_DETAIL (RTAS_TOKEN_BASE + 0x25)
+#define RTAS_IBM_QUERY_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_IBM_CREATE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x27)
+#define RTAS_IBM_REMOVE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x28)
+#define RTAS_IBM_RESET_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x29)
-#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x26)
+#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x2A)
/* RTAS ibm,get-system-parameter token values */
#define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 0610377789..07f7188df4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,6 +73,8 @@ typedef struct VFIOContainer {
VFIOAddressSpace *space;
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
MemoryListener listener;
+ MemoryListener prereg_listener;
+ unsigned iommu_type;
int error;
bool initialized;
/*
@@ -80,9 +82,8 @@ typedef struct VFIOContainer {
* contiguous IOVA window. We may need to generalize that in
* future
*/
- hwaddr min_iova, max_iova;
- uint64_t iova_pgsizes;
QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
+ QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_ENTRY(VFIOContainer) next;
} VFIOContainer;
@@ -95,6 +96,13 @@ typedef struct VFIOGuestIOMMU {
QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
} VFIOGuestIOMMU;
+typedef struct VFIOHostDMAWindow {
+ hwaddr min_iova;
+ hwaddr max_iova;
+ uint64_t iova_pgsizes;
+ QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
+} VFIOHostDMAWindow;
+
typedef struct VFIODeviceOps VFIODeviceOps;
typedef struct VFIODevice {
@@ -158,4 +166,12 @@ int vfio_get_region_info(VFIODevice *vbasedev, int index,
int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
uint32_t subtype, struct vfio_region_info **info);
#endif
+extern const MemoryListener vfio_prereg_listener;
+
+int vfio_spapr_create_window(VFIOContainer *container,
+ MemoryRegionSection *section,
+ hwaddr *pgsize);
+int vfio_spapr_remove_window(VFIOContainer *container,
+ hwaddr offset_within_address_space);
+
#endif /* !HW_VFIO_VFIO_COMMON_H */
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index af73bced9f..2666a3f80d 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -1047,6 +1047,8 @@ struct CPUPPCState {
uint64_t insns_flags2;
#if defined(TARGET_PPC64)
struct ppc_segment_page_sizes sps;
+ ppc_slb_t vrma_slb;
+ target_ulong rmls;
bool ci_large_pages;
#endif
diff --git a/target-ppc/fpu_helper.c b/target-ppc/fpu_helper.c
index 4ef893be2c..d9795d04d0 100644
--- a/target-ppc/fpu_helper.c
+++ b/target-ppc/fpu_helper.c
@@ -2689,19 +2689,19 @@ void helper_##op(CPUPPCState *env, uint32_t opcode) \
helper_float_check_status(env); \
}
-VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_nearest_even, 1)
+VSX_ROUND(xsrdpi, 1, float64, VsrD(0), float_round_ties_away, 1)
VSX_ROUND(xsrdpic, 1, float64, VsrD(0), FLOAT_ROUND_CURRENT, 1)
VSX_ROUND(xsrdpim, 1, float64, VsrD(0), float_round_down, 1)
VSX_ROUND(xsrdpip, 1, float64, VsrD(0), float_round_up, 1)
VSX_ROUND(xsrdpiz, 1, float64, VsrD(0), float_round_to_zero, 1)
-VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrdpi, 2, float64, VsrD(i), float_round_ties_away, 0)
VSX_ROUND(xvrdpic, 2, float64, VsrD(i), FLOAT_ROUND_CURRENT, 0)
VSX_ROUND(xvrdpim, 2, float64, VsrD(i), float_round_down, 0)
VSX_ROUND(xvrdpip, 2, float64, VsrD(i), float_round_up, 0)
VSX_ROUND(xvrdpiz, 2, float64, VsrD(i), float_round_to_zero, 0)
-VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_nearest_even, 0)
+VSX_ROUND(xvrspi, 4, float32, VsrW(i), float_round_ties_away, 0)
VSX_ROUND(xvrspic, 4, float32, VsrW(i), FLOAT_ROUND_CURRENT, 0)
VSX_ROUND(xvrspim, 4, float32, VsrW(i), float_round_down, 0)
VSX_ROUND(xvrspip, 4, float32, VsrW(i), float_round_up, 0)
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 3b1357a648..82c2186bcf 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -450,31 +450,47 @@ void ppc_hash64_stop_access(PowerPCCPU *cpu, uint64_t token)
}
}
-/* Returns the effective page shift or 0. MPSS isn't supported yet so
- * this will always be the slb_pshift or 0
- */
-static uint32_t ppc_hash64_pte_size_decode(uint64_t pte1, uint32_t slb_pshift)
+static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
+ uint64_t pte0, uint64_t pte1)
{
- switch (slb_pshift) {
- case 12:
+ int i;
+
+ if (!(pte0 & HPTE64_V_LARGE)) {
+ if (sps->page_shift != 12) {
+ /* 4kiB page in a non 4kiB segment */
+ return 0;
+ }
+ /* Normal 4kiB page */
return 12;
- case 16:
- if ((pte1 & 0xf000) == 0x1000) {
- return 16;
+ }
+
+ for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+ const struct ppc_one_page_size *ps = &sps->enc[i];
+ uint64_t mask;
+
+ if (!ps->page_shift) {
+ break;
}
- return 0;
- case 24:
- if ((pte1 & 0xff000) == 0) {
- return 24;
+
+ if (ps->page_shift == 12) {
+ /* L bit is set so this can't be a 4kiB page */
+ continue;
+ }
+
+ mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
+
+ if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
+ return ps->page_shift;
}
- return 0;
}
- return 0;
+
+ return 0; /* Bad page size encoding */
}
static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
- uint32_t slb_pshift, bool secondary,
- target_ulong ptem, ppc_hash_pte64_t *pte)
+ const struct ppc_one_seg_page_size *sps,
+ target_ulong ptem,
+ ppc_hash_pte64_t *pte, unsigned *pshift)
{
CPUPPCState *env = &cpu->env;
int i;
@@ -491,11 +507,17 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
pte0 = ppc_hash64_load_hpte0(cpu, token, i);
pte1 = ppc_hash64_load_hpte1(cpu, token, i);
- if ((pte0 & HPTE64_V_VALID)
- && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
- && HPTE64_V_COMPARE(pte0, ptem)) {
- uint32_t pshift = ppc_hash64_pte_size_decode(pte1, slb_pshift);
- if (pshift == 0) {
+ /* This compares V, B, H (secondary) and the AVPN */
+ if (HPTE64_V_COMPARE(pte0, ptem)) {
+ *pshift = hpte_page_shift(sps, pte0, pte1);
+ /*
+ * If there is no match, ignore the PTE, it could simply
+ * be for a different segment size encoding and the
+ * architecture specifies we should not match. Linux will
+ * potentially leave behind PTEs for the wrong base page
+ * size when demoting segments.
+ */
+ if (*pshift == 0) {
continue;
}
/* We don't do anything with pshift yet as qemu TLB only deals
@@ -516,31 +538,40 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU *cpu, hwaddr hash,
static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
ppc_slb_t *slb, target_ulong eaddr,
- ppc_hash_pte64_t *pte)
+ ppc_hash_pte64_t *pte, unsigned *pshift)
{
CPUPPCState *env = &cpu->env;
hwaddr pte_offset;
hwaddr hash;
uint64_t vsid, epnmask, epn, ptem;
+ const struct ppc_one_seg_page_size *sps = slb->sps;
/* The SLB store path should prevent any bad page size encodings
* getting in there, so: */
- assert(slb->sps);
+ assert(sps);
+
+ /* If ISL is set in LPCR we need to clamp the page size to 4K */
+ if (env->spr[SPR_LPCR] & LPCR_ISL) {
+ /* We assume that when using TCG, 4k is first entry of SPS */
+ sps = &env->sps.sps[0];
+ assert(sps->page_shift == 12);
+ }
- epnmask = ~((1ULL << slb->sps->page_shift) - 1);
+ epnmask = ~((1ULL << sps->page_shift) - 1);
if (slb->vsid & SLB_VSID_B) {
/* 1TB segment */
vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT_1T;
epn = (eaddr & ~SEGMENT_MASK_1T) & epnmask;
- hash = vsid ^ (vsid << 25) ^ (epn >> slb->sps->page_shift);
+ hash = vsid ^ (vsid << 25) ^ (epn >> sps->page_shift);
} else {
/* 256M segment */
vsid = (slb->vsid & SLB_VSID_VSID) >> SLB_VSID_SHIFT;
epn = (eaddr & ~SEGMENT_MASK_256M) & epnmask;
- hash = vsid ^ (epn >> slb->sps->page_shift);
+ hash = vsid ^ (epn >> sps->page_shift);
}
ptem = (slb->vsid & SLB_VSID_PTEM) | ((epn >> 16) & HPTE64_V_AVPN);
+ ptem |= HPTE64_V_VALID;
/* Page address translation */
qemu_log_mask(CPU_LOG_MMU,
@@ -554,70 +585,30 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU *cpu,
" vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
" hash=" TARGET_FMT_plx "\n",
env->htab_base, env->htab_mask, vsid, ptem, hash);
- pte_offset = ppc_hash64_pteg_search(cpu, hash, slb->sps->page_shift,
- 0, ptem, pte);
+ pte_offset = ppc_hash64_pteg_search(cpu, hash, sps, ptem, pte, pshift);
if (pte_offset == -1) {
/* Secondary PTEG lookup */
+ ptem |= HPTE64_V_SECONDARY;
qemu_log_mask(CPU_LOG_MMU,
"1 htab=" TARGET_FMT_plx "/" TARGET_FMT_plx
" vsid=" TARGET_FMT_lx " api=" TARGET_FMT_lx
" hash=" TARGET_FMT_plx "\n", env->htab_base,
env->htab_mask, vsid, ptem, ~hash);
- pte_offset = ppc_hash64_pteg_search(cpu, ~hash, slb->sps->page_shift, 1,
- ptem, pte);
+ pte_offset = ppc_hash64_pteg_search(cpu, ~hash, sps, ptem, pte, pshift);
}
return pte_offset;
}
-static unsigned hpte_page_shift(const struct ppc_one_seg_page_size *sps,
- uint64_t pte0, uint64_t pte1)
-{
- int i;
-
- if (!(pte0 & HPTE64_V_LARGE)) {
- if (sps->page_shift != 12) {
- /* 4kiB page in a non 4kiB segment */
- return 0;
- }
- /* Normal 4kiB page */
- return 12;
- }
-
- for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
- const struct ppc_one_page_size *ps = &sps->enc[i];
- uint64_t mask;
-
- if (!ps->page_shift) {
- break;
- }
-
- if (ps->page_shift == 12) {
- /* L bit is set so this can't be a 4kiB page */
- continue;
- }
-
- mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
-
- if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
- return ps->page_shift;
- }
- }
-
- return 0; /* Bad page size encoding */
-}
-
unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
- uint64_t pte0, uint64_t pte1,
- unsigned *seg_page_shift)
+ uint64_t pte0, uint64_t pte1)
{
CPUPPCState *env = &cpu->env;
int i;
if (!(pte0 & HPTE64_V_LARGE)) {
- *seg_page_shift = 12;
return 12;
}
@@ -635,12 +626,10 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
shift = hpte_page_shift(sps, pte0, pte1);
if (shift) {
- *seg_page_shift = sps->page_shift;
return shift;
}
}
- *seg_page_shift = 0;
return 0;
}
@@ -701,11 +690,52 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
assert((rwx == 0) || (rwx == 1) || (rwx == 2));
+ /* Note on LPCR usage: 970 uses HID4, but our special variant
+ * of store_spr copies relevant fields into env->spr[SPR_LPCR].
+ * Similarily we filter unimplemented bits when storing into
+ * LPCR depending on the MMU version. This code can thus just
+ * use the LPCR "as-is".
+ */
+
/* 1. Handle real mode accesses */
if (((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0))) {
- /* Translation is off */
- /* In real mode the top 4 effective address bits are ignored */
+ /* Translation is supposedly "off" */
+ /* In real mode the top 4 effective address bits are (mostly) ignored */
raddr = eaddr & 0x0FFFFFFFFFFFFFFFULL;
+
+ /* In HV mode, add HRMOR if top EA bit is clear */
+ if (msr_hv || !env->has_hv_mode) {
+ if (!(eaddr >> 63)) {
+ raddr |= env->spr[SPR_HRMOR];
+ }
+ } else {
+ /* Otherwise, check VPM for RMA vs VRMA */
+ if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+ slb = &env->vrma_slb;
+ if (slb->sps) {
+ goto skip_slb_search;
+ }
+ /* Not much else to do here */
+ cs->exception_index = POWERPC_EXCP_MCHECK;
+ env->error_code = 0;
+ return 1;
+ } else if (raddr < env->rmls) {
+ /* RMA. Check bounds in RMLS */
+ raddr |= env->spr[SPR_RMOR];
+ } else {
+ /* The access failed, generate the approriate interrupt */
+ if (rwx == 2) {
+ ppc_hash64_set_isi(cs, env, 0x08000000);
+ } else {
+ dsisr = 0x08000000;
+ if (rwx == 1) {
+ dsisr |= 0x02000000;
+ }
+ ppc_hash64_set_dsi(cs, env, eaddr, dsisr);
+ }
+ return 1;
+ }
+ }
tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
PAGE_READ | PAGE_WRITE | PAGE_EXEC, mmu_idx,
TARGET_PAGE_SIZE);
@@ -714,7 +744,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
/* 2. Translation is on, so look up the SLB */
slb = slb_lookup(cpu, eaddr);
-
if (!slb) {
if (rwx == 2) {
cs->exception_index = POWERPC_EXCP_ISEG;
@@ -727,6 +756,8 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
return 1;
}
+skip_slb_search:
+
/* 3. Check for segment level no-execute violation */
if ((rwx == 2) && (slb->vsid & SLB_VSID_N)) {
ppc_hash64_set_isi(cs, env, 0x10000000);
@@ -734,7 +765,7 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
}
/* 4. Locate the PTE in the hash table */
- pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte);
+ pte_offset = ppc_hash64_htab_lookup(cpu, slb, eaddr, &pte, &apshift);
if (pte_offset == -1) {
dsisr = 0x40000000;
if (rwx == 2) {
@@ -750,18 +781,6 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr,
qemu_log_mask(CPU_LOG_MMU,
"found PTE at offset %08" HWADDR_PRIx "\n", pte_offset);
- /* Validate page size encoding */
- apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
- if (!apshift) {
- error_report("Bad page size encoding in HPTE 0x%"PRIx64" - 0x%"PRIx64
- " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1, pte_offset);
- /* Not entirely sure what the right action here, but machine
- * check seems reasonable */
- cs->exception_index = POWERPC_EXCP_MCHECK;
- env->error_code = 0;
- return 1;
- }
-
/* 5. Check access permissions */
pp_prot = ppc_hash64_pte_prot(cpu, slb, pte);
@@ -821,27 +840,41 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong addr)
{
CPUPPCState *env = &cpu->env;
ppc_slb_t *slb;
- hwaddr pte_offset;
+ hwaddr pte_offset, raddr;
ppc_hash_pte64_t pte;
unsigned apshift;
+ /* Handle real mode */
if (msr_dr == 0) {
/* In real mode the top 4 effective address bits are ignored */
- return addr & 0x0FFFFFFFFFFFFFFFULL;
- }
+ raddr = addr & 0x0FFFFFFFFFFFFFFFULL;
- slb = slb_lookup(cpu, addr);
- if (!slb) {
- return -1;
- }
+ /* In HV mode, add HRMOR if top EA bit is clear */
+ if ((msr_hv || !env->has_hv_mode) && !(addr >> 63)) {
+ return raddr | env->spr[SPR_HRMOR];
+ }
- pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte);
- if (pte_offset == -1) {
- return -1;
+ /* Otherwise, check VPM for RMA vs VRMA */
+ if (env->spr[SPR_LPCR] & LPCR_VPM0) {
+ slb = &env->vrma_slb;
+ if (!slb->sps) {
+ return -1;
+ }
+ } else if (raddr < env->rmls) {
+ /* RMA. Check bounds in RMLS */
+ return raddr | env->spr[SPR_RMOR];
+ } else {
+ return -1;
+ }
+ } else {
+ slb = slb_lookup(cpu, addr);
+ if (!slb) {
+ return -1;
+ }
}
- apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
- if (!apshift) {
+ pte_offset = ppc_hash64_htab_lookup(cpu, slb, addr, &pte, &apshift);
+ if (pte_offset == -1) {
return -1;
}
@@ -883,6 +916,90 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
tlb_flush(CPU(cpu), 1);
}
+void ppc_hash64_update_rmls(CPUPPCState *env)
+{
+ uint64_t lpcr = env->spr[SPR_LPCR];
+
+ /*
+ * This is the full 4 bits encoding of POWER8. Previous
+ * CPUs only support a subset of these but the filtering
+ * is done when writing LPCR
+ */
+ switch ((lpcr & LPCR_RMLS) >> LPCR_RMLS_SHIFT) {
+ case 0x8: /* 32MB */
+ env->rmls = 0x2000000ull;
+ break;
+ case 0x3: /* 64MB */
+ env->rmls = 0x4000000ull;
+ break;
+ case 0x7: /* 128MB */
+ env->rmls = 0x8000000ull;
+ break;
+ case 0x4: /* 256MB */
+ env->rmls = 0x10000000ull;
+ break;
+ case 0x2: /* 1GB */
+ env->rmls = 0x40000000ull;
+ break;
+ case 0x1: /* 16GB */
+ env->rmls = 0x400000000ull;
+ break;
+ default:
+ /* What to do here ??? */
+ env->rmls = 0;
+ }
+}
+
+void ppc_hash64_update_vrma(CPUPPCState *env)
+{
+ const struct ppc_one_seg_page_size *sps = NULL;
+ target_ulong esid, vsid, lpcr;
+ ppc_slb_t *slb = &env->vrma_slb;
+ uint32_t vrmasd;
+ int i;
+
+ /* First clear it */
+ slb->esid = slb->vsid = 0;
+ slb->sps = NULL;
+
+ /* Is VRMA enabled ? */
+ lpcr = env->spr[SPR_LPCR];
+ if (!(lpcr & LPCR_VPM0)) {
+ return;
+ }
+
+ /* Make one up. Mostly ignore the ESID which will not be
+ * needed for translation
+ */
+ vsid = SLB_VSID_VRMA;
+ vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT;
+ vsid |= (vrmasd << 4) & (SLB_VSID_L | SLB_VSID_LP);
+ esid = SLB_ESID_V;
+
+ for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
+ const struct ppc_one_seg_page_size *sps1 = &env->sps.sps[i];
+
+ if (!sps1->page_shift) {
+ break;
+ }
+
+ if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) {
+ sps = sps1;
+ break;
+ }
+ }
+
+ if (!sps) {
+ error_report("Bad page size encoding esid 0x"TARGET_FMT_lx
+ " vsid 0x"TARGET_FMT_lx, esid, vsid);
+ return;
+ }
+
+ slb->vsid = vsid;
+ slb->esid = esid;
+ slb->sps = sps;
+}
+
void helper_store_lpcr(CPUPPCState *env, target_ulong val)
{
uint64_t lpcr = 0;
@@ -938,4 +1055,6 @@ void helper_store_lpcr(CPUPPCState *env, target_ulong val)
;
}
env->spr[SPR_LPCR] = lpcr;
+ ppc_hash64_update_rmls(env);
+ ppc_hash64_update_vrma(env);
}
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 6423b9f791..3a7476b30a 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -17,8 +17,9 @@ void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
target_ulong pte_index,
target_ulong pte0, target_ulong pte1);
unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
- uint64_t pte0, uint64_t pte1,
- unsigned *seg_page_shift);
+ uint64_t pte0, uint64_t pte1);
+void ppc_hash64_update_vrma(CPUPPCState *env);
+void ppc_hash64_update_rmls(CPUPPCState *env);
#endif
/*
@@ -37,6 +38,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
#define SLB_VSID_B_256M 0x0000000000000000ULL
#define SLB_VSID_B_1T 0x4000000000000000ULL
#define SLB_VSID_VSID 0x3FFFFFFFFFFFF000ULL
+#define SLB_VSID_VRMA (0x0001FFFFFF000000ULL | SLB_VSID_B_1T)
#define SLB_VSID_PTEM (SLB_VSID_B | SLB_VSID_VSID)
#define SLB_VSID_KS 0x0000000000000800ULL
#define SLB_VSID_KP 0x0000000000000400ULL
@@ -63,7 +65,7 @@ unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
#define HPTE64_V_AVPN_SHIFT 7
#define HPTE64_V_AVPN 0x3fffffffffffff80ULL
#define HPTE64_V_AVPN_VAL(x) (((x) & HPTE64_V_AVPN) >> HPTE64_V_AVPN_SHIFT)
-#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff80ULL))
+#define HPTE64_V_COMPARE(x, y) (!(((x) ^ (y)) & 0xffffffffffffff83ULL))
#define HPTE64_V_LARGE 0x0000000000000004ULL
#define HPTE64_V_SECONDARY 0x0000000000000002ULL
#define HPTE64_V_VALID 0x0000000000000001ULL
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 843f19b748..8f257fb74a 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -8791,11 +8791,19 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
/* Set emulated LPCR to not send interrupts to hypervisor. Note that
* under KVM, the actual HW LPCR will be set differently by KVM itself,
* the settings below ensure proper operations with TCG in absence of
- * a real hypervisor
+ * a real hypervisor.
+ *
+ * Clearing VPM0 will also cause us to use RMOR in mmu-hash64.c for
+ * real mode accesses, which thankfully defaults to 0 and isn't
+ * accessible in guest mode.
*/
lpcr->default_value &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV);
lpcr->default_value |= LPCR_LPES0 | LPCR_LPES1;
+ /* Set RMLS to the max (ie, 16G) */
+ lpcr->default_value &= ~LPCR_RMLS;
+ lpcr->default_value |= 1ull << LPCR_RMLS_SHIFT;
+
/* P7 and P8 has slightly different PECE bits, mostly because P8 adds
* bit 47 and 48 which are reserved on P7. Here we set them all, which
* will work as expected for both implementations
@@ -8811,6 +8819,10 @@ void cpu_ppc_set_papr(PowerPCCPU *cpu)
/* Set a full AMOR so guest can use the AMR as it sees fit */
env->spr[SPR_AMOR] = amor->default_value = 0xffffffffffffffffull;
+ /* Update some env bits based on new LPCR value */
+ ppc_hash64_update_rmls(env);
+ ppc_hash64_update_vrma(env);
+
/* Tell KVM that we're in PAPR mode */
if (kvm_enabled()) {
kvmppc_set_papr(cpu);
@@ -9516,7 +9528,7 @@ static void ppc_cpu_realizefn(DeviceState *dev, Error **errp)
PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
Error *local_err = NULL;
#if !defined(CONFIG_USER_ONLY)
- int max_smt = kvm_enabled() ? kvmppc_smt_threads() : 1;
+ int max_smt = kvmppc_smt_threads();
#endif
#if !defined(CONFIG_USER_ONLY)