summaryrefslogtreecommitdiff
path: root/target-i386/kvm.c
diff options
context:
space:
mode:
authorMarcelo Tosatti <mtosatti@redhat.com>2010-10-11 15:31:21 -0300
committerAnthony Liguori <aliguori@us.ibm.com>2010-10-20 16:15:04 -0500
commitc0532a76b407af4b276dc5a62d8178db59857ea6 (patch)
tree6a34d8986e474edd0ba63dd283cda22d31436dc3 /target-i386/kvm.c
parent983dfc3b135de0a4808a41a8ca71e1809ba6a62e (diff)
downloadqemu-c0532a76b407af4b276dc5a62d8178db59857ea6.tar.gz
MCE: Relay UCR MCE to guest
Port qemu-kvm's commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef Author: Huang Ying <ying.huang@intel.com> Date: Mon Sep 21 10:43:25 2009 +0800 MCE: Relay UCR MCE to guest UCR (uncorrected recovery) MCE is supported in recent Intel CPUs, where some hardware error such as some memory error can be reported without PCC (processor context corrupted). To recover from such MCE, the corresponding memory will be unmapped, and all processes accessing the memory will be killed via SIGBUS. For KVM, if QEMU/KVM is killed, all guest processes will be killed too. So we relay SIGBUS from host OS to guest system via a UCR MCE injection. Then guest OS can isolate corresponding memory and kill necessary guest processes only. SIGBUS sent to main thread (not VCPU threads) will be broadcast to all VCPU threads as UCR MCE. aliguori: fix build Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Avi Kivity <avi@redhat.com> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Diffstat (limited to 'target-i386/kvm.c')
-rw-r--r--target-i386/kvm.c178
1 files changed, 176 insertions, 2 deletions
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 343fb022c8..0161f6d2fb 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -46,6 +46,13 @@
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
#ifdef KVM_CAP_EXT_CPUID
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
}
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+ struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+ int r;
+
+ kmsrs->nmsrs = n;
+ memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+ r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+ memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+ free(kmsrs);
+ return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+ struct kvm_msr_entry msr_mcg_status = {
+ .index = MSR_MCG_STATUS,
+ };
+ int r;
+
+ r = kvm_get_msr(env, &msr_mcg_status, 1);
+ if (r == -1 || r == 0) {
+ return -1;
+ }
+ return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
struct kvm_x86_mce_data
{
CPUState *env;
struct kvm_x86_mce *mce;
+ int abort_on_error;
};
static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *_data)
struct kvm_x86_mce_data *data = _data;
int r;
+ /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+ r = kvm_mce_in_exception(data->env);
+ if (r == -1)
+ fprintf(stderr, "Failed to get MCE status\n");
+ else if (r && !(data->mce->status & MCI_STATUS_AR))
+ return;
+
r = kvm_set_mce(data->env, data->mce);
- if (r < 0)
+ if (r < 0) {
perror("kvm_set_mce FAILED");
+ if (data->abort_on_error) {
+ abort();
+ }
+ }
}
#endif
void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
- uint64_t mcg_status, uint64_t addr, uint64_t misc)
+ uint64_t mcg_status, uint64_t addr, uint64_t misc,
+ int abort_on_error)
{
#ifdef KVM_CAP_MCE
struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
.mce = &mce,
};
+ if (!cenv->mcg_cap) {
+ fprintf(stderr, "MCE support is not enabled!\n");
+ return;
+ }
+
run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+ if (abort_on_error)
+ abort();
#endif
}
@@ -1528,3 +1584,121 @@ bool kvm_arch_stop_on_emulation_error(CPUState *env)
((env->segs[R_CS].selector & 3) != 3);
}
+static void hardware_memory_error(void)
+{
+ fprintf(stderr, "Hardware memory error!\n");
+ exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+ struct kvm_x86_mce mce = {
+ .bank = 9,
+ };
+ void *vaddr;
+ ram_addr_t ram_addr;
+ target_phys_addr_t paddr;
+ int r;
+
+ if ((env->mcg_cap & MCG_SER_P) && addr
+ && (code == BUS_MCEERR_AR
+ || code == BUS_MCEERR_AO)) {
+ if (code == BUS_MCEERR_AR) {
+ /* Fake an Intel architectural Data Load SRAR UCR */
+ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | MCI_STATUS_AR | 0x134;
+ mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+ mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+ } else {
+ /*
+ * If there is an MCE excpetion being processed, ignore
+ * this SRAO MCE
+ */
+ r = kvm_mce_in_exception(env);
+ if (r == -1) {
+ fprintf(stderr, "Failed to get MCE status\n");
+ } else if (r) {
+ return 0;
+ }
+ /* Fake an Intel architectural Memory scrubbing UCR */
+ mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | 0xc0;
+ mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+ mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+ }
+ vaddr = (void *)addr;
+ if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+ !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+ fprintf(stderr, "Hardware memory error for memory used by "
+ "QEMU itself instead of guest system!\n");
+ /* Hope we are lucky for AO MCE */
+ if (code == BUS_MCEERR_AO) {
+ return 0;
+ } else {
+ hardware_memory_error();
+ }
+ }
+ mce.addr = paddr;
+ r = kvm_set_mce(env, &mce);
+ if (r < 0) {
+ fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+ abort();
+ }
+ } else
+#endif
+ {
+ if (code == BUS_MCEERR_AO) {
+ return 0;
+ } else if (code == BUS_MCEERR_AR) {
+ hardware_memory_error();
+ } else {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+ if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
+ uint64_t status;
+ void *vaddr;
+ ram_addr_t ram_addr;
+ target_phys_addr_t paddr;
+ CPUState *cenv;
+
+ /* Hope we are lucky for AO MCE */
+ vaddr = addr;
+ if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+ !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+ fprintf(stderr, "Hardware memory error for memory used by "
+ "QEMU itself instead of guest system!: %p\n", addr);
+ return 0;
+ }
+ status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+ | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+ | 0xc0;
+ kvm_inject_x86_mce(first_cpu, 9, status,
+ MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+ (MCM_ADDR_PHYS << 6) | 0xc, 1);
+ for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+ kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+ MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+ }
+ } else
+#endif
+ {
+ if (code == BUS_MCEERR_AO) {
+ return 0;
+ } else if (code == BUS_MCEERR_AR) {
+ hardware_memory_error();
+ } else {
+ return 1;
+ }
+ }
+ return 0;
+}