/* * QEMU Windows Hypervisor Platform accelerator (WHPX) * * Copyright Microsoft Corp. 2017 * * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. * */ #include "qemu/osdep.h" #include "cpu.h" #include "exec/address-spaces.h" #include "exec/exec-all.h" #include "exec/ioport.h" #include "qemu-common.h" #include "strings.h" #include "sysemu/accel.h" #include "sysemu/whpx.h" #include "sysemu/sysemu.h" #include "sysemu/cpus.h" #include "qemu/main-loop.h" #include "hw/boards.h" #include "qemu/error-report.h" #include "qemu/queue.h" #include "qapi/error.h" #include "migration/blocker.h" #include #include struct whpx_state { uint64_t mem_quota; WHV_PARTITION_HANDLE partition; }; static const WHV_REGISTER_NAME whpx_register_names[] = { /* X64 General purpose registers */ WHvX64RegisterRax, WHvX64RegisterRcx, WHvX64RegisterRdx, WHvX64RegisterRbx, WHvX64RegisterRsp, WHvX64RegisterRbp, WHvX64RegisterRsi, WHvX64RegisterRdi, WHvX64RegisterR8, WHvX64RegisterR9, WHvX64RegisterR10, WHvX64RegisterR11, WHvX64RegisterR12, WHvX64RegisterR13, WHvX64RegisterR14, WHvX64RegisterR15, WHvX64RegisterRip, WHvX64RegisterRflags, /* X64 Segment registers */ WHvX64RegisterEs, WHvX64RegisterCs, WHvX64RegisterSs, WHvX64RegisterDs, WHvX64RegisterFs, WHvX64RegisterGs, WHvX64RegisterLdtr, WHvX64RegisterTr, /* X64 Table registers */ WHvX64RegisterIdtr, WHvX64RegisterGdtr, /* X64 Control Registers */ WHvX64RegisterCr0, WHvX64RegisterCr2, WHvX64RegisterCr3, WHvX64RegisterCr4, WHvX64RegisterCr8, /* X64 Debug Registers */ /* * WHvX64RegisterDr0, * WHvX64RegisterDr1, * WHvX64RegisterDr2, * WHvX64RegisterDr3, * WHvX64RegisterDr6, * WHvX64RegisterDr7, */ /* X64 Floating Point and Vector Registers */ WHvX64RegisterXmm0, WHvX64RegisterXmm1, WHvX64RegisterXmm2, WHvX64RegisterXmm3, WHvX64RegisterXmm4, WHvX64RegisterXmm5, WHvX64RegisterXmm6, WHvX64RegisterXmm7, WHvX64RegisterXmm8, WHvX64RegisterXmm9, WHvX64RegisterXmm10, WHvX64RegisterXmm11, WHvX64RegisterXmm12, WHvX64RegisterXmm13, WHvX64RegisterXmm14, WHvX64RegisterXmm15, WHvX64RegisterFpMmx0, WHvX64RegisterFpMmx1, WHvX64RegisterFpMmx2, WHvX64RegisterFpMmx3, WHvX64RegisterFpMmx4, WHvX64RegisterFpMmx5, WHvX64RegisterFpMmx6, WHvX64RegisterFpMmx7, WHvX64RegisterFpControlStatus, WHvX64RegisterXmmControlStatus, /* X64 MSRs */ WHvX64RegisterTsc, WHvX64RegisterEfer, #ifdef TARGET_X86_64 WHvX64RegisterKernelGsBase, #endif WHvX64RegisterApicBase, /* WHvX64RegisterPat, */ WHvX64RegisterSysenterCs, WHvX64RegisterSysenterEip, WHvX64RegisterSysenterEsp, WHvX64RegisterStar, #ifdef TARGET_X86_64 WHvX64RegisterLstar, WHvX64RegisterCstar, WHvX64RegisterSfmask, #endif /* Interrupt / Event Registers */ /* * WHvRegisterPendingInterruption, * WHvRegisterInterruptState, * WHvRegisterPendingEvent0, * WHvRegisterPendingEvent1 * WHvX64RegisterDeliverabilityNotifications, */ }; struct whpx_register_set { WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; }; struct whpx_vcpu { WHV_EMULATOR_HANDLE emulator; bool window_registered; bool interruptable; uint64_t tpr; uint64_t apic_base; bool interruption_pending; /* Must be the last field as it may have a tail */ WHV_RUN_VP_EXIT_CONTEXT exit_ctx; }; static bool whpx_allowed; struct whpx_state whpx_global; /* * VP support */ static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu) { return (struct whpx_vcpu *)cpu->hax_vcpu; } static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, int r86) { WHV_X64_SEGMENT_REGISTER hs; unsigned flags = qs->flags; hs.Base = qs->base; hs.Limit = qs->limit; hs.Selector = qs->selector; if (v86) { hs.Attributes = 0; hs.SegmentType = 3; hs.Present = 1; hs.DescriptorPrivilegeLevel = 3; hs.NonSystemSegment = 1; } else { hs.Attributes = (flags >> DESC_TYPE_SHIFT); if (r86) { /* hs.Base &= 0xfffff; */ } } return hs; } static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) { SegmentCache qs; qs.base = hs->Base; qs.limit = hs->Limit; qs.selector = hs->Selector; qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; return qs; } static void whpx_set_registers(CPUState *cpu) { struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); X86CPU *x86_cpu = X86_CPU(cpu); struct whpx_register_set vcxt = {0}; HRESULT hr; int idx = 0; int i; int v86, r86; assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); v86 = (env->eflags & VM_MASK); r86 = !(env->cr[0] & CR0_PE_MASK); vcpu->tpr = cpu_get_apic_tpr(x86_cpu->apic_state); vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); /* Indexes for first 16 registers match between HV and QEMU definitions */ for (idx = 0; idx < CPU_NB_REGS64; idx += 1) { vcxt.values[idx].Reg64 = env->regs[idx]; } /* Same goes for RIP and RFLAGS */ assert(whpx_register_names[idx] == WHvX64RegisterRip); vcxt.values[idx++].Reg64 = env->eip; assert(whpx_register_names[idx] == WHvX64RegisterRflags); vcxt.values[idx++].Reg64 = env->eflags; /* Translate 6+4 segment registers. HV and QEMU order matches */ assert(idx == WHvX64RegisterEs); for (i = 0; i < 6; i += 1, idx += 1) { vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); } assert(idx == WHvX64RegisterLdtr); vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); assert(idx == WHvX64RegisterTr); vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); assert(idx == WHvX64RegisterIdtr); vcxt.values[idx].Table.Base = env->idt.base; vcxt.values[idx].Table.Limit = env->idt.limit; idx += 1; assert(idx == WHvX64RegisterGdtr); vcxt.values[idx].Table.Base = env->gdt.base; vcxt.values[idx].Table.Limit = env->gdt.limit; idx += 1; /* CR0, 2, 3, 4, 8 */ assert(whpx_register_names[idx] == WHvX64RegisterCr0); vcxt.values[idx++].Reg64 = env->cr[0]; assert(whpx_register_names[idx] == WHvX64RegisterCr2); vcxt.values[idx++].Reg64 = env->cr[2]; assert(whpx_register_names[idx] == WHvX64RegisterCr3); vcxt.values[idx++].Reg64 = env->cr[3]; assert(whpx_register_names[idx] == WHvX64RegisterCr4); vcxt.values[idx++].Reg64 = env->cr[4]; assert(whpx_register_names[idx] == WHvX64RegisterCr8); vcxt.values[idx++].Reg64 = vcpu->tpr; /* 8 Debug Registers - Skipped */ /* 16 XMM registers */ assert(whpx_register_names[idx] == WHvX64RegisterXmm0); for (i = 0; i < 16; i += 1, idx += 1) { vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); } /* 8 FP registers */ assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); for (i = 0; i < 8; i += 1, idx += 1) { vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); /* vcxt.values[idx].Fp.AsUINT128.High64 = env->fpregs[i].mmx.MMX_Q(1); */ } /* FP control status register */ assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; vcxt.values[idx].FpControlStatus.FpStatus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; vcxt.values[idx].FpControlStatus.FpTag = 0; for (i = 0; i < 8; ++i) { vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; } vcxt.values[idx].FpControlStatus.Reserved = 0; vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; idx += 1; /* XMM control status register */ assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; idx += 1; /* MSRs */ assert(whpx_register_names[idx] == WHvX64RegisterTsc); vcxt.values[idx++].Reg64 = env->tsc; assert(whpx_register_names[idx] == WHvX64RegisterEfer); vcxt.values[idx++].Reg64 = env->efer; #ifdef TARGET_X86_64 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); vcxt.values[idx++].Reg64 = env->kernelgsbase; #endif assert(whpx_register_names[idx] == WHvX64RegisterApicBase); vcxt.values[idx++].Reg64 = vcpu->apic_base; /* WHvX64RegisterPat - Skipped */ assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); vcxt.values[idx++].Reg64 = env->sysenter_cs; assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); vcxt.values[idx++].Reg64 = env->sysenter_eip; assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); vcxt.values[idx++].Reg64 = env->sysenter_esp; assert(whpx_register_names[idx] == WHvX64RegisterStar); vcxt.values[idx++].Reg64 = env->star; #ifdef TARGET_X86_64 assert(whpx_register_names[idx] == WHvX64RegisterLstar); vcxt.values[idx++].Reg64 = env->lstar; assert(whpx_register_names[idx] == WHvX64RegisterCstar); vcxt.values[idx++].Reg64 = env->cstar; assert(whpx_register_names[idx] == WHvX64RegisterSfmask); vcxt.values[idx++].Reg64 = env->fmask; #endif /* Interrupt / Event Registers - Skipped */ assert(idx == RTL_NUMBER_OF(whpx_register_names)); hr = WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, whpx_register_names, RTL_NUMBER_OF(whpx_register_names), &vcxt.values[0]); if (FAILED(hr)) { error_report("WHPX: Failed to set virtual processor context, hr=%08lx", hr); } return; } static void whpx_get_registers(CPUState *cpu) { struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); X86CPU *x86_cpu = X86_CPU(cpu); struct whpx_register_set vcxt; uint64_t tpr, apic_base; HRESULT hr; int idx = 0; int i; assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); hr = WHvGetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, whpx_register_names, RTL_NUMBER_OF(whpx_register_names), &vcxt.values[0]); if (FAILED(hr)) { error_report("WHPX: Failed to get virtual processor context, hr=%08lx", hr); } /* Indexes for first 16 registers match between HV and QEMU definitions */ for (idx = 0; idx < CPU_NB_REGS64; idx += 1) { env->regs[idx] = vcxt.values[idx].Reg64; } /* Same goes for RIP and RFLAGS */ assert(whpx_register_names[idx] == WHvX64RegisterRip); env->eip = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterRflags); env->eflags = vcxt.values[idx++].Reg64; /* Translate 6+4 segment registers. HV and QEMU order matches */ assert(idx == WHvX64RegisterEs); for (i = 0; i < 6; i += 1, idx += 1) { env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); } assert(idx == WHvX64RegisterLdtr); env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); assert(idx == WHvX64RegisterTr); env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); assert(idx == WHvX64RegisterIdtr); env->idt.base = vcxt.values[idx].Table.Base; env->idt.limit = vcxt.values[idx].Table.Limit; idx += 1; assert(idx == WHvX64RegisterGdtr); env->gdt.base = vcxt.values[idx].Table.Base; env->gdt.limit = vcxt.values[idx].Table.Limit; idx += 1; /* CR0, 2, 3, 4, 8 */ assert(whpx_register_names[idx] == WHvX64RegisterCr0); env->cr[0] = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterCr2); env->cr[2] = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterCr3); env->cr[3] = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterCr4); env->cr[4] = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterCr8); tpr = vcxt.values[idx++].Reg64; if (tpr != vcpu->tpr) { vcpu->tpr = tpr; cpu_set_apic_tpr(x86_cpu->apic_state, tpr); } /* 8 Debug Registers - Skipped */ /* 16 XMM registers */ assert(whpx_register_names[idx] == WHvX64RegisterXmm0); for (i = 0; i < 16; i += 1, idx += 1) { env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; } /* 8 FP registers */ assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); for (i = 0; i < 8; i += 1, idx += 1) { env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; /* env->fpregs[i].mmx.MMX_Q(1) = vcxt.values[idx].Fp.AsUINT128.High64; */ } /* FP control status register */ assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; for (i = 0; i < 8; ++i) { env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); } env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; idx += 1; /* XMM control status register */ assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; idx += 1; /* MSRs */ assert(whpx_register_names[idx] == WHvX64RegisterTsc); env->tsc = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterEfer); env->efer = vcxt.values[idx++].Reg64; #ifdef TARGET_X86_64 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); env->kernelgsbase = vcxt.values[idx++].Reg64; #endif assert(whpx_register_names[idx] == WHvX64RegisterApicBase); apic_base = vcxt.values[idx++].Reg64; if (apic_base != vcpu->apic_base) { vcpu->apic_base = apic_base; cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); } /* WHvX64RegisterPat - Skipped */ assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); env->sysenter_cs = vcxt.values[idx++].Reg64;; assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); env->sysenter_eip = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); env->sysenter_esp = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterStar); env->star = vcxt.values[idx++].Reg64; #ifdef TARGET_X86_64 assert(whpx_register_names[idx] == WHvX64RegisterLstar); env->lstar = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterCstar); env->cstar = vcxt.values[idx++].Reg64; assert(whpx_register_names[idx] == WHvX64RegisterSfmask); env->fmask = vcxt.values[idx++].Reg64; #endif /* Interrupt / Event Registers - Skipped */ assert(idx == RTL_NUMBER_OF(whpx_register_names)); return; } static HRESULT CALLBACK whpx_emu_ioport_callback( void *ctx, WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) { MemTxAttrs attrs = { 0 }; address_space_rw(&address_space_io, IoAccess->Port, attrs, (uint8_t *)&IoAccess->Data, IoAccess->AccessSize, IoAccess->Direction); return S_OK; } static HRESULT CALLBACK whpx_emu_mmio_callback( void *ctx, WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) { cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, ma->Direction); return S_OK; } static HRESULT CALLBACK whpx_emu_getreg_callback( void *ctx, const WHV_REGISTER_NAME *RegisterNames, UINT32 RegisterCount, WHV_REGISTER_VALUE *RegisterValues) { HRESULT hr; struct whpx_state *whpx = &whpx_global; CPUState *cpu = (CPUState *)ctx; hr = WHvGetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, RegisterNames, RegisterCount, RegisterValues); if (FAILED(hr)) { error_report("WHPX: Failed to get virtual processor registers," " hr=%08lx", hr); } return hr; } static HRESULT CALLBACK whpx_emu_setreg_callback( void *ctx, const WHV_REGISTER_NAME *RegisterNames, UINT32 RegisterCount, const WHV_REGISTER_VALUE *RegisterValues) { HRESULT hr; struct whpx_state *whpx = &whpx_global; CPUState *cpu = (CPUState *)ctx; hr = WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, RegisterNames, RegisterCount, RegisterValues); if (FAILED(hr)) { error_report("WHPX: Failed to set virtual processor registers," " hr=%08lx", hr); } /* * The emulator just successfully wrote the register state. We clear the * dirty state so we avoid the double write on resume of the VP. */ cpu->vcpu_dirty = false; return hr; } static HRESULT CALLBACK whpx_emu_translate_callback( void *ctx, WHV_GUEST_VIRTUAL_ADDRESS Gva, WHV_TRANSLATE_GVA_FLAGS TranslateFlags, WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, WHV_GUEST_PHYSICAL_ADDRESS *Gpa) { HRESULT hr; struct whpx_state *whpx = &whpx_global; CPUState *cpu = (CPUState *)ctx; WHV_TRANSLATE_GVA_RESULT res; hr = WHvTranslateGva(whpx->partition, cpu->cpu_index, Gva, TranslateFlags, &res, Gpa); if (FAILED(hr)) { error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); } else { *TranslationResult = res.ResultCode; } return hr; } static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { .Size = sizeof(WHV_EMULATOR_CALLBACKS), .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, }; static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) { HRESULT hr; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); WHV_EMULATOR_STATUS emu_status; hr = WHvEmulatorTryMmioEmulation(vcpu->emulator, cpu, &vcpu->exit_ctx.VpContext, ctx, &emu_status); if (FAILED(hr)) { error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); return -1; } if (!emu_status.EmulationSuccessful) { error_report("WHPX: Failed to emulate MMIO access"); return -1; } return 0; } static int whpx_handle_portio(CPUState *cpu, WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) { HRESULT hr; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); WHV_EMULATOR_STATUS emu_status; hr = WHvEmulatorTryIoEmulation(vcpu->emulator, cpu, &vcpu->exit_ctx.VpContext, ctx, &emu_status); if (FAILED(hr)) { error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); return -1; } if (!emu_status.EmulationSuccessful) { error_report("WHPX: Failed to emulate PortMMIO access"); return -1; } return 0; } static int whpx_handle_halt(CPUState *cpu) { struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); int ret = 0; qemu_mutex_lock_iothread(); if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) && !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { cpu->exception_index = EXCP_HLT; cpu->halted = true; ret = 1; } qemu_mutex_unlock_iothread(); return ret; } static void whpx_vcpu_pre_run(CPUState *cpu) { HRESULT hr; struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); X86CPU *x86_cpu = X86_CPU(cpu); int irq; uint8_t tpr; WHV_X64_PENDING_INTERRUPTION_REGISTER new_int = {0}; UINT32 reg_count = 0; WHV_REGISTER_VALUE reg_values[3] = {0}; WHV_REGISTER_NAME reg_names[3]; qemu_mutex_lock_iothread(); /* Inject NMI */ if (!vcpu->interruption_pending && cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; vcpu->interruptable = false; new_int.InterruptionType = WHvX64PendingNmi; new_int.InterruptionPending = 1; new_int.InterruptionVector = 2; } if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; } } /* * Force the VCPU out of its inner loop to process any INIT requests or * commit pending TPR access. */ if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { cpu->exit_request = 1; } if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { cpu->exit_request = 1; } } /* Get pending hard interruption or replay one that was overwritten */ if (!vcpu->interruption_pending && vcpu->interruptable && (env->eflags & IF_MASK)) { assert(!new_int.InterruptionPending); if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; irq = cpu_get_pic_interrupt(env); if (irq >= 0) { new_int.InterruptionType = WHvX64PendingInterrupt; new_int.InterruptionPending = 1; new_int.InterruptionVector = irq; } } } /* Setup interrupt state if new one was prepared */ if (new_int.InterruptionPending) { reg_values[reg_count].PendingInterruption = new_int; reg_names[reg_count] = WHvRegisterPendingInterruption; reg_count += 1; } /* Sync the TPR to the CR8 if was modified during the intercept */ tpr = cpu_get_apic_tpr(x86_cpu->apic_state); if (tpr != vcpu->tpr) { vcpu->tpr = tpr; reg_values[reg_count].Reg64 = tpr; cpu->exit_request = 1; reg_names[reg_count] = WHvX64RegisterCr8; reg_count += 1; } /* Update the state of the interrupt delivery notification */ if (!vcpu->window_registered && cpu->interrupt_request & CPU_INTERRUPT_HARD) { reg_values[reg_count].DeliverabilityNotifications.InterruptNotification = 1; vcpu->window_registered = 1; reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; reg_count += 1; } qemu_mutex_unlock_iothread(); if (reg_count) { hr = WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, reg_names, reg_count, reg_values); if (FAILED(hr)) { error_report("WHPX: Failed to set interrupt state registers," " hr=%08lx", hr); } } return; } static void whpx_vcpu_post_run(CPUState *cpu) { struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); X86CPU *x86_cpu = X86_CPU(cpu); env->eflags = vcpu->exit_ctx.VpContext.Rflags; uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; if (vcpu->tpr != tpr) { vcpu->tpr = tpr; qemu_mutex_lock_iothread(); cpu_set_apic_tpr(x86_cpu->apic_state, vcpu->tpr); qemu_mutex_unlock_iothread(); } vcpu->interruption_pending = vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; vcpu->interruptable = !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; return; } static void whpx_vcpu_process_async_events(CPUState *cpu) { struct CPUX86State *env = (CPUArchState *)(cpu->env_ptr); X86CPU *x86_cpu = X86_CPU(cpu); struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { do_cpu_init(x86_cpu); cpu->vcpu_dirty = true; vcpu->interruptable = true; } if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; apic_poll_irq(x86_cpu->apic_state); } if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) || (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { cpu->halted = false; } if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { if (!cpu->vcpu_dirty) { whpx_get_registers(cpu); } do_cpu_sipi(x86_cpu); } if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; if (!cpu->vcpu_dirty) { whpx_get_registers(cpu); } apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, env->tpr_access_type); } return; } static int whpx_vcpu_run(CPUState *cpu) { HRESULT hr; struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); int ret; whpx_vcpu_process_async_events(cpu); if (cpu->halted) { cpu->exception_index = EXCP_HLT; atomic_set(&cpu->exit_request, false); return 0; } qemu_mutex_unlock_iothread(); cpu_exec_start(cpu); do { if (cpu->vcpu_dirty) { whpx_set_registers(cpu); cpu->vcpu_dirty = false; } whpx_vcpu_pre_run(cpu); if (atomic_read(&cpu->exit_request)) { whpx_vcpu_kick(cpu); } hr = WHvRunVirtualProcessor(whpx->partition, cpu->cpu_index, &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); if (FAILED(hr)) { error_report("WHPX: Failed to exec a virtual processor," " hr=%08lx", hr); ret = -1; break; } whpx_vcpu_post_run(cpu); switch (vcpu->exit_ctx.ExitReason) { case WHvRunVpExitReasonMemoryAccess: ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); break; case WHvRunVpExitReasonX64IoPortAccess: ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); break; case WHvRunVpExitReasonX64InterruptWindow: vcpu->window_registered = 0; break; case WHvRunVpExitReasonX64Halt: ret = whpx_handle_halt(cpu); break; case WHvRunVpExitReasonCanceled: cpu->exception_index = EXCP_INTERRUPT; ret = 1; break; case WHvRunVpExitReasonX64Cpuid: { WHV_REGISTER_VALUE reg_values[5] = {0}; WHV_REGISTER_NAME reg_names[5]; UINT32 reg_count = 5; UINT64 rip, rax, rcx, rdx, rbx; rip = vcpu->exit_ctx.VpContext.Rip + vcpu->exit_ctx.VpContext.InstructionLength; switch (vcpu->exit_ctx.CpuidAccess.Rax) { case 1: rax = vcpu->exit_ctx.CpuidAccess.DefaultResultRax; /* Advertise that we are running on a hypervisor */ rcx = vcpu->exit_ctx.CpuidAccess.DefaultResultRcx | CPUID_EXT_HYPERVISOR; rdx = vcpu->exit_ctx.CpuidAccess.DefaultResultRdx; rbx = vcpu->exit_ctx.CpuidAccess.DefaultResultRbx; break; default: rax = vcpu->exit_ctx.CpuidAccess.DefaultResultRax; rcx = vcpu->exit_ctx.CpuidAccess.DefaultResultRcx; rdx = vcpu->exit_ctx.CpuidAccess.DefaultResultRdx; rbx = vcpu->exit_ctx.CpuidAccess.DefaultResultRbx; } reg_names[0] = WHvX64RegisterRip; reg_names[1] = WHvX64RegisterRax; reg_names[2] = WHvX64RegisterRcx; reg_names[3] = WHvX64RegisterRdx; reg_names[4] = WHvX64RegisterRbx; reg_values[0].Reg64 = rip; reg_values[1].Reg64 = rax; reg_values[2].Reg64 = rcx; reg_values[3].Reg64 = rdx; reg_values[4].Reg64 = rbx; hr = WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index, reg_names, reg_count, reg_values); if (FAILED(hr)) { error_report("WHPX: Failed to set CpuidAccess state registers," " hr=%08lx", hr); } ret = 0; break; } case WHvRunVpExitReasonNone: case WHvRunVpExitReasonUnrecoverableException: case WHvRunVpExitReasonInvalidVpRegisterValue: case WHvRunVpExitReasonUnsupportedFeature: case WHvRunVpExitReasonX64MsrAccess: case WHvRunVpExitReasonException: default: error_report("WHPX: Unexpected VP exit code %d", vcpu->exit_ctx.ExitReason); whpx_get_registers(cpu); qemu_mutex_lock_iothread(); qemu_system_guest_panicked(cpu_get_crash_info(cpu)); qemu_mutex_unlock_iothread(); break; } } while (!ret); cpu_exec_end(cpu); qemu_mutex_lock_iothread(); current_cpu = cpu; atomic_set(&cpu->exit_request, false); return ret < 0; } static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) { whpx_get_registers(cpu); cpu->vcpu_dirty = true; } static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) { whpx_set_registers(cpu); cpu->vcpu_dirty = false; } static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) { whpx_set_registers(cpu); cpu->vcpu_dirty = false; } static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) { cpu->vcpu_dirty = true; } /* * CPU support. */ void whpx_cpu_synchronize_state(CPUState *cpu) { if (!cpu->vcpu_dirty) { run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); } } void whpx_cpu_synchronize_post_reset(CPUState *cpu) { run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); } void whpx_cpu_synchronize_post_init(CPUState *cpu) { run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); } void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) { run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); } /* * Vcpu support. */ static Error *whpx_migration_blocker; int whpx_init_vcpu(CPUState *cpu) { HRESULT hr; struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu; Error *local_error = NULL; /* Add migration blockers for all unsupported features of the * Windows Hypervisor Platform */ if (whpx_migration_blocker == NULL) { error_setg(&whpx_migration_blocker, "State blocked due to non-migratable CPUID feature support," "dirty memory tracking support, and XSAVE/XRSTOR support"); (void)migrate_add_blocker(whpx_migration_blocker, &local_error); if (local_error) { error_report_err(local_error); error_free(whpx_migration_blocker); migrate_del_blocker(whpx_migration_blocker); return -EINVAL; } } vcpu = g_malloc0(sizeof(struct whpx_vcpu)); if (!vcpu) { error_report("WHPX: Failed to allocte VCPU context."); return -ENOMEM; } hr = WHvEmulatorCreateEmulator(&whpx_emu_callbacks, &vcpu->emulator); if (FAILED(hr)) { error_report("WHPX: Failed to setup instruction completion support," " hr=%08lx", hr); g_free(vcpu); return -EINVAL; } hr = WHvCreateVirtualProcessor(whpx->partition, cpu->cpu_index, 0); if (FAILED(hr)) { error_report("WHPX: Failed to create a virtual processor," " hr=%08lx", hr); WHvEmulatorDestroyEmulator(vcpu->emulator); g_free(vcpu); return -EINVAL; } vcpu->interruptable = true; cpu->vcpu_dirty = true; cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu; return 0; } int whpx_vcpu_exec(CPUState *cpu) { int ret; int fatal; for (;;) { if (cpu->exception_index >= EXCP_INTERRUPT) { ret = cpu->exception_index; cpu->exception_index = -1; break; } fatal = whpx_vcpu_run(cpu); if (fatal) { error_report("WHPX: Failed to exec a virtual processor"); abort(); } } return ret; } void whpx_destroy_vcpu(CPUState *cpu) { struct whpx_state *whpx = &whpx_global; struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu); WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); WHvEmulatorDestroyEmulator(vcpu->emulator); g_free(cpu->hax_vcpu); return; } void whpx_vcpu_kick(CPUState *cpu) { struct whpx_state *whpx = &whpx_global; WHvCancelRunVirtualProcessor(whpx->partition, cpu->cpu_index, 0); } /* * Memory support. */ static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, void *host_va, int add, int rom, const char *name) { struct whpx_state *whpx = &whpx_global; HRESULT hr; /* if (add) { printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", (void*)start_pa, (void*)size, host_va, (rom ? "ROM" : "RAM"), name); } else { printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", (void*)start_pa, (void*)size, host_va, name); } */ if (add) { hr = WHvMapGpaRange(whpx->partition, host_va, start_pa, size, (WHvMapGpaRangeFlagRead | WHvMapGpaRangeFlagExecute | (rom ? 0 : WHvMapGpaRangeFlagWrite))); } else { hr = WHvUnmapGpaRange(whpx->partition, start_pa, size); } if (FAILED(hr)) { error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," " Host:%p, hr=%08lx", (add ? "MAP" : "UNMAP"), name, (void *)start_pa, (void *)size, host_va, hr); } } static void whpx_process_section(MemoryRegionSection *section, int add) { MemoryRegion *mr = section->mr; hwaddr start_pa = section->offset_within_address_space; ram_addr_t size = int128_get64(section->size); unsigned int delta; uint64_t host_va; if (!memory_region_is_ram(mr)) { return; } delta = qemu_real_host_page_size - (start_pa & ~qemu_real_host_page_mask); delta &= ~qemu_real_host_page_mask; if (delta > size) { return; } start_pa += delta; size -= delta; size &= qemu_real_host_page_mask; if (!size || (start_pa & ~qemu_real_host_page_mask)) { return; } host_va = (uintptr_t)memory_region_get_ram_ptr(mr) + section->offset_within_region + delta; whpx_update_mapping(start_pa, size, (void *)host_va, add, memory_region_is_rom(mr), mr->name); } static void whpx_region_add(MemoryListener *listener, MemoryRegionSection *section) { memory_region_ref(section->mr); whpx_process_section(section, 1); } static void whpx_region_del(MemoryListener *listener, MemoryRegionSection *section) { whpx_process_section(section, 0); memory_region_unref(section->mr); } static void whpx_transaction_begin(MemoryListener *listener) { } static void whpx_transaction_commit(MemoryListener *listener) { } static void whpx_log_sync(MemoryListener *listener, MemoryRegionSection *section) { MemoryRegion *mr = section->mr; if (!memory_region_is_ram(mr)) { return; } memory_region_set_dirty(mr, 0, int128_get64(section->size)); } static MemoryListener whpx_memory_listener = { .begin = whpx_transaction_begin, .commit = whpx_transaction_commit, .region_add = whpx_region_add, .region_del = whpx_region_del, .log_sync = whpx_log_sync, .priority = 10, }; static void whpx_memory_init(void) { memory_listener_register(&whpx_memory_listener, &address_space_memory); } static void whpx_handle_interrupt(CPUState *cpu, int mask) { cpu->interrupt_request |= mask; if (!qemu_cpu_is_self(cpu)) { qemu_cpu_kick(cpu); } } /* * Partition support */ static int whpx_accel_init(MachineState *ms) { struct whpx_state *whpx; int ret; HRESULT hr; WHV_CAPABILITY whpx_cap; UINT32 whpx_cap_size; WHV_PARTITION_PROPERTY prop; whpx = &whpx_global; memset(whpx, 0, sizeof(struct whpx_state)); whpx->mem_quota = ms->ram_size; hr = WHvGetCapability(WHvCapabilityCodeHypervisorPresent, &whpx_cap, sizeof(whpx_cap), &whpx_cap_size); if (FAILED(hr) || !whpx_cap.HypervisorPresent) { error_report("WHPX: No accelerator found, hr=%08lx", hr); ret = -ENOSPC; goto error; } hr = WHvCreatePartition(&whpx->partition); if (FAILED(hr)) { error_report("WHPX: Failed to create partition, hr=%08lx", hr); ret = -EINVAL; goto error; } memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); prop.ProcessorCount = smp_cpus; hr = WHvSetPartitionProperty(whpx->partition, WHvPartitionPropertyCodeProcessorCount, &prop, sizeof(WHV_PARTITION_PROPERTY)); if (FAILED(hr)) { error_report("WHPX: Failed to set partition core count to %d," " hr=%08lx", smp_cores, hr); ret = -EINVAL; goto error; } memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); prop.ExtendedVmExits.X64CpuidExit = 1; hr = WHvSetPartitionProperty(whpx->partition, WHvPartitionPropertyCodeExtendedVmExits, &prop, sizeof(WHV_PARTITION_PROPERTY)); if (FAILED(hr)) { error_report("WHPX: Failed to enable partition extended X64CpuidExit" " hr=%08lx", hr); ret = -EINVAL; goto error; } UINT32 cpuidExitList[] = {1}; hr = WHvSetPartitionProperty(whpx->partition, WHvPartitionPropertyCodeCpuidExitList, cpuidExitList, RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); if (FAILED(hr)) { error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", hr); ret = -EINVAL; goto error; } hr = WHvSetupPartition(whpx->partition); if (FAILED(hr)) { error_report("WHPX: Failed to setup partition, hr=%08lx", hr); ret = -EINVAL; goto error; } whpx_memory_init(); cpu_interrupt_handler = whpx_handle_interrupt; printf("Windows Hypervisor Platform accelerator is operational\n"); return 0; error: if (NULL != whpx->partition) { WHvDeletePartition(whpx->partition); whpx->partition = NULL; } return ret; } int whpx_enabled(void) { return whpx_allowed; } static void whpx_accel_class_init(ObjectClass *oc, void *data) { AccelClass *ac = ACCEL_CLASS(oc); ac->name = "WHPX"; ac->init_machine = whpx_accel_init; ac->allowed = &whpx_allowed; } static const TypeInfo whpx_accel_type = { .name = ACCEL_CLASS_NAME("whpx"), .parent = TYPE_ACCEL, .class_init = whpx_accel_class_init, }; static void whpx_type_init(void) { type_register_static(&whpx_accel_type); } type_init(whpx_type_init);