blob: e177848a36313871b6ea12fe08c5ec1eeaa5fbaf [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
#include <asm/perf_event.h>
#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
#include "ops.h"
#include "pmu.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static const struct x86_cpu_id vmx_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_VMX),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
static u64 __read_mostly host_xss;
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
(~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
* According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute
* in a PAUSE loop. Tests indicate that most spinlocks are held for
* less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
/* Default doubles per-vcpu window every exit. */
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
/* Default is SYSTEM mode, 1 for host-guest mode */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
module_param(pt_mode, int, S_IRUGO);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
static const struct {
const char *option;
bool for_parse;
} vmentry_l1d_param[] = {
[VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
[VMENTER_L1D_FLUSH_NEVER] = {"never", true},
[VMENTER_L1D_FLUSH_COND] = {"cond", true},
[VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
}
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
u64 msr;
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
}
/* If set to auto use the default l1tf mitigation method */
if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
l1tf = VMENTER_L1D_FLUSH_COND;
break;
case L1TF_MITIGATION_FULL:
case L1TF_MITIGATION_FULL_FORCE:
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
break;
}
} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
/*
* This allocation for vmx_l1d_flush_pages is not tied to a VM
* lifetime and so should not be charged to a memcg.
*/
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
vmx_l1d_flush_pages = page_address(page);
/*
* Initialize each page with a different pattern in
* order to protect against KSM in the nested
* virtualization case.
*/
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
}
l1tf_vmx_mitigation = l1tf;
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
static_branch_enable(&vmx_l1d_should_flush);
else
static_branch_disable(&vmx_l1d_should_flush);
if (l1tf == VMENTER_L1D_FLUSH_COND)
static_branch_enable(&vmx_l1d_flush_cond);
else
static_branch_disable(&vmx_l1d_flush_cond);
return 0;
}
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
if (s) {
for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
if (vmentry_l1d_param[i].for_parse &&
sysfs_streq(s, vmentry_l1d_param[i].option))
return i;
}
}
return -EINVAL;
}
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
int l1tf, ret;
l1tf = vmentry_l1d_flush_parse(s);
if (l1tf < 0)
return l1tf;
if (!boot_cpu_has(X86_BUG_L1TF))
return 0;
/*
* Has vmx_init() run already? If not then this is the pre init
* parameter parsing. In that case just store the value and let
* vmx_init() do the proper setup after enable_ept has been
* established.
*/
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
vmentry_l1d_flush_param = l1tf;
return 0;
}
mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
return sprintf(s, "???\n");
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type);
void vmx_vmexit(void);
#define vmx_insn_failed(fmt...) \
do { \
WARN_ONCE(1, fmt); \
pr_warn_ratelimited(fmt); \
} while (0)
asmlinkage void vmread_error(unsigned long field, bool fault)
{
if (fault)
kvm_spurious_fault();
else
vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
* can find which vCPU should be waken up.
*/
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
struct vmcs_config vmcs_config;
struct vmx_capability vmx_capability;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
.selector = GUEST_##seg##_SELECTOR, \
.base = GUEST_##seg##_BASE, \
.limit = GUEST_##seg##_LIMIT, \
.ar_bytes = GUEST_##seg##_AR_BYTES, \
}
static const struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
VMX_SEGMENT_FIELD(CS),
VMX_SEGMENT_FIELD(DS),
VMX_SEGMENT_FIELD(ES),
VMX_SEGMENT_FIELD(FS),
VMX_SEGMENT_FIELD(GS),
VMX_SEGMENT_FIELD(SS),
VMX_SEGMENT_FIELD(TR),
VMX_SEGMENT_FIELD(LDTR),
};
u64 host_efer;
static unsigned long host_idt_base;
/*
* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
* vmx_msr_index[], even in i386 builds.
*/
const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
/* check_ept_pointer() should be under protection of ept_pointer_lock. */
static void check_ept_pointer_match(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
u64 tmp_eptp = INVALID_PAGE;
int i;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!VALID_PAGE(tmp_eptp)) {
tmp_eptp = to_vmx(vcpu)->ept_pointer;
} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
to_kvm_vmx(kvm)->ept_pointers_match
= EPT_POINTERS_MISMATCH;
return;
}
}
to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
}
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
struct kvm_tlb_range *range = data;
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
range->pages);
}
static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
{
u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
/*
* FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
* of the base of EPT PML4 table, strip off EPT configuration
* information.
*/
if (range)
return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
kvm_fill_hv_flush_list_func, (void *)range);
else
return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
}
static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
struct kvm_vcpu *vcpu;
int ret = 0, i;
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
check_ept_pointer_match(kvm);
if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
kvm_for_each_vcpu(i, vcpu, kvm) {
/* If ept_pointer is invalid pointer, bypass flush request. */
if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
ret |= __hv_remote_flush_tlb_with_range(
kvm, vcpu, range);
}
} else {
ret = __hv_remote_flush_tlb_with_range(kvm,
kvm_get_vcpu(kvm, 0), range);
}
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
return ret;
}
static int hv_remote_flush_tlb(struct kvm *kvm)
{
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
&vcpu->kvm->arch.hyperv.hv_pa_pg;
/*
* Synthetic VM-Exit is not enabled in current code and so All
* evmcs in singe VM shares same assist page.
*/
if (!*p_hv_pa_pg)
*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!*p_hv_pa_pg)
return -ENOMEM;
evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
evmcs->partition_assist_page =
__pa(*p_hv_pa_pg);
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
return 0;
}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
* https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
*/
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
/*
* 320767.pdf - AAP86 - B1 -
* i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
*/
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
/* 321333.pdf - AAM126 - D0 - Xeon 3500 */
/* 321324.pdf - AAK139 - D0 - Xeon 5500 */
/* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
/* Xeon E3-1220 V2 */
0x000306A8,
};
static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
u32 eax = cpuid_eax(0x00000001), i;
/* Clear the reserved bits */
eax &= ~(0x3U << 14 | 0xfU << 28);
for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
if (eax == vmx_preemption_cpu_tfms[i])
return true;
return false;
}
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
}
static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
for (i = 0; i < vmx->nmsrs; ++i)
if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
return i;
return -1;
}
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
i = __find_msr_index(vmx, msr);
if (i >= 0)
return &vmx->guest_msrs[i];
return NULL;
}
void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
vmcs_clear(loaded_vmcs->vmcs);
if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
vmcs_clear(loaded_vmcs->shadow_vmcs);
loaded_vmcs->cpu = -1;
loaded_vmcs->launched = 0;
}
#ifdef CONFIG_KEXEC_CORE
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
int cpu = raw_smp_processor_id();
if (loaded_vmcs->cpu != cpu)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
vmcs_clear(loaded_vmcs->vmcs);
if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
vmcs_clear(loaded_vmcs->shadow_vmcs);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
* Ensure all writes to loaded_vmcs, including deleting it from its
* current percpu list, complete before setting loaded_vmcs->vcpu to
* -1, otherwise a different cpu can see vcpu == -1 first and add
* loaded_vmcs to its percpu list before it's deleted from this cpu's
* list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
loaded_vmcs->cpu = -1;
loaded_vmcs->launched = 0;
}
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
if (cpu != -1)
smp_call_function_single(cpu,
__loaded_vmcs_clear, loaded_vmcs, 1);
}
static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
unsigned field)
{
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
vmx->segment_cache.bitmask |= mask;
return ret;
}
static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
u16 *p = &vmx->segment_cache.seg[seg].selector;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
return *p;
}
static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
ulong *p = &vmx->segment_cache.seg[seg].base;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
return *p;
}
static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].limit;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
return *p;
}
static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].ar;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
return *p;
}
void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
* as VMware does.
*/
if (enable_vmware_backdoor)
eb |= (1u << GP_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
* them to L1. When running L2, we will only handle the exceptions
* specified above if L1 did not want them.
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
unsigned long *msr_bitmap;
int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return true;
msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
if (msr <= 0x1fff) {
return !!test_bit(msr, msr_bitmap + 0x800 / f);
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
return !!test_bit(msr, msr_bitmap + 0xc00 / f);
}
return true;
}
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
vm_exit_controls_clearbit(vmx, exit);
}
static int find_msr(struct vmx_msrs *m, unsigned int msr)
{
unsigned int i;
for (i = 0; i < m->nr; ++i) {
if (m->val[i].index == msr)
return i;
}
return -ENOENT;
}
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
case MSR_EFER:
if (cpu_has_load_ia32_efer()) {
clear_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER);
return;
}
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (cpu_has_load_perf_global_ctrl()) {
clear_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
return;
}
break;
}
i = find_msr(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
m->guest.val[i] = m->guest.val[m->guest.nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
i = find_msr(&m->host, msr);
if (i < 0)
return;
--m->host.nr;
m->host.val[i] = m->host.val[m->host.nr];
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit,
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
u64 guest_val, u64 host_val)
{
vmcs_write64(guest_val_vmcs, guest_val);
if (host_val_vmcs != HOST_IA32_EFER)
vmcs_write64(host_val_vmcs, host_val);
vm_entry_controls_setbit(vmx, entry);
vm_exit_controls_setbit(vmx, exit);
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val, bool entry_only)
{
int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
case MSR_EFER:
if (cpu_has_load_ia32_efer()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER,
GUEST_IA32_EFER,
HOST_IA32_EFER,
guest_val, host_val);
return;
}
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (cpu_has_load_perf_global_ctrl()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
GUEST_IA32_PERF_GLOBAL_CTRL,
HOST_IA32_PERF_GLOBAL_CTRL,
guest_val, host_val);
return;
}
break;
case MSR_IA32_PEBS_ENABLE:
/* PEBS needs a quiescent period after being disabled (to write
* a record). Disabling PEBS through VMX MSR swapping doesn't
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
i = find_msr(&m->guest, msr);
if (!entry_only)
j = find_msr(&m->host, msr);
if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
(j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
}
if (i < 0) {
i = m->guest.nr++;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
}
m->guest.val[i].index = msr;
m->guest.val[i].value = guest_val;
if (entry_only)
return;
if (j < 0) {
j = m->host.nr++;
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
m->host.val[j].index = msr;
m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
/* Shadow paging assumes NX to be available. */
if (!enable_ept)
guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
/*
* On EPT, we can't emulate NX, so we must switch EFER atomically.
* On CPUs that support "load IA32_EFER", always switch EFER
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
} else {
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
vmx->guest_msrs[efer_offset].data = guest_efer;
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
return true;
}
}
#ifdef CONFIG_X86_32
/*
* On 32-bit kernels, VM exits still load the FS and GS bases from the
* VMCS rather than the segment table. KVM uses this helper to figure
* out the current bases to poke them into the VMCS before entry.
*/
static unsigned long segment_base(u16 selector)
{
struct desc_struct *table;
unsigned long v;
if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
if (!(ldt_selector & ~SEGMENT_RPL_MASK))
return 0;
table = (struct desc_struct *)segment_base(ldt_selector);
}
v = get_desc_base(&table[selector >> 3]);
return v;
}
#endif
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
if (pt_mode == PT_MODE_SYSTEM)
return;
/*
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
}
}
static void pt_guest_exit(struct vcpu_vmx *vmx)
{
if (pt_mode == PT_MODE_SYSTEM)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
}
/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base)
{
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
else
vmcs_write16(HOST_FS_SELECTOR, 0);
host->fs_sel = fs_sel;
}
if (unlikely(gs_sel != host->gs_sel)) {
if (!(gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, gs_sel);
else
vmcs_write16(HOST_GS_SELECTOR, 0);
host->gs_sel = gs_sel;
}
if (unlikely(fs_base != host->fs_base)) {
vmcs_writel(HOST_FS_BASE, fs_base);
host->fs_base = fs_base;
}
if (unlikely(gs_base != host->gs_base)) {
vmcs_writel(HOST_GS_BASE, gs_base);
host->gs_base = gs_base;
}
}
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
#endif
unsigned long fs_base, gs_base;
u16 fs_sel, gs_sel;
int i;
vmx->req_immediate_exit = false;
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
if (!vmx->guest_msrs_ready) {
vmx->guest_msrs_ready = true;
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
if (vmx->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
host_state->ldt_sel = kvm_read_ldt();
#ifdef CONFIG_X86_64
savesegment(ds, host_state->ds_sel);
savesegment(es, host_state->es_sel);
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
save_fsgs_for_kvm();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
vmx->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = segment_base(fs_sel);
gs_base = segment_base(gs_sel);
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
vmx->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
if (!vmx->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
++vmx->vcpu.stat.host_state_reload;
#ifdef CONFIG_X86_64
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(host_state->gs_sel);
#else
loadsegment(gs, host_state->gs_sel);
#endif
}
if (host_state->fs_sel & 7)
loadsegment(fs, host_state->fs_sel);
#ifdef CONFIG_X86_64
if (unlikely(host_state->ds_sel | host_state->es_sel)) {
loadsegment(ds, host_state->ds_sel);
loadsegment(es, host_state->es_sel);
}
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->guest_state_loaded = false;
vmx->guest_msrs_ready = false;
}
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
if (vmx->guest_state_loaded)
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
if (vmx->guest_state_loaded)
wrmsrl(MSR_KERNEL_GS_BASE, data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
unsigned int dest;
/*
* In case of hot-plug or hot-unplug, we may have to undo
* vmx_vcpu_pi_put even if there is no assigned device. And we
* always keep PI.NDST up to date for simplicity: it makes the
* code easier, and CPU migration is not a fast path.
*/
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
/*
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
* correctly.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
pi_clear_sn(pi_desc);
goto after_clear_sn;
}
/* The full case. */
do {
old.control = new.control = pi_desc->control;
dest = cpu_physical_id(cpu);
if (x2apic_enabled())
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
new.sn = 0;
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
after_clear_sn:
/*
* Clear SN before reading the bitmap. The VT-d firmware
* writes the bitmap and reads SN atomically (5.2.3 in the
* spec), so it doesn't really have a memory barrier that
* pairs with this, but we cannot do that and we need one.
*/
smp_mb__after_atomic();
if (!pi_is_pir_empty(pi_desc))
pi_set_on(pi_desc);
}
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
struct vmcs *prev;
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
/*
* Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
* this cpu's percpu list, otherwise it may not yet be deleted
* from its previous cpu's percpu list. Pairs with the
* smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
local_irq_enable();
}
prev = per_cpu(current_vmcs, cpu);
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
/*
* No indirect branch prediction barrier needed when switching
* the active VMCS within a guest, e.g. on nested VM-Enter.
* The L1 VMM can protect itself with retpolines, IBPB or IBRS.
*/
if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
indirect_branch_prediction_barrier();
}
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
* exit. This is okay, since 0x67 covers everything except
* the IO bitmap and have have code to handle the IO bitmap
* being lost after a VM exit.
*/
BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu = cpu;
}
/* Setup TSC multiplier */
if (kvm_has_tsc_control &&
vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
decache_tsc_multiplier(vmx);
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
vmx->host_debugctlmsr = get_debugctlmsr();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
return;
/* Set SN when the vCPU is preempted */
if (vcpu->preempted)
pi_set_sn(pi_desc);
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
static bool emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
rflags = vmcs_readl(GUEST_RFLAGS);
if (to_vmx(vcpu)->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
save_rflags = to_vmx(vcpu)->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
to_vmx(vcpu)->rflags = rflags;
}
return to_vmx(vcpu)->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
unsigned long old_rflags = vmx_get_rflags(vcpu);
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
return ret;
}
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
u32 interruptibility = interruptibility_old;
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
if (mask & KVM_X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
else if (mask & KVM_X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}
static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long value;
/*
* Any MSR write that attempts to change bits marked reserved will
* case a #GP fault.
*/
if (data & vmx->pt_desc.ctl_bitmask)
return 1;
/*
* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
return 1;
/*
* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
* and FabricEn would cause #GP, if
* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
*/
if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
!(data & RTIT_CTL_FABRIC_EN) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))
return 1;
/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
* utilize encodings marked reserved will casue a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
!test_bit((data & RTIT_CTL_MTC_RANGE) >>
RTIT_CTL_MTC_RANGE_OFFSET, &value))
return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cycle_thresholds);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET, &value))
return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET, &value))
return 1;
/*
* If ADDRx_CFG is reserved or the encodings is >2 will
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
return 1;
return 0;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip, orig_rip;
/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
* undefined behavior: Intel's SDM doesn't mandate the VMCS field be
* set when EPT misconfig occurs. In practice, real hardware updates
* VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
* (namely Hyper-V) don't set it due to it being undefined behavior,
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
/*
* We need to mask out the high 32 bits of RIP if not in 64-bit
* mode, but just finding out that we are in 64-bit mode is
* quite expensive. Only do it if there was a carry.
*/
if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
rip = (u32)rip;
#endif
kvm_rip_write(vcpu, rip);
} else {
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
return 0;
}
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
return 1;
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/*
* Ensure that we clear the HLT state in the VMCS. We don't need to
* explicitly skip the instruction because if the HLT state is set,
* then the instruction is already executing and RIP has already been
* advanced.
*/
if (kvm_hlt_in_guest(vcpu->kvm) &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
kvm_deliver_exception_payload(vcpu);
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (kvm_exception_is_soft(nr))
inc_eip = vcpu->arch.event_exit_inst_len;
kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
return;
}
WARN_ON_ONCE(vmx->emulation_required);
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
{
return cpu_has_vmx_rdtscp();
}
static bool vmx_invpcid_supported(void)
{
return cpu_has_vmx_invpcid();
}
/*
* Swap MSR entry in host/guest MSR entry array.
*/
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
{
struct shared_msr_entry tmp;
tmp = vmx->guest_msrs[to];
vmx->guest_msrs[to] = vmx->guest_msrs[from];
vmx->guest_msrs[from] = tmp;
}
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
* mode, as fiddling with msrs is very expensive.
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs, index;
save_nmsrs = 0;
#ifdef CONFIG_X86_64
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
index = __find_msr_index(vmx, MSR_STAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_LSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
index = __find_msr_index(vmx, MSR_EFER);
if (index >= 0 && update_transition_efer(vmx, index))
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
vmx->guest_msrs_ready = false;
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
}
static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (is_guest_mode(vcpu) &&
(vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
}
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u64 g_tsc_offset = 0;
/*
* We're here if L1 chose not to trap WRMSR to TSC. According
* to the spec, this should set L1's TSC; The offset that L1
* set for L2 remains unchanged, and still needs to be added
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
(vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vcpu->arch.tsc_offset - g_tsc_offset,
offset);
vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
return offset + g_tsc_offset;
}
/*
* nested_vmx_allowed() checks whether a guest should be allowed to use VMX
* instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
* all guests if the "nested" module option is off, and can also be disabled
* for a single guest by disabling its VMX cpuid bit.
*/
bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
{
return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
uint64_t val)
{
uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
return !(val & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
default:
return 1;
}
return 0;
}
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
u32 index;
switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
msr_info->data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
break;
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
msr_info->data = vmx->msr_ia32_umwait_control;
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
case MSR_IA32_FEATURE_CONTROL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported() ||
(!msr_info->host_initiated &&
!(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
return 1;
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
msr_info->data = vmx->pt_desc.guest.ctl;
break;
case MSR_IA32_RTIT_STATUS:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
msr_info->data = vmx->pt_desc.guest.status;
break;
case MSR_IA32_RTIT_CR3_MATCH:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
msr_info->data = vmx->pt_desc.guest.output_base;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
msr_info->data = vmx->pt_desc.guest.output_mask;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
}
return kvm_get_msr_common(vcpu, msr_info);
}
return 0;
}
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;
switch (msr_index) {
case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
get_vmcs12(vcpu)->guest_sysenter_cs = data;
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
if (is_guest_mode(vcpu))
get_vmcs12(vcpu)->guest_sysenter_eip = data;
vmcs_writel(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
if (is_guest_mode(vcpu))
get_vmcs12(vcpu)->guest_sysenter_esp = data;
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_DEBUGCTLMSR:
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
/* The reserved bit 1 and non-32 bit [63:32] should be zero */
if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
return 1;
vmx->msr_ia32_umwait_control = data;
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_has_spec_ctrl_msr(vcpu))
return 1;
if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
if (!data)
break;
/*
* For non-nested:
* When it's written (to non-zero) for the first time, pass
* it through.
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
* nested_vmx_merge_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
if (!boot_cpu_has(X86_FEATURE_IBPB))
return 1;
if (!data)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
/*
* For non-nested:
* When it's written (to non-zero) for the first time, pass
* it through.
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
* nested_vmx_merge_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
return 1;
if (is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
get_vmcs12(vcpu)->guest_ia32_pat = data;
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, data);
vcpu->arch.pat = data;
break;
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_TSC_ADJUST:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEATURE_CONTROL:
if (!vmx_feature_control_msr_valid(vcpu, data) ||
(to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported() ||
(!msr_info->host_initiated &&
!(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
return 1;
/*
* The only supported bit as of Skylake is bit 8, but
* it is not supported on KVM.
*/
if (data != 0)
return 1;
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
pt_update_intercept_for_msr(vmx);
break;
case MSR_IA32_RTIT_STATUS:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(data & MSR_IA32_RTIT_STATUS_MASK))
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) ||
(data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
/* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
ret = kvm_set_shared_msr(msr->index, msr->data,
msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
}
break;
}
ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
}
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
break;
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
case VCPU_EXREG_PDPTR:
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
default:
break;
}
}
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
}
static __init int vmx_disabled_by_bios(void)
{
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
if (msr & FEATURE_CONTROL_LOCKED) {
/* launched w/ TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& tboot_enabled())
return 1;
/* launched w/o TXT and VMX only enabled w/ TXT */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
"activate TXT before enabling KVM\n");
return 1;
}
/* launched w/o TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& !tboot_enabled())
return 1;
}
return 0;
}
static void kvm_cpu_vmxon(u64 addr)
{
cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
asm volatile ("vmxon %0" : : "m"(addr));
}
static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
/*
* This can happen if we hot-added a CPU but failed to allocate
* VP assist page for it.
*/
if (static_branch_unlikely(&enable_evmcs) &&
!hv_get_vp_assist_page(cpu))
return -EFAULT;
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
if (tboot_enabled())
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
if ((old & test_bits) != test_bits) {
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
return 0;
}
static void vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v, *n;
list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
__loaded_vmcs_clear(v);
}
/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
* tricks.
*/
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex("vmxoff"));
intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
}
static void hardware_disable(void)
{
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
rdmsr(msr, vmx_msr_low, vmx_msr_high);
ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
/* Ensure minimum (required) set of control bits are supported. */
if (ctl_min & ~ctl)
return -EIO;
*result = ctl;
return 0;
}
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
#endif
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min2 = 0;
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
}
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_2nd_exec_control &= ~(
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
&vmx_cap->ept, &vmx_cap->vpid);
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
} else if (vmx_cap->ept) {
vmx_cap->ept = 0;
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
vmx_cap->vpid = 0;
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
VM_EXIT_LOAD_IA32_PAT |
VM_EXIT_LOAD_IA32_EFER |
VM_EXIT_CLEAR_BNDCFGS |
VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
VM_ENTRY_LOAD_IA32_PAT |
VM_ENTRY_LOAD_IA32_EFER |
VM_ENTRY_LOAD_BNDCFGS |
VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
* can't be used due to an errata where VM Exit may incorrectly clear
* IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*/
if (boot_cpu_data.x86 == 0x6) {
switch (boot_cpu_data.x86_model) {
case 26: /* AAK155 */
case 30: /* AAP115 */
case 37: /* AAT100 */
case 44: /* BC86,AAY89,BD102 */
case 46: /* BA97 */
_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
break;
default:
break;
}
}
rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
if (vmx_msr_high & (1u<<16))
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
if (((vmx_msr_high >> 18) & 15) != 6)
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
if (static_branch_unlikely(&enable_evmcs))
evmcs_sanitize_exec_ctrls(vmcs_conf);
return 0;
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
pages = __alloc_pages_node(node, flags, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
memset(vmcs, 0, vmcs_config.size);
/* KVM supports Enlightened VMCS v1 only */
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
vmcs->hdr.revision_id = vmcs_config.revision_id;
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
return vmcs;
}
void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
}
/*
* Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
*/
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
if (!loaded_vmcs->vmcs)
return;
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
if (loaded_vmcs->msr_bitmap)
free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
loaded_vmcs->vmcs = alloc_vmcs(false);
if (!loaded_vmcs->vmcs)
return -ENOMEM;
loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs->hv_timer_soft_disabled = false;
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)
__get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
if (IS_ENABLED(CONFIG_HYPERV) &&
static_branch_unlikely(&enable_evmcs) &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs =
(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
memset(&loaded_vmcs->controls_shadow, 0,
sizeof(struct vmcs_controls_shadow));
return 0;
out_vmcs:
free_loaded_vmcs(loaded_vmcs);
return -ENOMEM;
}
static void free_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
free_vmcs(per_cpu(vmxarea, cpu));
per_cpu(vmxarea, cpu) = NULL;
}
}
static __init int alloc_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
}
/*
* When eVMCS is enabled, alloc_vmcs_cpu() sets
* vmcs->revision_id to KVM_EVMCS_VERSION instead of
* revision_id reported by MSR_IA32_VMX_BASIC.
*
* However, even though not explicitly documented by
* TLFS, VMXArea passed as VMXON argument should
* still be marked with revision_id reported by
* physical CPU.
*/
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
}
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
struct kvm_segment *save)
{
if (!emulate_invalid_guest_state) {
/*
* CS and SS RPL should be equal during guest entry according
* to VMX spec, but in reality it is not always so. Since vcpu
* is in the middle of the transition from real mode to
* protected mode it is safe to assume that RPL 0 is a good
* default value.
*/
if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
save->selector &= ~SEGMENT_RPL_MASK;
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* Update real mode segment cache. It may be not up-to-date if sement
* register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
vmx->rmode.vm86_active = 0;
vmx_segment_cache_clear(vmx);
vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
}
static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_segment var = *save;
var.dpl = 0x3;
if (seg == VCPU_SREG_CS)
var.type = 0x3;
if (!emulate_invalid_guest_state) {
var.selector = var.base >> 4;
var.base = var.base & 0xffff0;
var.limit = 0xffff;
var.g = 0;
var.db = 0;
var.present = 1;
var.s = 1;
var.l = 0;
var.unusable = 0;
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
printk_once(KERN_WARNING "kvm: segment base is not "
"paragraph aligned when entering "
"protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
vmx->rmode.vm86_active = 1;
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
vmx->rmode.save_rflags = flags;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
kvm_mmu_reset_context(vcpu);
}
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
if (!msr)
return;
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
msr->data = efer;
} else {
vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
msr->data = efer & ~EFER_LME;
}
setup_msrs(vmx);
}
#ifdef CONFIG_X86_64
static void enter_lmode(struct kvm_vcpu *vcpu)
{
u32 guest_tr_ar;
vmx_segment_cache_clear(to_vmx(vcpu));
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
pr_debug_ratelimited("%s: tss fixup for long mode. \n",
__func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~VMX_AR_TYPE_MASK)
| VMX_AR_TYPE_BUSY_64_TSS);
}
vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
int vpid = to_vmx(vcpu)->vpid;
if (!vpid_sync_vcpu_addr(vpid, addr))
vpid_sync_context(vpid);
/*
* If VPIDs are not supported or enabled, then the above is a no-op.
* But we don't really need a TLB flush in that case anyway, because
* each VM entry/exit includes an implicit flush when VPID is 0.
*/
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty))
return;
if (is_pae_paging(vcpu)) {
vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
}
}
void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
if (is_pae_paging(vcpu)) {
mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty);
}
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING);
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING);
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
if (!(cr0 & X86_CR0_WP))
*hw_cr0 &= ~X86_CR0_WP;
}
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
}
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
if (enable_ept && !enable_unrestricted_guest)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
/* depends on vcpu->arch.cr0 to be set to a new value */
vmx->emulation_required = emulation_required(vcpu);
}
static int get_ept_level(struct kvm_vcpu *vcpu)
{
/* Nested EPT currently only supports 4-level walks. */
if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
return 4;
if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
return 5;
return 4;
}
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
{
u64 eptp = VMX_EPTP_MT_WB;
eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
eptp |= VMX_EPTP_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
return eptp;
}
void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
guest_cr3 = cr3;
if (enable_ept) {
eptp = construct_eptp(vcpu, cr3);
vmcs_write64(EPT_POINTER, eptp);
if (kvm_x86_ops->tlb_remote_flush) {
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
to_vmx(vcpu)->ept_pointer = eptp;
to_kvm_vmx(kvm)->ept_pointers_match
= EPT_POINTERS_CHECK;
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
/* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
if (is_guest_mode(vcpu))
update_guest_cr3 = false;
else if (enable_unrestricted_guest || is_paging(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
else
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
if (update_guest_cr3)
vmcs_writel(GUEST_CR3, guest_cr3);
}
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
unsigned long hw_cr4;
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
if (cr4 & X86_CR4_UMIP) {
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
hw_cr4 &= ~X86_CR4_UMIP;
} else if (!is_guest_mode(vcpu) ||
!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
}
}
if (cr4 & X86_CR4_VMXE) {
/*
* To use VMXON (and later other VMX instructions), a guest
* must first be able to turn on cr4.VMXE (see handle_vmon()).
* So basically the check on whether to allow nested VMX
* is here. We operate under the default treatment of SMM,
* so VMX cannot be enabled under SMM.
*/
if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
return 1;
}
if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
return 1;
vcpu->arch.cr4 = cr4;
if (!enable_unrestricted_guest) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
/*
* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
* to be manually disabled when guest switches to non-paging
* mode.
*
* If !enable_unrestricted_guest, the CPU is always running
* with CR0.PG=1 and CR4 needs to be modified.
* If enable_unrestricted_guest, the CPU automatically
* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
if (!is_paging(vcpu))
hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
}
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
return 0;