Quelle vmx.c Sprache: C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
*   Avi Kivity   <avi@qumranet.com>
*   Yaniv Kamay  <yaniv@qumranet.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/objtool.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
#include <linux/entry-kvm.h>

#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fred.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/reboot.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>

#include <trace/events/ipi.h>

#include "capabilities.h"
#include "common.h"
#include "cpuid.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
#include "posted_intr.h"

#include "mmu/spte.h"

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");

#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
#endif

bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);

static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, 0444);

bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, 0444);

bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, 0444);

bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
   enable_unrestricted_guest, bool, 0444);

bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, 0444);

static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, 0444);

static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);

module_param(enable_apicv, bool, 0444);
module_param(enable_ipiv, bool, 0444);

module_param(enable_device_posted_irqs, bool, 0444);

/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
module_param(nested, bool, 0444);

bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, 0444);

static bool __read_mostly error_on_inconsistent_vmcs_config = true;
module_param(error_on_inconsistent_vmcs_config, bool, 0444);

static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);

#define MSR_BITMAP_MODE_X2APIC  1
#define MSR_BITMAP_MODE_X2APIC_APICV 2

#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif

extern bool __read_mostly allow_smaller_maxphyaddr;
module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);

#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON    \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)

#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))

/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap:    upper bound on the amount of time between two successive
*             executions of PAUSE in a loop. Also indicate if ple enabled.
*             According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute
*             in a PAUSE loop. Tests indicate that most spinlocks are held for
*             less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);

static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, uint, 0444);

/* Default resets per-vcpu window every exit to ple_window. */
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);

/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
#endif

struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;

static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);

/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;

static const struct {
const char *option;
bool for_parse;
} vmentry_l1d_param[] = {
[VMENTER_L1D_FLUSH_AUTO]  = {"auto", true},
[VMENTER_L1D_FLUSH_NEVER]  = {"never", true},
[VMENTER_L1D_FLUSH_COND]  = {"cond", true},
[VMENTER_L1D_FLUSH_ALWAYS]  = {"always", true},
[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};

#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;

static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;

if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
  l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
  return 0;
}

if (!enable_ept) {
  l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
  return 0;
}

if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
  l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
  return 0;
}

/* If set to auto use the default l1tf mitigation method */
if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
  switch (l1tf_mitigation) {
  case L1TF_MITIGATION_OFF:
   l1tf = VMENTER_L1D_FLUSH_NEVER;
   break;
  case L1TF_MITIGATION_AUTO:
  case L1TF_MITIGATION_FLUSH_NOWARN:
  case L1TF_MITIGATION_FLUSH:
  case L1TF_MITIGATION_FLUSH_NOSMT:
   l1tf = VMENTER_L1D_FLUSH_COND;
   break;
  case L1TF_MITIGATION_FULL:
  case L1TF_MITIGATION_FULL_FORCE:
   l1tf = VMENTER_L1D_FLUSH_ALWAYS;
   break;
  }
} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
  l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}

if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
     !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
  /*
* This allocation for vmx_l1d_flush_pages is not tied to a VM
* lifetime and so should not be charged to a memcg.
*/
  page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
  if (!page)
   return -ENOMEM;
  vmx_l1d_flush_pages = page_address(page);

  /*
* Initialize each page with a different pattern in
* order to protect against KSM in the nested
* virtualization case.
*/
  for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
   memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
          PAGE_SIZE);
  }
}

l1tf_vmx_mitigation = l1tf;

if (l1tf != VMENTER_L1D_FLUSH_NEVER)
  static_branch_enable(&vmx_l1d_should_flush);
else
  static_branch_disable(&vmx_l1d_should_flush);

if (l1tf == VMENTER_L1D_FLUSH_COND)
  static_branch_enable(&vmx_l1d_flush_cond);
else
  static_branch_disable(&vmx_l1d_flush_cond);
return 0;
}

static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;

if (s) {
  for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
   if (vmentry_l1d_param[i].for_parse &&
       sysfs_streq(s, vmentry_l1d_param[i].option))
    return i;
  }
}
return -EINVAL;
}

static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
int l1tf, ret;

l1tf = vmentry_l1d_flush_parse(s);
if (l1tf < 0)
  return l1tf;

if (!boot_cpu_has(X86_BUG_L1TF))
  return 0;

/*
* Has vmx_init() run already? If not then this is the pre init
* parameter parsing. In that case just store the value and let
* vmx_init() do the proper setup after enable_ept has been
* established.
*/
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
  vmentry_l1d_flush_param = l1tf;
  return 0;
}

mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}

static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
  return sysfs_emit(s, "???\n");

return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}

static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
{
u64 msr;

if (!vmx->disable_fb_clear)
  return;

msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
/* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}

static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
{
if (!vmx->disable_fb_clear)
  return;

vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
}

static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
/*
* Disable VERW's behavior of clearing CPU buffers for the guest if the
* CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
* the mitigation. Disabling the clearing behavior provides a
* performance boost for guests that aren't aware that manually clearing
* CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
    (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
    !boot_cpu_has_bug(X86_BUG_MDS) &&
    !boot_cpu_has_bug(X86_BUG_TAA);

/*
* If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
* at VMEntry. Skip the MSR read/write when a guest has no use case to
* execute VERW.
*/
if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
    ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
     (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
     (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
     (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
     (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
  vmx->disable_fb_clear = false;
}

static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);

static u32 vmx_segment_access_rights(struct kvm_segment *var);

void vmx_vmexit(void);

#define vmx_insn_failed(fmt...)  \
do {     \
WARN_ONCE(1, fmt);  \
pr_warn_ratelimited(fmt); \
} while (0)

noinline void vmread_error(unsigned long field)
{
vmx_insn_failed("vmread failed: field=%lx\n", field);
}

#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
{
if (fault) {
  kvm_spurious_fault();
} else {
  instrumentation_begin();
  vmread_error(field);
  instrumentation_end();
}
}
#endif

noinline void vmwrite_error(unsigned long field, unsigned long value)
{
vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
   field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
   vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
   vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
   ext, vpid, gva);
}

noinline void invept_error(unsigned long ext, u64 eptp)
{
vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}

static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);

static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability vmx_capability __ro_after_init;

#define VMX_SEGMENT_FIELD(seg)     \
[VCPU_SREG_##seg] = {                                   \
  .selector = GUEST_##seg##_SELECTOR,  \
  .base = GUEST_##seg##_BASE,      \
  .limit = GUEST_##seg##_LIMIT,      \
  .ar_bytes = GUEST_##seg##_AR_BYTES,     \
}

static const struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
VMX_SEGMENT_FIELD(CS),
VMX_SEGMENT_FIELD(DS),
VMX_SEGMENT_FIELD(ES),
VMX_SEGMENT_FIELD(FS),
VMX_SEGMENT_FIELD(GS),
VMX_SEGMENT_FIELD(SS),
VMX_SEGMENT_FIELD(TR),
VMX_SEGMENT_FIELD(LDTR),
};

static unsigned long host_idt_base;

#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);

static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);

if (partition_assist_page == INVALID_PAGE)
  return -ENOMEM;

evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;

evmcs->partition_assist_page = partition_assist_page;
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;

return 0;
}

static __init void hv_init_evmcs(void)
{
int cpu;

if (!enlightened_vmcs)
  return;

/*
* Enlightened VMCS usage should be recommended and the host needs
* to support eVMCS v1 or above.
*/
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
     (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
      KVM_EVMCS_VERSION) {

  /* Check that we have assist pages on all online CPUs */
  for_each_online_cpu(cpu) {
   if (!hv_get_vp_assist_page(cpu)) {
    enlightened_vmcs = false;
    break;
   }
  }

  if (enlightened_vmcs) {
   pr_info("Using Hyper-V Enlightened VMCS\n");
   static_branch_enable(&__kvm_is_using_evmcs);
  }

  if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
   vt_x86_ops.enable_l2_tlb_flush
    = hv_enable_l2_tlb_flush;
} else {
  enlightened_vmcs = false;
}
}

static void hv_reset_evmcs(void)
{
struct hv_vp_assist_page *vp_ap;

if (!kvm_is_using_evmcs())
  return;

/*
* KVM should enable eVMCS if and only if all CPUs have a VP assist
* page, and should reject CPU onlining if eVMCS is enabled the CPU
* doesn't have a VP assist page allocated.
*/
vp_ap = hv_get_vp_assist_page(smp_processor_id());
if (WARN_ON_ONCE(!vp_ap))
  return;

/*
* Reset everything to support using non-enlightened VMCS access later
* (e.g. when we reload the module with enlightened_vmcs=0)
*/
vp_ap->nested_control.features.directhypercall = 0;
vp_ap->current_nested_vmcs = 0;
vp_ap->enlighten_vmentry = 0;
}

#else /* IS_ENABLED(CONFIG_HYPERV) */
static void hv_init_evmcs(void) {}
static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */

/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
* https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
*/
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
/*
* 320767.pdf - AAP86  - B1 -
* i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
*/
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
/* 321333.pdf - AAM126 - D0 - Xeon 3500 */
/* 321324.pdf - AAK139 - D0 - Xeon 5500 */
/* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
/* Xeon E3-1220 V2 */
0x000306A8,
};

static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
u32 eax = cpuid_eax(0x00000001), i;

/* Clear the reserved bits */
eax &= ~(0x3U << 14 | 0xfU << 28);
for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
  if (eax == vmx_preemption_cpu_tfms[i])
   return true;

return false;
}

static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
return flexpriority_enabled && lapic_in_kernel(vcpu);
}

struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;

i = kvm_find_user_return_msr(msr);
if (i >= 0)
  return &vmx->guest_uret_msrs[i];
return NULL;
}

static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
      struct vmx_uret_msr *msr, u64 data)
{
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;

if (msr->load_into_hardware) {
  preempt_disable();
  ret = kvm_set_user_return_msr(slot, data, msr->mask);
  preempt_enable();
}
if (!ret)
  msr->data = data;
return ret;
}

/*
* Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
*
* Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
* atomically track post-VMXON state, e.g. this may be called in NMI context.
* Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
* faults are guaranteed to be due to the !post-VMXON check unless the CPU is
* magically in RM, VM86, compat mode, or at CPL>0.
*/
static int kvm_cpu_vmxoff(void)
{
asm goto("1: vmxoff\n\t"
     _ASM_EXTABLE(1b, %l[fault])
     ::: "cc", "memory" : fault);

cr4_clear_bits(X86_CR4_VMXE);
return 0;

fault:
cr4_clear_bits(X86_CR4_VMXE);
return -EIO;
}

void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;

kvm_rebooting = true;

/*
* Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
* set in task context.  If this races with VMX is disabled by an NMI,
* VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
* kvm_rebooting set.
*/
if (!(__read_cr4() & X86_CR4_VMXE))
  return;

list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
       loaded_vmcss_on_cpu_link) {
  vmcs_clear(v->vmcs);
  if (v->shadow_vmcs)
   vmcs_clear(v->shadow_vmcs);
}

kvm_cpu_vmxoff();
}

static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
int cpu = raw_smp_processor_id();

if (loaded_vmcs->cpu != cpu)
  return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
  per_cpu(current_vmcs, cpu) = NULL;

vmcs_clear(loaded_vmcs->vmcs);
if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
  vmcs_clear(loaded_vmcs->shadow_vmcs);

list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);

/*
* Ensure all writes to loaded_vmcs, including deleting it from its
* current percpu list, complete before setting loaded_vmcs->cpu to
* -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
* and add loaded_vmcs to its percpu list before it's deleted from this
* cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();

loaded_vmcs->cpu = -1;
loaded_vmcs->launched = 0;
}

void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;

if (cpu != -1)
  smp_call_function_single(cpu,
    __loaded_vmcs_clear, loaded_vmcs, 1);
}

static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
           unsigned field)
{
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);

if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
  kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
  vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
vmx->segment_cache.bitmask |= mask;
return ret;
}

static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
u16 *p = &vmx->segment_cache.seg[seg].selector;

if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
  *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
return *p;
}

static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
ulong *p = &vmx->segment_cache.seg[seg].base;

if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
  *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
return *p;
}

static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].limit;

if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
  *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
return *p;
}

static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].ar;

if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
  *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
return *p;
}

void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;

eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
      (1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
* #VE isn't used for VMX.  To test against unexpected changes
* related to #VE for VMX, intercept unexpected #VE and warn on it.
*/
if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
  eb |= 1u << VE_VECTOR;
/*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
* as VMware does.
*/
if (enable_vmware_backdoor)
  eb |= (1u << GP_VECTOR);
if ((vcpu->guest_debug &
      (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
  eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
  eb = ~0;
if (!vmx_need_pf_intercept(vcpu))
  eb &= ~(1u << PF_VECTOR);

/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
* them to L1. When running L2, we will only handle the exceptions
* specified above if L1 did not want them.
*/
if (is_guest_mode(vcpu))
  eb |= get_vmcs12(vcpu)->exception_bitmap;
else {
  int mask = 0, match = 0;

  if (enable_ept && (eb & (1u << PF_VECTOR))) {
   /*
* If EPT is enabled, #PF is currently only intercepted
* if MAXPHYADDR is smaller on the guest than on the
* host.  In that case we only care about present,
* non-reserved faults.  For vmcs02, however, PFEC_MASK
* and PFEC_MATCH are set in prepare_vmcs02_rare.
*/
   mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
   match = PFERR_PRESENT_MASK;
  }
  vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
  vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}

/*
* Disabling xfd interception indicates that dynamic xfeatures
* might be used in the guest. Always trap #NM in this case
* to save guest xfd_err timely.
*/
if (vcpu->arch.xfd_no_write_intercept)
  eb |= (1u << NM_VECTOR);

vmcs_write32(EXCEPTION_BITMAP, eb);
}

/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
  return true;

return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
}

unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
{
unsigned int flags = 0;

if (vmx->loaded_vmcs->launched)
  flags |= VMX_RUN_VMRESUME;

/*
* If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
* to change it directly without causing a vmexit.  In that case read
* it after vmexit and store it in vmx->spec_ctrl.
*/
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
  flags |= VMX_RUN_SAVE_SPEC_CTRL;

if (static_branch_unlikely(&cpu_buf_vm_clear) &&
     kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
  flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;

return flags;
}

static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
  unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
vm_exit_controls_clearbit(vmx, exit);
}

int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
unsigned int i;

for (i = 0; i < m->nr; ++i) {
  if (m->val[i].index == msr)
   return i;
}
return -ENOENT;
}

static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
int i;
struct msr_autoload *m = &vmx->msr_autoload;

switch (msr) {
case MSR_EFER:
  if (cpu_has_load_ia32_efer()) {
   clear_atomic_switch_msr_special(vmx,
     VM_ENTRY_LOAD_IA32_EFER,
     VM_EXIT_LOAD_IA32_EFER);
   return;
  }
  break;
case MSR_CORE_PERF_GLOBAL_CTRL:
  if (cpu_has_load_perf_global_ctrl()) {
   clear_atomic_switch_msr_special(vmx,
     VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
     VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
   return;
  }
  break;
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (i < 0)
  goto skip_guest;
--m->guest.nr;
m->guest.val[i] = m->guest.val[m->guest.nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);

skip_guest:
i = vmx_find_loadstore_msr_slot(&m->host, msr);
if (i < 0)
  return;

--m->host.nr;
m->host.val[i] = m->host.val[m->host.nr];
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}

static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
  unsigned long entry, unsigned long exit,
  unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
  u64 guest_val, u64 host_val)
{
vmcs_write64(guest_val_vmcs, guest_val);
if (host_val_vmcs != HOST_IA32_EFER)
  vmcs_write64(host_val_vmcs, host_val);
vm_entry_controls_setbit(vmx, entry);
vm_exit_controls_setbit(vmx, exit);
}

static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
      u64 guest_val, u64 host_val, bool entry_only)
{
int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;

switch (msr) {
case MSR_EFER:
  if (cpu_has_load_ia32_efer()) {
   add_atomic_switch_msr_special(vmx,
     VM_ENTRY_LOAD_IA32_EFER,
     VM_EXIT_LOAD_IA32_EFER,
     GUEST_IA32_EFER,
     HOST_IA32_EFER,
     guest_val, host_val);
   return;
  }
  break;
case MSR_CORE_PERF_GLOBAL_CTRL:
  if (cpu_has_load_perf_global_ctrl()) {
   add_atomic_switch_msr_special(vmx,
     VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
     VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
     GUEST_IA32_PERF_GLOBAL_CTRL,
     HOST_IA32_PERF_GLOBAL_CTRL,
     guest_val, host_val);
   return;
  }
  break;
case MSR_IA32_PEBS_ENABLE:
  /* PEBS needs a quiescent period after being disabled (to write
* a record).  Disabling PEBS through VMX MSR swapping doesn't
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
  wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}

i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (!entry_only)
  j = vmx_find_loadstore_msr_slot(&m->host, msr);

if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
     (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
  printk_once(KERN_WARNING "Not enough msr switch entries. "
    "Can't add msr %x\n", msr);
  return;
}
if (i < 0) {
  i = m->guest.nr++;
  vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
}
m->guest.val[i].index = msr;
m->guest.val[i].value = guest_val;

if (entry_only)
  return;

if (j < 0) {
  j = m->host.nr++;
  vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
m->host.val[j].index = msr;
m->host.val[j].value = host_val;
}

static bool update_transition_efer(struct vcpu_vmx *vmx)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
int i;

/* Shadow paging assumes NX to be available.  */
if (!enable_ept)
  guest_efer |= EFER_NX;

/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
  ignore_bits &= ~(u64)EFER_SCE;
#endif

/*
* On EPT, we can't emulate NX, so we must switch EFER atomically.
* On CPUs that support "load IA32_EFER", always switch EFER
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
     (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
  if (!(guest_efer & EFER_LMA))
   guest_efer &= ~EFER_LME;
  if (guest_efer != kvm_host.efer)
   add_atomic_switch_msr(vmx, MSR_EFER,
           guest_efer, kvm_host.efer, false);
  else
   clear_atomic_switch_msr(vmx, MSR_EFER);
  return false;
}

i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0)
  return false;

clear_atomic_switch_msr(vmx, MSR_EFER);

guest_efer &= ~ignore_bits;
guest_efer |= kvm_host.efer & ignore_bits;

vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;

return true;
}

#ifdef CONFIG_X86_32
/*
* On 32-bit kernels, VM exits still load the FS and GS bases from the
* VMCS rather than the segment table.  KVM uses this helper to figure
* out the current bases to poke them into the VMCS before entry.
*/
static unsigned long segment_base(u16 selector)
{
struct desc_struct *table;
unsigned long v;

if (!(selector & ~SEGMENT_RPL_MASK))
  return 0;

table = get_current_gdt_ro();

if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
  u16 ldt_selector = kvm_read_ldt();

  if (!(ldt_selector & ~SEGMENT_RPL_MASK))
   return 0;

  table = (struct desc_struct *)segment_base(ldt_selector);
}
v = get_desc_base(&table[selector >> 3]);
return v;
}
#endif

static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
{
return vmx_pt_mode_is_host_guest() &&
        !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}

static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
/* The base must be 128-byte aligned and a legal physical address. */
return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}

static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;

wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
  wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
  wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}

static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;

rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
  rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
  rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}

static void pt_guest_enter(struct vcpu_vmx *vmx)
{
if (vmx_pt_mode_is_system())
  return;

/*
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
  wrmsrq(MSR_IA32_RTIT_CTL, 0);
  pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
  pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}

static void pt_guest_exit(struct vcpu_vmx *vmx)
{
if (vmx_pt_mode_is_system())
  return;

if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
  pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
  pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}

/*
* KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
* i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
*/
if (vmx->pt_desc.host.ctl)
  wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}

void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
   unsigned long fs_base, unsigned long gs_base)
{
if (unlikely(fs_sel != host->fs_sel)) {
  if (!(fs_sel & 7))
   vmcs_write16(HOST_FS_SELECTOR, fs_sel);
  else
   vmcs_write16(HOST_FS_SELECTOR, 0);
  host->fs_sel = fs_sel;
}
if (unlikely(gs_sel != host->gs_sel)) {
  if (!(gs_sel & 7))
   vmcs_write16(HOST_GS_SELECTOR, gs_sel);
  else
   vmcs_write16(HOST_GS_SELECTOR, 0);
  host->gs_sel = gs_sel;
}
if (unlikely(fs_base != host->fs_base)) {
  vmcs_writel(HOST_FS_BASE, fs_base);
  host->fs_base = fs_base;
}
if (unlikely(gs_base != host->gs_base)) {
  vmcs_writel(HOST_GS_BASE, gs_base);
  host->gs_base = gs_base;
}
}

void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
#endif
unsigned long fs_base, gs_base;
u16 fs_sel, gs_sel;
int i;

/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
if (!vmx->guest_uret_msrs_loaded) {
  vmx->guest_uret_msrs_loaded = true;
  for (i = 0; i < kvm_nr_uret_msrs; ++i) {
   if (!vmx->guest_uret_msrs[i].load_into_hardware)
    continue;

   kvm_set_user_return_msr(i,
      vmx->guest_uret_msrs[i].data,
      vmx->guest_uret_msrs[i].mask);
  }
}

if (vmx->nested.need_vmcs12_to_shadow_sync)
  nested_sync_vmcs12_to_shadow(vcpu);

if (vt->guest_state_loaded)
  return;

host_state = &vmx->loaded_vmcs->host_state;

/*
* Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
host_state->ldt_sel = kvm_read_ldt();

#ifdef CONFIG_X86_64
savesegment(ds, host_state->ds_sel);
savesegment(es, host_state->es_sel);

gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
  current_save_fsgs();
  fs_sel = current->thread.fsindex;
  gs_sel = current->thread.gsindex;
  fs_base = current->thread.fsbase;
  vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
  savesegment(fs, fs_sel);
  savesegment(gs, gs_sel);
  fs_base = read_msr(MSR_FS_BASE);
  vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}

wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = segment_base(fs_sel);
gs_base = segment_base(gs_sel);
#endif

vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
vt->guest_state_loaded = true;
}

static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;

if (!vmx->vt.guest_state_loaded)
  return;

host_state = &vmx->loaded_vmcs->host_state;

++vmx->vcpu.stat.host_state_reload;

#ifdef CONFIG_X86_64
rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
  kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
  load_gs_index(host_state->gs_sel);
#else
  loadsegment(gs, host_state->gs_sel);
#endif
}
if (host_state->fs_sel & 7)
  loadsegment(fs, host_state->fs_sel);
#ifdef CONFIG_X86_64
if (unlikely(host_state->ds_sel | host_state->es_sel)) {
  loadsegment(ds, host_state->ds_sel);
  loadsegment(es, host_state->es_sel);
}
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->vt.guest_state_loaded = false;
vmx->guest_uret_msrs_loaded = false;
}

#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
if (vmx->vt.guest_state_loaded)
  rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}

static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
if (vmx->vt.guest_state_loaded)
  wrmsrq(MSR_KERNEL_GS_BASE, data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;

vmx->ple_window = __grow_ple_window(old, ple_window,
         ple_window_grow,
         ple_window_max);

if (vmx->ple_window != old) {
  vmx->ple_window_dirty = true;
  trace_kvm_ple_window_update(vcpu->vcpu_id,
         vmx->ple_window, old);
}
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;

vmx->ple_window = __shrink_ple_window(old, ple_window,
           ple_window_shrink,
           ple_window);

if (vmx->ple_window != old) {
  vmx->ple_window_dirty = true;
  trace_kvm_ple_window_update(vcpu->vcpu_id,
         vmx->ple_window, old);
}
}

void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
struct vmcs *prev;

if (!already_loaded) {
  loaded_vmcs_clear(vmx->loaded_vmcs);
  local_irq_disable();

  /*
* Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
* this cpu's percpu list, otherwise it may not yet be deleted
* from its previous cpu's percpu list.  Pairs with the
* smb_wmb() in __loaded_vmcs_clear().
*/
  smp_rmb();

  list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
    &per_cpu(loaded_vmcss_on_cpu, cpu));
  local_irq_enable();
}

prev = per_cpu(current_vmcs, cpu);
if (prev != vmx->loaded_vmcs->vmcs) {
  per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
  vmcs_load(vmx->loaded_vmcs->vmcs);
}

if (!already_loaded) {
  void *gdt = get_current_gdt_ro();

  /*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
* TLB entries from its previous association with the vCPU.
*/
  kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);

  /*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors.  See 22.2.4.
*/
  vmcs_writel(HOST_TR_BASE,
       (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
  vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */

  if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
   /* 22.2.3 */
   vmcs_writel(HOST_IA32_SYSENTER_ESP,
        (unsigned long)(cpu_entry_stack(cpu) + 1));
  }

  vmx->loaded_vmcs->cpu = cpu;
}
}

/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
  shrink_ple_window(vcpu);

vmx_vcpu_load_vmcs(vcpu, cpu);

vmx_vcpu_pi_load(vcpu, cpu);
}

void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);

vmx_prepare_switch_to_host(to_vmx(vcpu));
}

bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}

unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;

if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
  kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
  rflags = vmcs_readl(GUEST_RFLAGS);
  if (vmx->rmode.vm86_active) {
   rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
   save_rflags = vmx->rmode.save_rflags;
   rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
  }
  vmx->rflags = rflags;
}
return vmx->rflags;
}

void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;

/*
* Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
* is an unrestricted guest in order to mark L2 as needing emulation
* if L1 runs L2 as a restricted guest.
*/
if (is_unrestricted_guest(vcpu)) {
  kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
  vmx->rflags = rflags;
  vmcs_writel(GUEST_RFLAGS, rflags);
  return;
}

old_rflags = vmx_get_rflags(vcpu);
vmx->rflags = rflags;
if (vmx->rmode.vm86_active) {
  vmx->rmode.save_rflags = rflags;
  rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);

if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
  vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}

bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}

u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;

if (interruptibility & GUEST_INTR_STATE_STI)
  ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
  ret |= KVM_X86_SHADOW_INT_MOV_SS;

return ret;
}

void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
u32 interruptibility = interruptibility_old;

interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);

if (mask & KVM_X86_SHADOW_INT_MOV_SS)
  interruptibility |= GUEST_INTR_STATE_MOV_SS;
else if (mask & KVM_X86_SHADOW_INT_STI)
  interruptibility |= GUEST_INTR_STATE_STI;

if ((interruptibility != interruptibility_old))
  vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}

static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long value;

/*
* Any MSR write that attempts to change bits marked reserved will
* case a #GP fault.
*/
if (data & vmx->pt_desc.ctl_bitmask)
  return 1;

/*
* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
     (data & RTIT_CTL_TRACEEN) &&
     data != vmx->pt_desc.guest.ctl)
  return 1;

/*
* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
* and FabricEn would cause #GP, if
* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
*/
if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
  !(data & RTIT_CTL_FABRIC_EN) &&
  !intel_pt_validate_cap(vmx->pt_desc.caps,
     PT_CAP_single_range_output))
  return 1;

/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
* utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
   !test_bit((data & RTIT_CTL_MTC_RANGE) >>
   RTIT_CTL_MTC_RANGE_OFFSET, &value))
  return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps,
      PT_CAP_cycle_thresholds);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
   !test_bit((data & RTIT_CTL_CYC_THRESH) >>
   RTIT_CTL_CYC_THRESH_OFFSET, &value))
  return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
   !test_bit((data & RTIT_CTL_PSB_FREQ) >>
   RTIT_CTL_PSB_FREQ_OFFSET, &value))
  return 1;

/*
* If ADDRx_CFG is reserved or the encodings is >2 will
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
  return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
  return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
  return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
  return 1;

return 0;
}

int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
      void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
* not point at the failing instruction, and even if it did, the code
* stream is inaccessible.  Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
*/
if (vmx_get_exit_reason(vcpu).enclave_mode) {
  kvm_queue_exception(vcpu, UD_VECTOR);
  return X86EMUL_PROPAGATE_FAULT;
}

/* Check that emulation is possible during event vectoring */
if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
     !kvm_can_emulate_event_vectoring(emul_type))
  return X86EMUL_UNHANDLEABLE_VECTORING;

return X86EMUL_CONTINUE;
}

static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
unsigned long rip, orig_rip;
u32 instr_len;

/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
* undefined behavior: Intel's SDM doesn't mandate the VMCS field be
* set when EPT misconfig occurs.  In practice, real hardware updates
* VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
* (namely Hyper-V) don't set it due to it being undefined behavior,
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
     exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
  instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);

  /*
* Emulating an enclave's instructions isn't supported as KVM
* cannot access the enclave's memory or its true RIP, e.g. the
* vmcs.GUEST_RIP points at the exit point of the enclave, not
* the RIP that actually triggered the VM-Exit.  But, because
* most instructions that cause VM-Exit will #UD in an enclave,
* most instruction-based VM-Exits simply do not occur.
*
* There are a few exceptions, notably the debug instructions
* INT1ICEBRK and INT3, as they are allowed in debug enclaves
* and generate #DB/#BP as expected, which KVM might intercept.
* But again, the CPU does the dirty work and saves an instr
* length of zero so VMMs don't shoot themselves in the foot.
* WARN if KVM tries to skip a non-zero length instruction on
* a VM-Exit from an enclave.
*/
  if (!instr_len)
   goto rip_updated;

  WARN_ONCE(exit_reason.enclave_mode,
     "skipping instruction after SGX enclave VM-Exit");

  orig_rip = kvm_rip_read(vcpu);
  rip = orig_rip + instr_len;
#ifdef CONFIG_X86_64
  /*
* We need to mask out the high 32 bits of RIP if not in 64-bit
* mode, but just finding out that we are in 64-bit mode is
* quite expensive.  Only do it if there was a carry.
*/
  if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
   rip = (u32)rip;
#endif
  kvm_rip_write(vcpu, rip);
} else {
  if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
   return 0;
}

rip_updated:
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);

return 1;
}

/*
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);

if (!is_guest_mode(vcpu))
  return;

/*
* Per the SDM, MTF takes priority over debug-trap exceptions besides
* TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
* or ICEBP (in the emulator proper), and skipping of ICEBP after an
* intercepted #DB deliberately avoids single-step #DB and MTF updates
* as ICEBP is higher priority than both.  As instruction emulation is
* completed at this point (i.e. KVM is at the instruction boundary),
* any #DB exception pending delivery must be a debug-trap of lower
* priority than MTF.  Record the pending MTF state to be delivered in
* vmx_check_nested_events().
*/
if (nested_cpu_has_mtf(vmcs12) &&
     (!vcpu->arch.exception.pending ||
      vcpu->arch.exception.vector == DB_VECTOR) &&
     (!vcpu->arch.exception_vmexit.pending ||
      vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
  vmx->nested.mtf_pending = true;
  kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
  vmx->nested.mtf_pending = false;
}
}

int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
}

static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/*
* Ensure that we clear the HLT state in the VMCS.  We don't need to
* explicitly skip the instruction because if the HLT state is set,
* then the instruction is already executing and RIP has already been
* advanced.
*/
if (kvm_hlt_in_guest(vcpu->kvm) &&
   vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
  vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}

void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vcpu_vmx *vmx = to_vmx(vcpu);

kvm_deliver_exception_payload(vcpu, ex);

if (ex->has_error_code) {
  /*
* Despite the error code being architecturally defined as 32
* bits, and the VMCS field being 32 bits, Intel CPUs and thus
* VMX don't actually supporting setting bits 31:16.  Hardware
* will (should) never provide a bogus error code, but AMD CPUs
* do generate error codes with bits 31:16 set, and so KVM's
* ABI lets userspace shove in arbitrary 32-bit values.  Drop
* the upper bits to avoid VM-Fail, losing information that
* doesn't really exist is preferable to killing the VM.
*/
  vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
  intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}

if (vmx->rmode.vm86_active) {
  int inc_eip = 0;
  if (kvm_exception_is_soft(ex->vector))
   inc_eip = vcpu->arch.event_exit_inst_len;
  kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
  return;
}

WARN_ON_ONCE(vmx->vt.emulation_required);

if (kvm_exception_is_soft(ex->vector)) {
  vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
        vmx->vcpu.arch.event_exit_inst_len);
  intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
  intr_info |= INTR_TYPE_HARD_EXCEPTION;

vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);

vmx_clear_hlt(vcpu);
}

static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
          bool load_into_hardware)
{
struct vmx_uret_msr *uret_msr;

uret_msr = vmx_find_uret_msr(vmx, msr);
if (!uret_msr)
  return;

uret_msr->load_into_hardware = load_into_hardware;
}

/*
* Configuring user return MSRs to automatically save, load, and restore MSRs
* that need to be shoved into hardware when running the guest.  Note, omitting
* an MSR here does _NOT_ mean it's not emulated, only that it will not be
* loaded into hardware when running the guest.
*/
static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
bool load_syscall_msrs;

/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
       (vmx->vcpu.arch.efer & EFER_SCE);

vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));

vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
      guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
      guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));

/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
* kernel and old userspace.  If those guests run on a tsx=off host, do
* allow guests to use TSX_CTRL, but don't change the value in hardware
* so that TSX remains always disabled.
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));

/*
* The set of MSRs to load may have changed, reload MSRs before the
* next VM-Enter.
*/
vmx->guest_uret_msrs_loaded = false;
}

u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
  return vmcs12->tsc_offset;

return 0;
}

u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
     nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
  return vmcs12->tsc_multiplier;

return kvm_caps.default_tsc_scaling_ratio;
}

void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}

void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}

/*
* Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
* guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
* backwards compatibility even though KVM doesn't support emulating SMX.  And
* because userspace set "VMX in SMX", the guest must also be allowed to set it,
* e.g. if the MSR is left unlocked and the guest does a RMW operation.
*/
#define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED    | \
     FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
     FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
     FEAT_CTL_SGX_LC_ENABLED   | \
     FEAT_CTL_SGX_ENABLED   | \
     FEAT_CTL_LMCE_ENABLED)

static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
          struct msr_data *msr)
{
uint64_t valid_bits;

/*
* Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
* exposed to the guest.
*/
WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
       ~KVM_SUPPORTED_FEATURE_CONTROL);

if (!msr->host_initiated &&
     (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
  return false;

if (msr->host_initiated)
  valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
else
  valid_bits = vmx->msr_ia32_feature_control_valid_bits;

return !(msr->data & ~valid_bits);
}

int vmx_get_feature_msr(u32 msr, u64 *data)
{
switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
  if (!nested)
   return 1;
  return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
  return KVM_MSR_RET_UNSUPPORTED;
}
}

/*
* Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
u32 index;

switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
  msr_info->data = vmcs_readl(GUEST_FS_BASE);
  break;
case MSR_GS_BASE:
  msr_info->data = vmcs_readl(GUEST_GS_BASE);
  break;
case MSR_KERNEL_GS_BASE:
  msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
  break;
#endif
case MSR_EFER:
  return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSX_CTRL:
  if (!msr_info->host_initiated &&
      !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
   return 1;
  goto find_uret_msr;
case MSR_IA32_UMWAIT_CONTROL:
  if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
   return 1;

  msr_info->data = vmx->msr_ia32_umwait_control;
  break;
case MSR_IA32_SPEC_CTRL:
  if (!msr_info->host_initiated &&
      !guest_has_spec_ctrl_msr(vcpu))
   return 1;

  msr_info->data = to_vmx(vcpu)->spec_ctrl;
  break;
case MSR_IA32_SYSENTER_CS:
  msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
  break;
case MSR_IA32_SYSENTER_EIP:
  msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
  break;
case MSR_IA32_SYSENTER_ESP:
  msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
  break;
case MSR_IA32_BNDCFGS:
  if (!kvm_mpx_supported() ||
      (!msr_info->host_initiated &&
       !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
   return 1;
  msr_info->data = vmcs_read64(GUEST_BNDCFGS);
  break;
case MSR_IA32_MCG_EXT_CTL:
  if (!msr_info->host_initiated &&
      !(vmx->msr_ia32_feature_control &
        FEAT_CTL_LMCE_ENABLED))
   return 1;
  msr_info->data = vcpu->arch.mcg_ext_ctl;
  break;
case MSR_IA32_FEAT_CTL:
  msr_info->data = vmx->msr_ia32_feature_control;
  break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
  if (!msr_info->host_initiated &&
      !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
   return 1;
  msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
   [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
  break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
  if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
   return 1;
  if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
        &msr_info->data))
   return 1;
#ifdef CONFIG_KVM_HYPERV
  /*
* Enlightened VMCS v1 doesn't have certain VMCS fields but
* instead of just ignoring the features, different Hyper-V
* versions are either trying to use them and fail or do some
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
  if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
   nested_evmcs_filter_control_msr(vcpu, msr_info->index,
       &msr_info->data);
#endif
  break;
case MSR_IA32_RTIT_CTL:
  if (!vmx_pt_mode_is_host_guest())
   return 1;
  msr_info->data = vmx->pt_desc.guest.ctl;
  break;
case MSR_IA32_RTIT_STATUS:
  if (!vmx_pt_mode_is_host_guest())
   return 1;
  msr_info->data = vmx->pt_desc.guest.status;
  break;
case MSR_IA32_RTIT_CR3_MATCH:
  if (!vmx_pt_mode_is_host_guest() ||
   !intel_pt_validate_cap(vmx->pt_desc.caps,
      PT_CAP_cr3_filtering))
   return 1;
  msr_info->data = vmx->pt_desc.guest.cr3_match;
  break;
case MSR_IA32_RTIT_OUTPUT_BASE:
  if (!vmx_pt_mode_is_host_guest() ||
   (!intel_pt_validate_cap(vmx->pt_desc.caps,
     PT_CAP_topa_output) &&
    !intel_pt_validate_cap(vmx->pt_desc.caps,
     PT_CAP_single_range_output)))
   return 1;
  msr_info->data = vmx->pt_desc.guest.output_base;
  break;
case MSR_IA32_RTIT_OUTPUT_MASK:
  if (!vmx_pt_mode_is_host_guest() ||
   (!intel_pt_validate_cap(vmx->pt_desc.caps,
     PT_CAP_topa_output) &&
    !intel_pt_validate_cap(vmx->pt_desc.caps,
     PT_CAP_single_range_output)))
   return 1;
  msr_info->data = vmx->pt_desc.guest.output_mask;
  break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
  index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
  if (!vmx_pt_mode_is_host_guest() ||
      (index >= 2 * vmx->pt_desc.num_address_ranges))
   return 1;
  if (index % 2)
   msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
  else
   msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
  break;
case MSR_IA32_DEBUGCTLMSR:
  msr_info->data = vmx_guest_debugctl_read();
  break;
default:
find_uret_msr:
  msr = vmx_find_uret_msr(vmx, msr_info->index);
  if (msr) {
   msr_info->data = msr->data;
   break;
  }
  return kvm_get_msr_common(vcpu, msr_info);
}

return 0;
}

static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
          u64 data)
{
#ifdef CONFIG_X86_64
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
  return (u32)data;
#endif
return (unsigned long)data;
}

u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;

if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
     (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
  debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;

if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
     (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
  debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;

if (boot_cpu_has(X86_FEATURE_RTM) &&
     (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
  debugctl |= DEBUGCTLMSR_RTM_DEBUG;

return debugctl;
}

bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
{
u64 invalid;

invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
  kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
  invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
}
return !invalid;
}

/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;

switch (msr_index) {
case MSR_EFER:
  ret = kvm_set_msr_common(vcpu, msr_info);
  break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
  vmx_segment_cache_clear(vmx);
  vmcs_writel(GUEST_FS_BASE, data);
  break;
case MSR_GS_BASE:
  vmx_segment_cache_clear(vmx);
  vmcs_writel(GUEST_GS_BASE, data);
  break;
case MSR_KERNEL_GS_BASE:
  vmx_write_guest_kernel_gs_base(vmx, data);
  break;
case MSR_IA32_XFD:
  ret = kvm_set_msr_common(vcpu, msr_info);
  /*
* Always intercepting WRMSR could incur non-negligible
* overhead given xfd might be changed frequently in
* guest context switch. Disable write interception
* upon the first write with a non-zero value (indicating
* potential usage on dynamic xfeatures). Also update
* exception bitmap to trap #NM for proper virtualization
* of guest xfd_err.
*/
  if (!ret && data) {
   vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
            MSR_TYPE_RW);
   vcpu->arch.xfd_no_write_intercept = true;
   vmx_update_exception_bitmap(vcpu);
  }
  break;
#endif
case MSR_IA32_SYSENTER_CS:
  if (is_guest_mode(vcpu))
   get_vmcs12(vcpu)->guest_sysenter_cs = data;
  vmcs_write32(GUEST_SYSENTER_CS, data);
  break;
case MSR_IA32_SYSENTER_EIP:
  if (is_guest_mode(vcpu)) {
   data = nested_vmx_truncate_sysenter_addr(vcpu, data);
   get_vmcs12(vcpu)->guest_sysenter_eip = data;
  }
  vmcs_writel(GUEST_SYSENTER_EIP, data);
  break;
case MSR_IA32_SYSENTER_ESP:
  if (is_guest_mode(vcpu)) {
   data = nested_vmx_truncate_sysenter_addr(vcpu, data);
   get_vmcs12(vcpu)->guest_sysenter_esp = data;
  }
  vmcs_writel(GUEST_SYSENTER_ESP, data);
  break;
case MSR_IA32_DEBUGCTLMSR:
  if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
   return 1;

  data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);

  if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
      VM_EXIT_SAVE_DEBUG_CONTROLS)
   get_vmcs12(vcpu)->guest_ia32_debugctl = data;

  vmx_guest_debugctl_write(vcpu, data);

  if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
      (data & DEBUGCTLMSR_LBR))
   intel_pmu_create_guest_lbr_event(vcpu);
  return 0;
case MSR_IA32_BNDCFGS:
  if (!kvm_mpx_supported() ||
      (!msr_info->host_initiated &&
       !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
   return 1;
  if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
      (data & MSR_IA32_BNDCFGS_RSVD))
   return 1;

  if (is_guest_mode(vcpu) &&
      ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
       (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
   get_vmcs12(vcpu)->guest_bndcfgs = data;

  vmcs_write64(GUEST_BNDCFGS, data);
  break;
case MSR_IA32_UMWAIT_CONTROL:
  if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
   return 1;

  /* The reserved bit 1 and non-32 bit [63:32] should be zero */
  if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
   return 1;

  vmx->msr_ia32_umwait_control = data;
  break;
case MSR_IA32_SPEC_CTRL:
  if (!msr_info->host_initiated &&
      !guest_has_spec_ctrl_msr(vcpu))
   return 1;

  if (kvm_spec_ctrl_test_value(data))
   return 1;

  vmx->spec_ctrl = data;
  if (!data)
   break;

  /*
* For non-nested:
* When it's written (to non-zero) for the first time, pass
* it through.
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
* nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.14 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.