Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/sources/formale Sprachen/C/Linux/arch/x86/kvm/svm/ (Open Source Betriebssystem Version 6.17.9^©) Datei vom 24.10.2025 mit Größe 152 kB

Quelle svm.c Sprache: C

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kvm_host.h>

#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include "smm.h"
#include "cpuid.h"
#include "pmu.h"

#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/amd-iommu.h>
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
#include <linux/objtool.h>
#include <linux/psp-sev.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
#include <linux/mutex.h>

#include <asm/apic.h>
#include <asm/msr.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/reboot.h>
#include <asm/fpu/api.h>

#include <trace/events/ipi.h>

#include "trace.h"

#include "svm.h"
#include "svm_ops.h"

#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");

#ifdef MODULE
static const struct x86_cpu_id svm_cpu_id[] = {
X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif

#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3

static bool erratum_383_found __read_mostly;

/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
*/
static uint64_t osvw_len = 4, osvw_status;

static DEFINE_PER_CPU(u64, current_tsc_ratio);

/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
* by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
* count value. On VMRUN this value is loaded into an internal counter.
* Each time a pause instruction is executed, this counter is decremented
* until it reaches zero at which time a #VMEXIT is generated if pause
* intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
* Intercept Filtering for more details.
* This also indicate if ple logic enabled.
*
* pause_filter_thresh: In addition, some processor families support advanced
* pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
* the amount of time a guest is allowed to execute in a pause loop.
* In this mode, a 16-bit pause filter threshold field is added in the
* VMCB. The threshold value is a cycle count that is used to reset the
* pause counter. As with simple pause filtering, VMRUN loads the pause
* count value from VMCB into an internal counter. Then, on each pause
* instruction the hardware checks the elapsed number of cycles since
* the most recent pause instruction against the pause filter threshold.
* If the elapsed cycle count is greater than the pause filter threshold,
* then the internal pause count is reloaded from the VMCB and execution
* continues. If the elapsed cycle count is less than the pause filter
* threshold, then the internal pause count is decremented. If the count
* value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
* triggered. If advanced pause filtering is supported and pause filter
* threshold field is set to zero, the filter will operate in the simpler,
* count only mode.
*/

static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
module_param(pause_filter_thresh, ushort, 0444);

static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
module_param(pause_filter_count, ushort, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(pause_filter_count_grow, ushort, 0444);

/* Default resets per-vcpu window every exit to pause_filter_count. */
static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(pause_filter_count_shrink, ushort, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);

/*
* Use nested page tables by default.  Note, NPT may get forced off by
* svm_hardware_setup() if it's unsupported by hardware or the host kernel.
*/
bool npt_enabled = true;
module_param_named(npt, npt_enabled, bool, 0444);

/* allow nested virtualization in KVM/SVM */
static int nested = true;
module_param(nested, int, 0444);

/* enable/disable Next RIP Save */
int nrips = true;
module_param(nrips, int, 0444);

/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);

/* enable/disable Virtual GIF */
int vgif = true;
module_param(vgif, int, 0444);

/* enable/disable LBR virtualization */
int lbrv = true;
module_param(lbrv, int, 0444);

static int tsc_scaling = true;
module_param(tsc_scaling, int, 0444);

/*
* enable / disable AVIC.  Because the defaults differ for APICv
* support between VMX and SVM we cannot use module_param_named.
*/
static bool avic;
module_param(avic, bool, 0444);
module_param(enable_ipiv, bool, 0444);

module_param(enable_device_posted_irqs, bool, 0444);

bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);

bool intercept_smi = true;
module_param(intercept_smi, bool, 0444);

bool vnmi = true;
module_param(vnmi, bool, 0444);

static bool svm_gp_erratum_intercept = true;

static u8 rsm_ins_bytes[] = "\x0f\xaa";

static unsigned long iopm_base;

DEFINE_PER_CPU(struct svm_cpu_data, svm_data);

static DEFINE_MUTEX(vmcb_dump_mutex);

/*
* Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
*
* RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
* defer the restoration of TSC_AUX until the CPU returns to userspace.
*/
int tsc_aux_uret_slot __ro_after_init = -1;

static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
}

int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 old_efer = vcpu->arch.efer;
vcpu->arch.efer = efer;

if (!npt_enabled) {
  /* Shadow paging assumes NX to be available.  */
  efer |= EFER_NX;

  if (!(efer & EFER_LMA))
   efer &= ~EFER_LME;
}

if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
  if (!(efer & EFER_SVME)) {
   svm_leave_nested(vcpu);
   svm_set_gif(svm, true);
   /* #GP intercept is still needed for vmware backdoor */
   if (!enable_vmware_backdoor)
    clr_exception_intercept(svm, GP_VECTOR);

   /*
* Free the nested guest state, unless we are in SMM.
* In this case we will return to the nested guest
* as soon as we leave SMM.
*/
   if (!is_smm(vcpu))
    svm_free_nested(svm);

  } else {
   int ret = svm_allocate_nested(svm);

   if (ret) {
    vcpu->arch.efer = old_efer;
    return ret;
   }

   /*
* Never intercept #GP for SEV guests, KVM can't
* decrypt guest memory to workaround the erratum.
*/
   if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
    set_exception_intercept(svm, GP_VECTOR);
  }
}

svm->vmcb->save.efer = efer | EFER_SVME;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
return 0;
}

static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 ret = 0;

if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
  ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
return ret;
}

static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
struct vcpu_svm *svm = to_svm(vcpu);

if (mask == 0)
  svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
else
  svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;

}

static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
        bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long old_rflags;

/*
* SEV-ES does not expose the next RIP. The RIP update is controlled by
* the type of exit and the #VC handler in the guest.
*/
if (sev_es_guest(vcpu->kvm))
  goto done;

if (nrips && svm->vmcb->control.next_rip != 0) {
  WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
  svm->next_rip = svm->vmcb->control.next_rip;
}

if (!svm->next_rip) {
  if (unlikely(!commit_side_effects))
   old_rflags = svm->vmcb->save.rflags;

  if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
   return 0;

  if (unlikely(!commit_side_effects))
   svm->vmcb->save.rflags = old_rflags;
} else {
  kvm_rip_write(vcpu, svm->next_rip);
}

done:
if (likely(commit_side_effects))
  svm_set_interrupt_shadow(vcpu, 0);

return 1;
}

static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
return __svm_skip_emulated_instruction(vcpu, true);
}

static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
{
unsigned long rip, old_rip = kvm_rip_read(vcpu);
struct vcpu_svm *svm = to_svm(vcpu);

/*
* Due to architectural shortcomings, the CPU doesn't always provide
* NextRIP, e.g. if KVM intercepted an exception that occurred while
* the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
* the instruction even if NextRIP is supported to acquire the next
* RIP so that it can be shoved into the NextRIP field, otherwise
* hardware will fail to advance guest RIP during event injection.
* Drop the exception/interrupt if emulation fails and effectively
* retry the instruction, it's the least awful option.  If NRIPS is
* in use, the skip must not commit any side effects such as clearing
* the interrupt shadow or RFLAGS.RF.
*/
if (!__svm_skip_emulated_instruction(vcpu, !nrips))
  return -EIO;

rip = kvm_rip_read(vcpu);

/*
* Save the injection information, even when using next_rip, as the
* VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
* doesn't complete due to a VM-Exit occurring while the CPU is
* vectoring the event.   Decoding the instruction isn't guaranteed to
* work as there may be no backing instruction, e.g. if the event is
* being injected by L1 for L2, or if the guest is patching INT3 into
* a different instruction.
*/
svm->soft_int_injected = true;
svm->soft_int_csbase = svm->vmcb->save.cs.base;
svm->soft_int_old_rip = old_rip;
svm->soft_int_next_rip = rip;

if (nrips)
  kvm_rip_write(vcpu, old_rip);

if (static_cpu_has(X86_FEATURE_NRIPS))
  svm->vmcb->control.next_rip = rip;

return 0;
}

static void svm_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct vcpu_svm *svm = to_svm(vcpu);

kvm_deliver_exception_payload(vcpu, ex);

if (kvm_exception_is_soft(ex->vector) &&
     svm_update_soft_interrupt_rip(vcpu))
  return;

svm->vmcb->control.event_inj = ex->vector
  | SVM_EVTINJ_VALID
  | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
  | SVM_EVTINJ_TYPE_EXEPT;
svm->vmcb->control.event_inj_err = ex->error_code;
}

static void svm_init_erratum_383(void)
{
u64 val;

if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
  return;

/* Use _safe variants to not break nested virtualization */
if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
  return;

val |= (1ULL << 47);

native_write_msr_safe(MSR_AMD64_DC_CFG, val);

erratum_383_found = true;
}

static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
/*
* Guests should see errata 400 and 415 as fixed (assuming that
* HLT and IO instructions are intercepted).
*/
vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
vcpu->arch.osvw.status = osvw_status & ~(6ULL);

/*
* By increasing VCPU's osvw.length to 3 we are telling the guest that
* all osvw.status bits inside that length, including bit 0 (which is
* reserved for erratum 298), are valid. However, if host processor's
* osvw_len is 0 then osvw_status[0] carries no information. We need to
* be conservative here and therefore we tell the guest that erratum 298
* is present (because we really don't know).
*/
if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
  vcpu->arch.osvw.status |= 1;
}

static bool __kvm_is_svm_supported(void)
{
int cpu = smp_processor_id();
struct cpuinfo_x86 *c = &cpu_data(cpu);

if (c->x86_vendor != X86_VENDOR_AMD &&
     c->x86_vendor != X86_VENDOR_HYGON) {
  pr_err("CPU %d isn't AMD or Hygon\n", cpu);
  return false;
}

if (!cpu_has(c, X86_FEATURE_SVM)) {
  pr_err("SVM not supported by CPU %d\n", cpu);
  return false;
}

if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
  pr_info("KVM is unsupported when running as an SEV guest\n");
  return false;
}

return true;
}

static bool kvm_is_svm_supported(void)
{
bool supported;

migrate_disable();
supported = __kvm_is_svm_supported();
migrate_enable();

return supported;
}

static int svm_check_processor_compat(void)
{
if (!__kvm_is_svm_supported())
  return -EIO;

return 0;
}

static void __svm_write_tsc_multiplier(u64 multiplier)
{
if (multiplier == __this_cpu_read(current_tsc_ratio))
  return;

wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
__this_cpu_write(current_tsc_ratio, multiplier);
}

static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
{
return &sd->save_area->host_sev_es_save;
}

static inline void kvm_cpu_svm_disable(void)
{
uint64_t efer;

wrmsrq(MSR_VM_HSAVE_PA, 0);
rdmsrq(MSR_EFER, efer);
if (efer & EFER_SVME) {
  /*
* Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
* NMI aren't blocked.
*/
  stgi();
  wrmsrq(MSR_EFER, efer & ~EFER_SVME);
}
}

static void svm_emergency_disable_virtualization_cpu(void)
{
kvm_rebooting = true;

kvm_cpu_svm_disable();
}

static void svm_disable_virtualization_cpu(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
  __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);

kvm_cpu_svm_disable();

amd_pmu_disable_virt();
}

static int svm_enable_virtualization_cpu(void)
{

struct svm_cpu_data *sd;
uint64_t efer;
int me = raw_smp_processor_id();

rdmsrq(MSR_EFER, efer);
if (efer & EFER_SVME)
  return -EBUSY;

sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
sd->min_asid = max_sev_asid + 1;

wrmsrq(MSR_EFER, efer | EFER_SVME);

wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);

if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
  /*
* Set the default value, even if we don't use TSC scaling
* to avoid having stale value in the msr
*/
  __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
}

/*
* Get OSVW bits.
*
* Note that it is possible to have a system with mixed processor
* revisions and therefore different OSVW bits. If bits are not the same
* on different processors then choose the worst case (i.e. if erratum
* is present on one processor and not on another then assume that the
* erratum is present everywhere).
*/
if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
  u64 len, status = 0;
  int err;

  err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
  if (!err)
   err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);

  if (err)
   osvw_status = osvw_len = 0;
  else {
   if (len < osvw_len)
    osvw_len = len;
   osvw_status |= status;
   osvw_status &= (1ULL << osvw_len) - 1;
  }
} else
  osvw_status = osvw_len = 0;

svm_init_erratum_383();

amd_pmu_enable_virt();

return 0;
}

static void svm_cpu_uninit(int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);

if (!sd->save_area)
  return;

kfree(sd->sev_vmcbs);
__free_page(__sme_pa_to_page(sd->save_area_pa));
sd->save_area_pa = 0;
sd->save_area = NULL;
}

static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
struct page *save_area_page;
int ret = -ENOMEM;

memset(sd, 0, sizeof(struct svm_cpu_data));
save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
if (!save_area_page)
  return ret;

ret = sev_cpu_init(sd);
if (ret)
  goto free_save_area;

sd->save_area = page_address(save_area_page);
sd->save_area_pa = __sme_page_pa(save_area_page);
return 0;

free_save_area:
__free_page(save_area_page);
return ret;

}

static void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;

vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);

recalc_intercepts(svm);
}

static void clr_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;

vmcb->control.intercepts[INTERCEPT_DR] = 0;

recalc_intercepts(svm);
}

static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
/*
* For non-nested case:
* If the L01 MSR bitmap does not intercept the MSR, then we need to
* save it.
*
* For nested case:
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
         to_svm(vcpu)->msrpm;

return svm_test_msr_bitmap_write(msrpm, msr);
}

void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_svm *svm = to_svm(vcpu);
void *msrpm = svm->msrpm;

/* Don't disable interception for MSRs userspace wants to handle. */
if (type & MSR_TYPE_R) {
  if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
   svm_clear_msr_bitmap_read(msrpm, msr);
  else
   svm_set_msr_bitmap_read(msrpm, msr);
}

if (type & MSR_TYPE_W) {
  if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
   svm_clear_msr_bitmap_write(msrpm, msr);
  else
   svm_set_msr_bitmap_write(msrpm, msr);
}

svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
svm->nested.force_msr_bitmap_recalc = true;
}

void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
struct page *pages = alloc_pages(gfp_mask, order);
void *pm;

if (!pages)
  return NULL;

/*
* Set all bits in the permissions map so that all MSR and I/O accesses
* are intercepted by default.
*/
pm = page_address(pages);
memset(pm, 0xff, PAGE_SIZE * (1 << order));

return pm;
}

static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);

svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);

if (sev_es_guest(vcpu->kvm))
  svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
}

void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
{
static const u32 x2avic_passthrough_msrs[] = {
  X2APIC_MSR(APIC_ID),
  X2APIC_MSR(APIC_LVR),
  X2APIC_MSR(APIC_TASKPRI),
  X2APIC_MSR(APIC_ARBPRI),
  X2APIC_MSR(APIC_PROCPRI),
  X2APIC_MSR(APIC_EOI),
  X2APIC_MSR(APIC_RRR),
  X2APIC_MSR(APIC_LDR),
  X2APIC_MSR(APIC_DFR),
  X2APIC_MSR(APIC_SPIV),
  X2APIC_MSR(APIC_ISR),
  X2APIC_MSR(APIC_TMR),
  X2APIC_MSR(APIC_IRR),
  X2APIC_MSR(APIC_ESR),
  X2APIC_MSR(APIC_ICR),
  X2APIC_MSR(APIC_ICR2),

  /*
* Note!  Always intercept LVTT, as TSC-deadline timer mode
* isn't virtualized by hardware, and the CPU will generate a
* #GP instead of a #VMEXIT.
*/
  X2APIC_MSR(APIC_LVTTHMR),
  X2APIC_MSR(APIC_LVTPC),
  X2APIC_MSR(APIC_LVT0),
  X2APIC_MSR(APIC_LVT1),
  X2APIC_MSR(APIC_LVTERR),
  X2APIC_MSR(APIC_TMICT),
  X2APIC_MSR(APIC_TMCCT),
  X2APIC_MSR(APIC_TDCR),
};
int i;

if (intercept == svm->x2avic_msrs_intercepted)
  return;

if (!x2avic_enabled)
  return;

for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
  svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
       MSR_TYPE_RW, intercept);

svm->x2avic_msrs_intercepted = intercept;
}

void svm_vcpu_free_msrpm(void *msrpm)
{
__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}

static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);

#ifdef CONFIG_X86_64
svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
#endif

if (lbrv)
  svm_recalc_lbr_msr_intercepts(vcpu);

if (cpu_feature_enabled(X86_FEATURE_IBPB))
  svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
       !guest_has_pred_cmd_msr(vcpu));

if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
  svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
       !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));

/*
* Disable interception of SPEC_CTRL if KVM doesn't need to manually
* context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
* the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
* using SPEC_CTRL.
*/
if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
  svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
       !guest_has_spec_ctrl_msr(vcpu));
else
  svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
       !svm->spec_ctrl);

/*
* Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
* as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
*/
svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
      guest_cpuid_is_intel_compatible(vcpu));
svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
      guest_cpuid_is_intel_compatible(vcpu));

if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
  svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
  svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}

if (sev_es_guest(vcpu->kvm))
  sev_es_recalc_msr_intercepts(vcpu);

/*
* x2APIC intercepts are modified on-demand and cannot be filtered by
* userspace.
*/
}

void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
{
to_vmcb->save.dbgctl  = from_vmcb->save.dbgctl;
to_vmcb->save.br_from  = from_vmcb->save.br_from;
to_vmcb->save.br_to  = from_vmcb->save.br_to;
to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;

vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}

static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
}

void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
__svm_enable_lbrv(vcpu);
svm_recalc_lbr_msr_intercepts(vcpu);
}

static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
}

void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
       (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
       (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));

if (enable_lbrv && !current_enable_lbrv)
  __svm_enable_lbrv(vcpu);
else if (!enable_lbrv && current_enable_lbrv)
  __svm_disable_lbrv(vcpu);

/*
* During nested transitions, it is possible that the current VMCB has
* LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
* In this case, even though LBR_CTL does not need an update, intercepts
* do, so always recalculate the intercepts here.
*/
svm_recalc_lbr_msr_intercepts(vcpu);
}

void disable_nmi_singlestep(struct vcpu_svm *svm)
{
svm->nmi_singlestep = false;

if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
  /* Clear our flags if they were not set by the guest */
  if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
   svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
  if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
   svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
}
}

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;

if (kvm_pause_in_guest(vcpu->kvm))
  return;

control->pause_filter_count = __grow_ple_window(old,
       pause_filter_count,
       pause_filter_count_grow,
       pause_filter_count_max);

if (control->pause_filter_count != old) {
  vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  trace_kvm_ple_window_update(vcpu->vcpu_id,
         control->pause_filter_count, old);
}
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;

if (kvm_pause_in_guest(vcpu->kvm))
  return;

control->pause_filter_count =
    __shrink_ple_window(old,
          pause_filter_count,
          pause_filter_count_shrink,
          pause_filter_count);
if (control->pause_filter_count != old) {
  vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  trace_kvm_ple_window_update(vcpu->vcpu_id,
         control->pause_filter_count, old);
}
}

static void svm_hardware_unsetup(void)
{
int cpu;

sev_hardware_unsetup();

for_each_possible_cpu(cpu)
  svm_cpu_uninit(cpu);

__free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
iopm_base = 0;
}

static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
        SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
seg->limit = 0xffff;
seg->base = 0;
}

static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | type;
seg->limit = 0xffff;
seg->base = 0;
}

static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

return svm->nested.ctl.tsc_offset;
}

static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

return svm->tsc_ratio_msr;
}

static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}

void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
preempt_disable();
if (to_svm(vcpu)->guest_state_loaded)
  __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
preempt_enable();
}

/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
*/
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
  if (!npt_enabled ||
      !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
   svm_set_intercept(svm, INTERCEPT_INVPCID);
  else
   svm_clr_intercept(svm, INTERCEPT_INVPCID);
}

if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
  if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
   svm_clr_intercept(svm, INTERCEPT_RDTSCP);
  else
   svm_set_intercept(svm, INTERCEPT_RDTSCP);
}

if (guest_cpuid_is_intel_compatible(vcpu)) {
  svm_set_intercept(svm, INTERCEPT_VMLOAD);
  svm_set_intercept(svm, INTERCEPT_VMSAVE);
  svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
} else {
  /*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
* in VMCB and clear intercepts to avoid #VMEXIT.
*/
  if (vls) {
   svm_clr_intercept(svm, INTERCEPT_VMLOAD);
   svm_clr_intercept(svm, INTERCEPT_VMSAVE);
   svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
  }
}
}

static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu)
{
svm_recalc_instruction_intercepts(vcpu);
svm_recalc_msr_intercepts(vcpu);
}

static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
struct vmcb_control_area *control = &vmcb->control;
struct vmcb_save_area *save = &vmcb->save;

svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
svm_set_intercept(svm, INTERCEPT_CR4_READ);
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
if (!kvm_vcpu_apicv_active(vcpu))
  svm_set_intercept(svm, INTERCEPT_CR8_WRITE);

set_dr_intercepts(svm);

set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
set_exception_intercept(svm, AC_VECTOR);
set_exception_intercept(svm, DB_VECTOR);
/*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
* as VMware does.
*/
if (enable_vmware_backdoor)
  set_exception_intercept(svm, GP_VECTOR);

svm_set_intercept(svm, INTERCEPT_INTR);
svm_set_intercept(svm, INTERCEPT_NMI);

if (intercept_smi)
  svm_set_intercept(svm, INTERCEPT_SMI);

svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
svm_set_intercept(svm, INTERCEPT_RDPMC);
svm_set_intercept(svm, INTERCEPT_CPUID);
svm_set_intercept(svm, INTERCEPT_INVD);
svm_set_intercept(svm, INTERCEPT_INVLPG);
svm_set_intercept(svm, INTERCEPT_INVLPGA);
svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
svm_set_intercept(svm, INTERCEPT_MSR_PROT);
svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
svm_set_intercept(svm, INTERCEPT_VMRUN);
svm_set_intercept(svm, INTERCEPT_VMMCALL);
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
svm_set_intercept(svm, INTERCEPT_STGI);
svm_set_intercept(svm, INTERCEPT_CLGI);
svm_set_intercept(svm, INTERCEPT_SKINIT);
svm_set_intercept(svm, INTERCEPT_WBINVD);
svm_set_intercept(svm, INTERCEPT_XSETBV);
svm_set_intercept(svm, INTERCEPT_RDPRU);
svm_set_intercept(svm, INTERCEPT_RSM);

if (!kvm_mwait_in_guest(vcpu->kvm)) {
  svm_set_intercept(svm, INTERCEPT_MONITOR);
  svm_set_intercept(svm, INTERCEPT_MWAIT);
}

if (!kvm_hlt_in_guest(vcpu->kvm)) {
  if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
   svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
  else
   svm_set_intercept(svm, INTERCEPT_HLT);
}

control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;

init_seg(&save->es);
init_seg(&save->ss);
init_seg(&save->ds);
init_seg(&save->fs);
init_seg(&save->gs);

save->cs.selector = 0xf000;
save->cs.base = 0xffff0000;
/* Executable/Readable Code Segment */
save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
  SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;

save->gdtr.base = 0;
save->gdtr.limit = 0xffff;
save->idtr.base = 0;
save->idtr.limit = 0xffff;

init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);

if (npt_enabled) {
  /* Setup VMCB for Nested Paging */
  control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
  svm_clr_intercept(svm, INTERCEPT_INVLPG);
  clr_exception_intercept(svm, PF_VECTOR);
  svm_clr_intercept(svm, INTERCEPT_CR3_READ);
  svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
  save->g_pat = vcpu->arch.pat;
  save->cr3 = 0;
}
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;

svm->nested.vmcb12_gpa = INVALID_GPA;
svm->nested.last_vmcb12_gpa = INVALID_GPA;

if (!kvm_pause_in_guest(vcpu->kvm)) {
  control->pause_filter_count = pause_filter_count;
  if (pause_filter_thresh)
   control->pause_filter_thresh = pause_filter_thresh;
  svm_set_intercept(svm, INTERCEPT_PAUSE);
} else {
  svm_clr_intercept(svm, INTERCEPT_PAUSE);
}

if (kvm_vcpu_apicv_active(vcpu))
  avic_init_vmcb(svm, vmcb);

if (vnmi)
  svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;

if (vgif) {
  svm_clr_intercept(svm, INTERCEPT_STGI);
  svm_clr_intercept(svm, INTERCEPT_CLGI);
  svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}

if (vcpu->kvm->arch.bus_lock_detection_enabled)
  svm_set_intercept(svm, INTERCEPT_BUSLOCK);

if (sev_guest(vcpu->kvm))
  sev_init_vmcb(svm);

svm_hv_init_vmcb(vmcb);

svm_recalc_intercepts_after_set_cpuid(vcpu);

vmcb_mark_all_dirty(vmcb);

enable_gif(svm);
}

static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm_init_osvw(vcpu);

if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
  vcpu->arch.microcode_version = 0x01000065;
svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;

svm->nmi_masked = false;
svm->awaiting_iret_completion = false;

if (sev_es_guest(vcpu->kvm))
  sev_es_vcpu_reset(svm);
}

static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;

if (init_event)
  sev_snp_init_protected_guest_state(vcpu);

init_vmcb(vcpu);

if (!init_event)
  __svm_vcpu_reset(vcpu);
}

void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
{
svm->current_vmcb = target_vmcb;
svm->vmcb = target_vmcb->ptr;
}

static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *vmcb01_page;
struct page *vmsa_page = NULL;
int err;

BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
svm = to_svm(vcpu);

err = -ENOMEM;
vmcb01_page = snp_safe_alloc_page();
if (!vmcb01_page)
  goto out;

if (sev_es_guest(vcpu->kvm)) {
  /*
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
*/
  vmsa_page = snp_safe_alloc_page();
  if (!vmsa_page)
   goto error_free_vmcb_page;
}

err = avic_init_vcpu(svm);
if (err)
  goto error_free_vmsa_page;

svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
  err = -ENOMEM;
  goto error_free_vmsa_page;
}

svm->x2avic_msrs_intercepted = true;

svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
svm_switch_vmcb(svm, &svm->vmcb01);

if (vmsa_page)
  svm->sev_es.vmsa = page_address(vmsa_page);

svm->guest_state_loaded = false;

return 0;

error_free_vmsa_page:
if (vmsa_page)
  __free_page(vmsa_page);
error_free_vmcb_page:
__free_page(vmcb01_page);
out:
return err;
}

static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

WARN_ON_ONCE(!list_empty(&svm->ir_list));

svm_leave_nested(vcpu);
svm_free_nested(svm);

sev_free_vcpu(vcpu);

__free_page(__sme_pa_to_page(svm->vmcb01.pa));
svm_vcpu_free_msrpm(svm->msrpm);
}

#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_SPINLOCK(srso_lock);
static atomic_t srso_nr_vms;

static void svm_srso_clear_bp_spec_reduce(void *ign)
{
struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);

if (!sd->bp_spec_reduce_set)
  return;

msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
sd->bp_spec_reduce_set = false;
}

static void svm_srso_vm_destroy(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
  return;

if (atomic_dec_return(&srso_nr_vms))
  return;

guard(spinlock)(&srso_lock);

/*
* Verify a new VM didn't come along, acquire the lock, and increment
* the count before this task acquired the lock.
*/
if (atomic_read(&srso_nr_vms))
  return;

on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
}

static void svm_srso_vm_init(void)
{
if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
  return;

/*
* Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
* transition, i.e. destroying the last VM, is fully complete, e.g. so
* that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
*/
if (atomic_inc_not_zero(&srso_nr_vms))
  return;

guard(spinlock)(&srso_lock);

atomic_inc(&srso_nr_vms);
}
#else
static void svm_srso_vm_init(void) { }
static void svm_srso_vm_destroy(void) { }
#endif

static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);

if (sev_es_guest(vcpu->kvm))
  sev_es_unmap_ghcb(svm);

if (svm->guest_state_loaded)
  return;

/*
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
vmsave(sd->save_area_pa);
if (sev_es_guest(vcpu->kvm))
  sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));

if (tsc_scaling)
  __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);

/*
* TSC_AUX is always virtualized (context switched by hardware) for
* SEV-ES guests when the feature is available.  For non-SEV-ES guests,
* context switch TSC_AUX via the user_return MSR infrastructure (not
* all CPUs support TSC_AUX virtualization).
*/
if (likely(tsc_aux_uret_slot >= 0) &&
     (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
  kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);

if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
     !sd->bp_spec_reduce_set) {
  sd->bp_spec_reduce_set = true;
  msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
}
svm->guest_state_loaded = true;
}

static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->guest_state_loaded = false;
}

static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
  shrink_ple_window(vcpu);

if (kvm_vcpu_apicv_active(vcpu))
  avic_vcpu_load(vcpu, cpu);
}

static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
  avic_vcpu_put(vcpu);

svm_prepare_host_switch(vcpu);

++vcpu->stat.host_state_reload;
}

static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long rflags = svm->vmcb->save.rflags;

if (svm->nmi_singlestep) {
  /* Hide our flags if they were not set by the guest */
  if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
   rflags &= ~X86_EFLAGS_TF;
  if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
   rflags &= ~X86_EFLAGS_RF;
}
return rflags;
}

static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (to_svm(vcpu)->nmi_singlestep)
  rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);

       /*
        * Any change of EFLAGS.VM is accompanied by a reload of SS
        * (caused by either a task switch or an inter-privilege IRET),
        * so we do not need to update the CPL here.
        */
to_svm(vcpu)->vmcb->save.rflags = rflags;
}

static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;

return sev_es_guest(vcpu->kvm)
  ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
  : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
}

static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
kvm_register_mark_available(vcpu, reg);

switch (reg) {
case VCPU_EXREG_PDPTR:
  /*
* When !npt_enabled, mmu->pdptrs[] is already available since
* it is always updated per SDM when moving to CRs.
*/
  if (npt_enabled)
   load_pdptrs(vcpu, kvm_read_cr3(vcpu));
  break;
default:
  KVM_BUG_ON(1, vcpu->kvm);
}
}

static void svm_set_vintr(struct vcpu_svm *svm)
{
struct vmcb_control_area *control;

/*
* The following fields are ignored when AVIC is enabled
*/
WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));

svm_set_intercept(svm, INTERCEPT_VINTR);

/*
* Recalculating intercepts may have cleared the VINTR intercept.  If
* V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
* for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
* Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
* interrupts will never be unblocked while L2 is running.
*/
if (!svm_is_intercept(svm, INTERCEPT_VINTR))
  return;

/*
* This is just a dummy VINTR to actually cause a vmexit to happen.
* Actual injection of virtual interrupts happens through EVENTINJ.
*/
control = &svm->vmcb->control;
control->int_vector = 0x0;
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
  ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}

static void svm_clear_vintr(struct vcpu_svm *svm)
{
svm_clr_intercept(svm, INTERCEPT_VINTR);

/* Drop int_ctl fields related to VINTR injection.  */
svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
if (is_guest_mode(&svm->vcpu)) {
  svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;

  WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
   (svm->nested.ctl.int_ctl & V_TPR_MASK));

  svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
   V_IRQ_INJECTION_BITS_MASK;

  svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
}

vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}

static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;

switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
case VCPU_SREG_FS: return &save01->fs;
case VCPU_SREG_GS: return &save01->gs;
case VCPU_SREG_SS: return &save->ss;
case VCPU_SREG_TR: return &save01->tr;
case VCPU_SREG_LDTR: return &save01->ldtr;
}
BUG();
return NULL;
}

static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_seg *s = svm_seg(vcpu, seg);

return s->base;
}

static void svm_get_segment(struct kvm_vcpu *vcpu,
       struct kvm_segment *var, int seg)
{
struct vmcb_seg *s = svm_seg(vcpu, seg);

var->base = s->base;
var->limit = s->limit;
var->selector = s->selector;
var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;

/*
* AMD CPUs circa 2014 track the G bit for all segments except CS.
* However, the SVM spec states that the G bit is not observed by the
* CPU, and some VMware virtual CPUs drop the G bit for all segments.
* So let's synthesize a legal G bit for all segments, this helps
* running KVM nested. It also helps cross-vendor migration, because
* Intel's vmentry has a check on the 'G' bit.
*/
var->g = s->limit > 0xfffff;

/*
* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
var->unusable = !var->present;

switch (seg) {
case VCPU_SREG_TR:
  /*
* Work around a bug where the busy flag in the tr selector
* isn't exposed
*/
  var->type |= 0x2;
  break;
case VCPU_SREG_DS:
case VCPU_SREG_ES:
case VCPU_SREG_FS:
case VCPU_SREG_GS:
  /*
* The accessed bit must always be set in the segment
* descriptor cache, although it can be cleared in the
* descriptor, the cached bit always remains at 1. Since
* Intel has a check on this, set it here to support
* cross-vendor migration.
*/
  if (!var->unusable)
   var->type |= 0x1;
  break;
case VCPU_SREG_SS:
  /*
* On AMD CPUs sometimes the DB bit in the segment
* descriptor is left as 1, although the whole segment has
* been made unusable. Clear it here to pass an Intel VMX
* entry check when cross vendor migrating.
*/
  if (var->unusable)
   var->db = 0;
  /* This is symmetric with svm_set_segment() */
  var->dpl = to_svm(vcpu)->vmcb->save.cpl;
  break;
}
}

static int svm_get_cpl(struct kvm_vcpu *vcpu)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;

return save->cpl;
}

static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
struct kvm_segment cs;

svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
*db = cs.db;
*l = cs.l;
}

static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);

dt->size = svm->vmcb->save.idtr.limit;
dt->address = svm->vmcb->save.idtr.base;
}

static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}

static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);

dt->size = svm->vmcb->save.gdtr.limit;
dt->address = svm->vmcb->save.gdtr.base;
}

static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);

svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}

static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct vcpu_svm *svm = to_svm(vcpu);

/*
* For guests that don't set guest_state_protected, the cr3 update is
* handled via kvm_mmu_load() while entering the guest. For guests
* that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
* VMCB save area now, since the save area will become the initial
* contents of the VMSA, and future VMCB save area updates won't be
* seen.
*/
if (sev_es_guest(vcpu->kvm)) {
  svm->vmcb->save.cr3 = cr3;
  vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
}

static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
return true;
}

void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 hcr0 = cr0;
bool old_paging = is_paging(vcpu);

#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
  if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
   vcpu->arch.efer |= EFER_LMA;
   if (!vcpu->arch.guest_state_protected)
    svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
  }

  if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
   vcpu->arch.efer &= ~EFER_LMA;
   if (!vcpu->arch.guest_state_protected)
    svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
  }
}
#endif
vcpu->arch.cr0 = cr0;

if (!npt_enabled) {
  hcr0 |= X86_CR0_PG | X86_CR0_WP;
  if (old_paging != is_paging(vcpu))
   svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
}

/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
  hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);

svm->vmcb->save.cr0 = hcr0;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);

/*
* SEV-ES guests must always keep the CR intercepts cleared. CR
* tracking is done using the CR write traps.
*/
if (sev_es_guest(vcpu->kvm))
  return;

if (hcr0 == cr0) {
  /* Selective CR0 write remains on.  */
  svm_clr_intercept(svm, INTERCEPT_CR0_READ);
  svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
  svm_set_intercept(svm, INTERCEPT_CR0_READ);
  svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
}
}

static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
return true;
}

void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = vcpu->arch.cr4;

vcpu->arch.cr4 = cr4;
if (!npt_enabled) {
  cr4 |= X86_CR4_PAE;

  if (!is_paging(vcpu))
   cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
}
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);

if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
  vcpu->arch.cpuid_dynamic_bits_dirty = true;
}

static void svm_set_segment(struct kvm_vcpu *vcpu,
       struct kvm_segment *var, int seg)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_seg *s = svm_seg(vcpu, seg);

s->base = var->base;
s->limit = var->limit;
s->selector = var->selector;
s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;

/*
* This is always accurate, except if SYSRET returned to a segment
* with SS.DPL != 3.  Intel does not have this quirk, and always
* forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
* would entail passing the CPL to userspace and back.
*/
if (seg == VCPU_SREG_SS)
  /* This is symmetric with svm_get_segment() */
  svm->vmcb->save.cpl = (var->dpl & 3);

vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}

static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

clr_exception_intercept(svm, BP_VECTOR);

if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
  if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
   set_exception_intercept(svm, BP_VECTOR);
}
}

static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
if (sd->next_asid > sd->max_asid) {
  ++sd->asid_generation;
  sd->next_asid = sd->min_asid;
  svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
  vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}

svm->current_vmcb->asid_generation = sd->asid_generation;
svm->asid = sd->next_asid++;
}

static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;

if (vcpu->arch.guest_state_protected)
  return;

if (unlikely(value != vmcb->save.dr6)) {
  vmcb->save.dr6 = value;
  vmcb_mark_dirty(vmcb, VMCB_DR);
}
}

static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
  return;

get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
get_debugreg(vcpu->arch.db[3], 3);
/*
* We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
* because db_interception might need it.  We can do it before vmentry.
*/
vcpu->arch.dr6 = svm->vmcb->save.dr6;
vcpu->arch.dr7 = svm->vmcb->save.dr7;
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
set_dr_intercepts(svm);
}

static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);

if (vcpu->arch.guest_state_protected)
  return;

svm->vmcb->save.dr7 = value;
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}

static int pf_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;

return kvm_handle_page_fault(vcpu, error_code, fault_address,
   static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
   svm->vmcb->control.insn_bytes : NULL,
   svm->vmcb->control.insn_len);
}

static int npf_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int rc;

u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;

/*
* WARN if hardware generates a fault with an error code that collides
* with KVM-defined sythentic flags.  Clear the flags and continue on,
* i.e. don't terminate the VM, as KVM can't possibly be relying on a
* flag that KVM doesn't know about.
*/
if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
  error_code &= ~PFERR_SYNTHETIC_MASK;

if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
  error_code |= PFERR_PRIVATE_ACCESS;

trace_kvm_page_fault(vcpu, fault_address, error_code);
rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
    static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
    svm->vmcb->control.insn_bytes : NULL,
    svm->vmcb->control.insn_len);

if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
  sev_handle_rmp_fault(vcpu, fault_address, error_code);

return rc;
}

static int db_interception(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
struct vcpu_svm *svm = to_svm(vcpu);

if (!(vcpu->guest_debug &
       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
  !svm->nmi_singlestep) {
  u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
  kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
  return 1;
}

if (svm->nmi_singlestep) {
  disable_nmi_singlestep(svm);
  /* Make sure we check for pending NMIs upon entry */
  kvm_make_request(KVM_REQ_EVENT, vcpu);
}

if (vcpu->guest_debug &
     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
  kvm_run->exit_reason = KVM_EXIT_DEBUG;
  kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
  kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
  kvm_run->debug.arch.pc =
   svm->vmcb->save.cs.base + svm->vmcb->save.rip;
  kvm_run->debug.arch.exception = DB_VECTOR;
  return 0;
}

return 1;
}

static int bp_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;

kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = BP_VECTOR;
return 0;
}

static int ud_interception(struct kvm_vcpu *vcpu)
{
return handle_ud(vcpu);
}

static int ac_interception(struct kvm_vcpu *vcpu)
{
kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
return 1;
}

static bool is_erratum_383(void)
{
int i;
u64 value;

if (!erratum_383_found)
  return false;

if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
  return false;

/* Bit 62 may or may not be set for this mce */
value &= ~(1ULL << 62);

if (value != 0xb600000000010015ULL)
  return false;

/* Clear MCi_STATUS registers */
for (i = 0; i < 6; ++i)
  native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);

if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
  value &= ~(1ULL << 2);
  native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
}

/* Flush tlb to evict multi-match entries */
__flush_tlb_all();

return true;
}

static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
if (is_erratum_383()) {
  /*
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
  pr_err("Guest triggered AMD Erratum 383\n");

  kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

  return;
}

/*
* On an #MC intercept the MCE handler is not called automatically in
* the host. So do it by hand here.
*/
kvm_machine_check();
}

static int mc_interception(struct kvm_vcpu *vcpu)
{
return 1;
}

static int shutdown_interception(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
struct vcpu_svm *svm = to_svm(vcpu);

/*
* VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
* the VMCB in a known good state.  Unfortuately, KVM doesn't have
* KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
* userspace.  At a platform view, INIT is acceptable behavior as
* there exist bare metal platforms that automatically INIT the CPU
* in response to shutdown.
*
* The VM save area for SEV-ES guests has already been encrypted so it
* cannot be reinitialized, i.e. synthesizing INIT is futile.
*/
if (!sev_es_guest(vcpu->kvm)) {
  clear_page(svm->vmcb);
#ifdef CONFIG_KVM_SMM
  if (is_smm(vcpu))
   kvm_smm_changed(vcpu, false);
#endif
  kvm_vcpu_reset(vcpu, true);
}

kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}

static int io_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;

++vcpu->stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;

if (string) {
  if (sev_es_guest(vcpu->kvm))
   return sev_es_string_io(svm, size, port, in);
  else
   return kvm_emulate_instruction(vcpu, 0);
}

svm->next_rip = svm->vmcb->control.exit_info_2;

return kvm_fast_pio(vcpu, size, port, in);
}

static int nmi_interception(struct kvm_vcpu *vcpu)
{
return 1;
}

static int smi_interception(struct kvm_vcpu *vcpu)
{
return 1;
}

static int intr_interception(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
}

static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb12;
struct kvm_host_map map;
int ret;

if (nested_svm_check_permissions(vcpu))
  return 1;

ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
if (ret) {
  if (ret == -EINVAL)
   kvm_inject_gp(vcpu, 0);
  return 1;
}

vmcb12 = map.hva;

ret = kvm_skip_emulated_instruction(vcpu);

if (vmload) {
  svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
  svm->sysenter_eip_hi = 0;
  svm->sysenter_esp_hi = 0;
} else {
  svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
}

kvm_vcpu_unmap(vcpu, &map);

return ret;
}

static int vmload_interception(struct kvm_vcpu *vcpu)
{
return vmload_vmsave_interception(vcpu, true);
}

static int vmsave_interception(struct kvm_vcpu *vcpu)
{
return vmload_vmsave_interception(vcpu, false);
}

static int vmrun_interception(struct kvm_vcpu *vcpu)
{
if (nested_svm_check_permissions(vcpu))
  return 1;

return nested_svm_vmrun(vcpu);
}

enum {
NONE_SVM_INSTR,
SVM_INSTR_VMRUN,
SVM_INSTR_VMLOAD,
SVM_INSTR_VMSAVE,
};

/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
static int svm_instr_opcode(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;

if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
  return NONE_SVM_INSTR;

switch (ctxt->modrm) {
case 0xd8: /* VMRUN */
  return SVM_INSTR_VMRUN;
case 0xda: /* VMLOAD */
  return SVM_INSTR_VMLOAD;
case 0xdb: /* VMSAVE */
  return SVM_INSTR_VMSAVE;
default:
  break;
}

return NONE_SVM_INSTR;
}

static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
{
const int guest_mode_exit_codes[] = {
  [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
  [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
  [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
};
int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
  [SVM_INSTR_VMRUN] = vmrun_interception,
  [SVM_INSTR_VMLOAD] = vmload_interception,
  [SVM_INSTR_VMSAVE] = vmsave_interception,
};
struct vcpu_svm *svm = to_svm(vcpu);
int ret;

if (is_guest_mode(vcpu)) {
  /* Returns '1' or -errno on failure, '0' on success. */
  ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
  if (ret)
   return ret;
  return 1;
}
return svm_instr_handlers[opcode](vcpu);
}

/*
* #GP handling code. Note that #GP can be triggered under the following two
* cases:
*   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
*      some AMD CPUs when EAX of these instructions are in the reserved memory
*      regions (e.g. SMM memory on host).
*   2) VMware backdoor
*/
static int gp_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 error_code = svm->vmcb->control.exit_info_1;
int opcode;

/* Both #GP cases have zero error_code */
if (error_code)
  goto reinject;

/* Decode the instruction for usage later */
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
  goto reinject;

opcode = svm_instr_opcode(vcpu);

if (opcode == NONE_SVM_INSTR) {
  if (!enable_vmware_backdoor)
   goto reinject;

  /*
* VMware backdoor emulation on #GP interception only handles
* IN{S}, OUT{S}, and RDPMC.
*/
  if (!is_guest_mode(vcpu))
   return kvm_emulate_instruction(vcpu,
    EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
} else {
  /* All SVM instructions expect page aligned RAX */
  if (svm->vmcb->save.rax & ~PAGE_MASK)
   goto reinject;

  return emulate_svm_instr(vcpu, opcode);
}

reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
return 1;
}

void svm_set_gif(struct vcpu_svm *svm, bool value)
{
if (value) {
  /*
* If VGIF is enabled, the STGI intercept is only added to
* detect the opening of the SMI/NMI window; remove it now.
* Likewise, clear the VINTR intercept, we will set it
* again while processing KVM_REQ_EVENT if needed.
*/
  if (vgif)
   svm_clr_intercept(svm, INTERCEPT_STGI);
  if (svm_is_intercept(svm, INTERCEPT_VINTR))
   svm_clear_vintr(svm);

  enable_gif(svm);
  if (svm->vcpu.arch.smi_pending ||
      svm->vcpu.arch.nmi_pending ||
      kvm_cpu_has_injectable_intr(&svm->vcpu) ||
      kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
   kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
} else {
  disable_gif(svm);

  /*
* After a CLGI no interrupts should come.  But if vGIF is
* in use, we still rely on the VINTR intercept (rather than
* STGI) to detect an open interrupt window.
*/
  if (!vgif)
   svm_clear_vintr(svm);
}
}

static int stgi_interception(struct kvm_vcpu *vcpu)
{
int ret;

if (nested_svm_check_permissions(vcpu))
  return 1;

ret = kvm_skip_emulated_instruction(vcpu);
svm_set_gif(to_svm(vcpu), true);
return ret;
}

static int clgi_interception(struct kvm_vcpu *vcpu)
{
int ret;

if (nested_svm_check_permissions(vcpu))
  return 1;

ret = kvm_skip_emulated_instruction(vcpu);
svm_set_gif(to_svm(vcpu), false);
return ret;
}

static int invlpga_interception(struct kvm_vcpu *vcpu)
{
gva_t gva = kvm_rax_read(vcpu);
u32 asid = kvm_rcx_read(vcpu);

/* FIXME: Handle an address size prefix. */
if (!is_long_mode(vcpu))
  gva = (u32)gva;

trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);

/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
kvm_mmu_invlpg(vcpu, gva);

return kvm_skip_emulated_instruction(vcpu);
}

static int skinit_interception(struct kvm_vcpu *vcpu)
{
trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));

kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}

static int task_switch_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u16 tss_selector;
int reason;
int int_type = svm->vmcb->control.exit_int_info &
  SVM_EXITINTINFO_TYPE_MASK;
int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
uint32_t type =
  svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
uint32_t idt_v =
  svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
bool has_error_code = false;
u32 error_code = 0;

tss_selector = (u16)svm->vmcb->control.exit_info_1;

if (svm->vmcb->control.exit_info_2 &
     (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
  reason = TASK_SWITCH_IRET;
else if (svm->vmcb->control.exit_info_2 &
   (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
  reason = TASK_SWITCH_JMP;
else if (idt_v)
  reason = TASK_SWITCH_GATE;
else
  reason = TASK_SWITCH_CALL;

if (reason == TASK_SWITCH_GATE) {
  switch (type) {
  case SVM_EXITINTINFO_TYPE_NMI:
   vcpu->arch.nmi_injected = false;
   break;
  case SVM_EXITINTINFO_TYPE_EXEPT:
   if (svm->vmcb->control.exit_info_2 &
       (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
    has_error_code = true;
    error_code =
     (u32)svm->vmcb->control.exit_info_2;
   }
   kvm_clear_exception_queue(vcpu);
   break;
  case SVM_EXITINTINFO_TYPE_INTR:
  case SVM_EXITINTINFO_TYPE_SOFT:
   kvm_clear_interrupt_queue(vcpu);
   break;
  default:
   break;
  }
}

if (reason != TASK_SWITCH_GATE ||
     int_type == SVM_EXITINTINFO_TYPE_SOFT ||
     (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
      (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
  if (!svm_skip_emulated_instruction(vcpu))
   return 0;
}

if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
  int_vec = -1;

return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
          has_error_code, error_code);
}

static void svm_clr_iret_intercept(struct vcpu_svm *svm)
{
if (!sev_es_guest(svm->vcpu.kvm))
  svm_clr_intercept(svm, INTERCEPT_IRET);
}

static void svm_set_iret_intercept(struct vcpu_svm *svm)
{
if (!sev_es_guest(svm->vcpu.kvm))
  svm_set_intercept(svm, INTERCEPT_IRET);
}

static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);

WARN_ON_ONCE(sev_es_guest(vcpu->kvm));

++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;

svm_clr_iret_intercept(svm);
svm->nmi_iret_rip = kvm_rip_read(vcpu);

kvm_make_request(KVM_REQ_EVENT, vcpu);
--> --------------------

--> maximum size reached

--> --------------------

Messung V0.5

¤ Dauer der Verarbeitung: 0.26 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.