arch/x86/kvm/svm/svm.c

root/arch/x86/kvm/svm/svm.c
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kvm_host.h>

#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include "smm.h"
#include "cpuid.h"
#include "pmu.h"

#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/amd-iommu.h>
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
#include <linux/objtool.h>
#include <linux/psp-sev.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
#include <linux/mutex.h>

#include <asm/apic.h>
#include <asm/msr.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/reboot.h>
#include <asm/fpu/api.h>

#include <trace/events/ipi.h>

#include "trace.h"

#include "svm.h"
#include "svm_ops.h"

#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");

#ifdef MODULE
static const struct x86_cpu_id svm_cpu_id[] = {
        X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
        {}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#endif

#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3

static bool erratum_383_found __read_mostly;

/*
 * Set osvw_len to higher value when updated Revision Guides
 * are published and we know what the new status bits are
 */
static uint64_t osvw_len = 4, osvw_status;

static DEFINE_PER_CPU(u64, current_tsc_ratio);

/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * pause_filter_count: On processors that support Pause filtering(indicated
 *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 *      count value. On VMRUN this value is loaded into an internal counter.
 *      Each time a pause instruction is executed, this counter is decremented
 *      until it reaches zero at which time a #VMEXIT is generated if pause
 *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 *      Intercept Filtering for more details.
 *      This also indicate if ple logic enabled.
 *
 * pause_filter_thresh: In addition, some processor families support advanced
 *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 *      the amount of time a guest is allowed to execute in a pause loop.
 *      In this mode, a 16-bit pause filter threshold field is added in the
 *      VMCB. The threshold value is a cycle count that is used to reset the
 *      pause counter. As with simple pause filtering, VMRUN loads the pause
 *      count value from VMCB into an internal counter. Then, on each pause
 *      instruction the hardware checks the elapsed number of cycles since
 *      the most recent pause instruction against the pause filter threshold.
 *      If the elapsed cycle count is greater than the pause filter threshold,
 *      then the internal pause count is reloaded from the VMCB and execution
 *      continues. If the elapsed cycle count is less than the pause filter
 *      threshold, then the internal pause count is decremented. If the count
 *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 *      triggered. If advanced pause filtering is supported and pause filter
 *      threshold field is set to zero, the filter will operate in the simpler,
 *      count only mode.
 */

static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
module_param(pause_filter_thresh, ushort, 0444);

static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
module_param(pause_filter_count, ushort, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(pause_filter_count_grow, ushort, 0444);

/* Default resets per-vcpu window every exit to pause_filter_count. */
static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(pause_filter_count_shrink, ushort, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);

/*
 * Use nested page tables by default.  Note, NPT may get forced off by
 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 */
bool npt_enabled = true;
module_param_named(npt, npt_enabled, bool, 0444);

/* allow nested virtualization in KVM/SVM */
static int nested = true;
module_param(nested, int, 0444);

/* enable/disable Next RIP Save */
int nrips = true;
module_param(nrips, int, 0444);

/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);

/* enable/disable Virtual GIF */
int vgif = true;
module_param(vgif, int, 0444);

/* enable/disable LBR virtualization */
int lbrv = true;
module_param(lbrv, int, 0444);

static int tsc_scaling = true;
module_param(tsc_scaling, int, 0444);

module_param(enable_device_posted_irqs, bool, 0444);

bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);


bool intercept_smi = true;
module_param(intercept_smi, bool, 0444);

bool vnmi = true;
module_param(vnmi, bool, 0444);

module_param(enable_mediated_pmu, bool, 0444);

static bool svm_gp_erratum_intercept = true;

static u8 rsm_ins_bytes[] = "\x0f\xaa";

static unsigned long iopm_base;

DEFINE_PER_CPU(struct svm_cpu_data, svm_data);

static DEFINE_MUTEX(vmcb_dump_mutex);

/*
 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 *
 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 */
int tsc_aux_uret_slot __ro_after_init = -1;

static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
        return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
        return PT32E_ROOT_LEVEL;
#endif
}

int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 old_efer = vcpu->arch.efer;
        vcpu->arch.efer = efer;

        if (!npt_enabled) {
                /* Shadow paging assumes NX to be available.  */
                efer |= EFER_NX;

                if (!(efer & EFER_LMA))
                        efer &= ~EFER_LME;
        }

        if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
                if (!(efer & EFER_SVME)) {
                        svm_leave_nested(vcpu);
                        /* #GP intercept is still needed for vmware backdoor */
                        if (!enable_vmware_backdoor)
                                clr_exception_intercept(svm, GP_VECTOR);

                        /*
                         * Free the nested guest state, unless we are in SMM.
                         * In this case we will return to the nested guest
                         * as soon as we leave SMM.
                         */
                        if (!is_smm(vcpu))
                                svm_free_nested(svm);

                } else {
                        int ret = svm_allocate_nested(svm);

                        if (ret) {
                                vcpu->arch.efer = old_efer;
                                return ret;
                        }

                        /*
                         * Never intercept #GP for SEV guests, KVM can't
                         * decrypt guest memory to workaround the erratum.
                         */
                        if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
                                set_exception_intercept(svm, GP_VECTOR);
                }
        }

        svm->vmcb->save.efer = efer | EFER_SVME;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
        return 0;
}

static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 ret = 0;

        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
        return ret;
}

static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (mask == 0)
                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
        else
                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;

}

static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
                                           int emul_type,
                                           bool commit_side_effects)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long old_rflags;

        /*
         * SEV-ES does not expose the next RIP. The RIP update is controlled by
         * the type of exit and the #VC handler in the guest.
         */
        if (sev_es_guest(vcpu->kvm))
                goto done;

        if (nrips && svm->vmcb->control.next_rip != 0) {
                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
                svm->next_rip = svm->vmcb->control.next_rip;
        }

        if (!svm->next_rip) {
                if (unlikely(!commit_side_effects))
                        old_rflags = svm->vmcb->save.rflags;

                if (!kvm_emulate_instruction(vcpu, emul_type))
                        return 0;

                if (unlikely(!commit_side_effects))
                        svm->vmcb->save.rflags = old_rflags;
        } else {
                kvm_rip_write(vcpu, svm->next_rip);
        }

done:
        if (likely(commit_side_effects))
                svm_set_interrupt_shadow(vcpu, 0);

        return 1;
}

static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
        return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}

static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
{
        const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
                              EMULTYPE_SET_SOFT_INT_VECTOR(vector);
        unsigned long rip, old_rip = kvm_rip_read(vcpu);
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * Due to architectural shortcomings, the CPU doesn't always provide
         * NextRIP, e.g. if KVM intercepted an exception that occurred while
         * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
         * the instruction even if NextRIP is supported to acquire the next
         * RIP so that it can be shoved into the NextRIP field, otherwise
         * hardware will fail to advance guest RIP during event injection.
         * Drop the exception/interrupt if emulation fails and effectively
         * retry the instruction, it's the least awful option.  If NRIPS is
         * in use, the skip must not commit any side effects such as clearing
         * the interrupt shadow or RFLAGS.RF.
         */
        if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
                return -EIO;

        rip = kvm_rip_read(vcpu);

        /*
         * Save the injection information, even when using next_rip, as the
         * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
         * doesn't complete due to a VM-Exit occurring while the CPU is
         * vectoring the event.   Decoding the instruction isn't guaranteed to
         * work as there may be no backing instruction, e.g. if the event is
         * being injected by L1 for L2, or if the guest is patching INT3 into
         * a different instruction.
         */
        svm->soft_int_injected = true;
        svm->soft_int_csbase = svm->vmcb->save.cs.base;
        svm->soft_int_old_rip = old_rip;
        svm->soft_int_next_rip = rip;

        if (nrips)
                kvm_rip_write(vcpu, old_rip);

        if (static_cpu_has(X86_FEATURE_NRIPS))
                svm->vmcb->control.next_rip = rip;

        return 0;
}

static void svm_inject_exception(struct kvm_vcpu *vcpu)
{
        struct kvm_queued_exception *ex = &vcpu->arch.exception;
        struct vcpu_svm *svm = to_svm(vcpu);

        kvm_deliver_exception_payload(vcpu, ex);

        if (kvm_exception_is_soft(ex->vector) &&
            svm_update_soft_interrupt_rip(vcpu, ex->vector))
                return;

        svm->vmcb->control.event_inj = ex->vector
                | SVM_EVTINJ_VALID
                | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
                | SVM_EVTINJ_TYPE_EXEPT;
        svm->vmcb->control.event_inj_err = ex->error_code;
}

static void svm_init_erratum_383(void)
{
        u64 val;

        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
                return;

        /* Use _safe variants to not break nested virtualization */
        if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
                return;

        val |= (1ULL << 47);

        native_write_msr_safe(MSR_AMD64_DC_CFG, val);

        erratum_383_found = true;
}

static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
        /*
         * Guests should see errata 400 and 415 as fixed (assuming that
         * HLT and IO instructions are intercepted).
         */
        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
        vcpu->arch.osvw.status = osvw_status & ~(6ULL);

        /*
         * By increasing VCPU's osvw.length to 3 we are telling the guest that
         * all osvw.status bits inside that length, including bit 0 (which is
         * reserved for erratum 298), are valid. However, if host processor's
         * osvw_len is 0 then osvw_status[0] carries no information. We need to
         * be conservative here and therefore we tell the guest that erratum 298
         * is present (because we really don't know).
         */
        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
                vcpu->arch.osvw.status |= 1;
}

static bool __kvm_is_svm_supported(void)
{
        int cpu = smp_processor_id();
        struct cpuinfo_x86 *c = &cpu_data(cpu);

        if (c->x86_vendor != X86_VENDOR_AMD &&
            c->x86_vendor != X86_VENDOR_HYGON) {
                pr_err("CPU %d isn't AMD or Hygon\n", cpu);
                return false;
        }

        if (!cpu_has(c, X86_FEATURE_SVM)) {
                pr_err("SVM not supported by CPU %d\n", cpu);
                return false;
        }

        if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
                pr_info("KVM is unsupported when running as an SEV guest\n");
                return false;
        }

        return true;
}

static bool kvm_is_svm_supported(void)
{
        bool supported;

        migrate_disable();
        supported = __kvm_is_svm_supported();
        migrate_enable();

        return supported;
}

static int svm_check_processor_compat(void)
{
        if (!__kvm_is_svm_supported())
                return -EIO;

        return 0;
}

static void __svm_write_tsc_multiplier(u64 multiplier)
{
        if (multiplier == __this_cpu_read(current_tsc_ratio))
                return;

        wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
        __this_cpu_write(current_tsc_ratio, multiplier);
}

static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
{
        return &sd->save_area->host_sev_es_save;
}

static inline void kvm_cpu_svm_disable(void)
{
        uint64_t efer;

        wrmsrq(MSR_VM_HSAVE_PA, 0);
        rdmsrq(MSR_EFER, efer);
        if (efer & EFER_SVME) {
                /*
                 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
                 * NMI aren't blocked.
                 */
                stgi();
                wrmsrq(MSR_EFER, efer & ~EFER_SVME);
        }
}

static void svm_emergency_disable_virtualization_cpu(void)
{
        kvm_rebooting = true;

        kvm_cpu_svm_disable();
}

static void svm_disable_virtualization_cpu(void)
{
        /* Make sure we clean up behind us */
        if (tsc_scaling)
                __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);

        kvm_cpu_svm_disable();

        amd_pmu_disable_virt();
}

static int svm_enable_virtualization_cpu(void)
{

        struct svm_cpu_data *sd;
        uint64_t efer;
        int me = raw_smp_processor_id();

        rdmsrq(MSR_EFER, efer);
        if (efer & EFER_SVME)
                return -EBUSY;

        sd = per_cpu_ptr(&svm_data, me);
        sd->asid_generation = 1;
        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
        sd->next_asid = sd->max_asid + 1;
        sd->min_asid = max_sev_asid + 1;

        wrmsrq(MSR_EFER, efer | EFER_SVME);

        wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);

        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
                /*
                 * Set the default value, even if we don't use TSC scaling
                 * to avoid having stale value in the msr
                 */
                __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
        }


        /*
         * Get OSVW bits.
         *
         * Note that it is possible to have a system with mixed processor
         * revisions and therefore different OSVW bits. If bits are not the same
         * on different processors then choose the worst case (i.e. if erratum
         * is present on one processor and not on another then assume that the
         * erratum is present everywhere).
         */
        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
                u64 len, status = 0;
                int err;

                err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
                if (!err)
                        err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);

                if (err)
                        osvw_status = osvw_len = 0;
                else {
                        if (len < osvw_len)
                                osvw_len = len;
                        osvw_status |= status;
                        osvw_status &= (1ULL << osvw_len) - 1;
                }
        } else
                osvw_status = osvw_len = 0;

        svm_init_erratum_383();

        amd_pmu_enable_virt();

        return 0;
}

static void svm_cpu_uninit(int cpu)
{
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);

        if (!sd->save_area)
                return;

        kfree(sd->sev_vmcbs);
        __free_page(__sme_pa_to_page(sd->save_area_pa));
        sd->save_area_pa = 0;
        sd->save_area = NULL;
}

static int svm_cpu_init(int cpu)
{
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
        struct page *save_area_page;
        int ret = -ENOMEM;

        memset(sd, 0, sizeof(struct svm_cpu_data));
        save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
        if (!save_area_page)
                return ret;

        ret = sev_cpu_init(sd);
        if (ret)
                goto free_save_area;

        sd->save_area = page_address(save_area_page);
        sd->save_area_pa = __sme_page_pa(save_area_page);
        return 0;

free_save_area:
        __free_page(save_area_page);
        return ret;

}

static void set_dr_intercepts(struct vcpu_svm *svm)
{
        struct vmcb *vmcb = svm->vmcb01.ptr;

        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
        vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);

        recalc_intercepts(svm);
}

static void clr_dr_intercepts(struct vcpu_svm *svm)
{
        struct vmcb *vmcb = svm->vmcb01.ptr;

        vmcb->control.intercepts[INTERCEPT_DR] = 0;

        recalc_intercepts(svm);
}

static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
        /*
         * For non-nested case:
         * If the L01 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         *
         * For nested case:
         * If the L02 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         */
        void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
                                            to_svm(vcpu)->msrpm;

        return svm_test_msr_bitmap_write(msrpm, msr);
}

void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        void *msrpm = svm->msrpm;

        /* Don't disable interception for MSRs userspace wants to handle. */
        if (type & MSR_TYPE_R) {
                if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
                        svm_clear_msr_bitmap_read(msrpm, msr);
                else
                        svm_set_msr_bitmap_read(msrpm, msr);
        }

        if (type & MSR_TYPE_W) {
                if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
                        svm_clear_msr_bitmap_write(msrpm, msr);
                else
                        svm_set_msr_bitmap_write(msrpm, msr);
        }

        svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
        svm->nested.force_msr_bitmap_recalc = true;
}

void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
        unsigned int order = get_order(size);
        struct page *pages = alloc_pages(gfp_mask, order);
        void *pm;

        if (!pages)
                return NULL;

        /*
         * Set all bits in the permissions map so that all MSR and I/O accesses
         * are intercepted by default.
         */
        pm = page_address(pages);
        memset(pm, 0xff, PAGE_SIZE * (1 << order));

        return pm;
}

static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);

        if (intercept == svm->lbr_msrs_intercepted)
                return;

        svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);

        if (sev_es_guest(vcpu->kvm))
                svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);

        svm->lbr_msrs_intercepted = intercept;
}

void svm_vcpu_free_msrpm(void *msrpm)
{
        __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}

static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
{
        bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu);
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
        int i;

        if (!enable_mediated_pmu)
                return;

        /* Legacy counters are always available for AMD CPUs with a PMU. */
        for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++)
                svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i,
                                          MSR_TYPE_RW, intercept);

        intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
        for (i = 0; i < pmu->nr_arch_gp_counters; i++)
                svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
                                          MSR_TYPE_RW, intercept);

        for ( ; i < kvm_pmu_cap.num_counters_gp; i++)
                svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
                                             MSR_TYPE_RW);

        intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
        svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
                                  MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
                                  MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
                                  MSR_TYPE_RW, intercept);
        svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
                                  MSR_TYPE_RW, intercept);
}

static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);

#ifdef CONFIG_X86_64
        svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
        svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
#endif

        if (lbrv)
                svm_recalc_lbr_msr_intercepts(vcpu);

        if (cpu_feature_enabled(X86_FEATURE_IBPB))
                svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
                                          !guest_has_pred_cmd_msr(vcpu));

        if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
                svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
                                          !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));

        /*
         * Disable interception of SPEC_CTRL if KVM doesn't need to manually
         * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
         * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
         * using SPEC_CTRL.
         */
        if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
                svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
                                          !guest_has_spec_ctrl_msr(vcpu));
        else
                svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
                                          !svm->spec_ctrl);

        /*
         * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
         * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
         */
        svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
                                  guest_cpuid_is_intel_compatible(vcpu));
        svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
                                  guest_cpuid_is_intel_compatible(vcpu));

        if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
                svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
                svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
        }

        if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
                bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);

                svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled);
                svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled);
                svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled);
                svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled);
                svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled);
                svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled);
        }

        if (sev_es_guest(vcpu->kvm))
                sev_es_recalc_msr_intercepts(vcpu);

        svm_recalc_pmu_msr_intercepts(vcpu);

        /*
         * x2APIC intercepts are modified on-demand and cannot be filtered by
         * userspace.
         */
}

void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
{
        to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
        to_vmcb->save.br_from           = from_vmcb->save.br_from;
        to_vmcb->save.br_to             = from_vmcb->save.br_to;
        to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
        to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;

        vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}

static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
        to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
}

void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
        __svm_enable_lbrv(vcpu);
        svm_recalc_lbr_msr_intercepts(vcpu);
}

static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
        KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
        to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
}

void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
        bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
                            (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
                            (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));

        if (enable_lbrv && !current_enable_lbrv)
                __svm_enable_lbrv(vcpu);
        else if (!enable_lbrv && current_enable_lbrv)
                __svm_disable_lbrv(vcpu);

        /*
         * During nested transitions, it is possible that the current VMCB has
         * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
         * In this case, even though LBR_CTL does not need an update, intercepts
         * do, so always recalculate the intercepts here.
         */
        svm_recalc_lbr_msr_intercepts(vcpu);
}

void disable_nmi_singlestep(struct vcpu_svm *svm)
{
        svm->nmi_singlestep = false;

        if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
                /* Clear our flags if they were not set by the guest */
                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
                        svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
                        svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
        }
}

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;

        if (kvm_pause_in_guest(vcpu->kvm))
                return;

        control->pause_filter_count = __grow_ple_window(old,
                                                        pause_filter_count,
                                                        pause_filter_count_grow,
                                                        pause_filter_count_max);

        if (control->pause_filter_count != old) {
                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            control->pause_filter_count, old);
        }
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;

        if (kvm_pause_in_guest(vcpu->kvm))
                return;

        control->pause_filter_count =
                                __shrink_ple_window(old,
                                                    pause_filter_count,
                                                    pause_filter_count_shrink,
                                                    pause_filter_count);
        if (control->pause_filter_count != old) {
                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            control->pause_filter_count, old);
        }
}

static void svm_hardware_unsetup(void)
{
        int cpu;

        avic_hardware_unsetup();

        sev_hardware_unsetup();

        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);

        __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
        iopm_base = 0;
}

static void init_seg(struct vmcb_seg *seg)
{
        seg->selector = 0;
        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
        seg->limit = 0xffff;
        seg->base = 0;
}

static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
{
        seg->selector = 0;
        seg->attrib = SVM_SELECTOR_P_MASK | type;
        seg->limit = 0xffff;
        seg->base = 0;
}

static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        return svm->nested.ctl.tsc_offset;
}

static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        return svm->tsc_ratio_msr;
}

static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
        svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}

void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
        preempt_disable();
        if (to_svm(vcpu)->guest_state_loaded)
                __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
        preempt_enable();
}

/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * Intercept INVPCID if shadow paging is enabled to sync/free shadow
         * roots, or if INVPCID is disabled in the guest to inject #UD.
         */
        if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
                if (!npt_enabled ||
                    !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
                        svm_set_intercept(svm, INTERCEPT_INVPCID);
                else
                        svm_clr_intercept(svm, INTERCEPT_INVPCID);
        }

        if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
                if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
                        svm_clr_intercept(svm, INTERCEPT_RDTSCP);
                else
                        svm_set_intercept(svm, INTERCEPT_RDTSCP);
        }

        /*
         * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is
         * always set if vls is enabled. If the intercepts are set, the bit is
         * meaningless anyway.
         */
        if (guest_cpuid_is_intel_compatible(vcpu)) {
                svm_set_intercept(svm, INTERCEPT_VMLOAD);
                svm_set_intercept(svm, INTERCEPT_VMSAVE);
        } else {
                /*
                 * If hardware supports Virtual VMLOAD VMSAVE then enable it
                 * in VMCB and clear intercepts to avoid #VMEXIT.
                 */
                if (vls) {
                        svm_clr_intercept(svm, INTERCEPT_VMLOAD);
                        svm_clr_intercept(svm, INTERCEPT_VMSAVE);
                }
        }

        if (kvm_need_rdpmc_intercept(vcpu))
                svm_set_intercept(svm, INTERCEPT_RDPMC);
        else
                svm_clr_intercept(svm, INTERCEPT_RDPMC);
}

static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
{
        svm_recalc_instruction_intercepts(vcpu);
        svm_recalc_msr_intercepts(vcpu);
}

static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb01.ptr;
        struct vmcb_control_area *control = &vmcb->control;
        struct vmcb_save_area *save = &vmcb->save;

        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
        svm_set_intercept(svm, INTERCEPT_CR4_READ);
        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
        svm_set_intercept(svm, INTERCEPT_CR8_WRITE);

        set_dr_intercepts(svm);

        set_exception_intercept(svm, PF_VECTOR);
        set_exception_intercept(svm, UD_VECTOR);
        set_exception_intercept(svm, MC_VECTOR);
        set_exception_intercept(svm, AC_VECTOR);
        set_exception_intercept(svm, DB_VECTOR);
        /*
         * Guest access to VMware backdoor ports could legitimately
         * trigger #GP because of TSS I/O permission bitmap.
         * We intercept those #GP and allow access to them anyway
         * as VMware does.
         */
        if (enable_vmware_backdoor)
                set_exception_intercept(svm, GP_VECTOR);

        svm_set_intercept(svm, INTERCEPT_INTR);
        svm_set_intercept(svm, INTERCEPT_NMI);

        if (intercept_smi)
                svm_set_intercept(svm, INTERCEPT_SMI);

        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
        svm_set_intercept(svm, INTERCEPT_RDPMC);
        svm_set_intercept(svm, INTERCEPT_CPUID);
        svm_set_intercept(svm, INTERCEPT_INVD);
        svm_set_intercept(svm, INTERCEPT_INVLPG);
        svm_set_intercept(svm, INTERCEPT_INVLPGA);
        svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
        svm_set_intercept(svm, INTERCEPT_MSR_PROT);
        svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
        svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
        svm_set_intercept(svm, INTERCEPT_VMRUN);
        svm_set_intercept(svm, INTERCEPT_VMMCALL);
        svm_set_intercept(svm, INTERCEPT_VMLOAD);
        svm_set_intercept(svm, INTERCEPT_VMSAVE);
        svm_set_intercept(svm, INTERCEPT_STGI);
        svm_set_intercept(svm, INTERCEPT_CLGI);
        svm_set_intercept(svm, INTERCEPT_SKINIT);
        svm_set_intercept(svm, INTERCEPT_WBINVD);
        svm_set_intercept(svm, INTERCEPT_XSETBV);
        svm_set_intercept(svm, INTERCEPT_RDPRU);
        svm_set_intercept(svm, INTERCEPT_RSM);

        if (!kvm_mwait_in_guest(vcpu->kvm)) {
                svm_set_intercept(svm, INTERCEPT_MONITOR);
                svm_set_intercept(svm, INTERCEPT_MWAIT);
        }

        if (!kvm_hlt_in_guest(vcpu->kvm)) {
                if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
                        svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
                else
                        svm_set_intercept(svm, INTERCEPT_HLT);
        }

        control->iopm_base_pa = iopm_base;
        control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
        control->int_ctl = V_INTR_MASKING_MASK;

        init_seg(&save->es);
        init_seg(&save->ss);
        init_seg(&save->ds);
        init_seg(&save->fs);
        init_seg(&save->gs);

        save->cs.selector = 0xf000;
        save->cs.base = 0xffff0000;
        /* Executable/Readable Code Segment */
        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
        save->cs.limit = 0xffff;

        save->gdtr.base = 0;
        save->gdtr.limit = 0xffff;
        save->idtr.base = 0;
        save->idtr.limit = 0xffff;

        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);

        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
                svm_clr_intercept(svm, INTERCEPT_INVLPG);
                clr_exception_intercept(svm, PF_VECTOR);
                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
                save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
        }
        svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;

        svm->nested.vmcb12_gpa = INVALID_GPA;
        svm->nested.last_vmcb12_gpa = INVALID_GPA;

        if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
                if (pause_filter_thresh)
                        control->pause_filter_thresh = pause_filter_thresh;
                svm_set_intercept(svm, INTERCEPT_PAUSE);
        } else {
                svm_clr_intercept(svm, INTERCEPT_PAUSE);
        }

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
                svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP;

        if (enable_apicv && irqchip_in_kernel(vcpu->kvm))
                avic_init_vmcb(svm, vmcb);

        if (vnmi)
                svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;

        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
                svm_clr_intercept(svm, INTERCEPT_CLGI);
                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
        }

        if (vls)
                svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;

        if (vcpu->kvm->arch.bus_lock_detection_enabled)
                svm_set_intercept(svm, INTERCEPT_BUSLOCK);

        if (sev_guest(vcpu->kvm))
                sev_init_vmcb(svm, init_event);

        svm_hv_init_vmcb(vmcb);

        kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);

        vmcb_mark_all_dirty(vmcb);

        enable_gif(svm);
}

static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm_init_osvw(vcpu);

        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
                vcpu->arch.microcode_version = 0x01000065;
        svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;

        svm->nmi_masked = false;
        svm->awaiting_iret_completion = false;
}

static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm->spec_ctrl = 0;
        svm->virt_spec_ctrl = 0;

        init_vmcb(vcpu, init_event);

        if (!init_event)
                __svm_vcpu_reset(vcpu);
}

void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
{
        svm->current_vmcb = target_vmcb;
        svm->vmcb = target_vmcb->ptr;
}

static int svm_vcpu_precreate(struct kvm *kvm)
{
        return avic_alloc_physical_id_table(kvm);
}

static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm;
        struct page *vmcb01_page;
        int err;

        BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
        svm = to_svm(vcpu);

        err = -ENOMEM;
        vmcb01_page = snp_safe_alloc_page();
        if (!vmcb01_page)
                goto out;

        err = sev_vcpu_create(vcpu);
        if (err)
                goto error_free_vmcb_page;

        err = avic_init_vcpu(svm);
        if (err)
                goto error_free_sev;

        svm->msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->msrpm) {
                err = -ENOMEM;
                goto error_free_sev;
        }

        svm->x2avic_msrs_intercepted = true;
        svm->lbr_msrs_intercepted = true;

        svm->vmcb01.ptr = page_address(vmcb01_page);
        svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
        svm_switch_vmcb(svm, &svm->vmcb01);

        svm->guest_state_loaded = false;

        return 0;

error_free_sev:
        sev_free_vcpu(vcpu);
error_free_vmcb_page:
        __free_page(vmcb01_page);
out:
        return err;
}

static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        WARN_ON_ONCE(!list_empty(&svm->ir_list));

        svm_leave_nested(vcpu);
        svm_free_nested(svm);

        sev_free_vcpu(vcpu);

        __free_page(__sme_pa_to_page(svm->vmcb01.pa));
        svm_vcpu_free_msrpm(svm->msrpm);
}

#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_SPINLOCK(srso_lock);
static atomic_t srso_nr_vms;

static void svm_srso_clear_bp_spec_reduce(void *ign)
{
        struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);

        if (!sd->bp_spec_reduce_set)
                return;

        msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
        sd->bp_spec_reduce_set = false;
}

static void svm_srso_vm_destroy(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
                return;

        if (atomic_dec_return(&srso_nr_vms))
                return;

        guard(spinlock)(&srso_lock);

        /*
         * Verify a new VM didn't come along, acquire the lock, and increment
         * the count before this task acquired the lock.
         */
        if (atomic_read(&srso_nr_vms))
                return;

        on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
}

static void svm_srso_vm_init(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
                return;

        /*
         * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
         * transition, i.e. destroying the last VM, is fully complete, e.g. so
         * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
         */
        if (atomic_inc_not_zero(&srso_nr_vms))
                return;

        guard(spinlock)(&srso_lock);

        atomic_inc(&srso_nr_vms);
}
#else
static void svm_srso_vm_init(void) { }
static void svm_srso_vm_destroy(void) { }
#endif

static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);

        if (sev_es_guest(vcpu->kvm))
                sev_es_unmap_ghcb(svm);

        if (svm->guest_state_loaded)
                return;

        /*
         * Save additional host state that will be restored on VMEXIT (sev-es)
         * or subsequent vmload of host save area.
         */
        vmsave(sd->save_area_pa);
        if (sev_es_guest(vcpu->kvm))
                sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));

        if (tsc_scaling)
                __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);

        /*
         * TSC_AUX is always virtualized (context switched by hardware) for
         * SEV-ES guests when the feature is available.  For non-SEV-ES guests,
         * context switch TSC_AUX via the user_return MSR infrastructure (not
         * all CPUs support TSC_AUX virtualization).
         */
        if (likely(tsc_aux_uret_slot >= 0) &&
            (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
                kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);

        if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
            !sd->bp_spec_reduce_set) {
                sd->bp_spec_reduce_set = true;
                msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
        }
        svm->guest_state_loaded = true;
}

static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
{
        to_svm(vcpu)->guest_state_loaded = false;
}

static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
        if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
                shrink_ple_window(vcpu);

        if (kvm_vcpu_apicv_active(vcpu))
                avic_vcpu_load(vcpu, cpu);
}

static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
        if (kvm_vcpu_apicv_active(vcpu))
                avic_vcpu_put(vcpu);

        svm_prepare_host_switch(vcpu);

        ++vcpu->stat.host_state_reload;
}

static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long rflags = svm->vmcb->save.rflags;

        if (svm->nmi_singlestep) {
                /* Hide our flags if they were not set by the guest */
                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
                        rflags &= ~X86_EFLAGS_TF;
                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
                        rflags &= ~X86_EFLAGS_RF;
        }
        return rflags;
}

static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
        if (to_svm(vcpu)->nmi_singlestep)
                rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);

       /*
        * Any change of EFLAGS.VM is accompanied by a reload of SS
        * (caused by either a task switch or an inter-privilege IRET),
        * so we do not need to update the CPL here.
        */
        to_svm(vcpu)->vmcb->save.rflags = rflags;
}

static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
{
        struct vmcb *vmcb = to_svm(vcpu)->vmcb;

        return sev_es_guest(vcpu->kvm)
                ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
                : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
}

static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
        kvm_register_mark_available(vcpu, reg);

        switch (reg) {
        case VCPU_EXREG_PDPTR:
                /*
                 * When !npt_enabled, mmu->pdptrs[] is already available since
                 * it is always updated per SDM when moving to CRs.
                 */
                if (npt_enabled)
                        load_pdptrs(vcpu, kvm_read_cr3(vcpu));
                break;
        default:
                KVM_BUG_ON(1, vcpu->kvm);
        }
}

static void svm_set_vintr(struct vcpu_svm *svm)
{
        struct vmcb_control_area *control;

        /*
         * The following fields are ignored when AVIC is enabled
         */
        WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));

        svm_set_intercept(svm, INTERCEPT_VINTR);

        /*
         * Recalculating intercepts may have cleared the VINTR intercept.  If
         * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
         * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
         * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
         * interrupts will never be unblocked while L2 is running.
         */
        if (!svm_is_intercept(svm, INTERCEPT_VINTR))
                return;

        /*
         * This is just a dummy VINTR to actually cause a vmexit to happen.
         * Actual injection of virtual interrupts happens through EVENTINJ.
         */
        control = &svm->vmcb->control;
        control->int_vector = 0x0;
        control->int_ctl &= ~V_INTR_PRIO_MASK;
        control->int_ctl |= V_IRQ_MASK |
                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}

static void svm_clear_vintr(struct vcpu_svm *svm)
{
        svm_clr_intercept(svm, INTERCEPT_VINTR);

        /* Drop int_ctl fields related to VINTR injection.  */
        svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
        if (is_guest_mode(&svm->vcpu)) {
                svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;

                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
                        (svm->nested.ctl.int_ctl & V_TPR_MASK));

                svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
                        V_IRQ_INJECTION_BITS_MASK;

                svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
        }

        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}

static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
        struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;

        switch (seg) {
        case VCPU_SREG_CS: return &save->cs;
        case VCPU_SREG_DS: return &save->ds;
        case VCPU_SREG_ES: return &save->es;
        case VCPU_SREG_FS: return &save01->fs;
        case VCPU_SREG_GS: return &save01->gs;
        case VCPU_SREG_SS: return &save->ss;
        case VCPU_SREG_TR: return &save01->tr;
        case VCPU_SREG_LDTR: return &save01->ldtr;
        }
        BUG();
        return NULL;
}

static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
        struct vmcb_seg *s = svm_seg(vcpu, seg);

        return s->base;
}

static void svm_get_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg)
{
        struct vmcb_seg *s = svm_seg(vcpu, seg);

        var->base = s->base;
        var->limit = s->limit;
        var->selector = s->selector;
        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;

        /*
         * AMD CPUs circa 2014 track the G bit for all segments except CS.
         * However, the SVM spec states that the G bit is not observed by the
         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
         * So let's synthesize a legal G bit for all segments, this helps
         * running KVM nested. It also helps cross-vendor migration, because
         * Intel's vmentry has a check on the 'G' bit.
         */
        var->g = s->limit > 0xfffff;

        /*
         * AMD's VMCB does not have an explicit unusable field, so emulate it
         * for cross vendor migration purposes by "not present"
         */
        var->unusable = !var->present;

        switch (seg) {
        case VCPU_SREG_TR:
                /*
                 * Work around a bug where the busy flag in the tr selector
                 * isn't exposed
                 */
                var->type |= 0x2;
                break;
        case VCPU_SREG_DS:
        case VCPU_SREG_ES:
        case VCPU_SREG_FS:
        case VCPU_SREG_GS:
                /*
                 * The accessed bit must always be set in the segment
                 * descriptor cache, although it can be cleared in the
                 * descriptor, the cached bit always remains at 1. Since
                 * Intel has a check on this, set it here to support
                 * cross-vendor migration.
                 */
                if (!var->unusable)
                        var->type |= 0x1;
                break;
        case VCPU_SREG_SS:
                /*
                 * On AMD CPUs sometimes the DB bit in the segment
                 * descriptor is left as 1, although the whole segment has
                 * been made unusable. Clear it here to pass an Intel VMX
                 * entry check when cross vendor migrating.
                 */
                if (var->unusable)
                        var->db = 0;
                /* This is symmetric with svm_set_segment() */
                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
                break;
        }
}

static int svm_get_cpl(struct kvm_vcpu *vcpu)
{
        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;

        return save->cpl;
}

static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
        struct kvm_segment cs;

        svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
        *db = cs.db;
        *l = cs.l;
}

static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        dt->size = svm->vmcb->save.idtr.limit;
        dt->address = svm->vmcb->save.idtr.base;
}

static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm->vmcb->save.idtr.limit = dt->size;
        svm->vmcb->save.idtr.base = dt->address ;
        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}

static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        dt->size = svm->vmcb->save.gdtr.limit;
        dt->address = svm->vmcb->save.gdtr.base;
}

static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm->vmcb->save.gdtr.limit = dt->size;
        svm->vmcb->save.gdtr.base = dt->address ;
        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}

static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * For guests that don't set guest_state_protected, the cr3 update is
         * handled via kvm_mmu_load() while entering the guest. For guests
         * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
         * VMCB save area now, since the save area will become the initial
         * contents of the VMSA, and future VMCB save area updates won't be
         * seen.
         */
        if (sev_es_guest(vcpu->kvm)) {
                svm->vmcb->save.cr3 = cr3;
                vmcb_mark_dirty(svm->vmcb, VMCB_CR);
        }
}

static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
        return true;
}

void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 hcr0 = cr0;
        bool old_paging = is_paging(vcpu);

#ifdef CONFIG_X86_64
        if (vcpu->arch.efer & EFER_LME) {
                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
                        vcpu->arch.efer |= EFER_LMA;
                        if (!vcpu->arch.guest_state_protected)
                                svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
                }

                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
                        vcpu->arch.efer &= ~EFER_LMA;
                        if (!vcpu->arch.guest_state_protected)
                                svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
                }
        }
#endif
        vcpu->arch.cr0 = cr0;

        if (!npt_enabled) {
                hcr0 |= X86_CR0_PG | X86_CR0_WP;
                if (old_paging != is_paging(vcpu))
                        svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
        }

        /*
         * re-enable caching here because the QEMU bios
         * does not do it - this results in some delay at
         * reboot
         */
        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
                hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);

        svm->vmcb->save.cr0 = hcr0;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);

        /*
         * SEV-ES guests must always keep the CR intercepts cleared. CR
         * tracking is done using the CR write traps.
         */
        if (sev_es_guest(vcpu->kvm))
                return;

        if (hcr0 == cr0) {
                /* Selective CR0 write remains on.  */
                svm_clr_intercept(svm, INTERCEPT_CR0_READ);
                svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
        } else {
                svm_set_intercept(svm, INTERCEPT_CR0_READ);
                svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
        }
}

static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
        return true;
}

void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
        unsigned long old_cr4 = vcpu->arch.cr4;

        vcpu->arch.cr4 = cr4;
        if (!npt_enabled) {
                cr4 |= X86_CR4_PAE;

                if (!is_paging(vcpu))
                        cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
        }
        cr4 |= host_cr4_mce;
        to_svm(vcpu)->vmcb->save.cr4 = cr4;
        vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);

        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                vcpu->arch.cpuid_dynamic_bits_dirty = true;
}

static void svm_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_seg *s = svm_seg(vcpu, seg);

        s->base = var->base;
        s->limit = var->limit;
        s->selector = var->selector;
        s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
        s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
        s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
        s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
        s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
        s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
        s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
        s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;

        /*
         * This is always accurate, except if SYSRET returned to a segment
         * with SS.DPL != 3.  Intel does not have this quirk, and always
         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
         * would entail passing the CPL to userspace and back.
         */
        if (seg == VCPU_SREG_SS)
                /* This is symmetric with svm_get_segment() */
                svm->vmcb->save.cpl = (var->dpl & 3);

        vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}

static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        clr_exception_intercept(svm, BP_VECTOR);

        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
                        set_exception_intercept(svm, BP_VECTOR);
        }
}

static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
        if (sd->next_asid > sd->max_asid) {
                ++sd->asid_generation;
                sd->next_asid = sd->min_asid;
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
        }

        svm->current_vmcb->asid_generation = sd->asid_generation;
        svm->asid = sd->next_asid++;
}

static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
        struct vmcb *vmcb = to_svm(vcpu)->vmcb;

        if (vcpu->arch.guest_state_protected)
                return;

        if (unlikely(value != vmcb->save.dr6)) {
                vmcb->save.dr6 = value;
                vmcb_mark_dirty(vmcb, VMCB_DR);
        }
}

static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
                return;

        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
        get_debugreg(vcpu->arch.db[3], 3);
        /*
         * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
         * because db_interception might need it.  We can do it before vmentry.
         */
        vcpu->arch.dr6 = svm->vmcb->save.dr6;
        vcpu->arch.dr7 = svm->vmcb->save.dr7;
        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
        set_dr_intercepts(svm);
}

static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (vcpu->arch.guest_state_protected)
                return;

        svm->vmcb->save.dr7 = value;
        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}

static int pf_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;

        return kvm_handle_page_fault(vcpu, error_code, fault_address,
                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                        svm->vmcb->control.insn_bytes : NULL,
                        svm->vmcb->control.insn_len);
}

static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
                                         void *insn, int insn_len);

static int npf_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int rc;

        u64 error_code = svm->vmcb->control.exit_info_1;
        gpa_t gpa = svm->vmcb->control.exit_info_2;

        /*
         * WARN if hardware generates a fault with an error code that collides
         * with KVM-defined sythentic flags.  Clear the flags and continue on,
         * i.e. don't terminate the VM, as KVM can't possibly be relying on a
         * flag that KVM doesn't know about.
         */
        if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
                error_code &= ~PFERR_SYNTHETIC_MASK;

        /*
         * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed
         * emulate a page fault, e.g. skipping the current instruction is wrong
         * if the #NPF occurred while vectoring an event.
         */
        if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) {
                const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE;

                if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0))
                        return 1;

                if (nrips && svm->vmcb->control.next_rip &&
                    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                        trace_kvm_fast_mmio(gpa);
                        return kvm_skip_emulated_instruction(vcpu);
                }
        }

        if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
                error_code |= PFERR_PRIVATE_ACCESS;

        trace_kvm_page_fault(vcpu, gpa, error_code);
        rc = kvm_mmu_page_fault(vcpu, gpa, error_code,
                                static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
                                svm->vmcb->control.insn_bytes : NULL,
                                svm->vmcb->control.insn_len);

        if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
                sev_handle_rmp_fault(vcpu, gpa, error_code);

        return rc;
}

static int db_interception(struct kvm_vcpu *vcpu)
{
        struct kvm_run *kvm_run = vcpu->run;
        struct vcpu_svm *svm = to_svm(vcpu);

        if (!(vcpu->guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
                !svm->nmi_singlestep) {
                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
                kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
                return 1;
        }

        if (svm->nmi_singlestep) {
                disable_nmi_singlestep(svm);
                /* Make sure we check for pending NMIs upon entry */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        }

        if (vcpu->guest_debug &
            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
                kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
                kvm_run->debug.arch.pc =
                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
                kvm_run->debug.arch.exception = DB_VECTOR;
                return 0;
        }

        return 1;
}

static int bp_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_run *kvm_run = vcpu->run;

        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
        kvm_run->debug.arch.exception = BP_VECTOR;
        return 0;
}

static int ud_interception(struct kvm_vcpu *vcpu)
{
        return handle_ud(vcpu);
}

static int ac_interception(struct kvm_vcpu *vcpu)
{
        kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
        return 1;
}

static bool is_erratum_383(void)
{
        int i;
        u64 value;

        if (!erratum_383_found)
                return false;

        if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
                return false;

        /* Bit 62 may or may not be set for this mce */
        value &= ~(1ULL << 62);

        if (value != 0xb600000000010015ULL)
                return false;

        /* Clear MCi_STATUS registers */
        for (i = 0; i < 6; ++i)
                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);

        if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
                value &= ~(1ULL << 2);
                native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
        }

        /* Flush tlb to evict multi-match entries */
        __flush_tlb_all();

        return true;
}

static void svm_handle_mce(struct kvm_vcpu *vcpu)
{
        if (is_erratum_383()) {
                /*
                 * Erratum 383 triggered. Guest state is corrupt so kill the
                 * guest.
                 */
                pr_err("Guest triggered AMD Erratum 383\n");

                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

                return;
        }

        /*
         * On an #MC intercept the MCE handler is not called automatically in
         * the host. So do it by hand here.
         */
        kvm_machine_check();
}

static int mc_interception(struct kvm_vcpu *vcpu)
{
        return 1;
}

static int shutdown_interception(struct kvm_vcpu *vcpu)
{
        struct kvm_run *kvm_run = vcpu->run;
        struct vcpu_svm *svm = to_svm(vcpu);


        /*
         * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
         * the VMCB in a known good state.  Unfortuately, KVM doesn't have
         * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
         * userspace.  At a platform view, INIT is acceptable behavior as
         * there exist bare metal platforms that automatically INIT the CPU
         * in response to shutdown.
         *
         * The VM save area for SEV-ES guests has already been encrypted so it
         * cannot be reinitialized, i.e. synthesizing INIT is futile.
         */
        if (!sev_es_guest(vcpu->kvm)) {
                clear_page(svm->vmcb);
#ifdef CONFIG_KVM_SMM
                if (is_smm(vcpu))
                        kvm_smm_changed(vcpu, false);
#endif
                kvm_vcpu_reset(vcpu, true);
        }

        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
}

static int io_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
        unsigned port;

        ++vcpu->stat.io_exits;
        string = (io_info & SVM_IOIO_STR_MASK) != 0;
        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
        port = io_info >> 16;
        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;

        if (string) {
                if (sev_es_guest(vcpu->kvm))
                        return sev_es_string_io(svm, size, port, in);
                else
                        return kvm_emulate_instruction(vcpu, 0);
        }

        svm->next_rip = svm->vmcb->control.exit_info_2;

        return kvm_fast_pio(vcpu, size, port, in);
}

static int nmi_interception(struct kvm_vcpu *vcpu)
{
        return 1;
}

static int smi_interception(struct kvm_vcpu *vcpu)
{
        return 1;
}

static int intr_interception(struct kvm_vcpu *vcpu)
{
        ++vcpu->stat.irq_exits;
        return 1;
}

static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb12;
        struct kvm_host_map map;
        int ret;

        if (nested_svm_check_permissions(vcpu))
                return 1;

        ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
        if (ret) {
                if (ret == -EINVAL)
                        kvm_inject_gp(vcpu, 0);
                return 1;
        }

        vmcb12 = map.hva;

        ret = kvm_skip_emulated_instruction(vcpu);

        /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */
        if (vmload) {
                svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12);
                svm->sysenter_eip_hi = 0;
                svm->sysenter_esp_hi = 0;
        } else {
                svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr);
        }

        kvm_vcpu_unmap(vcpu, &map);

        return ret;
}

static int vmload_interception(struct kvm_vcpu *vcpu)
{
        return vmload_vmsave_interception(vcpu, true);
}

static int vmsave_interception(struct kvm_vcpu *vcpu)
{
        return vmload_vmsave_interception(vcpu, false);
}

static int vmrun_interception(struct kvm_vcpu *vcpu)
{
        if (nested_svm_check_permissions(vcpu))
                return 1;

        return nested_svm_vmrun(vcpu);
}

enum {
        NONE_SVM_INSTR,
        SVM_INSTR_VMRUN,
        SVM_INSTR_VMLOAD,
        SVM_INSTR_VMSAVE,
};

/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
static int svm_instr_opcode(struct kvm_vcpu *vcpu)
{
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;

        if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
                return NONE_SVM_INSTR;

        switch (ctxt->modrm) {
        case 0xd8: /* VMRUN */
                return SVM_INSTR_VMRUN;
        case 0xda: /* VMLOAD */
                return SVM_INSTR_VMLOAD;
        case 0xdb: /* VMSAVE */
                return SVM_INSTR_VMSAVE;
        default:
                break;
        }

        return NONE_SVM_INSTR;
}

static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
{
        const int guest_mode_exit_codes[] = {
                [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
        };
        int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
                [SVM_INSTR_VMRUN] = vmrun_interception,
                [SVM_INSTR_VMLOAD] = vmload_interception,
                [SVM_INSTR_VMSAVE] = vmsave_interception,
        };
        struct vcpu_svm *svm = to_svm(vcpu);
        int ret;

        if (is_guest_mode(vcpu)) {
                /* Returns '1' or -errno on failure, '0' on success. */
                ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
                if (ret)
                        return ret;
                return 1;
        }
        return svm_instr_handlers[opcode](vcpu);
}

/*
 * #GP handling code. Note that #GP can be triggered under the following two
 * cases:
 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
 *      some AMD CPUs when EAX of these instructions are in the reserved memory
 *      regions (e.g. SMM memory on host).
 *   2) VMware backdoor
 */
static int gp_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 error_code = svm->vmcb->control.exit_info_1;
        int opcode;

        /* Both #GP cases have zero error_code */
        if (error_code)
                goto reinject;

        /* Decode the instruction for usage later */
        if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
                goto reinject;

        opcode = svm_instr_opcode(vcpu);

        if (opcode == NONE_SVM_INSTR) {
                if (!enable_vmware_backdoor)
                        goto reinject;

                /*
                 * VMware backdoor emulation on #GP interception only handles
                 * IN{S}, OUT{S}, and RDPMC.
                 */
                if (!is_guest_mode(vcpu))
                        return kvm_emulate_instruction(vcpu,
                                EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
        } else {
                /* All SVM instructions expect page aligned RAX */
                if (svm->vmcb->save.rax & ~PAGE_MASK)
                        goto reinject;

                return emulate_svm_instr(vcpu, opcode);
        }

reinject:
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
        return 1;
}

void svm_set_gif(struct vcpu_svm *svm, bool value)
{
        if (value) {
                /*
                 * If VGIF is enabled, the STGI intercept is only added to
                 * detect the opening of the SMI/NMI window; remove it now.
                 * Likewise, clear the VINTR intercept, we will set it
                 * again while processing KVM_REQ_EVENT if needed.
                 */
                if (vgif)
                        svm_clr_intercept(svm, INTERCEPT_STGI);
                if (svm_is_intercept(svm, INTERCEPT_VINTR))
                        svm_clear_vintr(svm);

                enable_gif(svm);
                if (svm->vcpu.arch.smi_pending ||
                    svm->vcpu.arch.nmi_pending ||
                    kvm_cpu_has_injectable_intr(&svm->vcpu) ||
                    kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
                        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
        } else {
                disable_gif(svm);

                /*
                 * After a CLGI no interrupts should come.  But if vGIF is
                 * in use, we still rely on the VINTR intercept (rather than
                 * STGI) to detect an open interrupt window.
                */
                if (!vgif)
                        svm_clear_vintr(svm);
        }
}

static int stgi_interception(struct kvm_vcpu *vcpu)
{
        int ret;

        if (nested_svm_check_permissions(vcpu))
                return 1;

        ret = kvm_skip_emulated_instruction(vcpu);
        svm_set_gif(to_svm(vcpu), true);
        return ret;
}

static int clgi_interception(struct kvm_vcpu *vcpu)
{
        int ret;

        if (nested_svm_check_permissions(vcpu))
                return 1;

        ret = kvm_skip_emulated_instruction(vcpu);
        svm_set_gif(to_svm(vcpu), false);
        return ret;
}

static int invlpga_interception(struct kvm_vcpu *vcpu)
{
        gva_t gva = kvm_rax_read(vcpu);
        u32 asid = kvm_rcx_read(vcpu);

        /* FIXME: Handle an address size prefix. */
        if (!is_long_mode(vcpu))
                gva = (u32)gva;

        trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);

        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
        kvm_mmu_invlpg(vcpu, gva);

        return kvm_skip_emulated_instruction(vcpu);
}

static int skinit_interception(struct kvm_vcpu *vcpu)
{
        trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));

        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
}

static int task_switch_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u16 tss_selector;
        int reason;
        int int_type = svm->vmcb->control.exit_int_info &
                SVM_EXITINTINFO_TYPE_MASK;
        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
        uint32_t type =
                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
        uint32_t idt_v =
                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
        bool has_error_code = false;
        u32 error_code = 0;

        tss_selector = (u16)svm->vmcb->control.exit_info_1;

        if (svm->vmcb->control.exit_info_2 &
            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
                reason = TASK_SWITCH_IRET;
        else if (svm->vmcb->control.exit_info_2 &
                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
                reason = TASK_SWITCH_JMP;
        else if (idt_v)
                reason = TASK_SWITCH_GATE;
        else
                reason = TASK_SWITCH_CALL;

        if (reason == TASK_SWITCH_GATE) {
                switch (type) {
                case SVM_EXITINTINFO_TYPE_NMI:
                        vcpu->arch.nmi_injected = false;
                        break;
                case SVM_EXITINTINFO_TYPE_EXEPT:
                        if (svm->vmcb->control.exit_info_2 &
                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
                                has_error_code = true;
                                error_code =
                                        (u32)svm->vmcb->control.exit_info_2;
                        }
                        kvm_clear_exception_queue(vcpu);
                        break;
                case SVM_EXITINTINFO_TYPE_INTR:
                case SVM_EXITINTINFO_TYPE_SOFT:
                        kvm_clear_interrupt_queue(vcpu);
                        break;
                default:
                        break;
                }
        }

        if (reason != TASK_SWITCH_GATE ||
            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
                if (!svm_skip_emulated_instruction(vcpu))
                        return 0;
        }

        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
                int_vec = -1;

        return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
                               has_error_code, error_code);
}

static void svm_clr_iret_intercept(struct vcpu_svm *svm)
{
        if (!sev_es_guest(svm->vcpu.kvm))
                svm_clr_intercept(svm, INTERCEPT_IRET);
}

static void svm_set_iret_intercept(struct vcpu_svm *svm)
{
        if (!sev_es_guest(svm->vcpu.kvm))
                svm_set_intercept(svm, INTERCEPT_IRET);
}

static int iret_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        WARN_ON_ONCE(sev_es_guest(vcpu->kvm));

        ++vcpu->stat.nmi_window_exits;
        svm->awaiting_iret_completion = true;

        svm_clr_iret_intercept(svm);
        svm->nmi_iret_rip = kvm_rip_read(vcpu);

        kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 1;
}

static int invlpg_interception(struct kvm_vcpu *vcpu)
{
        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
                return kvm_emulate_instruction(vcpu, 0);

        kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
        return kvm_skip_emulated_instruction(vcpu);
}

static int emulate_on_interception(struct kvm_vcpu *vcpu)
{
        return kvm_emulate_instruction(vcpu, 0);
}

static int rsm_interception(struct kvm_vcpu *vcpu)
{
        return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
}

static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
                                            unsigned long val)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr0 = vcpu->arch.cr0;
        bool ret = false;

        if (!is_guest_mode(vcpu) ||
            (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
                return false;

        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
        val &= ~SVM_CR0_SELECTIVE_MASK;

        if (cr0 ^ val) {
                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
        }

        return ret;
}

#define CR_VALID (1ULL << 63)

static int cr_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int reg, cr;
        unsigned long val;
        int err;

        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
                return emulate_on_interception(vcpu);

        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
                return emulate_on_interception(vcpu);

        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
                cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
        else
                cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;

        err = 0;
        if (cr >= 16) { /* mov to cr */
                cr -= 16;
                val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
                        if (!check_selective_cr0_intercepted(vcpu, val))
                                err = kvm_set_cr0(vcpu, val);
                        else
                                return 1;

                        break;
                case 3:
                        err = kvm_set_cr3(vcpu, val);
                        break;
                case 4:
                        err = kvm_set_cr4(vcpu, val);
                        break;
                case 8:
                        err = kvm_set_cr8(vcpu, val);
                        break;
                default:
                        WARN(1, "unhandled write to CR%d", cr);
                        kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
        } else { /* mov from cr */
                switch (cr) {
                case 0:
                        val = kvm_read_cr0(vcpu);
                        break;
                case 2:
                        val = vcpu->arch.cr2;
                        break;
                case 3:
                        val = kvm_read_cr3(vcpu);
                        break;
                case 4:
                        val = kvm_read_cr4(vcpu);
                        break;
                case 8:
                        val = kvm_get_cr8(vcpu);
                        break;
                default:
                        WARN(1, "unhandled read from CR%d", cr);
                        kvm_queue_exception(vcpu, UD_VECTOR);
                        return 1;
                }
                kvm_register_write(vcpu, reg, val);
                trace_kvm_cr_read(cr, val);
        }
        return kvm_complete_insn_gp(vcpu, err);
}

static int cr_trap(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long old_value, new_value;
        unsigned int cr;
        int ret = 0;

        new_value = (unsigned long)svm->vmcb->control.exit_info_1;

        cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
        switch (cr) {
        case 0:
                old_value = kvm_read_cr0(vcpu);
                svm_set_cr0(vcpu, new_value);

                kvm_post_set_cr0(vcpu, old_value, new_value);
                break;
        case 4:
                old_value = kvm_read_cr4(vcpu);
                svm_set_cr4(vcpu, new_value);

                kvm_post_set_cr4(vcpu, old_value, new_value);
                break;
        case 8:
                ret = kvm_set_cr8(vcpu, new_value);
                break;
        default:
                WARN(1, "unhandled CR%d write trap", cr);
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }

        return kvm_complete_insn_gp(vcpu, ret);
}

static int dr_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int reg, dr;
        int err = 0;

        /*
         * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
         * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
         */
        if (sev_es_guest(vcpu->kvm))
                return 1;

        if (vcpu->guest_debug == 0) {
                /*
                 * No more DR vmexits; force a reload of the debug registers
                 * and reenter on this instruction.  The next vmexit will
                 * retrieve the full state of the debug registers.
                 */
                clr_dr_intercepts(svm);
                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                return 1;
        }

        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
                return emulate_on_interception(vcpu);

        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
        if (dr >= 16) { /* mov to DRn  */
                dr -= 16;
                err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
        } else {
                kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
        }

        return kvm_complete_insn_gp(vcpu, err);
}

static int cr8_write_interception(struct kvm_vcpu *vcpu)
{
        u8 cr8_prev = kvm_get_cr8(vcpu);
        int r;

        WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu));

        /* instruction emulation calls kvm_set_cr8() */
        r = cr_interception(vcpu);
        if (lapic_in_kernel(vcpu))
                return r;
        if (cr8_prev <= kvm_get_cr8(vcpu))
                return r;
        vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
        return 0;
}

static int efer_trap(struct kvm_vcpu *vcpu)
{
        struct msr_data msr_info;
        int ret;

        /*
         * Clear the EFER_SVME bit from EFER. The SVM code always sets this
         * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
         * whether the guest has X86_FEATURE_SVM - this avoids a failure if
         * the guest doesn't have X86_FEATURE_SVM.
         */
        msr_info.host_initiated = false;
        msr_info.index = MSR_EFER;
        msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
        ret = kvm_set_msr_common(vcpu, &msr_info);

        return kvm_complete_insn_gp(vcpu, ret);
}

static int svm_get_feature_msr(u32 msr, u64 *data)
{
        *data = 0;

        switch (msr) {
        case MSR_AMD64_DE_CFG:
                if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
                        *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
                break;
        default:
                return KVM_MSR_RET_UNSUPPORTED;
        }

        return 0;
}

static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
                                      struct msr_data *msr_info)
{
        return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected &&
               msr_info->index != MSR_IA32_XSS &&
               !msr_write_intercepted(vcpu, msr_info->index);
}

static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (sev_es_prevent_msr_access(vcpu, msr_info)) {
                msr_info->data = 0;
                return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
        }

        switch (msr_info->index) {
        case MSR_AMD64_TSC_RATIO:
                if (!msr_info->host_initiated &&
                    !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
                        return 1;
                msr_info->data = svm->tsc_ratio_msr;
                break;
        case MSR_STAR:
                msr_info->data = svm->vmcb01.ptr->save.star;
                break;
#ifdef CONFIG_X86_64
        case MSR_LSTAR:
                msr_info->data = svm->vmcb01.ptr->save.lstar;
                break;
        case MSR_CSTAR:
                msr_info->data = svm->vmcb01.ptr->save.cstar;
                break;
        case MSR_GS_BASE:
                msr_info->data = svm->vmcb01.ptr->save.gs.base;
                break;
        case MSR_FS_BASE:
                msr_info->data = svm->vmcb01.ptr->save.fs.base;
                break;
        case MSR_KERNEL_GS_BASE:
                msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
                break;
        case MSR_SYSCALL_MASK:
                msr_info->data = svm->vmcb01.ptr->save.sfmask;
                break;
#endif
        case MSR_IA32_SYSENTER_CS:
                msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
                break;
        case MSR_IA32_SYSENTER_EIP:
                msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
                if (guest_cpuid_is_intel_compatible(vcpu))
                        msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
                break;
        case MSR_IA32_SYSENTER_ESP:
                msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
                if (guest_cpuid_is_intel_compatible(vcpu))
                        msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
                break;
        case MSR_IA32_S_CET:
                msr_info->data = svm->vmcb->save.s_cet;
                break;
        case MSR_IA32_INT_SSP_TAB:
                msr_info->data = svm->vmcb->save.isst_addr;
                break;
        case MSR_KVM_INTERNAL_GUEST_SSP:
                msr_info->data = svm->vmcb->save.ssp;
                break;
        case MSR_TSC_AUX:
                msr_info->data = svm->tsc_aux;
                break;
        case MSR_IA32_DEBUGCTLMSR:
                msr_info->data = svm->vmcb->save.dbgctl;
                break;
        case MSR_IA32_LASTBRANCHFROMIP:
                msr_info->data = svm->vmcb->save.br_from;
                break;
        case MSR_IA32_LASTBRANCHTOIP:
                msr_info->data = svm->vmcb->save.br_to;
                break;
        case MSR_IA32_LASTINTFROMIP:
                msr_info->data = svm->vmcb->save.last_excp_from;
                break;
        case MSR_IA32_LASTINTTOIP:
                msr_info->data = svm->vmcb->save.last_excp_to;
                break;
        case MSR_VM_HSAVE_PA:
                msr_info->data = svm->nested.hsave_msr;
                break;
        case MSR_VM_CR:
                msr_info->data = svm->nested.vm_cr_msr;
                break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;

                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
                        msr_info->data = svm->vmcb->save.spec_ctrl;
                else
                        msr_info->data = svm->spec_ctrl;
                break;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
                        return 1;

                msr_info->data = svm->virt_spec_ctrl;
                break;
        case MSR_F15H_IC_CFG: {

                int family, model;

                family = guest_cpuid_family(vcpu);
                model  = guest_cpuid_model(vcpu);

                if (family < 0 || model < 0)
                        return kvm_get_msr_common(vcpu, msr_info);

                msr_info->data = 0;

                if (family == 0x15 &&
                    (model >= 0x2 && model < 0x20))
                        msr_info->data = 0x1E;
                }
                break;
        case MSR_AMD64_DE_CFG:
                msr_info->data = svm->msr_decfg;
                break;
        default:
                return kvm_get_msr_common(vcpu, msr_info);
        }
        return 0;
}

static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
                return kvm_complete_insn_gp(vcpu, err);

        svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
        return 1;
}

static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int svm_dis, chg_mask;

        if (data & ~SVM_VM_CR_VALID_MASK)
                return 1;

        chg_mask = SVM_VM_CR_VALID_MASK;

        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);

        svm->nested.vm_cr_msr &= ~chg_mask;
        svm->nested.vm_cr_msr |= (data & chg_mask);

        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;

        /* check for svm_disable while efer.svme is set */
        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
                return 1;

        return 0;
}

static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int ret = 0;

        u32 ecx = msr->index;
        u64 data = msr->data;

        if (sev_es_prevent_msr_access(vcpu, msr))
                return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;

        switch (ecx) {
        case MSR_AMD64_TSC_RATIO:

                if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {

                        if (!msr->host_initiated)
                                return 1;
                        /*
                         * In case TSC scaling is not enabled, always
                         * leave this MSR at the default value.
                         *
                         * Due to bug in qemu 6.2.0, it would try to set
                         * this msr to 0 if tsc scaling is not enabled.
                         * Ignore this value as well.
                         */
                        if (data != 0 && data != svm->tsc_ratio_msr)
                                return 1;
                        break;
                }

                if (data & SVM_TSC_RATIO_RSVD)
                        return 1;

                svm->tsc_ratio_msr = data;

                if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
                    is_guest_mode(vcpu))
                        nested_svm_update_tsc_ratio_msr(vcpu);

                break;
        case MSR_IA32_CR_PAT:
                ret = kvm_set_msr_common(vcpu, msr);
                if (ret)
                        break;

                svm->vmcb01.ptr->save.g_pat = data;
                if (is_guest_mode(vcpu))
                        nested_vmcb02_compute_g_pat(svm);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
                break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr->host_initiated &&
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;

                if (kvm_spec_ctrl_test_value(data))
                        return 1;

                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
                        svm->vmcb->save.spec_ctrl = data;
                else
                        svm->spec_ctrl = data;
                if (!data)
                        break;

                /*
                 * For non-nested:
                 * When it's written (to non-zero) for the first time, pass
                 * it through.
                 *
                 * For nested:
                 * The handling of the MSR bitmap for L2 guests is done in
                 * nested_svm_merge_msrpm().
                 * We update the L1 MSR bit as well since it will end up
                 * touching the MSR anyway now.
                 */
                svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
                break;
        case MSR_AMD64_VIRT_SPEC_CTRL:
                if (!msr->host_initiated &&
                    !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
                        return 1;

                if (data & ~SPEC_CTRL_SSBD)
                        return 1;

                svm->virt_spec_ctrl = data;
                break;
        case MSR_STAR:
                svm->vmcb01.ptr->save.star = data;
                break;
#ifdef CONFIG_X86_64
        case MSR_LSTAR:
                svm->vmcb01.ptr->save.lstar = data;
                break;
        case MSR_CSTAR:
                svm->vmcb01.ptr->save.cstar = data;
                break;
        case MSR_GS_BASE:
                svm->vmcb01.ptr->save.gs.base = data;
                break;
        case MSR_FS_BASE:
                svm->vmcb01.ptr->save.fs.base = data;
                break;
        case MSR_KERNEL_GS_BASE:
                svm->vmcb01.ptr->save.kernel_gs_base = data;
                break;
        case MSR_SYSCALL_MASK:
                svm->vmcb01.ptr->save.sfmask = data;
                break;
#endif
        case MSR_IA32_SYSENTER_CS:
                svm->vmcb01.ptr->save.sysenter_cs = data;
                break;
        case MSR_IA32_SYSENTER_EIP:
                svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
                /*
                 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
                 * when we spoof an Intel vendor ID (for cross vendor migration).
                 * In this case we use this intercept to track the high
                 * 32 bit part of these msrs to support Intel's
                 * implementation of SYSENTER/SYSEXIT.
                 */
                svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_IA32_SYSENTER_ESP:
                svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
                svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
                break;
        case MSR_IA32_S_CET:
                svm->vmcb->save.s_cet = data;
                vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
                break;
        case MSR_IA32_INT_SSP_TAB:
                svm->vmcb->save.isst_addr = data;
                vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
                break;
        case MSR_KVM_INTERNAL_GUEST_SSP:
                svm->vmcb->save.ssp = data;
                vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
                break;
        case MSR_TSC_AUX:
                /*
                 * TSC_AUX is always virtualized for SEV-ES guests when the
                 * feature is available. The user return MSR support is not
                 * required in this case because TSC_AUX is restored on #VMEXIT
                 * from the host save area.
                 */
                if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
                        break;

                /*
                 * TSC_AUX is usually changed only during boot and never read
                 * directly.  Intercept TSC_AUX and switch it via user return.
                 */
                preempt_disable();
                ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
                preempt_enable();
                if (ret)
                        break;

                svm->tsc_aux = data;
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!lbrv) {
                        kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
                        break;
                }

                /*
                 * Suppress BTF as KVM doesn't virtualize BTF, but there's no
                 * way to communicate lack of support to the guest.
                 */
                if (data & DEBUGCTLMSR_BTF) {
                        kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
                        data &= ~DEBUGCTLMSR_BTF;
                }

                if (data & DEBUGCTL_RESERVED_BITS)
                        return 1;

                if (svm->vmcb->save.dbgctl == data)
                        break;

                svm->vmcb->save.dbgctl = data;
                vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
                svm_update_lbrv(vcpu);
                break;
        case MSR_VM_HSAVE_PA:
                /*
                 * Old kernels did not validate the value written to
                 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
                 * value to allow live migrating buggy or malicious guests
                 * originating from those kernels.
                 */
                if (!msr->host_initiated && !page_address_valid(vcpu, data))
                        return 1;

                svm->nested.hsave_msr = data & PAGE_MASK;
                break;
        case MSR_VM_CR:
                return svm_set_vm_cr(vcpu, data);
        case MSR_VM_IGNNE:
                kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
                break;
        case MSR_AMD64_DE_CFG: {
                u64 supported_de_cfg;

                if (svm_get_feature_msr(ecx, &supported_de_cfg))
                        return 1;

                if (data & ~supported_de_cfg)
                        return 1;

                svm->msr_decfg = data;
                break;
        }
        default:
                return kvm_set_msr_common(vcpu, msr);
        }
        return ret;
}

static int msr_interception(struct kvm_vcpu *vcpu)
{
        if (to_svm(vcpu)->vmcb->control.exit_info_1)
                return kvm_emulate_wrmsr(vcpu);
        else
                return kvm_emulate_rdmsr(vcpu);
}

static int interrupt_window_interception(struct kvm_vcpu *vcpu)
{
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        svm_clear_vintr(to_svm(vcpu));

        /*
         * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         *
         * If running nested, still remove the VM wide AVIC inhibit to
         * support case in which the interrupt window was requested when the
         * vCPU was not running nested.

         * All vCPUs which run still run nested, will remain to have their
         * AVIC still inhibited due to per-cpu AVIC inhibition.
         */
        kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);

        ++vcpu->stat.irq_window_exits;
        return 1;
}

static int pause_interception(struct kvm_vcpu *vcpu)
{
        bool in_kernel;
        /*
         * CPL is not made available for an SEV-ES guest, therefore
         * vcpu->arch.preempted_in_kernel can never be true.  Just
         * set in_kernel to false as well.
         */
        in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;

        grow_ple_window(vcpu);

        kvm_vcpu_on_spin(vcpu, in_kernel);
        return kvm_skip_emulated_instruction(vcpu);
}

static int invpcid_interception(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long type;
        gva_t gva;

        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }

        /*
         * For an INVPCID intercept:
         * EXITINFO1 provides the linear address of the memory operand.
         * EXITINFO2 provides the contents of the register operand.
         */
        type = svm->vmcb->control.exit_info_2;
        gva = svm->vmcb->control.exit_info_1;

        /*
         * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
         *        stack segment is used.  The intercept takes priority over all
         *        #GP checks except CPL>0, but somehow still generates a linear
         *        address?  The APM is sorely lacking.
         */
        if (is_noncanonical_address(gva, vcpu, 0)) {
                kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
                return 1;
        }

        return kvm_handle_invpcid(vcpu, type, gva);
}

static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
         * execute the bus-locking instruction.  Set the bus lock counter to '1'
         * to effectively step past the bus lock.
         */
        if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
                svm->vmcb->control.bus_lock_counter = 1;

        return 1;
}

static int bus_lock_exit(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
        vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;

        vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
        vcpu->arch.complete_userspace_io = complete_userspace_buslock;

        if (is_guest_mode(vcpu))
                svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;

        return 0;
}

static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_READ_CR0]                     = cr_interception,
        [SVM_EXIT_READ_CR3]                     = cr_interception,
        [SVM_EXIT_READ_CR4]                     = cr_interception,
        [SVM_EXIT_READ_CR8]                     = cr_interception,
        [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
        [SVM_EXIT_READ_DR0]                     = dr_interception,
        [SVM_EXIT_READ_DR1]                     = dr_interception,
        [SVM_EXIT_READ_DR2]                     = dr_interception,
        [SVM_EXIT_READ_DR3]                     = dr_interception,
        [SVM_EXIT_READ_DR4]                     = dr_interception,
        [SVM_EXIT_READ_DR5]                     = dr_interception,
        [SVM_EXIT_READ_DR6]                     = dr_interception,
        [SVM_EXIT_READ_DR7]                     = dr_interception,
        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
        [SVM_EXIT_SMI]                          = smi_interception,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
        [SVM_EXIT_IRET]                         = iret_interception,
        [SVM_EXIT_INVD]                         = kvm_emulate_invd,
        [SVM_EXIT_PAUSE]                        = pause_interception,
        [SVM_EXIT_HLT]                          = kvm_emulate_halt,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
        [SVM_EXIT_IOIO]                         = io_interception,
        [SVM_EXIT_MSR]                          = msr_interception,
        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
        [SVM_EXIT_VMRUN]                        = vmrun_interception,
        [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
        [SVM_EXIT_VMLOAD]                       = vmload_interception,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
        [SVM_EXIT_SKINIT]                       = skinit_interception,
        [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
        [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
        [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
        [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
        [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
        [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
        [SVM_EXIT_INVPCID]                      = invpcid_interception,
        [SVM_EXIT_IDLE_HLT]                     = kvm_emulate_halt,
        [SVM_EXIT_NPF]                          = npf_interception,
        [SVM_EXIT_BUS_LOCK]                     = bus_lock_exit,
        [SVM_EXIT_RSM]                          = rsm_interception,
        [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
        [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
#ifdef CONFIG_KVM_AMD_SEV
        [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
#endif
};

static void dump_vmcb(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
        struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
        char *vm_type;

        if (!dump_invalid_vmcb) {
                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
                return;
        }

        guard(mutex)(&vmcb_dump_mutex);

        vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
                  sev_es_guest(vcpu->kvm) ? "SEV-ES" :
                  sev_guest(vcpu->kvm) ? "SEV" : "SVM";

        pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
               vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
        pr_err("VMCB Control Area:\n");
        pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
        pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
        pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
        pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
        pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
        pr_err("%-20s%08x %08x\n", "intercepts:",
              control->intercepts[INTERCEPT_WORD3],
               control->intercepts[INTERCEPT_WORD4]);
        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
        pr_err("%-20s%d\n", "pause filter threshold:",
               control->pause_filter_thresh);
        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
        pr_err("%-20s%d\n", "asid:", control->asid);
        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
        pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
        pr_err("%-20s%08x\n", "int_state:", control->int_state);
        pr_err("%-20s%016llx\n", "exit_code:", control->exit_code);
        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
        pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
        pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
        pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
        pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
        pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
        pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
        pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
        pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
        pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);

        if (sev_es_guest(vcpu->kvm)) {
                save = sev_decrypt_vmsa(vcpu);
                if (!save)
                        goto no_vmsa;

                save01 = save;
        }

        pr_err("VMCB State Save Area:\n");
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "es:",
               save->es.selector, save->es.attrib,
               save->es.limit, save->es.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "cs:",
               save->cs.selector, save->cs.attrib,
               save->cs.limit, save->cs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ss:",
               save->ss.selector, save->ss.attrib,
               save->ss.limit, save->ss.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ds:",
               save->ds.selector, save->ds.attrib,
               save->ds.limit, save->ds.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "fs:",
               save01->fs.selector, save01->fs.attrib,
               save01->fs.limit, save01->fs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gs:",
               save01->gs.selector, save01->gs.attrib,
               save01->gs.limit, save01->gs.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "gdtr:",
               save->gdtr.selector, save->gdtr.attrib,
               save->gdtr.limit, save->gdtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "ldtr:",
               save01->ldtr.selector, save01->ldtr.attrib,
               save01->ldtr.limit, save01->ldtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "idtr:",
               save->idtr.selector, save->idtr.attrib,
               save->idtr.limit, save->idtr.base);
        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
               "tr:",
               save01->tr.selector, save01->tr.attrib,
               save01->tr.limit, save01->tr.base);
        pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
               save->vmpl, save->cpl, save->efer);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "cr0:", save->cr0, "cr2:", save->cr2);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "cr3:", save->cr3, "cr4:", save->cr4);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "dr6:", save->dr6, "dr7:", save->dr7);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "rip:", save->rip, "rflags:", save->rflags);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "rsp:", save->rsp, "rax:", save->rax);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "s_cet:", save->s_cet, "ssp:", save->ssp);
        pr_err("%-15s %016llx\n",
               "isst_addr:", save->isst_addr);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "star:", save01->star, "lstar:", save01->lstar);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "cstar:", save01->cstar, "sfmask:", save01->sfmask);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "kernel_gs_base:", save01->kernel_gs_base,
               "sysenter_cs:", save01->sysenter_cs);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "sysenter_esp:", save01->sysenter_esp,
               "sysenter_eip:", save01->sysenter_eip);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "br_from:", save->br_from, "br_to:", save->br_to);
        pr_err("%-15s %016llx %-13s %016llx\n",
               "excp_from:", save->last_excp_from,
               "excp_to:", save->last_excp_to);

        if (sev_es_guest(vcpu->kvm)) {
                struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;

                pr_err("%-15s %016llx\n",
                       "sev_features", vmsa->sev_features);

                pr_err("%-15s %016llx %-13s %016llx\n",
                       "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp);
                pr_err("%-15s %016llx\n",
                       "u_cet:", vmsa->u_cet);

                pr_err("%-15s %016llx %-13s %016llx\n",
                       "rax:", vmsa->rax, "rbx:", vmsa->rbx);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "r8:", vmsa->r8, "r9:", vmsa->r9);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "r10:", vmsa->r10, "r11:", vmsa->r11);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "r12:", vmsa->r12, "r13:", vmsa->r13);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "r14:", vmsa->r14, "r15:", vmsa->r15);
                pr_err("%-15s %016llx %-13s %016llx\n",
                       "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
        } else {
                pr_err("%-15s %016llx %-13s %016lx\n",
                       "rax:", save->rax, "rbx:",
                       vcpu->arch.regs[VCPU_REGS_RBX]);
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
                       "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
                       "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
                pr_err("%-15s %016lx %-13s %016llx\n",
                       "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
                       "rsp:", save->rsp);
#ifdef CONFIG_X86_64
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "r8:", vcpu->arch.regs[VCPU_REGS_R8],
                       "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "r10:", vcpu->arch.regs[VCPU_REGS_R10],
                       "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "r12:", vcpu->arch.regs[VCPU_REGS_R12],
                       "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
                pr_err("%-15s %016lx %-13s %016lx\n",
                       "r14:", vcpu->arch.regs[VCPU_REGS_R14],
                       "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
#endif
        }

no_vmsa:
        if (sev_es_guest(vcpu->kvm))
                sev_free_decrypted_vmsa(vcpu, save);
}

int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
{
        u32 exit_code = __exit_code;

        /*
         * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN
         * failed.  Report all such errors to userspace (note, VMEXIT_INVALID,
         * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()).  Skip
         * the check when running as a VM, as KVM has historically left garbage
         * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if
         * the underlying kernel is buggy.
         */
        if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) &&
            (u64)exit_code != __exit_code)
                goto unexpected_vmexit;

#ifdef CONFIG_MITIGATION_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
                return msr_interception(vcpu);
        else if (exit_code == SVM_EXIT_VINTR)
                return interrupt_window_interception(vcpu);
        else if (exit_code == SVM_EXIT_INTR)
                return intr_interception(vcpu);
        else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
                return kvm_emulate_halt(vcpu);
        else if (exit_code == SVM_EXIT_NPF)
                return npf_interception(vcpu);
#ifdef CONFIG_KVM_AMD_SEV
        else if (exit_code == SVM_EXIT_VMGEXIT)
                return sev_handle_vmgexit(vcpu);
#endif
#endif
        if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
                goto unexpected_vmexit;

        exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers));
        if (!svm_exit_handlers[exit_code])
                goto unexpected_vmexit;

        return svm_exit_handlers[exit_code](vcpu);

unexpected_vmexit:
        dump_vmcb(vcpu);
        kvm_prepare_unexpected_reason_exit(vcpu, __exit_code);
        return 0;
}

static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
                              u64 *info1, u64 *info2,
                              u32 *intr_info, u32 *error_code)
{
        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;

        *reason = control->exit_code;
        *info1 = control->exit_info_1;
        *info2 = control->exit_info_2;
        *intr_info = control->exit_int_info;
        if ((*intr_info & SVM_EXITINTINFO_VALID) &&
            (*intr_info & SVM_EXITINTINFO_VALID_ERR))
                *error_code = control->exit_int_info_err;
        else
                *error_code = 0;
}

static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
                               u32 *error_code)
{
        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;

        *intr_info = control->event_inj;

        if ((*intr_info & SVM_EXITINTINFO_VALID) &&
            (*intr_info & SVM_EXITINTINFO_VALID_ERR))
                *error_code = control->event_inj_err;
        else
                *error_code = 0;

}

static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_run *kvm_run = vcpu->run;

        /* SEV-ES guests must use the CR write traps to track CR registers. */
        if (!sev_es_guest(vcpu->kvm)) {
                if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
                        vcpu->arch.cr0 = svm->vmcb->save.cr0;
                if (npt_enabled)
                        vcpu->arch.cr3 = svm->vmcb->save.cr3;
        }

        if (is_guest_mode(vcpu)) {
                int vmexit;

                trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);

                vmexit = nested_svm_exit_special(svm);

                if (vmexit == NESTED_EXIT_CONTINUE)
                        vmexit = nested_svm_exit_handled(svm);

                if (vmexit == NESTED_EXIT_DONE)
                        return 1;
        }

        if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) {
                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                kvm_run->fail_entry.hardware_entry_failure_reason
                        = svm->vmcb->control.exit_code;
                kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                dump_vmcb(vcpu);
                return 0;
        }

        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;

        return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
}

static int pre_svm_run(struct kvm_vcpu *vcpu)
{
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * If the previous vmrun of the vmcb occurred on a different physical
         * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
         * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
         */
        if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
                svm->current_vmcb->asid_generation = 0;
                vmcb_mark_all_dirty(svm->vmcb);
                svm->current_vmcb->cpu = vcpu->cpu;
        }

        if (sev_guest(vcpu->kvm))
                return pre_sev_run(svm, vcpu->cpu);

        /* FIXME: handle wraparound of asid_generation */
        if (svm->current_vmcb->asid_generation != sd->asid_generation)
                new_asid(svm, sd);

        return 0;
}

static void svm_inject_nmi(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;

        if (svm->nmi_l1_to_l2)
                return;

        /*
         * No need to manually track NMI masking when vNMI is enabled, hardware
         * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
         * case where software directly injects an NMI.
         */
        if (!is_vnmi_enabled(svm)) {
                svm->nmi_masked = true;
                svm_set_iret_intercept(svm);
        }
        ++vcpu->stat.nmi_injections;
}

static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (!is_vnmi_enabled(svm))
                return false;

        return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
}

static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (!is_vnmi_enabled(svm))
                return false;

        if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
                return false;

        svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);

        /*
         * Because the pending NMI is serviced by hardware, KVM can't know when
         * the NMI is "injected", but for all intents and purposes, passing the
         * NMI off to hardware counts as injection.
         */
        ++vcpu->stat.nmi_injections;

        return true;
}

static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
        struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 type;

        if (intr->soft) {
                if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
                        return;

                type = SVM_EVTINJ_TYPE_SOFT;
        } else {
                type = SVM_EVTINJ_TYPE_INTR;
        }

        trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
        ++vcpu->stat.irq_injections;

        svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}

void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
                                     int trig_mode, int vector)
{
        /*
         * apic->apicv_active must be read after vcpu->mode.
         * Pairs with smp_store_release in vcpu_enter_guest.
         */
        bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);

        /* Note, this is called iff the local APIC is in-kernel. */
        if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
                /* Process the interrupt via kvm_check_and_inject_events(). */
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                kvm_vcpu_kick(vcpu);
                return;
        }

        trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
        if (in_guest_mode) {
                /*
                 * Signal the doorbell to tell hardware to inject the IRQ.  If
                 * the vCPU exits the guest before the doorbell chimes, hardware
                 * will automatically process AVIC interrupts at the next VMRUN.
                 */
                avic_ring_doorbell(vcpu);
        } else {
                /*
                 * Wake the vCPU if it was blocking.  KVM will then detect the
                 * pending IRQ when checking if the vCPU has a wake event.
                 */
                kvm_vcpu_wake_up(vcpu);
        }
}

static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
                                  int trig_mode, int vector)
{
        kvm_lapic_set_irr(vector, apic);

        /*
         * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
         * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
         * the read of guest_mode.  This guarantees that either VMRUN will see
         * and process the new vIRR entry, or that svm_complete_interrupt_delivery
         * will signal the doorbell if the CPU has already entered the guest.
         */
        smp_mb__after_atomic();
        svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
}

static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * SEV-ES guests must always keep the CR intercepts cleared. CR
         * tracking is done using the CR write traps.
         */
        if (sev_es_guest(vcpu->kvm))
                return;

        if (nested_svm_virtualize_tpr(vcpu))
                return;

        svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);

        if (irr == -1)
                return;

        if (tpr >= irr)
                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}

static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (is_vnmi_enabled(svm))
                return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
        else
                return svm->nmi_masked;
}

static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (is_vnmi_enabled(svm)) {
                if (masked)
                        svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
                else
                        svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;

        } else {
                svm->nmi_masked = masked;
                if (masked)
                        svm_set_iret_intercept(svm);
                else
                        svm_clr_iret_intercept(svm);
        }
}

bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;

        if (!gif_set(svm))
                return true;

        if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
                return false;

        if (svm_get_nmi_mask(vcpu))
                return true;

        return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
}

static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        if (svm->nested.nested_run_pending)
                return -EBUSY;

        if (svm_nmi_blocked(vcpu))
                return 0;

        /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
                return -EBUSY;
        return 1;
}

bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;

        if (!gif_set(svm))
                return true;

        if (is_guest_mode(vcpu)) {
                /* As long as interrupts are being delivered...  */
                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
                    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
                        return true;

                /* ... vmexits aren't blocked by the interrupt shadow  */
                if (nested_exit_on_intr(svm))
                        return false;
        } else {
                if (!svm_get_if_flag(vcpu))
                        return true;
        }

        return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
}

static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (svm->nested.nested_run_pending)
                return -EBUSY;

        if (svm_interrupt_blocked(vcpu))
                return 0;

        /*
         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
         * e.g. if the IRQ arrived asynchronously after checking nested events.
         */
        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
                return -EBUSY;

        return 1;
}

static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
         * get that intercept, this function will be called again though and
         * we'll get the vintr intercept. However, if the vGIF feature is
         * enabled, the STGI interception will not occur. Enable the irq
         * window under the assumption that the hardware will set the GIF.
         */
        if (vgif || gif_set(svm)) {
                /*
                 * IRQ window is not needed when AVIC is enabled,
                 * unless we have pending ExtINT since it cannot be injected
                 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
                 * and fallback to injecting IRQ via V_IRQ.
                 *
                 * If running nested, AVIC is already locally inhibited
                 * on this vCPU, therefore there is no need to request
                 * the VM wide AVIC inhibition.
                 */
                if (!is_guest_mode(vcpu))
                        kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);

                svm_set_vintr(svm);
        }
}

static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * If NMIs are outright masked, i.e. the vCPU is already handling an
         * NMI, and KVM has not yet intercepted an IRET, then there is nothing
         * more to do at this time as KVM has already enabled IRET intercepts.
         * If KVM has already intercepted IRET, then single-step over the IRET,
         * as NMIs aren't architecturally unmasked until the IRET completes.
         *
         * If vNMI is enabled, KVM should never request an NMI window if NMIs
         * are masked, as KVM allows at most one to-be-injected NMI and one
         * pending NMI.  If two NMIs arrive simultaneously, KVM will inject one
         * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
         * unmasked.  KVM _will_ request an NMI window in some situations, e.g.
         * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
         * inject the NMI.  In those situations, KVM needs to single-step over
         * the STI shadow or intercept STGI.
         */
        if (svm_get_nmi_mask(vcpu)) {
                WARN_ON_ONCE(is_vnmi_enabled(svm));

                if (!svm->awaiting_iret_completion)
                        return; /* IRET will cause a vm exit */
        }

        /*
         * SEV-ES guests are responsible for signaling when a vCPU is ready to
         * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
         * KVM can't intercept and single-step IRET to detect when NMIs are
         * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
         *
         * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
         * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
         * supported NAEs in the GHCB protocol.
         */
        if (sev_es_guest(vcpu->kvm))
                return;

        if (!gif_set(svm)) {
                if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                return; /* STGI will cause a vm exit */
        }

        /*
         * Something prevents NMI from been injected. Single step over possible
         * problem (IRET or exception injection or interrupt shadow)
         */
        svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
        svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}

static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
         * A TLB flush for the current ASID flushes both "host" and "guest" TLB
         * entries, and thus is a superset of Hyper-V's fine grained flushing.
         */
        kvm_hv_vcpu_purge_flush_tlb(vcpu);

        /*
         * Flush only the current ASID even if the TLB flush was invoked via
         * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
         * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
         * unconditionally does a TLB flush on both nested VM-Enter and nested
         * VM-Exit (via kvm_mmu_reset_context()).
         */
        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
        else
                svm->current_vmcb->asid_generation--;
}

static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
{
        hpa_t root_tdp = vcpu->arch.mmu->root.hpa;

        /*
         * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
         * flush the NPT mappings via hypercall as flushing the ASID only
         * affects virtual to physical mappings, it does not invalidate guest
         * physical to host physical mappings.
         */
        if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
                hyperv_flush_guest_mapping(root_tdp);

        svm_flush_tlb_asid(vcpu);
}

static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
{
        /*
         * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
         * flushes should be routed to hv_flush_remote_tlbs() without requesting
         * a "regular" remote flush.  Reaching this point means either there's
         * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
         * which might be fatal to the guest.  Yell, but try to recover.
         */
        if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
                hv_flush_remote_tlbs(vcpu->kvm);

        svm_flush_tlb_asid(vcpu);
}

static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        invlpga(gva, svm->vmcb->control.asid);
}

static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);

        svm_flush_tlb_asid(vcpu);
}

static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (nested_svm_virtualize_tpr(vcpu))
                return;

        if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
                kvm_set_cr8(vcpu, cr8);
        }
}

static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 cr8;

        if (nested_svm_virtualize_tpr(vcpu))
                return;

        cr8 = kvm_get_cr8(vcpu);
        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
}

static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
                                        int type)
{
        bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
        bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
         * associated with the original soft exception/interrupt.  next_rip is
         * cleared on all exits that can occur while vectoring an event, so KVM
         * needs to manually set next_rip for re-injection.  Unlike the !nrips
         * case below, this needs to be done if and only if KVM is re-injecting
         * the same event, i.e. if the event is a soft exception/interrupt,
         * otherwise next_rip is unused on VMRUN.
         */
        if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
            kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
                svm->vmcb->control.next_rip = svm->soft_int_next_rip;
        /*
         * If NRIPS isn't enabled, KVM must manually advance RIP prior to
         * injecting the soft exception/interrupt.  That advancement needs to
         * be unwound if vectoring didn't complete.  Note, the new event may
         * not be the injected event, e.g. if KVM injected an INTn, the INTn
         * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
         * be the reported vectored event, but RIP still needs to be unwound.
         */
        else if (!nrips && (is_soft || is_exception) &&
                 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
                kvm_rip_write(vcpu, svm->soft_int_old_rip);
}

static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u8 vector;
        int type;
        u32 exitintinfo = svm->vmcb->control.exit_int_info;
        bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
        bool soft_int_injected = svm->soft_int_injected;

        svm->nmi_l1_to_l2 = false;
        svm->soft_int_injected = false;

        /*
         * If we've made progress since setting awaiting_iret_completion, we've
         * executed an IRET and can allow NMI injection.
         */
        if (svm->awaiting_iret_completion &&
            kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
                svm->awaiting_iret_completion = false;
                svm->nmi_masked = false;
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        }

        vcpu->arch.nmi_injected = false;
        kvm_clear_exception_queue(vcpu);
        kvm_clear_interrupt_queue(vcpu);

        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
                return;

        kvm_make_request(KVM_REQ_EVENT, vcpu);

        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;

        if (soft_int_injected)
                svm_complete_soft_interrupt(vcpu, vector, type);

        switch (type) {
        case SVM_EXITINTINFO_TYPE_NMI:
                vcpu->arch.nmi_injected = true;
                svm->nmi_l1_to_l2 = nmi_l1_to_l2;
                break;
        case SVM_EXITINTINFO_TYPE_EXEPT: {
                u32 error_code = 0;

                /*
                 * Never re-inject a #VC exception.
                 */
                if (vector == X86_TRAP_VC)
                        break;

                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
                        error_code = svm->vmcb->control.exit_int_info_err;

                kvm_requeue_exception(vcpu, vector,
                                      exitintinfo & SVM_EXITINTINFO_VALID_ERR,
                                      error_code);
                break;
        }
        case SVM_EXITINTINFO_TYPE_INTR:
                kvm_queue_interrupt(vcpu, vector, false);
                break;
        case SVM_EXITINTINFO_TYPE_SOFT:
                kvm_queue_interrupt(vcpu, vector, true);
                break;
        default:
                break;
        }

}

static void svm_cancel_injection(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;

        control->exit_int_info = control->event_inj;
        control->exit_int_info_err = control->event_inj_err;
        control->event_inj = 0;
        svm_complete_interrupts(vcpu);
}

static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
        if (to_kvm_sev_info(vcpu->kvm)->need_init)
                return -EINVAL;

        return 1;
}

static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb_control_area *control = &svm->vmcb->control;

        /*
         * Next RIP must be provided as IRQs are disabled, and accessing guest
         * memory to decode the instruction might fault, i.e. might sleep.
         */
        if (!nrips || !control->next_rip)
                return EXIT_FASTPATH_NONE;

        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;

        switch (control->exit_code) {
        case SVM_EXIT_MSR:
                if (!control->exit_info_1)
                        break;
                return handle_fastpath_wrmsr(vcpu);
        case SVM_EXIT_HLT:
                return handle_fastpath_hlt(vcpu);
        case SVM_EXIT_INVD:
                return handle_fastpath_invd(vcpu);
        default:
                break;
        }

        return EXIT_FASTPATH_NONE;
}

static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
        struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
        struct vcpu_svm *svm = to_svm(vcpu);

        guest_state_enter_irqoff();

        /*
         * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
         * VMRUN controls whether or not physical IRQs are masked (KVM always
         * runs with V_INTR_MASKING_MASK).  Toggle RFLAGS.IF here to avoid the
         * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
         * into guest state if delivery of an event during VMRUN triggers a
         * #VMEXIT, and the guest_state transitions already tell lockdep that
         * IRQs are being enabled/disabled.  Note!  GIF=0 for the entirety of
         * this path, so IRQs aren't actually unmasked while running host code.
         */
        raw_local_irq_enable();

        amd_clear_divider();

        if (sev_es_guest(vcpu->kvm))
                __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
                                      sev_es_host_save_area(sd));
        else
                __svm_vcpu_run(svm, spec_ctrl_intercepted);

        raw_local_irq_disable();

        guest_state_exit_irqoff();
}

static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
        bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
        struct vcpu_svm *svm = to_svm(vcpu);
        bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);

        trace_kvm_entry(vcpu, force_immediate_exit);

        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

        /*
         * Disable singlestep if we're injecting an interrupt/exception.
         * We don't want our modified rflags to be pushed on the stack where
         * we might not be able to easily reset them if we disabled NMI
         * singlestep later.
         */
        if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
                /*
                 * Event injection happens before external interrupts cause a
                 * vmexit and interrupts are disabled here, so smp_send_reschedule
                 * is enough to force an immediate vmexit.
                 */
                disable_nmi_singlestep(svm);
                force_immediate_exit = true;
        }

        if (force_immediate_exit)
                smp_send_reschedule(vcpu->cpu);

        if (pre_svm_run(vcpu)) {
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
                vcpu->run->fail_entry.cpu = vcpu->cpu;
                return EXIT_FASTPATH_EXIT_USERSPACE;
        }

        sync_lapic_to_cr8(vcpu);

        if (unlikely(svm->asid != svm->vmcb->control.asid)) {
                svm->vmcb->control.asid = svm->asid;
                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
        }
        svm->vmcb->save.cr2 = vcpu->arch.cr2;

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) &&
            kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
                svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;

        svm_hv_update_vp_id(svm->vmcb, vcpu);

        /*
         * Run with all-zero DR6 unless the guest can write DR6 freely, so that
         * KVM can get the exact cause of a #DB.  Note, loading guest DR6 from
         * KVM's snapshot is only necessary when DR accesses won't exit.
         */
        if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
                svm_set_dr6(vcpu, vcpu->arch.dr6);
        else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
                svm_set_dr6(vcpu, DR6_ACTIVE_LOW);

        clgi();

        /*
         * Hardware only context switches DEBUGCTL if LBR virtualization is
         * enabled.  Manually load DEBUGCTL if necessary (and restore it after
         * VM-Exit), as running with the host's DEBUGCTL can negatively affect
         * guest state and can even be fatal, e.g. due to Bus Lock Detect.
         */
        if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
            vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
                update_debugctlmsr(svm->vmcb->save.dbgctl);

        kvm_wait_lapic_expire(vcpu);

        /*
         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
         * it's non-zero. Since vmentry is serialising on affected CPUs, there
         * is no need to worry about the conditional branch over the wrmsr
         * being speculatively taken.
         */
        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
                x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);

        svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);

        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
                x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);

        if (!sev_es_guest(vcpu->kvm)) {
                vcpu->arch.cr2 = svm->vmcb->save.cr2;
                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
                vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
        }
        vcpu->arch.regs_dirty = 0;

        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
                kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);

        if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
            vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
                update_debugctlmsr(vcpu->arch.host_debugctl);

        stgi();

        /* Any pending NMI will happen here */

        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
                kvm_after_interrupt(vcpu);

        sync_cr8_to_lapic(vcpu);

        svm->next_rip = 0;
        if (is_guest_mode(vcpu)) {
                nested_sync_control_from_vmcb02(svm);

                /* Track VMRUNs that have made past consistency checking */
                if (svm->nested.nested_run_pending &&
                    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
                        ++vcpu->stat.nested_run;

                svm->nested.nested_run_pending = 0;
        }

        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;

        /*
         * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap
         * as the TEST+Jcc to avoid it.
         */
        if (cpu_feature_enabled(X86_FEATURE_ERAPS))
                svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP;

        vmcb_mark_all_clean(svm->vmcb);

        /* if exit due to PF check for async PF */
        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
                vcpu->arch.apf.host_apf_flags =
                        kvm_read_and_reset_apf_flags();

        vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;

        if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
                rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);

        trace_kvm_exit(vcpu, KVM_ISA_SVM);

        svm_complete_interrupts(vcpu);

        return svm_exit_handlers_fastpath(vcpu);
}

static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                             int root_level)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        unsigned long cr3;

        if (npt_enabled) {
                svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);

                hv_track_root_tdp(vcpu, root_hpa);

                cr3 = vcpu->arch.cr3;
        } else if (root_level >= PT64_ROOT_4LEVEL) {
                cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
        } else {
                /* PCID in the guest should be impossible with a 32-bit MMU. */
                WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
                cr3 = root_hpa;
        }

        svm->vmcb->save.cr3 = cr3;
        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}

static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
        /*
         * Patch in the VMMCALL instruction:
         */
        hypercall[0] = 0x0f;
        hypercall[1] = 0x01;
        hypercall[2] = 0xd9;
}

/*
 * The kvm parameter can be NULL (module initialization, or invocation before
 * VM creation). Be sure to check the kvm parameter before using it.
 */
static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
        switch (index) {
        case MSR_IA32_MCG_EXT_CTL:
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                return false;
        case MSR_IA32_SMBASE:
                if (!IS_ENABLED(CONFIG_KVM_SMM))
                        return false;
                /* SEV-ES guests do not support SMM, so report false */
                if (kvm && sev_es_guest(kvm))
                        return false;
                break;
        default:
                break;
        }

        return true;
}

static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /*
         * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
         * can only disable all variants of by disallowing CR4.OSXSAVE from
         * being set.  As a result, if the host has XSAVE and XSAVES, and the
         * guest has XSAVE enabled, the guest can execute XSAVES without
         * faulting.  Treat XSAVES as enabled in this case regardless of
         * whether it's advertised to the guest so that KVM context switches
         * XSS on VM-Enter/VM-Exit.  Failure to do so would effectively give
         * the guest read/write access to the host's XSS.
         */
        guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
                             boot_cpu_has(X86_FEATURE_XSAVES) &&
                             guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));

        /*
         * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
         * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
         * SVM on Intel is bonkers and extremely unlikely to work).
         */
        if (guest_cpuid_is_intel_compatible(vcpu))
                guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);

        if (sev_guest(vcpu->kvm))
                sev_vcpu_after_set_cpuid(svm);
}

static bool svm_has_wbinvd_exit(void)
{
        return true;
}

#define PRE_EX(exit)  { .exit_code = (exit), \
                        .stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
                        .stage = X86_ICPT_POST_EXCEPT, }
#define POST_MEM(exit) { .exit_code = (exit), \
                        .stage = X86_ICPT_POST_MEMACCESS, }

static const struct __x86_intercept {
        u32 exit_code;
        enum x86_intercept_stage stage;
} x86_intercept_map[] = {
        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
        [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
};

#undef PRE_EX
#undef POST_EX
#undef POST_MEM

static int svm_check_intercept(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage,
                               struct x86_exception *exception)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        int vmexit, ret = X86EMUL_CONTINUE;
        struct __x86_intercept icpt_info;
        struct vmcb *vmcb = svm->vmcb;

        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
                goto out;

        icpt_info = x86_intercept_map[info->intercept];

        if (stage != icpt_info.stage)
                goto out;

        switch (icpt_info.exit_code) {
        case SVM_EXIT_READ_CR0:
                if (info->intercept == x86_intercept_cr_read)
                        icpt_info.exit_code += info->modrm_reg;
                break;
        case SVM_EXIT_WRITE_CR0: {
                unsigned long cr0, val;

                /*
                 * Adjust the exit code accordingly if a CR other than CR0 is
                 * being written, and skip straight to the common handling as
                 * only CR0 has an additional selective intercept.
                 */
                if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
                        icpt_info.exit_code += info->modrm_reg;
                        break;
                }

                /*
                 * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
                 * selective CR0 intercept is triggered (the common logic will
                 * treat the selective intercept as being enabled).  Note, the
                 * unconditional intercept has higher priority, i.e. this is
                 * only relevant if *only* the selective intercept is enabled.
                 */
                if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
                    !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
                        break;

                /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
                if (info->intercept == x86_intercept_clts)
                        break;

                /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
                if (info->intercept == x86_intercept_lmsw) {
                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
                        break;
                }

                /*
                 * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
                 * other than SVM_CR0_SELECTIVE_MASK is changed.
                 */
                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
                if (cr0 ^ val)
                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
                break;
        }
        case SVM_EXIT_READ_DR0:
        case SVM_EXIT_WRITE_DR0:
                icpt_info.exit_code += info->modrm_reg;
                break;
        case SVM_EXIT_MSR:
                if (info->intercept == x86_intercept_wrmsr)
                        vmcb->control.exit_info_1 = 1;
                else
                        vmcb->control.exit_info_1 = 0;
                break;
        case SVM_EXIT_PAUSE:
                /*
                 * We get this for NOP only, but pause
                 * is rep not, check this here
                 */
                if (info->rep_prefix != REPE_PREFIX)
                        goto out;
                break;
        case SVM_EXIT_IOIO: {
                u64 exit_info;
                u32 bytes;

                if (info->intercept == x86_intercept_in ||
                    info->intercept == x86_intercept_ins) {
                        exit_info = ((info->src_val & 0xffff) << 16) |
                                SVM_IOIO_TYPE_MASK;
                        bytes = info->dst_bytes;
                } else {
                        exit_info = (info->dst_val & 0xffff) << 16;
                        bytes = info->src_bytes;
                }

                if (info->intercept == x86_intercept_outs ||
                    info->intercept == x86_intercept_ins)
                        exit_info |= SVM_IOIO_STR_MASK;

                if (info->rep_prefix)
                        exit_info |= SVM_IOIO_REP_MASK;

                bytes = min(bytes, 4u);

                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;

                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);

                vmcb->control.exit_info_1 = exit_info;
                vmcb->control.exit_info_2 = info->next_rip;

                break;
        }
        default:
                break;
        }

        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
        if (static_cpu_has(X86_FEATURE_NRIPS))
                vmcb->control.next_rip  = info->next_rip;
        vmcb->control.exit_code = icpt_info.exit_code;
        vmexit = nested_svm_exit_handled(svm);

        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
                                           : X86EMUL_CONTINUE;

out:
        return ret;
}

static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
        switch (to_svm(vcpu)->vmcb->control.exit_code) {
        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
                svm_handle_mce(vcpu);
                break;
        case SVM_EXIT_INTR:
                vcpu->arch.at_instruction_boundary = true;
                break;
        default:
                break;
        }
}

static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
        /* [63:9] are reserved. */
        vcpu->arch.mcg_cap &= 0x1ff;
}

#ifdef CONFIG_KVM_SMM
bool svm_smi_blocked(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        /* Per APM Vol.2 15.22.2 "Response to SMI" */
        if (!gif_set(svm))
                return true;

        return is_smm(vcpu);
}

static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        if (svm->nested.nested_run_pending)
                return -EBUSY;

        if (svm_smi_blocked(vcpu))
                return 0;

        /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
                return -EBUSY;

        return 1;
}

static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_host_map map_save;
        int ret;

        if (!is_guest_mode(vcpu))
                return 0;

        /*
         * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
         * responsible for ensuring nested SVM and SMIs are mutually exclusive.
         */

        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
                return 1;

        smram->smram64.svm_guest_flag = 1;
        smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;

        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];

        ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
        if (ret)
                return ret;

        /*
         * KVM uses VMCB01 to store L1 host state while L2 runs but
         * VMCB01 is going to be used during SMM and thus the state will
         * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
         * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
         * format of the area is identical to guest save area offsetted
         * by 0x400 (matches the offset of 'struct vmcb_save_area'
         * within 'struct vmcb'). Note: HSAVE area may also be used by
         * L1 hypervisor to save additional host context (e.g. KVM does
         * that, see svm_prepare_switch_to_guest()) which must be
         * preserved.
         */
        if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
                return 1;

        BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);

        svm_copy_vmrun_state(map_save.hva + 0x400,
                             &svm->vmcb01.ptr->save);

        kvm_vcpu_unmap(vcpu, &map_save);
        return 0;
}

static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        struct kvm_host_map map, map_save;
        struct vmcb *vmcb12;
        int ret;

        const struct kvm_smram_state_64 *smram64 = &smram->smram64;

        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
                return 0;

        /* Non-zero if SMI arrived while vCPU was in guest mode. */
        if (!smram64->svm_guest_flag)
                return 0;

        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
                return 1;

        if (!(smram64->efer & EFER_SVME))
                return 1;

        if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
                return 1;

        ret = 1;
        if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
                goto unmap_map;

        if (svm_allocate_nested(svm))
                goto unmap_save;

        /*
         * Restore L1 host state from L1 HSAVE area as VMCB01 was
         * used during SMM (see svm_enter_smm())
         */

        svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);

        /*
         * Enter the nested guest now
         */

        vmcb_mark_all_dirty(svm->vmcb01.ptr);

        vmcb12 = map.hva;
        nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
        nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);

        if (nested_svm_check_cached_vmcb12(vcpu) < 0)
                goto unmap_save;

        if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa,
                                 vmcb12, false) != 0)
                goto unmap_save;

        ret = 0;
        svm->nested.nested_run_pending = 1;

unmap_save:
        kvm_vcpu_unmap(vcpu, &map_save);
unmap_map:
        kvm_vcpu_unmap(vcpu, &map);
        return ret;
}

static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        if (!gif_set(svm)) {
                if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                /* STGI will cause a vm exit */
        } else {
                /* We must be in SMM; RSM will cause a vmexit anyway.  */
        }
}
#endif

static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
                                         void *insn, int insn_len)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        bool smep, smap, is_user;
        u64 error_code;

        /* Check that emulation is possible during event vectoring */
        if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
            !kvm_can_emulate_event_vectoring(emul_type))
                return X86EMUL_UNHANDLEABLE_VECTORING;

        /* Emulation is always possible when KVM has access to all guest state. */
        if (!sev_guest(vcpu->kvm))
                return X86EMUL_CONTINUE;

        /* #UD and #GP should never be intercepted for SEV guests. */
        WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
                                  EMULTYPE_TRAP_UD_FORCED |
                                  EMULTYPE_VMWARE_GP));

        /*
         * Emulation is impossible for SEV-ES guests as KVM doesn't have access
         * to guest register state.
         */
        if (sev_es_guest(vcpu->kvm))
                return X86EMUL_RETRY_INSTR;

        /*
         * Emulation is possible if the instruction is already decoded, e.g.
         * when completing I/O after returning from userspace.
         */
        if (emul_type & EMULTYPE_NO_DECODE)
                return X86EMUL_CONTINUE;

        /*
         * Emulation is possible for SEV guests if and only if a prefilled
         * buffer containing the bytes of the intercepted instruction is
         * available. SEV guest memory is encrypted with a guest specific key
         * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
         * decode garbage.
         *
         * If KVM is NOT trying to simply skip an instruction, inject #UD if
         * KVM reached this point without an instruction buffer.  In practice,
         * this path should never be hit by a well-behaved guest, e.g. KVM
         * doesn't intercept #UD or #GP for SEV guests, but this path is still
         * theoretically reachable, e.g. via unaccelerated fault-like AVIC
         * access, and needs to be handled by KVM to avoid putting the guest
         * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
         * its the least awful option given lack of insight into the guest.
         *
         * If KVM is trying to skip an instruction, simply resume the guest.
         * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
         * will attempt to re-inject the INT3/INTO and skip the instruction.
         * In that scenario, retrying the INT3/INTO and hoping the guest will
         * make forward progress is the only option that has a chance of
         * success (and in practice it will work the vast majority of the time).
         */
        if (unlikely(!insn)) {
                if (emul_type & EMULTYPE_SKIP)
                        return X86EMUL_UNHANDLEABLE;

                kvm_queue_exception(vcpu, UD_VECTOR);
                return X86EMUL_PROPAGATE_FAULT;
        }

        /*
         * Emulate for SEV guests if the insn buffer is not empty.  The buffer
         * will be empty if the DecodeAssist microcode cannot fetch bytes for
         * the faulting instruction because the code fetch itself faulted, e.g.
         * the guest attempted to fetch from emulated MMIO or a guest page
         * table used to translate CS:RIP resides in emulated MMIO.
         */
        if (likely(insn_len))
                return X86EMUL_CONTINUE;

        /*
         * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
         *
         * Errata:
         * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
         * possible that CPU microcode implementing DecodeAssist will fail to
         * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
         * be '0'.  This happens because microcode reads CS:RIP using a _data_
         * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
         * gives up and does not fill the instruction bytes buffer.
         *
         * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
         * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
         * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
         * GuestIntrBytes field of the VMCB.
         *
         * This does _not_ mean that the erratum has been encountered, as the
         * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
         * #PF, e.g. if the guest attempt to execute from emulated MMIO and
         * encountered a reserved/not-present #PF.
         *
         * To hit the erratum, the following conditions must be true:
         *    1. CR4.SMAP=1 (obviously).
         *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
         *       have been hit as the guest would have encountered a SMEP
         *       violation #PF, not a #NPF.
         *    3. The #NPF is not due to a code fetch, in which case failure to
         *       retrieve the instruction bytes is legitimate (see abvoe).
         *
         * In addition, don't apply the erratum workaround if the #NPF occurred
         * while translating guest page tables (see below).
         */
        error_code = svm->vmcb->control.exit_info_1;
        if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
                goto resume_guest;

        smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
        smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
        is_user = svm_get_cpl(vcpu) == 3;
        if (smap && (!smep || is_user)) {
                pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");

                /*
                 * If the fault occurred in userspace, arbitrarily inject #GP
                 * to avoid killing the guest and to hopefully avoid confusing
                 * the guest kernel too much, e.g. injecting #PF would not be
                 * coherent with respect to the guest's page tables.  Request
                 * triple fault if the fault occurred in the kernel as there's
                 * no fault that KVM can inject without confusing the guest.
                 * In practice, the triple fault is moot as no sane SEV kernel
                 * will execute from user memory while also running with SMAP=1.
                 */
                if (is_user)
                        kvm_inject_gp(vcpu, 0);
                else
                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
                return X86EMUL_PROPAGATE_FAULT;
        }

resume_guest:
        /*
         * If the erratum was not hit, simply resume the guest and let it fault
         * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
         * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
         * userspace will kill the guest, and letting the emulator read garbage
         * will yield random behavior and potentially corrupt the guest.
         *
         * Simply resuming the guest is technically not a violation of the SEV
         * architecture.  AMD's APM states that all code fetches and page table
         * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
         * APM also states that encrypted accesses to MMIO are "ignored", but
         * doesn't explicitly define "ignored", i.e. doing nothing and letting
         * the guest spin is technically "ignoring" the access.
         */
        return X86EMUL_RETRY_INSTR;
}

static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
        struct vcpu_svm *svm = to_svm(vcpu);

        return !gif_set(svm);
}

static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
        if (!sev_es_guest(vcpu->kvm))
                return kvm_vcpu_deliver_sipi_vector(vcpu, vector);

        sev_vcpu_deliver_sipi_vector(vcpu, vector);
}

static void svm_vm_destroy(struct kvm *kvm)
{
        avic_vm_destroy(kvm);
        sev_vm_destroy(kvm);

        svm_srso_vm_destroy();
}

static int svm_vm_init(struct kvm *kvm)
{
        int type = kvm->arch.vm_type;

        if (type != KVM_X86_DEFAULT_VM &&
            type != KVM_X86_SW_PROTECTED_VM) {
                kvm->arch.has_protected_state =
                        (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
                to_kvm_sev_info(kvm)->need_init = true;

                kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
                kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
        }

        if (!pause_filter_count || !pause_filter_thresh)
                kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);

        if (enable_apicv) {
                int ret = avic_vm_init(kvm);
                if (ret)
                        return ret;
        }

        svm_srso_vm_init();
        return 0;
}

static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
{
        struct page *page = snp_safe_alloc_page();

        if (!page)
                return NULL;

        return page_address(page);
}

struct kvm_x86_ops svm_x86_ops __initdata = {
        .name = KBUILD_MODNAME,

        .check_processor_compatibility = svm_check_processor_compat,

        .hardware_unsetup = svm_hardware_unsetup,
        .enable_virtualization_cpu = svm_enable_virtualization_cpu,
        .disable_virtualization_cpu = svm_disable_virtualization_cpu,
        .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
        .has_emulated_msr = svm_has_emulated_msr,

        .vcpu_precreate = svm_vcpu_precreate,
        .vcpu_create = svm_vcpu_create,
        .vcpu_free = svm_vcpu_free,
        .vcpu_reset = svm_vcpu_reset,

        .vm_size = sizeof(struct kvm_svm),
        .vm_init = svm_vm_init,
        .vm_destroy = svm_vm_destroy,

        .prepare_switch_to_guest = svm_prepare_switch_to_guest,
        .vcpu_load = svm_vcpu_load,
        .vcpu_put = svm_vcpu_put,
        .vcpu_blocking = avic_vcpu_blocking,
        .vcpu_unblocking = avic_vcpu_unblocking,

        .update_exception_bitmap = svm_update_exception_bitmap,
        .get_feature_msr = svm_get_feature_msr,
        .get_msr = svm_get_msr,
        .set_msr = svm_set_msr,
        .get_segment_base = svm_get_segment_base,
        .get_segment = svm_get_segment,
        .set_segment = svm_set_segment,
        .get_cpl = svm_get_cpl,
        .get_cpl_no_cache = svm_get_cpl,
        .get_cs_db_l_bits = svm_get_cs_db_l_bits,
        .is_valid_cr0 = svm_is_valid_cr0,
        .set_cr0 = svm_set_cr0,
        .post_set_cr3 = sev_post_set_cr3,
        .is_valid_cr4 = svm_is_valid_cr4,
        .set_cr4 = svm_set_cr4,
        .set_efer = svm_set_efer,
        .get_idt = svm_get_idt,
        .set_idt = svm_set_idt,
        .get_gdt = svm_get_gdt,
        .set_gdt = svm_set_gdt,
        .set_dr7 = svm_set_dr7,
        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
        .cache_reg = svm_cache_reg,
        .get_rflags = svm_get_rflags,
        .set_rflags = svm_set_rflags,
        .get_if_flag = svm_get_if_flag,

        .flush_tlb_all = svm_flush_tlb_all,
        .flush_tlb_current = svm_flush_tlb_current,
        .flush_tlb_gva = svm_flush_tlb_gva,
        .flush_tlb_guest = svm_flush_tlb_guest,

        .vcpu_pre_run = svm_vcpu_pre_run,
        .vcpu_run = svm_vcpu_run,
        .handle_exit = svm_handle_exit,
        .skip_emulated_instruction = svm_skip_emulated_instruction,
        .update_emulated_instruction = NULL,
        .set_interrupt_shadow = svm_set_interrupt_shadow,
        .get_interrupt_shadow = svm_get_interrupt_shadow,
        .patch_hypercall = svm_patch_hypercall,
        .inject_irq = svm_inject_irq,
        .inject_nmi = svm_inject_nmi,
        .is_vnmi_pending = svm_is_vnmi_pending,
        .set_vnmi_pending = svm_set_vnmi_pending,
        .inject_exception = svm_inject_exception,
        .cancel_injection = svm_cancel_injection,
        .interrupt_allowed = svm_interrupt_allowed,
        .nmi_allowed = svm_nmi_allowed,
        .get_nmi_mask = svm_get_nmi_mask,
        .set_nmi_mask = svm_set_nmi_mask,
        .enable_nmi_window = svm_enable_nmi_window,
        .enable_irq_window = svm_enable_irq_window,
        .update_cr8_intercept = svm_update_cr8_intercept,

        .x2apic_icr_is_split = true,
        .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
        .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
        .apicv_post_state_restore = avic_apicv_post_state_restore,
        .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,

        .get_exit_info = svm_get_exit_info,
        .get_entry_info = svm_get_entry_info,

        .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,

        .has_wbinvd_exit = svm_has_wbinvd_exit,

        .get_l2_tsc_offset = svm_get_l2_tsc_offset,
        .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
        .write_tsc_offset = svm_write_tsc_offset,
        .write_tsc_multiplier = svm_write_tsc_multiplier,

        .load_mmu_pgd = svm_load_mmu_pgd,

        .check_intercept = svm_check_intercept,
        .handle_exit_irqoff = svm_handle_exit_irqoff,

        .nested_ops = &svm_nested_ops,

        .deliver_interrupt = svm_deliver_interrupt,
        .pi_update_irte = avic_pi_update_irte,
        .setup_mce = svm_setup_mce,

#ifdef CONFIG_KVM_SMM
        .smi_allowed = svm_smi_allowed,
        .enter_smm = svm_enter_smm,
        .leave_smm = svm_leave_smm,
        .enable_smi_window = svm_enable_smi_window,
#endif

#ifdef CONFIG_KVM_AMD_SEV
        .dev_get_attr = sev_dev_get_attr,
        .mem_enc_ioctl = sev_mem_enc_ioctl,
        .mem_enc_register_region = sev_mem_enc_register_region,
        .mem_enc_unregister_region = sev_mem_enc_unregister_region,
        .guest_memory_reclaimed = sev_guest_memory_reclaimed,

        .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
        .vm_move_enc_context_from = sev_vm_move_enc_context_from,
#endif
        .check_emulate_instruction = svm_check_emulate_instruction,

        .apic_init_signal_blocked = svm_apic_init_signal_blocked,

        .recalc_intercepts = svm_recalc_intercepts,
        .complete_emulated_msr = svm_complete_emulated_msr,

        .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
        .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
        .alloc_apic_backing_page = svm_alloc_apic_backing_page,

        .gmem_prepare = sev_gmem_prepare,
        .gmem_invalidate = sev_gmem_invalidate,
        .gmem_max_mapping_level = sev_gmem_max_mapping_level,
};

/*
 * The default MMIO mask is a single bit (excluding the present bit),
 * which could conflict with the memory encryption bit. Check for
 * memory encryption support and override the default MMIO mask if
 * memory encryption is enabled.
 */
static __init void svm_adjust_mmio_mask(void)
{
        unsigned int enc_bit, mask_bit;
        u64 msr, mask;

        /* If there is no memory encryption support, use existing mask */
        if (cpuid_eax(0x80000000) < 0x8000001f)
                return;

        /* If memory encryption is not enabled, use existing mask */
        rdmsrq(MSR_AMD64_SYSCFG, msr);
        if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
                return;

        enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
        mask_bit = boot_cpu_data.x86_phys_bits;

        /* Increment the mask bit if it is the same as the encryption bit */
        if (enc_bit == mask_bit)
                mask_bit++;

        /*
         * If the mask bit location is below 52, then some bits above the
         * physical addressing limit will always be reserved, so use the
         * rsvd_bits() function to generate the mask. This mask, along with
         * the present bit, will be used to generate a page fault with
         * PFER.RSV = 1.
         *
         * If the mask bit location is 52 (or above), then clear the mask.
         */
        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;

        kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
}

static __init void svm_set_cpu_caps(void)
{
        kvm_initialize_cpu_caps();

        kvm_caps.supported_perf_cap = 0;

        kvm_cpu_cap_clear(X86_FEATURE_IBT);

        /* CPUID 0x80000001 and 0x8000000A (SVM features) */
        if (nested) {
                kvm_cpu_cap_set(X86_FEATURE_SVM);
                kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);

                /*
                 * KVM currently flushes TLBs on *every* nested SVM transition,
                 * and so for all intents and purposes KVM supports flushing by
                 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
                 */
                kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);

                if (nrips)
                        kvm_cpu_cap_set(X86_FEATURE_NRIPS);

                if (npt_enabled)
                        kvm_cpu_cap_set(X86_FEATURE_NPT);

                if (tsc_scaling)
                        kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);

                if (vls)
                        kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
                if (lbrv)
                        kvm_cpu_cap_set(X86_FEATURE_LBRV);

                if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
                        kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);

                if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
                        kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);

                if (vgif)
                        kvm_cpu_cap_set(X86_FEATURE_VGIF);

                if (vnmi)
                        kvm_cpu_cap_set(X86_FEATURE_VNMI);

                /* Nested VM can receive #VMEXIT instead of triggering #GP */
                kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
        }

        if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
                kvm_caps.has_bus_lock_exit = true;

        /* CPUID 0x80000008 */
        if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
            boot_cpu_has(X86_FEATURE_AMD_SSBD))
                kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);

        if (enable_pmu) {
                /*
                 * Enumerate support for PERFCTR_CORE if and only if KVM has
                 * access to enough counters to virtualize "core" support,
                 * otherwise limit vPMU support to the legacy number of counters.
                 */
                if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
                        kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
                                                          kvm_pmu_cap.num_counters_gp);
                else
                        kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);

                if (kvm_pmu_cap.version != 2 ||
                    !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
                        kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
        }

        /* CPUID 0x8000001F (SME/SEV features) */
        sev_set_cpu_caps();

        /*
         * Clear capabilities that are automatically configured by common code,
         * but that require explicit SVM support (that isn't yet implemented).
         */
        kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
        kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);

        kvm_setup_xss_caps();
        kvm_finalize_cpu_caps();
}

static __init int svm_hardware_setup(void)
{
        void *iopm_va;
        int cpu, r;

        /*
         * NX is required for shadow paging and for NPT if the NX huge pages
         * mitigation is enabled.
         */
        if (!boot_cpu_has(X86_FEATURE_NX)) {
                pr_err_ratelimited("NX (Execute Disable) not supported\n");
                return -EOPNOTSUPP;
        }
        kvm_enable_efer_bits(EFER_NX);

        kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
                                     XFEATURE_MASK_BNDCSR);

        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
                kvm_enable_efer_bits(EFER_FFXSR);

        if (tsc_scaling) {
                if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
                        tsc_scaling = false;
                } else {
                        pr_info("TSC scaling supported\n");
                        kvm_caps.has_tsc_control = true;
                }
        }
        kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
        kvm_caps.tsc_scaling_ratio_frac_bits = 32;

        tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);

        if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
                kvm_enable_efer_bits(EFER_AUTOIBRS);

        /* Check for pause filtering support */
        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
                pause_filter_count = 0;
                pause_filter_thresh = 0;
        } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
                pause_filter_thresh = 0;
        }

        if (nested) {
                pr_info("Nested Virtualization enabled\n");
                kvm_enable_efer_bits(EFER_SVME);
                if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
                        kvm_enable_efer_bits(EFER_LMSLE);

                r = nested_svm_init_msrpm_merge_offsets();
                if (r)
                        return r;
        }

        /*
         * KVM's MMU doesn't support using 2-level paging for itself, and thus
         * NPT isn't supported if the host is using 2-level paging since host
         * CR4 is unchanged on VMRUN.
         */
        if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
                npt_enabled = false;

        if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;

        /* Force VM NPT level equal to the host's paging level */
        kvm_configure_mmu(npt_enabled, get_npt_level(),
                          get_npt_level(), PG_LEVEL_1G);
        pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));

        /*
         * It seems that on AMD processors PTE's accessed bit is
         * being set by the CPU hardware before the NPF vmexit.
         * This is not expected behaviour and our tests fail because
         * of it.
         * A workaround here is to disable support for
         * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
         * In this case userspace can know if there is support using
         * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
         * it
         * If future AMD CPU models change the behaviour described above,
         * this variable can be changed accordingly
         */
        allow_smaller_maxphyaddr = !npt_enabled;

        /* Setup shadow_me_value and shadow_me_mask */
        kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);

        svm_adjust_mmio_mask();

        nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);

        if (lbrv) {
                if (!boot_cpu_has(X86_FEATURE_LBRV))
                        lbrv = false;
                else
                        pr_info("LBR virtualization supported\n");
        }

        iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
        if (!iopm_va)
                return -ENOMEM;

        iopm_base = __sme_set(__pa(iopm_va));

        /*
         * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
         * may be modified by svm_adjust_mmio_mask()), as well as nrips.
         */
        sev_hardware_setup();

        svm_hv_hardware_setup();

        enable_apicv = avic_hardware_setup();
        if (!enable_apicv) {
                enable_ipiv = false;
                svm_x86_ops.vcpu_blocking = NULL;
                svm_x86_ops.vcpu_unblocking = NULL;
                svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
        }

        if (vls) {
                if (!npt_enabled ||
                    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
                    !IS_ENABLED(CONFIG_X86_64)) {
                        vls = false;
                } else {
                        pr_info("Virtual VMLOAD VMSAVE supported\n");
                }
        }

        if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
                svm_gp_erratum_intercept = false;

        if (vgif) {
                if (!boot_cpu_has(X86_FEATURE_VGIF))
                        vgif = false;
                else
                        pr_info("Virtual GIF supported\n");
        }

        vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
        if (vnmi)
                pr_info("Virtual NMI enabled\n");

        if (!vnmi) {
                svm_x86_ops.is_vnmi_pending = NULL;
                svm_x86_ops.set_vnmi_pending = NULL;
        }

        if (!enable_pmu)
                pr_info("PMU virtualization is disabled\n");

        svm_set_cpu_caps();

        kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;

        for_each_possible_cpu(cpu) {
                r = svm_cpu_init(cpu);
                if (r)
                        goto err;
        }

        return 0;

err:
        svm_hardware_unsetup();
        return r;
}


static struct kvm_x86_init_ops svm_init_ops __initdata = {
        .hardware_setup = svm_hardware_setup,

        .runtime_ops = &svm_x86_ops,
        .pmu_ops = &amd_pmu_ops,
};

static void __svm_exit(void)
{
        kvm_x86_vendor_exit();
}

static int __init svm_init(void)
{
        int r;

        KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);

        __unused_size_checks();

        if (!kvm_is_svm_supported())
                return -EOPNOTSUPP;

        r = kvm_x86_vendor_init(&svm_init_ops);
        if (r)
                return r;

        /*
         * Common KVM initialization _must_ come last, after this, /dev/kvm is
         * exposed to userspace!
         */
        r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
                     THIS_MODULE);
        if (r)
                goto err_kvm_init;

        return 0;

err_kvm_init:
        __svm_exit();
        return r;
}

static void __exit svm_exit(void)
{
        kvm_exit();
        __svm_exit();
}

module_init(svm_init)
module_exit(svm_exit)
Linux