arch/x86/kvm/vmx/vmx.c

root/arch/x86/kvm/vmx/vmx.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/objtool.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>

#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fred.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/reboot.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>

#include <trace/events/ipi.h>

#include "capabilities.h"
#include "common.h"
#include "cpuid.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
#include "posted_intr.h"

#include "mmu/spte.h"

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");

#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
        X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
        {}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
#endif

bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);

static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, 0444);

bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, 0444);

bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, 0444);

bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
                        enable_unrestricted_guest, bool, 0444);

bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, 0444);

static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, 0444);

static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);

module_param(enable_apicv, bool, 0444);
module_param(enable_ipiv, bool, 0444);

module_param(enable_device_posted_irqs, bool, 0444);

/*
 * If nested=1, nested virtualization is supported, i.e., guests may use
 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 * use VMX instructions.
 */
static bool __read_mostly nested = 1;
module_param(nested, bool, 0444);

bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, 0444);

static bool __read_mostly error_on_inconsistent_vmcs_config = true;
module_param(error_on_inconsistent_vmcs_config, bool, 0444);

static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);

#define MSR_BITMAP_MODE_X2APIC          1
#define MSR_BITMAP_MODE_X2APIC_APICV    2

#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL

/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif

extern bool __read_mostly allow_smaller_maxphyaddr;
module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);

module_param(enable_mediated_pmu, bool, 0444);

#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON                            \
        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)

#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)

#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))

#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
        RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
        RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
        RTIT_STATUS_BYTECNT))

/*
 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 * ple_gap:    upper bound on the amount of time between two successive
 *             executions of PAUSE in a loop. Also indicate if ple enabled.
 *             According to test, this time is usually smaller than 128 cycles.
 * ple_window: upper bound on the amount of time a guest is allowed to execute
 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 *             less than 2^12 cycles
 * Time is measured based on a counter that runs at the same rate as the TSC,
 * refer SDM volume 3b section 21.6.13 & 22.1.3.
 */
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);

static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);

/* Default doubles per-vcpu window every exit. */
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, uint, 0444);

/* Default resets per-vcpu window every exit to ple_window. */
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);

/* Default is to compute the maximum so we can never overflow. */
static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);

/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
#endif

struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;

#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);

/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;

static const struct {
        const char *option;
        bool for_parse;
} vmentry_l1d_param[] = {
        [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
        [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
        [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
        [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
        [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
        [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};

#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;

static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
        struct page *page;
        unsigned int i;

        if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
                l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
                return 0;
        }

        if (!enable_ept) {
                l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
                return 0;
        }

        if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
                l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
                return 0;
        }

        /* If set to auto use the default l1tf mitigation method */
        if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
                switch (l1tf_mitigation) {
                case L1TF_MITIGATION_OFF:
                        l1tf = VMENTER_L1D_FLUSH_NEVER;
                        break;
                case L1TF_MITIGATION_AUTO:
                case L1TF_MITIGATION_FLUSH_NOWARN:
                case L1TF_MITIGATION_FLUSH:
                case L1TF_MITIGATION_FLUSH_NOSMT:
                        l1tf = VMENTER_L1D_FLUSH_COND;
                        break;
                case L1TF_MITIGATION_FULL:
                case L1TF_MITIGATION_FULL_FORCE:
                        l1tf = VMENTER_L1D_FLUSH_ALWAYS;
                        break;
                }
        } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
                l1tf = VMENTER_L1D_FLUSH_ALWAYS;
        }

        if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
            !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
                /*
                 * This allocation for vmx_l1d_flush_pages is not tied to a VM
                 * lifetime and so should not be charged to a memcg.
                 */
                page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
                if (!page)
                        return -ENOMEM;
                vmx_l1d_flush_pages = page_address(page);

                /*
                 * Initialize each page with a different pattern in
                 * order to protect against KSM in the nested
                 * virtualization case.
                 */
                for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
                        memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
                               PAGE_SIZE);
                }
        }

        l1tf_vmx_mitigation = l1tf;

        if (l1tf != VMENTER_L1D_FLUSH_NEVER)
                static_branch_enable(&vmx_l1d_should_flush);
        else
                static_branch_disable(&vmx_l1d_should_flush);

        if (l1tf == VMENTER_L1D_FLUSH_COND)
                static_branch_enable(&vmx_l1d_flush_cond);
        else
                static_branch_disable(&vmx_l1d_flush_cond);
        return 0;
}

static int vmx_setup_l1d_flush(void)
{
        /*
         * Hand the parameter mitigation value in which was stored in the pre
         * module init parser. If no parameter was given, it will contain
         * 'auto' which will be turned into the default 'cond' mitigation mode.
         */
        return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
}

static void vmx_cleanup_l1d_flush(void)
{
        if (vmx_l1d_flush_pages) {
                free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
                vmx_l1d_flush_pages = NULL;
        }
        /* Restore state so sysfs ignores VMX */
        l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}

static int vmentry_l1d_flush_parse(const char *s)
{
        unsigned int i;

        if (s) {
                for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
                        if (vmentry_l1d_param[i].for_parse &&
                            sysfs_streq(s, vmentry_l1d_param[i].option))
                                return i;
                }
        }
        return -EINVAL;
}

static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
        int l1tf, ret;

        l1tf = vmentry_l1d_flush_parse(s);
        if (l1tf < 0)
                return l1tf;

        if (!boot_cpu_has(X86_BUG_L1TF))
                return 0;

        /*
         * Has vmx_init() run already? If not then this is the pre init
         * parameter parsing. In that case just store the value and let
         * vmx_init() do the proper setup after enable_ept has been
         * established.
         */
        if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
                vmentry_l1d_flush_param = l1tf;
                return 0;
        }

        mutex_lock(&vmx_l1d_flush_mutex);
        ret = __vmx_setup_l1d_flush(l1tf);
        mutex_unlock(&vmx_l1d_flush_mutex);
        return ret;
}

static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
        if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
                return sysfs_emit(s, "???\n");

        return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}

/*
 * Software based L1D cache flush which is used when microcode providing
 * the cache control MSR is not loaded.
 *
 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
 * flush it is required to read in 64 KiB because the replacement algorithm
 * is not exactly LRU. This could be sized at runtime via topology
 * information but as all relevant affected CPUs have 32KiB L1D cache size
 * there is no point in doing so.
 */
static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
        int size = PAGE_SIZE << L1D_CACHE_ORDER;

        if (!static_branch_unlikely(&vmx_l1d_should_flush))
                return;

        /*
         * This code is only executed when the flush mode is 'cond' or
         * 'always'
         */
        if (static_branch_likely(&vmx_l1d_flush_cond)) {
                /*
                 * Clear the per-cpu flush bit, it gets set again if the vCPU
                 * is reloaded, i.e. if the vCPU is scheduled out or if KVM
                 * exits to userspace, or if KVM reaches one of the unsafe
                 * VMEXIT handlers, e.g. if KVM calls into the emulator,
                 * or from the interrupt handlers.
                 */
                if (!kvm_get_cpu_l1tf_flush_l1d())
                        return;
                kvm_clear_cpu_l1tf_flush_l1d();
        }

        vcpu->stat.l1d_flush++;

        if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
                native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
                return;
        }

        asm volatile(
                /* First ensure the pages are in the TLB */
                "xorl   %%eax, %%eax\n"
                ".Lpopulate_tlb:\n\t"
                "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
                "addl   $4096, %%eax\n\t"
                "cmpl   %%eax, %[size]\n\t"
                "jne    .Lpopulate_tlb\n\t"
                "xorl   %%eax, %%eax\n\t"
                "cpuid\n\t"
                /* Now fill the cache */
                "xorl   %%eax, %%eax\n"
                ".Lfill_cache:\n"
                "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
                "addl   $64, %%eax\n\t"
                "cmpl   %%eax, %[size]\n\t"
                "jne    .Lfill_cache\n\t"
                "lfence\n"
                :: [flush_pages] "r" (vmx_l1d_flush_pages),
                    [size] "r" (size)
                : "eax", "ebx", "ecx", "edx");
}

#else /* CONFIG_CPU_MITIGATIONS*/
static int vmx_setup_l1d_flush(void)
{
        l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
        return 0;
}
static void vmx_cleanup_l1d_flush(void)
{
        l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{

}
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
        pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
        return 0;
}
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
        return sysfs_emit(s, "never\n");
}
#endif

static const struct kernel_param_ops vmentry_l1d_flush_ops = {
        .set = vmentry_l1d_flush_set,
        .get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);

static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
{
        u64 msr;

        if (!vmx->disable_fb_clear)
                return;

        msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
        msr |= FB_CLEAR_DIS;
        native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
        /* Cache the MSR value to avoid reading it later */
        vmx->msr_ia32_mcu_opt_ctrl = msr;
}

static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
{
        if (!vmx->disable_fb_clear)
                return;

        vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
        native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
}

static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
        /*
         * Disable VERW's behavior of clearing CPU buffers for the guest if the
         * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
         * the mitigation. Disabling the clearing behavior provides a
         * performance boost for guests that aren't aware that manually clearing
         * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
         * and VM-Exit.
         */
        vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
                                (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
                                !boot_cpu_has_bug(X86_BUG_MDS) &&
                                !boot_cpu_has_bug(X86_BUG_TAA);

        /*
         * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
         * at VMEntry. Skip the MSR read/write when a guest has no use case to
         * execute VERW.
         */
        if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
           ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
            (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
            (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
            (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
            (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
                vmx->disable_fb_clear = false;
}

static u32 vmx_segment_access_rights(struct kvm_segment *var);

void vmx_vmexit(void);

#define vmx_insn_failed(fmt...)         \
do {                                    \
        WARN_ONCE(1, fmt);              \
        pr_warn_ratelimited(fmt);       \
} while (0)

noinline void vmread_error(unsigned long field)
{
        vmx_insn_failed("vmread failed: field=%lx\n", field);
}

#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
{
        if (fault) {
                kvm_spurious_fault();
        } else {
                instrumentation_begin();
                vmread_error(field);
                instrumentation_end();
        }
}
#endif

noinline void vmwrite_error(unsigned long field, unsigned long value)
{
        vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
                        field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
        vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
                        vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
        vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
                        vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}

noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
        vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
                        ext, vpid, gva);
}

noinline void invept_error(unsigned long ext, u64 eptp)
{
        vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}

static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 */
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);

static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);

struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability vmx_capability __ro_after_init;

#define VMX_SEGMENT_FIELD(seg)                                  \
        [VCPU_SREG_##seg] = {                                   \
                .selector = GUEST_##seg##_SELECTOR,             \
                .base = GUEST_##seg##_BASE,                     \
                .limit = GUEST_##seg##_LIMIT,                   \
                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
        }

static const struct kvm_vmx_segment_field {
        unsigned selector;
        unsigned base;
        unsigned limit;
        unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
        VMX_SEGMENT_FIELD(CS),
        VMX_SEGMENT_FIELD(DS),
        VMX_SEGMENT_FIELD(ES),
        VMX_SEGMENT_FIELD(FS),
        VMX_SEGMENT_FIELD(GS),
        VMX_SEGMENT_FIELD(SS),
        VMX_SEGMENT_FIELD(TR),
        VMX_SEGMENT_FIELD(LDTR),
};


static unsigned long host_idt_base;

#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);

static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
        struct hv_enlightened_vmcs *evmcs;
        hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);

        if (partition_assist_page == INVALID_PAGE)
                return -ENOMEM;

        evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;

        evmcs->partition_assist_page = partition_assist_page;
        evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
        evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;

        return 0;
}

static __init void hv_init_evmcs(void)
{
        int cpu;

        if (!enlightened_vmcs)
                return;

        /*
         * Enlightened VMCS usage should be recommended and the host needs
         * to support eVMCS v1 or above.
         */
        if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
            (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
             KVM_EVMCS_VERSION) {

                /* Check that we have assist pages on all online CPUs */
                for_each_online_cpu(cpu) {
                        if (!hv_get_vp_assist_page(cpu)) {
                                enlightened_vmcs = false;
                                break;
                        }
                }

                if (enlightened_vmcs) {
                        pr_info("Using Hyper-V Enlightened VMCS\n");
                        static_branch_enable(&__kvm_is_using_evmcs);
                }

                if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
                        vt_x86_ops.enable_l2_tlb_flush
                                = hv_enable_l2_tlb_flush;
        } else {
                enlightened_vmcs = false;
        }
}

static void hv_reset_evmcs(void)
{
        struct hv_vp_assist_page *vp_ap;

        if (!kvm_is_using_evmcs())
                return;

        /*
         * KVM should enable eVMCS if and only if all CPUs have a VP assist
         * page, and should reject CPU onlining if eVMCS is enabled the CPU
         * doesn't have a VP assist page allocated.
         */
        vp_ap = hv_get_vp_assist_page(smp_processor_id());
        if (WARN_ON_ONCE(!vp_ap))
                return;

        /*
         * Reset everything to support using non-enlightened VMCS access later
         * (e.g. when we reload the module with enlightened_vmcs=0)
         */
        vp_ap->nested_control.features.directhypercall = 0;
        vp_ap->current_nested_vmcs = 0;
        vp_ap->enlighten_vmentry = 0;
}

#else /* IS_ENABLED(CONFIG_HYPERV) */
static void hv_init_evmcs(void) {}
static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */

/*
 * Comment's format: document - errata name - stepping - processor name.
 * Refer from
 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 */
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
/*
 * 320767.pdf - AAP86  - B1 -
 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 */
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
 /* Xeon E3-1220 V2 */
0x000306A8,
};

static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
        u32 eax = cpuid_eax(0x00000001), i;

        /* Clear the reserved bits */
        eax &= ~(0x3U << 14 | 0xfU << 28);
        for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
                if (eax == vmx_preemption_cpu_tfms[i])
                        return true;

        return false;
}

static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
        return flexpriority_enabled && lapic_in_kernel(vcpu);
}

struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
        int i;

        i = kvm_find_user_return_msr(msr);
        if (i >= 0)
                return &vmx->guest_uret_msrs[i];
        return NULL;
}

static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
                                  struct vmx_uret_msr *msr, u64 data)
{
        unsigned int slot = msr - vmx->guest_uret_msrs;
        int ret = 0;

        if (msr->load_into_hardware) {
                preempt_disable();
                ret = kvm_set_user_return_msr(slot, data, msr->mask);
                preempt_enable();
        }
        if (!ret)
                msr->data = data;
        return ret;
}

/*
 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
 *
 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
 * atomically track post-VMXON state, e.g. this may be called in NMI context.
 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
 * magically in RM, VM86, compat mode, or at CPL>0.
 */
static int kvm_cpu_vmxoff(void)
{
        asm goto("1: vmxoff\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          ::: "cc", "memory" : fault);

        cr4_clear_bits(X86_CR4_VMXE);
        return 0;

fault:
        cr4_clear_bits(X86_CR4_VMXE);
        return -EIO;
}

void vmx_emergency_disable_virtualization_cpu(void)
{
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v;

        kvm_rebooting = true;

        /*
         * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
         * set in task context.  If this races with VMX is disabled by an NMI,
         * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
         * kvm_rebooting set.
         */
        if (!(__read_cr4() & X86_CR4_VMXE))
                return;

        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                            loaded_vmcss_on_cpu_link) {
                vmcs_clear(v->vmcs);
                if (v->shadow_vmcs)
                        vmcs_clear(v->shadow_vmcs);
        }

        kvm_cpu_vmxoff();
}

static void __loaded_vmcs_clear(void *arg)
{
        struct loaded_vmcs *loaded_vmcs = arg;
        int cpu = raw_smp_processor_id();

        if (loaded_vmcs->cpu != cpu)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;

        vmcs_clear(loaded_vmcs->vmcs);
        if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
                vmcs_clear(loaded_vmcs->shadow_vmcs);

        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);

        /*
         * Ensure all writes to loaded_vmcs, including deleting it from its
         * current percpu list, complete before setting loaded_vmcs->cpu to
         * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
         * and add loaded_vmcs to its percpu list before it's deleted from this
         * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
         */
        smp_wmb();

        loaded_vmcs->cpu = -1;
        loaded_vmcs->launched = 0;
}

static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
        int cpu = loaded_vmcs->cpu;

        if (cpu != -1)
                smp_call_function_single(cpu,
                         __loaded_vmcs_clear, loaded_vmcs, 1);
}

static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
                                       unsigned field)
{
        bool ret;
        u32 mask = 1 << (seg * SEG_FIELD_NR + field);

        if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
                kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
                vmx->segment_cache.bitmask = 0;
        }
        ret = vmx->segment_cache.bitmask & mask;
        vmx->segment_cache.bitmask |= mask;
        return ret;
}

static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
        u16 *p = &vmx->segment_cache.seg[seg].selector;

        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
                *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
        return *p;
}

static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
        ulong *p = &vmx->segment_cache.seg[seg].base;

        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
                *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
        return *p;
}

static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
        u32 *p = &vmx->segment_cache.seg[seg].limit;

        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
                *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
        return *p;
}

static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
        u32 *p = &vmx->segment_cache.seg[seg].ar;

        if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
                *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
        return *p;
}

void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
        u32 eb;

        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
             (1u << DB_VECTOR) | (1u << AC_VECTOR);
        /*
         * #VE isn't used for VMX.  To test against unexpected changes
         * related to #VE for VMX, intercept unexpected #VE and warn on it.
         */
        if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
                eb |= 1u << VE_VECTOR;
        /*
         * Guest access to VMware backdoor ports could legitimately
         * trigger #GP because of TSS I/O permission bitmap.
         * We intercept those #GP and allow access to them anyway
         * as VMware does.
         */
        if (enable_vmware_backdoor)
                eb |= (1u << GP_VECTOR);
        if ((vcpu->guest_debug &
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
                eb |= 1u << BP_VECTOR;
        if (to_vmx(vcpu)->rmode.vm86_active)
                eb = ~0;
        if (!vmx_need_pf_intercept(vcpu))
                eb &= ~(1u << PF_VECTOR);

        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
         * them to L1. When running L2, we will only handle the exceptions
         * specified above if L1 did not want them.
         */
        if (is_guest_mode(vcpu))
                eb |= get_vmcs12(vcpu)->exception_bitmap;
        else {
                int mask = 0, match = 0;

                if (enable_ept && (eb & (1u << PF_VECTOR))) {
                        /*
                         * If EPT is enabled, #PF is currently only intercepted
                         * if MAXPHYADDR is smaller on the guest than on the
                         * host.  In that case we only care about present,
                         * non-reserved faults.  For vmcs02, however, PFEC_MASK
                         * and PFEC_MATCH are set in prepare_vmcs02_rare.
                         */
                        mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
                        match = PFERR_PRESENT_MASK;
                }
                vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
                vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
        }

        /*
         * Disabling xfd interception indicates that dynamic xfeatures
         * might be used in the guest. Always trap #NM in this case
         * to save guest xfd_err timely.
         */
        if (vcpu->arch.xfd_no_write_intercept)
                eb |= (1u << NM_VECTOR);

        vmcs_write32(EXCEPTION_BITMAP, eb);
}

/*
 * Check if MSR is intercepted for currently loaded MSR bitmap.
 */
static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
        if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
                return true;

        return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
}

unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
{
        unsigned int flags = 0;

        if (vmx->loaded_vmcs->launched)
                flags |= VMX_RUN_VMRESUME;

        /*
         * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
         * to change it directly without causing a vmexit.  In that case read
         * it after vmexit and store it in vmx->spec_ctrl.
         */
        if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
                flags |= VMX_RUN_SAVE_SPEC_CTRL;

        if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
            kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
                flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;

        return flags;
}

static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                unsigned long entry, unsigned long exit)
{
        vm_entry_controls_clearbit(vmx, entry);
        vm_exit_controls_clearbit(vmx, exit);
}

static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
        unsigned int i;

        for (i = 0; i < m->nr; ++i) {
                if (m->val[i].index == msr)
                        return i;
        }
        return -ENOENT;
}

static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr,
                                unsigned long vmcs_count_field)
{
        int i;

        i = vmx_find_loadstore_msr_slot(m, msr);
        if (i < 0)
                return;

        --m->nr;
        m->val[i] = m->val[m->nr];
        vmcs_write32(vmcs_count_field, m->nr);
}

static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
        struct msr_autoload *m = &vmx->msr_autoload;

        switch (msr) {
        case MSR_EFER:
                if (cpu_has_load_ia32_efer()) {
                        clear_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_EFER,
                                        VM_EXIT_LOAD_IA32_EFER);
                        return;
                }
                break;
        case MSR_CORE_PERF_GLOBAL_CTRL:
                if (cpu_has_load_perf_global_ctrl()) {
                        clear_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
                        return;
                }
                break;
        }

        vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT);
        vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT);
}

static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                unsigned long entry, unsigned long exit,
                unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
                u64 guest_val, u64 host_val)
{
        vmcs_write64(guest_val_vmcs, guest_val);
        if (host_val_vmcs != HOST_IA32_EFER)
                vmcs_write64(host_val_vmcs, host_val);
        vm_entry_controls_setbit(vmx, entry);
        vm_exit_controls_setbit(vmx, exit);
}

static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value,
                             unsigned long vmcs_count_field, struct kvm *kvm)
{
        int i;

        i = vmx_find_loadstore_msr_slot(m, msr);
        if (i < 0) {
                if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm))
                        return;

                i = m->nr++;
                m->val[i].index = msr;
                vmcs_write32(vmcs_count_field, m->nr);
        }
        m->val[i].value = value;
}

static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                                  u64 guest_val, u64 host_val)
{
        struct msr_autoload *m = &vmx->msr_autoload;
        struct kvm *kvm = vmx->vcpu.kvm;

        switch (msr) {
        case MSR_EFER:
                if (cpu_has_load_ia32_efer()) {
                        add_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_EFER,
                                        VM_EXIT_LOAD_IA32_EFER,
                                        GUEST_IA32_EFER,
                                        HOST_IA32_EFER,
                                        guest_val, host_val);
                        return;
                }
                break;
        case MSR_CORE_PERF_GLOBAL_CTRL:
                if (cpu_has_load_perf_global_ctrl()) {
                        add_atomic_switch_msr_special(vmx,
                                        VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
                                        GUEST_IA32_PERF_GLOBAL_CTRL,
                                        HOST_IA32_PERF_GLOBAL_CTRL,
                                        guest_val, host_val);
                        return;
                }
                break;
        case MSR_IA32_PEBS_ENABLE:
                /* PEBS needs a quiescent period after being disabled (to write
                 * a record).  Disabling PEBS through VMX MSR swapping doesn't
                 * provide that period, so a CPU could write host's record into
                 * guest's memory.
                 */
                wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
        }

        vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm);
        vmx_add_auto_msr(&m->host, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm);
}

static bool update_transition_efer(struct vcpu_vmx *vmx)
{
        u64 guest_efer = vmx->vcpu.arch.efer;
        u64 ignore_bits = 0;
        int i;

        /* Shadow paging assumes NX to be available.  */
        if (!enable_ept)
                guest_efer |= EFER_NX;

        /*
         * LMA and LME handled by hardware; SCE meaningless outside long mode.
         */
        ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
        ignore_bits |= EFER_LMA | EFER_LME;
        /* SCE is meaningful only in long mode on Intel */
        if (guest_efer & EFER_LMA)
                ignore_bits &= ~(u64)EFER_SCE;
#endif

        /*
         * On EPT, we can't emulate NX, so we must switch EFER atomically.
         * On CPUs that support "load IA32_EFER", always switch EFER
         * atomically, since it's faster than switching it manually.
         */
        if (cpu_has_load_ia32_efer() ||
            (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
                if (guest_efer != kvm_host.efer)
                        add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer);
                else
                        clear_atomic_switch_msr(vmx, MSR_EFER);
                return false;
        }

        i = kvm_find_user_return_msr(MSR_EFER);
        if (i < 0)
                return false;

        clear_atomic_switch_msr(vmx, MSR_EFER);

        guest_efer &= ~ignore_bits;
        guest_efer |= kvm_host.efer & ignore_bits;

        vmx->guest_uret_msrs[i].data = guest_efer;
        vmx->guest_uret_msrs[i].mask = ~ignore_bits;

        return true;
}

static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
{
        vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT,
                         vmx->vcpu.kvm);
}

static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr)
{
        vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT);
}

#ifdef CONFIG_X86_32
/*
 * On 32-bit kernels, VM exits still load the FS and GS bases from the
 * VMCS rather than the segment table.  KVM uses this helper to figure
 * out the current bases to poke them into the VMCS before entry.
 */
static unsigned long segment_base(u16 selector)
{
        struct desc_struct *table;
        unsigned long v;

        if (!(selector & ~SEGMENT_RPL_MASK))
                return 0;

        table = get_current_gdt_ro();

        if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
                u16 ldt_selector = kvm_read_ldt();

                if (!(ldt_selector & ~SEGMENT_RPL_MASK))
                        return 0;

                table = (struct desc_struct *)segment_base(ldt_selector);
        }
        v = get_desc_base(&table[selector >> 3]);
        return v;
}
#endif

static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
{
        return vmx_pt_mode_is_host_guest() &&
               !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}

static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
        /* The base must be 128-byte aligned and a legal physical address. */
        return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}

static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
        u32 i;

        wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
        wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
        wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
        wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
        for (i = 0; i < addr_range; i++) {
                wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
                wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
        }
}

static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
        u32 i;

        rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
        rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
        rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
        rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
        for (i = 0; i < addr_range; i++) {
                rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
                rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
        }
}

static void pt_guest_enter(struct vcpu_vmx *vmx)
{
        if (vmx_pt_mode_is_system())
                return;

        /*
         * GUEST_IA32_RTIT_CTL is already set in the VMCS.
         * Save host state before VM entry.
         */
        rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
        if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
                wrmsrq(MSR_IA32_RTIT_CTL, 0);
                pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
                pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
        }
}

static void pt_guest_exit(struct vcpu_vmx *vmx)
{
        if (vmx_pt_mode_is_system())
                return;

        if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
                pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
                pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
        }

        /*
         * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
         * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
         */
        if (vmx->pt_desc.host.ctl)
                wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}

void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
                        unsigned long fs_base, unsigned long gs_base)
{
        if (unlikely(fs_sel != host->fs_sel)) {
                if (!(fs_sel & 7))
                        vmcs_write16(HOST_FS_SELECTOR, fs_sel);
                else
                        vmcs_write16(HOST_FS_SELECTOR, 0);
                host->fs_sel = fs_sel;
        }
        if (unlikely(gs_sel != host->gs_sel)) {
                if (!(gs_sel & 7))
                        vmcs_write16(HOST_GS_SELECTOR, gs_sel);
                else
                        vmcs_write16(HOST_GS_SELECTOR, 0);
                host->gs_sel = gs_sel;
        }
        if (unlikely(fs_base != host->fs_base)) {
                vmcs_writel(HOST_FS_BASE, fs_base);
                host->fs_base = fs_base;
        }
        if (unlikely(gs_base != host->gs_base)) {
                vmcs_writel(HOST_GS_BASE, gs_base);
                host->gs_base = gs_base;
        }
}

void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vcpu_vt *vt = to_vt(vcpu);
        struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
        int cpu = raw_smp_processor_id();
#endif
        unsigned long fs_base, gs_base;
        u16 fs_sel, gs_sel;
        int i;

        /*
         * Note that guest MSRs to be saved/restored can also be changed
         * when guest state is loaded. This happens when guest transitions
         * to/from long-mode by setting MSR_EFER.LMA.
         */
        if (!vmx->guest_uret_msrs_loaded) {
                vmx->guest_uret_msrs_loaded = true;
                for (i = 0; i < kvm_nr_uret_msrs; ++i) {
                        if (!vmx->guest_uret_msrs[i].load_into_hardware)
                                continue;

                        kvm_set_user_return_msr(i,
                                                vmx->guest_uret_msrs[i].data,
                                                vmx->guest_uret_msrs[i].mask);
                }
        }

        if (vmx->nested.need_vmcs12_to_shadow_sync)
                nested_sync_vmcs12_to_shadow(vcpu);

        if (vt->guest_state_loaded)
                return;

        host_state = &vmx->loaded_vmcs->host_state;

        /*
         * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
         * allow segment selectors with cpl > 0 or ti == 1.
         */
        host_state->ldt_sel = kvm_read_ldt();

#ifdef CONFIG_X86_64
        savesegment(ds, host_state->ds_sel);
        savesegment(es, host_state->es_sel);

        gs_base = cpu_kernelmode_gs_base(cpu);
        if (likely(is_64bit_mm(current->mm))) {
                current_save_fsgs();
                fs_sel = current->thread.fsindex;
                gs_sel = current->thread.gsindex;
                fs_base = current->thread.fsbase;
                vt->msr_host_kernel_gs_base = current->thread.gsbase;
        } else {
                savesegment(fs, fs_sel);
                savesegment(gs, gs_sel);
                fs_base = read_msr(MSR_FS_BASE);
                vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
        }

        wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
        savesegment(fs, fs_sel);
        savesegment(gs, gs_sel);
        fs_base = segment_base(fs_sel);
        gs_base = segment_base(gs_sel);
#endif

        vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
        vt->guest_state_loaded = true;
}

static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
        struct vmcs_host_state *host_state;

        if (!vmx->vt.guest_state_loaded)
                return;

        host_state = &vmx->loaded_vmcs->host_state;

        ++vmx->vcpu.stat.host_state_reload;

#ifdef CONFIG_X86_64
        rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
        if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
                kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
                load_gs_index(host_state->gs_sel);
#else
                loadsegment(gs, host_state->gs_sel);
#endif
        }
        if (host_state->fs_sel & 7)
                loadsegment(fs, host_state->fs_sel);
#ifdef CONFIG_X86_64
        if (unlikely(host_state->ds_sel | host_state->es_sel)) {
                loadsegment(ds, host_state->ds_sel);
                loadsegment(es, host_state->es_sel);
        }
#endif
        invalidate_tss_limit();
#ifdef CONFIG_X86_64
        wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
        load_fixmap_gdt(raw_smp_processor_id());
        vmx->vt.guest_state_loaded = false;
        vmx->guest_uret_msrs_loaded = false;
}

#ifdef CONFIG_X86_64
static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
{
        preempt_disable();
        if (vmx->vt.guest_state_loaded)
                *cache = read_msr(msr);
        preempt_enable();
        return *cache;
}

static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
                                     u64 *cache)
{
        preempt_disable();
        if (vmx->vt.guest_state_loaded)
                wrmsrns(msr, data);
        preempt_enable();
        *cache = data;
}

static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
        return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
                                       &vmx->msr_guest_kernel_gs_base);
}

static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
        vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
                                 &vmx->msr_guest_kernel_gs_base);
}
#endif

static void grow_ple_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned int old = vmx->ple_window;

        vmx->ple_window = __grow_ple_window(old, ple_window,
                                            ple_window_grow,
                                            ple_window_max);

        if (vmx->ple_window != old) {
                vmx->ple_window_dirty = true;
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            vmx->ple_window, old);
        }
}

static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned int old = vmx->ple_window;

        vmx->ple_window = __shrink_ple_window(old, ple_window,
                                              ple_window_shrink,
                                              ple_window);

        if (vmx->ple_window != old) {
                vmx->ple_window_dirty = true;
                trace_kvm_ple_window_update(vcpu->vcpu_id,
                                            vmx->ple_window, old);
        }
}

void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
        struct vmcs *prev;

        if (!already_loaded) {
                loaded_vmcs_clear(vmx->loaded_vmcs);
                local_irq_disable();

                /*
                 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
                 * this cpu's percpu list, otherwise it may not yet be deleted
                 * from its previous cpu's percpu list.  Pairs with the
                 * smb_wmb() in __loaded_vmcs_clear().
                 */
                smp_rmb();

                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
                local_irq_enable();
        }

        prev = per_cpu(current_vmcs, cpu);
        if (prev != vmx->loaded_vmcs->vmcs) {
                per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
                vmcs_load(vmx->loaded_vmcs->vmcs);
        }

        if (!already_loaded) {
                void *gdt = get_current_gdt_ro();

                /*
                 * Flush all EPTP/VPID contexts, the new pCPU may have stale
                 * TLB entries from its previous association with the vCPU.
                 */
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);

                /*
                 * Linux uses per-cpu TSS and GDT, so set these when switching
                 * processors.  See 22.2.4.
                 */
                vmcs_writel(HOST_TR_BASE,
                            (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
                vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */

                if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
                        /* 22.2.3 */
                        vmcs_writel(HOST_IA32_SYSENTER_ESP,
                                    (unsigned long)(cpu_entry_stack(cpu) + 1));
                }

                vmx->loaded_vmcs->cpu = cpu;
        }
}

/*
 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 * vcpu mutex is already taken.
 */
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
        if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
                shrink_ple_window(vcpu);

        vmx_vcpu_load_vmcs(vcpu, cpu);

        vmx_vcpu_pi_load(vcpu, cpu);
}

void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
        vmx_vcpu_pi_put(vcpu);

        vmx_prepare_switch_to_host(to_vmx(vcpu));
}

static void vmx_switch_loaded_vmcs(struct kvm_vcpu *vcpu,
                                   struct loaded_vmcs *vmcs)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int cpu;

        cpu = get_cpu();
        vmx->loaded_vmcs = vmcs;
        vmx_vcpu_load_vmcs(vcpu, cpu);
        put_cpu();
}

static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!is_guest_mode(vcpu)) {
                WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
                return;
        }

        WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->nested.vmcs02);
        vmx_switch_loaded_vmcs(vcpu, &vmx->vmcs01);
}

static void vmx_put_vmcs01(struct kvm_vcpu *vcpu)
{
        if (!is_guest_mode(vcpu))
                return;

        vmx_switch_loaded_vmcs(vcpu, &to_vmx(vcpu)->nested.vmcs02);
}
DEFINE_GUARD(vmx_vmcs01, struct kvm_vcpu *,
             vmx_load_vmcs01(_T), vmx_put_vmcs01(_T))

bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
        return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}

unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long rflags, save_rflags;

        if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
                kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
                rflags = vmcs_readl(GUEST_RFLAGS);
                if (vmx->rmode.vm86_active) {
                        rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
                        save_rflags = vmx->rmode.save_rflags;
                        rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
                }
                vmx->rflags = rflags;
        }
        return vmx->rflags;
}

void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long old_rflags;

        /*
         * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
         * is an unrestricted guest in order to mark L2 as needing emulation
         * if L1 runs L2 as a restricted guest.
         */
        if (is_unrestricted_guest(vcpu)) {
                kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
                vmx->rflags = rflags;
                vmcs_writel(GUEST_RFLAGS, rflags);
                return;
        }

        old_rflags = vmx_get_rflags(vcpu);
        vmx->rflags = rflags;
        if (vmx->rmode.vm86_active) {
                vmx->rmode.save_rflags = rflags;
                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
        }
        vmcs_writel(GUEST_RFLAGS, rflags);

        if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
                vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}

bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
        return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}

u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
        u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
        int ret = 0;

        if (interruptibility & GUEST_INTR_STATE_STI)
                ret |= KVM_X86_SHADOW_INT_STI;
        if (interruptibility & GUEST_INTR_STATE_MOV_SS)
                ret |= KVM_X86_SHADOW_INT_MOV_SS;

        return ret;
}

void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
        u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
        u32 interruptibility = interruptibility_old;

        interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);

        if (mask & KVM_X86_SHADOW_INT_MOV_SS)
                interruptibility |= GUEST_INTR_STATE_MOV_SS;
        else if (mask & KVM_X86_SHADOW_INT_STI)
                interruptibility |= GUEST_INTR_STATE_STI;

        if ((interruptibility != interruptibility_old))
                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}

static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long value;

        /*
         * Any MSR write that attempts to change bits marked reserved will
         * case a #GP fault.
         */
        if (data & vmx->pt_desc.ctl_bitmask)
                return 1;

        /*
         * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
         * result in a #GP unless the same write also clears TraceEn.
         */
        if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
            (data & RTIT_CTL_TRACEEN) &&
            data != vmx->pt_desc.guest.ctl)
                return 1;

        /*
         * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
         * and FabricEn would cause #GP, if
         * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
         */
        if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
                !(data & RTIT_CTL_FABRIC_EN) &&
                !intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_single_range_output))
                return 1;

        /*
         * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
         * utilize encodings marked reserved will cause a #GP fault.
         */
        value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
                        !test_bit((data & RTIT_CTL_MTC_RANGE) >>
                        RTIT_CTL_MTC_RANGE_OFFSET, &value))
                return 1;
        value = intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_cycle_thresholds);
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
                        !test_bit((data & RTIT_CTL_CYC_THRESH) >>
                        RTIT_CTL_CYC_THRESH_OFFSET, &value))
                return 1;
        value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
                        !test_bit((data & RTIT_CTL_PSB_FREQ) >>
                        RTIT_CTL_PSB_FREQ_OFFSET, &value))
                return 1;

        /*
         * If ADDRx_CFG is reserved or the encodings is >2 will
         * cause a #GP fault.
         */
        value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
        if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
                return 1;
        value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
        if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
                return 1;
        value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
        if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
                return 1;
        value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
        if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
                return 1;

        return 0;
}

int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
                                  void *insn, int insn_len)
{
        /*
         * Emulation of instructions in SGX enclaves is impossible as RIP does
         * not point at the failing instruction, and even if it did, the code
         * stream is inaccessible.  Inject #UD instead of exiting to userspace
         * so that guest userspace can't DoS the guest simply by triggering
         * emulation (enclaves are CPL3 only).
         */
        if (vmx_get_exit_reason(vcpu).enclave_mode) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return X86EMUL_PROPAGATE_FAULT;
        }

        /* Check that emulation is possible during event vectoring */
        if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
            !kvm_can_emulate_event_vectoring(emul_type))
                return X86EMUL_UNHANDLEABLE_VECTORING;

        return X86EMUL_CONTINUE;
}

static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
        union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
        unsigned long rip, orig_rip;
        u32 instr_len;

        /*
         * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
         * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
         * set when EPT misconfig occurs.  In practice, real hardware updates
         * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
         * (namely Hyper-V) don't set it due to it being undefined behavior,
         * i.e. we end up advancing IP with some random value.
         */
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
            exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
                instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);

                /*
                 * Emulating an enclave's instructions isn't supported as KVM
                 * cannot access the enclave's memory or its true RIP, e.g. the
                 * vmcs.GUEST_RIP points at the exit point of the enclave, not
                 * the RIP that actually triggered the VM-Exit.  But, because
                 * most instructions that cause VM-Exit will #UD in an enclave,
                 * most instruction-based VM-Exits simply do not occur.
                 *
                 * There are a few exceptions, notably the debug instructions
                 * INT1ICEBRK and INT3, as they are allowed in debug enclaves
                 * and generate #DB/#BP as expected, which KVM might intercept.
                 * But again, the CPU does the dirty work and saves an instr
                 * length of zero so VMMs don't shoot themselves in the foot.
                 * WARN if KVM tries to skip a non-zero length instruction on
                 * a VM-Exit from an enclave.
                 */
                if (!instr_len)
                        goto rip_updated;

                WARN_ONCE(exit_reason.enclave_mode,
                          "skipping instruction after SGX enclave VM-Exit");

                orig_rip = kvm_rip_read(vcpu);
                rip = orig_rip + instr_len;
#ifdef CONFIG_X86_64
                /*
                 * We need to mask out the high 32 bits of RIP if not in 64-bit
                 * mode, but just finding out that we are in 64-bit mode is
                 * quite expensive.  Only do it if there was a carry.
                 */
                if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
                        rip = (u32)rip;
#endif
                kvm_rip_write(vcpu, rip);
        } else {
                if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
                        return 0;
        }

rip_updated:
        /* skipping an emulated instruction also counts */
        vmx_set_interrupt_shadow(vcpu, 0);

        return 1;
}

/*
 * Recognizes a pending MTF VM-exit and records the nested state for later
 * delivery.
 */
void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!is_guest_mode(vcpu))
                return;

        /*
         * Per the SDM, MTF takes priority over debug-trap exceptions besides
         * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
         * or ICEBP (in the emulator proper), and skipping of ICEBP after an
         * intercepted #DB deliberately avoids single-step #DB and MTF updates
         * as ICEBP is higher priority than both.  As instruction emulation is
         * completed at this point (i.e. KVM is at the instruction boundary),
         * any #DB exception pending delivery must be a debug-trap of lower
         * priority than MTF.  Record the pending MTF state to be delivered in
         * vmx_check_nested_events().
         */
        if (nested_cpu_has_mtf(vmcs12) &&
            (!vcpu->arch.exception.pending ||
             vcpu->arch.exception.vector == DB_VECTOR) &&
            (!vcpu->arch.exception_vmexit.pending ||
             vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
                vmx->nested.mtf_pending = true;
                kvm_make_request(KVM_REQ_EVENT, vcpu);
        } else {
                vmx->nested.mtf_pending = false;
        }
}

int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
        vmx_update_emulated_instruction(vcpu);
        return skip_emulated_instruction(vcpu);
}

static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
        /*
         * Ensure that we clear the HLT state in the VMCS.  We don't need to
         * explicitly skip the instruction because if the HLT state is set,
         * then the instruction is already executing and RIP has already been
         * advanced.
         */
        if (kvm_hlt_in_guest(vcpu->kvm) &&
                        vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
                vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}

void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
        struct kvm_queued_exception *ex = &vcpu->arch.exception;
        u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        kvm_deliver_exception_payload(vcpu, ex);

        if (ex->has_error_code) {
                /*
                 * Despite the error code being architecturally defined as 32
                 * bits, and the VMCS field being 32 bits, Intel CPUs and thus
                 * VMX don't actually supporting setting bits 31:16.  Hardware
                 * will (should) never provide a bogus error code, but AMD CPUs
                 * do generate error codes with bits 31:16 set, and so KVM's
                 * ABI lets userspace shove in arbitrary 32-bit values.  Drop
                 * the upper bits to avoid VM-Fail, losing information that
                 * doesn't really exist is preferable to killing the VM.
                 */
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
        }

        if (vmx->rmode.vm86_active) {
                int inc_eip = 0;
                if (kvm_exception_is_soft(ex->vector))
                        inc_eip = vcpu->arch.event_exit_inst_len;
                kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
                return;
        }

        WARN_ON_ONCE(vmx->vt.emulation_required);

        if (kvm_exception_is_soft(ex->vector)) {
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                             vmx->vcpu.arch.event_exit_inst_len);
                intr_info |= INTR_TYPE_SOFT_EXCEPTION;
        } else
                intr_info |= INTR_TYPE_HARD_EXCEPTION;

        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);

        vmx_clear_hlt(vcpu);
}

static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
                               bool load_into_hardware)
{
        struct vmx_uret_msr *uret_msr;

        uret_msr = vmx_find_uret_msr(vmx, msr);
        if (!uret_msr)
                return;

        uret_msr->load_into_hardware = load_into_hardware;
}

/*
 * Configuring user return MSRs to automatically save, load, and restore MSRs
 * that need to be shoved into hardware when running the guest.  Note, omitting
 * an MSR here does _NOT_ mean it's not emulated, only that it will not be
 * loaded into hardware when running the guest.
 */
static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
        bool load_syscall_msrs;

        /*
         * The SYSCALL MSRs are only needed on long mode guests, and only
         * when EFER.SCE is set.
         */
        load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
                            (vmx->vcpu.arch.efer & EFER_SCE);

        vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
        vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
        vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
        vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));

        vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
                           guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
                           guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));

        /*
         * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
         * kernel and old userspace.  If those guests run on a tsx=off host, do
         * allow guests to use TSX_CTRL, but don't change the value in hardware
         * so that TSX remains always disabled.
         */
        vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));

        /*
         * The set of MSRs to load may have changed, reload MSRs before the
         * next VM-Enter.
         */
        vmx->guest_uret_msrs_loaded = false;
}

u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

        if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
                return vmcs12->tsc_offset;

        return 0;
}

u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);

        if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
            nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
                return vmcs12->tsc_multiplier;

        return kvm_caps.default_tsc_scaling_ratio;
}

void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}

void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
        vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}

/*
 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
 * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
 * backwards compatibility even though KVM doesn't support emulating SMX.  And
 * because userspace set "VMX in SMX", the guest must also be allowed to set it,
 * e.g. if the MSR is left unlocked and the guest does a RMW operation.
 */
#define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
                                        FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
                                        FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
                                        FEAT_CTL_SGX_LC_ENABLED          | \
                                        FEAT_CTL_SGX_ENABLED             | \
                                        FEAT_CTL_LMCE_ENABLED)

static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
                                                    struct msr_data *msr)
{
        uint64_t valid_bits;

        /*
         * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
         * exposed to the guest.
         */
        WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
                     ~KVM_SUPPORTED_FEATURE_CONTROL);

        if (!msr->host_initiated &&
            (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
                return false;

        if (msr->host_initiated)
                valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
        else
                valid_bits = vmx->msr_ia32_feature_control_valid_bits;

        return !(msr->data & ~valid_bits);
}

int vmx_get_feature_msr(u32 msr, u64 *data)
{
        switch (msr) {
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                if (!nested)
                        return 1;
                return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
        default:
                return KVM_MSR_RET_UNSUPPORTED;
        }
}

/*
 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmx_uret_msr *msr;
        u32 index;

        switch (msr_info->index) {
#ifdef CONFIG_X86_64
        case MSR_FS_BASE:
                msr_info->data = vmcs_readl(GUEST_FS_BASE);
                break;
        case MSR_GS_BASE:
                msr_info->data = vmcs_readl(GUEST_GS_BASE);
                break;
        case MSR_KERNEL_GS_BASE:
                msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
                break;
#endif
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_info);
        case MSR_IA32_TSX_CTRL:
                if (!msr_info->host_initiated &&
                    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
                        return 1;
                goto find_uret_msr;
        case MSR_IA32_UMWAIT_CONTROL:
                if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
                        return 1;

                msr_info->data = vmx->msr_ia32_umwait_control;
                break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;

                msr_info->data = to_vmx(vcpu)->spec_ctrl;
                break;
        case MSR_IA32_SYSENTER_CS:
                msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
                break;
        case MSR_IA32_SYSENTER_EIP:
                msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
                break;
        case MSR_IA32_SYSENTER_ESP:
                msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                break;
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
                    (!msr_info->host_initiated &&
                     !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
        case MSR_IA32_MCG_EXT_CTL:
                if (!msr_info->host_initiated &&
                    !(vmx->msr_ia32_feature_control &
                      FEAT_CTL_LMCE_ENABLED))
                        return 1;
                msr_info->data = vcpu->arch.mcg_ext_ctl;
                break;
        case MSR_IA32_FEAT_CTL:
                msr_info->data = vmx->msr_ia32_feature_control;
                break;
        case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
                if (!msr_info->host_initiated &&
                    !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
                        return 1;
                msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
                        [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
                break;
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
                        return 1;
                if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
                                    &msr_info->data))
                        return 1;
#ifdef CONFIG_KVM_HYPERV
                /*
                 * Enlightened VMCS v1 doesn't have certain VMCS fields but
                 * instead of just ignoring the features, different Hyper-V
                 * versions are either trying to use them and fail or do some
                 * sanity checking and refuse to boot. Filter all unsupported
                 * features out.
                 */
                if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
                        nested_evmcs_filter_control_msr(vcpu, msr_info->index,
                                                        &msr_info->data);
#endif
                break;
        case MSR_IA32_RTIT_CTL:
                if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.ctl;
                break;
        case MSR_IA32_RTIT_STATUS:
                if (!vmx_pt_mode_is_host_guest())
                        return 1;
                msr_info->data = vmx->pt_desc.guest.status;
                break;
        case MSR_IA32_RTIT_CR3_MATCH:
                if (!vmx_pt_mode_is_host_guest() ||
                        !intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_cr3_filtering))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.cr3_match;
                break;
        case MSR_IA32_RTIT_OUTPUT_BASE:
                if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_single_range_output)))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.output_base;
                break;
        case MSR_IA32_RTIT_OUTPUT_MASK:
                if (!vmx_pt_mode_is_host_guest() ||
                        (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_topa_output) &&
                         !intel_pt_validate_cap(vmx->pt_desc.caps,
                                        PT_CAP_single_range_output)))
                        return 1;
                msr_info->data = vmx->pt_desc.guest.output_mask;
                break;
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
                if (!vmx_pt_mode_is_host_guest() ||
                    (index >= 2 * vmx->pt_desc.num_address_ranges))
                        return 1;
                if (index % 2)
                        msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
                else
                        msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
                break;
        case MSR_IA32_S_CET:
                msr_info->data = vmcs_readl(GUEST_S_CET);
                break;
        case MSR_KVM_INTERNAL_GUEST_SSP:
                msr_info->data = vmcs_readl(GUEST_SSP);
                break;
        case MSR_IA32_INT_SSP_TAB:
                msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
                break;
        case MSR_IA32_DEBUGCTLMSR:
                msr_info->data = vmx_guest_debugctl_read();
                break;
        default:
        find_uret_msr:
                msr = vmx_find_uret_msr(vmx, msr_info->index);
                if (msr) {
                        msr_info->data = msr->data;
                        break;
                }
                return kvm_get_msr_common(vcpu, msr_info);
        }

        return 0;
}

static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
                                                    u64 data)
{
#ifdef CONFIG_X86_64
        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
                return (u32)data;
#endif
        return (unsigned long)data;
}

u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
        u64 debugctl = 0;

        if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
            (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
                debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;

        if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
            (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
                debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;

        if (boot_cpu_has(X86_FEATURE_RTM) &&
            (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
                debugctl |= DEBUGCTLMSR_RTM_DEBUG;

        return debugctl;
}

bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
{
        u64 invalid;

        invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
        if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
                kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
                invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
        }
        return !invalid;
}

/*
 * Writes msr value into the appropriate "register".
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmx_uret_msr *msr;
        int ret = 0;
        u32 msr_index = msr_info->index;
        u64 data = msr_info->data;
        u32 index;

        switch (msr_index) {
        case MSR_EFER:
                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
#ifdef CONFIG_X86_64
        case MSR_FS_BASE:
                vmx_segment_cache_clear(vmx);
                vmcs_writel(GUEST_FS_BASE, data);
                break;
        case MSR_GS_BASE:
                vmx_segment_cache_clear(vmx);
                vmcs_writel(GUEST_GS_BASE, data);
                break;
        case MSR_KERNEL_GS_BASE:
                vmx_write_guest_kernel_gs_base(vmx, data);
                break;
        case MSR_IA32_XFD:
                ret = kvm_set_msr_common(vcpu, msr_info);
                /*
                 * Always intercepting WRMSR could incur non-negligible
                 * overhead given xfd might be changed frequently in
                 * guest context switch. Disable write interception
                 * upon the first write with a non-zero value (indicating
                 * potential usage on dynamic xfeatures). Also update
                 * exception bitmap to trap #NM for proper virtualization
                 * of guest xfd_err.
                 */
                if (!ret && data) {
                        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
                                                      MSR_TYPE_RW);
                        vcpu->arch.xfd_no_write_intercept = true;
                        vmx_update_exception_bitmap(vcpu);
                }
                break;
#endif
        case MSR_IA32_SYSENTER_CS:
                if (is_guest_mode(vcpu))
                        get_vmcs12(vcpu)->guest_sysenter_cs = data;
                vmcs_write32(GUEST_SYSENTER_CS, data);
                break;
        case MSR_IA32_SYSENTER_EIP:
                if (is_guest_mode(vcpu)) {
                        data = nested_vmx_truncate_sysenter_addr(vcpu, data);
                        get_vmcs12(vcpu)->guest_sysenter_eip = data;
                }
                vmcs_writel(GUEST_SYSENTER_EIP, data);
                break;
        case MSR_IA32_SYSENTER_ESP:
                if (is_guest_mode(vcpu)) {
                        data = nested_vmx_truncate_sysenter_addr(vcpu, data);
                        get_vmcs12(vcpu)->guest_sysenter_esp = data;
                }
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
                        return 1;

                data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);

                if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
                                                VM_EXIT_SAVE_DEBUG_CONTROLS)
                        get_vmcs12(vcpu)->guest_ia32_debugctl = data;

                vmx_guest_debugctl_write(vcpu, data);

                if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
                    (data & DEBUGCTLMSR_LBR))
                        intel_pmu_create_guest_lbr_event(vcpu);
                return 0;
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
                    (!msr_info->host_initiated &&
                     !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
                        return 1;
                if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
                    (data & MSR_IA32_BNDCFGS_RSVD))
                        return 1;

                if (is_guest_mode(vcpu) &&
                    ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
                     (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
                        get_vmcs12(vcpu)->guest_bndcfgs = data;

                vmcs_write64(GUEST_BNDCFGS, data);
                break;
        case MSR_IA32_UMWAIT_CONTROL:
                if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
                        return 1;

                /* The reserved bit 1 and non-32 bit [63:32] should be zero */
                if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
                        return 1;

                vmx->msr_ia32_umwait_control = data;
                break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_has_spec_ctrl_msr(vcpu))
                        return 1;

                if (kvm_spec_ctrl_test_value(data))
                        return 1;

                vmx->spec_ctrl = data;
                if (!data)
                        break;

                /*
                 * For non-nested:
                 * When it's written (to non-zero) for the first time, pass
                 * it through.
                 *
                 * For nested:
                 * The handling of the MSR bitmap for L2 guests is done in
                 * nested_vmx_prepare_msr_bitmap. We should not touch the
                 * vmcs02.msr_bitmap here since it gets completely overwritten
                 * in the merging. We update the vmcs01 here for L1 as well
                 * since it will end up touching the MSR anyway now.
                 */
                vmx_disable_intercept_for_msr(vcpu,
                                              MSR_IA32_SPEC_CTRL,
                                              MSR_TYPE_RW);
                break;
        case MSR_IA32_TSX_CTRL:
                if (!msr_info->host_initiated &&
                    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
                        return 1;
                if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
                        return 1;
                goto find_uret_msr;
        case MSR_IA32_CR_PAT:
                ret = kvm_set_msr_common(vcpu, msr_info);
                if (ret)
                        break;

                if (is_guest_mode(vcpu) &&
                    get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                        get_vmcs12(vcpu)->guest_ia32_pat = data;

                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                        vmcs_write64(GUEST_IA32_PAT, data);
                break;
        case MSR_IA32_MCG_EXT_CTL:
                if ((!msr_info->host_initiated &&
                     !(to_vmx(vcpu)->msr_ia32_feature_control &
                       FEAT_CTL_LMCE_ENABLED)) ||
                    (data & ~MCG_EXT_CTL_LMCE_EN))
                        return 1;
                vcpu->arch.mcg_ext_ctl = data;
                break;
        case MSR_IA32_FEAT_CTL:
                if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
                        return 1;

                vmx->msr_ia32_feature_control = data;
                if (msr_info->host_initiated && data == 0)
                        vmx_leave_nested(vcpu);

                /* SGX may be enabled/disabled by guest's firmware */
                vmx_write_encls_bitmap(vcpu, NULL);
                break;
        case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
                /*
                 * On real hardware, the LE hash MSRs are writable before
                 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
                 * at which point SGX related bits in IA32_FEATURE_CONTROL
                 * become writable.
                 *
                 * KVM does not emulate SGX activation for simplicity, so
                 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
                 * is unlocked.  This is technically not architectural
                 * behavior, but it's close enough.
                 */
                if (!msr_info->host_initiated &&
                    (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
                    ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
                    !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
                        return 1;
                vmx->msr_ia32_sgxlepubkeyhash
                        [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
                break;
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                if (!msr_info->host_initiated)
                        return 1; /* they are read-only */
                if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
                        return 1;
                return vmx_set_vmx_msr(vcpu, msr_index, data);
        case MSR_IA32_RTIT_CTL:
                if (!vmx_pt_mode_is_host_guest() ||
                        vmx_rtit_ctl_check(vcpu, data) ||
                        vmx->nested.vmxon)
                        return 1;
                vmcs_write64(GUEST_IA32_RTIT_CTL, data);
                vmx->pt_desc.guest.ctl = data;
                pt_update_intercept_for_msr(vcpu);
                break;
        case MSR_IA32_RTIT_STATUS:
                if (!pt_can_write_msr(vmx))
                        return 1;
                if (data & MSR_IA32_RTIT_STATUS_MASK)
                        return 1;
                vmx->pt_desc.guest.status = data;
                break;
        case MSR_IA32_RTIT_CR3_MATCH:
                if (!pt_can_write_msr(vmx))
                        return 1;
                if (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_cr3_filtering))
                        return 1;
                vmx->pt_desc.guest.cr3_match = data;
                break;
        case MSR_IA32_RTIT_OUTPUT_BASE:
                if (!pt_can_write_msr(vmx))
                        return 1;
                if (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_topa_output) &&
                    !intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_single_range_output))
                        return 1;
                if (!pt_output_base_valid(vcpu, data))
                        return 1;
                vmx->pt_desc.guest.output_base = data;
                break;
        case MSR_IA32_RTIT_OUTPUT_MASK:
                if (!pt_can_write_msr(vmx))
                        return 1;
                if (!intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_topa_output) &&
                    !intel_pt_validate_cap(vmx->pt_desc.caps,
                                           PT_CAP_single_range_output))
                        return 1;
                vmx->pt_desc.guest.output_mask = data;
                break;
        case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
                if (!pt_can_write_msr(vmx))
                        return 1;
                index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
                if (index >= 2 * vmx->pt_desc.num_address_ranges)
                        return 1;
                if (is_noncanonical_msr_address(data, vcpu))
                        return 1;
                if (index % 2)
                        vmx->pt_desc.guest.addr_b[index / 2] = data;
                else
                        vmx->pt_desc.guest.addr_a[index / 2] = data;
                break;
        case MSR_IA32_S_CET:
                vmcs_writel(GUEST_S_CET, data);
                break;
        case MSR_KVM_INTERNAL_GUEST_SSP:
                vmcs_writel(GUEST_SSP, data);
                break;
        case MSR_IA32_INT_SSP_TAB:
                vmcs_writel(GUEST_INTR_SSP_TABLE, data);
                break;
        case MSR_IA32_PERF_CAPABILITIES:
                if (data & PERF_CAP_LBR_FMT) {
                        if ((data & PERF_CAP_LBR_FMT) !=
                            (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
                                return 1;
                        if (!cpuid_model_is_consistent(vcpu))
                                return 1;
                }
                if (data & PERF_CAP_PEBS_FORMAT) {
                        if ((data & PERF_CAP_PEBS_MASK) !=
                            (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
                                return 1;
                        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
                                return 1;
                        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
                                return 1;
                        if (!cpuid_model_is_consistent(vcpu))
                                return 1;
                }
                ret = kvm_set_msr_common(vcpu, msr_info);
                break;

        default:
        find_uret_msr:
                msr = vmx_find_uret_msr(vmx, msr_index);
                if (msr)
                        ret = vmx_set_guest_uret_msr(vmx, msr, data);
                else
                        ret = kvm_set_msr_common(vcpu, msr_info);
        }

        /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
        if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
                vmx_update_fb_clear_dis(vcpu, vmx);

        return ret;
}

void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
        unsigned long guest_owned_bits;

        kvm_register_mark_available(vcpu, reg);

        switch (reg) {
        case VCPU_REGS_RSP:
                vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
                break;
        case VCPU_REGS_RIP:
                vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
                break;
        case VCPU_EXREG_PDPTR:
                if (enable_ept)
                        ept_save_pdptrs(vcpu);
                break;
        case VCPU_EXREG_CR0:
                guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;

                vcpu->arch.cr0 &= ~guest_owned_bits;
                vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
                break;
        case VCPU_EXREG_CR3:
                /*
                 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
                 * CR3 is loaded into hardware, not the guest's CR3.
                 */
                if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
                        vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
                break;
        case VCPU_EXREG_CR4:
                guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;

                vcpu->arch.cr4 &= ~guest_owned_bits;
                vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
                break;
        default:
                KVM_BUG_ON(1, vcpu->kvm);
                break;
        }
}

/*
 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
 * directly instead of going through cpu_has(), to ensure KVM is trapping
 * ENCLS whenever it's supported in hardware.  It does not matter whether
 * the host OS supports or has enabled SGX.
 */
static bool cpu_has_sgx(void)
{
        return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}

static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
        u32 vmx_msr_low, vmx_msr_high;
        u32 ctl = ctl_min | ctl_opt;

        rdmsr(msr, vmx_msr_low, vmx_msr_high);

        ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
        ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */

        /* Ensure minimum (required) set of control bits are supported. */
        if (ctl_min & ~ctl)
                return -EIO;

        *result = ctl;
        return 0;
}

static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
        u64 allowed;

        rdmsrq(msr, allowed);

        return  ctl_opt & allowed;
}

#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls)        \
({                                                                              \
        int i, r = 0;                                                           \
                                                                                \
        BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
        BUILD_BUG_ON(sizeof(pairs[0].exit_control)  != sizeof(exit_controls));  \
                                                                                \
        for (i = 0; i < ARRAY_SIZE(pairs); i++) {                               \
                typeof(entry_controls) n_ctrl = pairs[i].entry_control;         \
                typeof(exit_controls) x_ctrl = pairs[i].exit_control;           \
                                                                                \
                if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl))    \
                        continue;                                               \
                                                                                \
                pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, "             \
                             "entry = %llx (%llx), exit = %llx (%llx)\n",       \
                             (u64)(entry_controls & n_ctrl), (u64)n_ctrl,       \
                             (u64)(exit_controls & x_ctrl), (u64)x_ctrl);       \
                                                                                \
                if (error_on_inconsistent_vmcs_config)                          \
                        r = -EIO;                                               \
                                                                                \
                entry_controls &= ~n_ctrl;                                      \
                exit_controls &= ~x_ctrl;                                       \
        }                                                                       \
        r;                                                                      \
})

static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
                             struct vmx_capability *vmx_cap)
{
        u32 _pin_based_exec_control = 0;
        u32 _cpu_based_exec_control = 0;
        u32 _cpu_based_2nd_exec_control = 0;
        u64 _cpu_based_3rd_exec_control = 0;
        u32 _vmexit_control = 0;
        u32 _vmentry_control = 0;
        u64 basic_msr;
        u64 misc_msr;

        /*
         * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
         * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
         * intercepts writes to PAT and EFER, i.e. never enables those controls.
         */
        struct {
                u32 entry_control;
                u32 exit_control;
        } const vmcs_entry_exit_pairs[] = {
                { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
                { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
                { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
                { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
                { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
                { VM_ENTRY_LOAD_CET_STATE,              VM_EXIT_LOAD_CET_STATE },
        };

        memset(vmcs_conf, 0, sizeof(*vmcs_conf));

        if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
                                KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
                                MSR_IA32_VMX_PROCBASED_CTLS,
                                &_cpu_based_exec_control))
                return -EIO;
        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
                if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
                                        KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control))
                        return -EIO;
        }
        if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
                _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;

#ifndef CONFIG_X86_64
        if (!(_cpu_based_2nd_exec_control &
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif

        if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
                _cpu_based_2nd_exec_control &= ~(
                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);

        rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
                &vmx_cap->ept, &vmx_cap->vpid);

        if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
            vmx_cap->ept) {
                pr_warn_once("EPT CAP should not exist if not support "
                                "1-setting enable EPT VM-execution control\n");

                if (error_on_inconsistent_vmcs_config)
                        return -EIO;

                vmx_cap->ept = 0;
                _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
        }
        if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
            vmx_cap->vpid) {
                pr_warn_once("VPID CAP should not exist if not support "
                                "1-setting enable VPID VM-execution control\n");

                if (error_on_inconsistent_vmcs_config)
                        return -EIO;

                vmx_cap->vpid = 0;
        }

        if (!cpu_has_sgx())
                _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;

        if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
                _cpu_based_3rd_exec_control =
                        adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
                                              MSR_IA32_VMX_PROCBASED_CTLS3);

        if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
                                KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
                                MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control))
                return -EIO;

        if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
                                KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
                                MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control))
                return -EIO;

        if (cpu_has_broken_vmx_preemption_timer())
                _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        if (!(_cpu_based_2nd_exec_control &
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
                _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;

        if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
                                KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
                                MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control))
                return -EIO;

        if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
                                       _vmentry_control, _vmexit_control))
                return -EIO;

        /*
         * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
         * can't be used due to an errata where VM Exit may incorrectly clear
         * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
         * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
         */
        switch (boot_cpu_data.x86_vfm) {
        case INTEL_NEHALEM_EP:  /* AAK155 */
        case INTEL_NEHALEM:     /* AAP115 */
        case INTEL_WESTMERE:    /* AAT100 */
        case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
        case INTEL_NEHALEM_EX:  /* BA97 */
                _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
                _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
                pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
                             "does not work properly. Using workaround\n");
                break;
        default:
                break;
        }

        rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);

        /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
        if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
                return -EIO;

#ifdef CONFIG_X86_64
        /*
         * KVM expects to be able to shove all legal physical addresses into
         * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
         * 0 for processors that support Intel 64 architecture".
         */
        if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
                return -EIO;
#endif

        /* Require Write-Back (WB) memory type for VMCS accesses. */
        if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
                return -EIO;

        rdmsrq(MSR_IA32_VMX_MISC, misc_msr);

        vmcs_conf->basic = basic_msr;
        vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
        vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
        vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
        vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
        vmcs_conf->vmexit_ctrl         = _vmexit_control;
        vmcs_conf->vmentry_ctrl        = _vmentry_control;
        vmcs_conf->misc = misc_msr;

#if IS_ENABLED(CONFIG_HYPERV)
        if (enlightened_vmcs)
                evmcs_sanitize_exec_ctrls(vmcs_conf);
#endif

        return 0;
}

static bool __kvm_is_vmx_supported(void)
{
        int cpu = smp_processor_id();

        if (!(cpuid_ecx(1) & feature_bit(VMX))) {
                pr_err("VMX not supported by CPU %d\n", cpu);
                return false;
        }

        if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
            !this_cpu_has(X86_FEATURE_VMX)) {
                pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
                return false;
        }

        return true;
}

static bool kvm_is_vmx_supported(void)
{
        bool supported;

        migrate_disable();
        supported = __kvm_is_vmx_supported();
        migrate_enable();

        return supported;
}

int vmx_check_processor_compat(void)
{
        int cpu = raw_smp_processor_id();
        struct vmcs_config vmcs_conf;
        struct vmx_capability vmx_cap;

        if (!__kvm_is_vmx_supported())
                return -EIO;

        if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
                pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
                return -EIO;
        }
        if (nested)
                nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);

        if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
                u32 *gold = (void *)&vmcs_config;
                u32 *mine = (void *)&vmcs_conf;
                int i;

                BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32));

                pr_err("VMCS config on CPU %d doesn't match reference config:", cpu);
                for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) {
                        if (gold[i] == mine[i])
                                continue;

                        pr_cont("\n  Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x",
                                i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]);
                }
                pr_cont("\n");
                return -EIO;
        }
        return 0;
}

static int kvm_cpu_vmxon(u64 vmxon_pointer)
{
        u64 msr;

        cr4_set_bits(X86_CR4_VMXE);

        asm goto("1: vmxon %[vmxon_pointer]\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          : : [vmxon_pointer] "m"(vmxon_pointer)
                          : : fault);
        return 0;

fault:
        WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
                  rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
        cr4_clear_bits(X86_CR4_VMXE);

        return -EFAULT;
}

int vmx_enable_virtualization_cpu(void)
{
        int cpu = raw_smp_processor_id();
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
        int r;

        if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;

        /*
         * This can happen if we hot-added a CPU but failed to allocate
         * VP assist page for it.
         */
        if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
                return -EFAULT;

        intel_pt_handle_vmx(1);

        r = kvm_cpu_vmxon(phys_addr);
        if (r) {
                intel_pt_handle_vmx(0);
                return r;
        }

        return 0;
}

static void vmclear_local_loaded_vmcss(void)
{
        int cpu = raw_smp_processor_id();
        struct loaded_vmcs *v, *n;

        list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
                                 loaded_vmcss_on_cpu_link)
                __loaded_vmcs_clear(v);
}

void vmx_disable_virtualization_cpu(void)
{
        vmclear_local_loaded_vmcss();

        if (kvm_cpu_vmxoff())
                kvm_spurious_fault();

        hv_reset_evmcs();

        intel_pt_handle_vmx(0);
}

struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
        int node = cpu_to_node(cpu);
        struct page *pages;
        struct vmcs *vmcs;

        pages = __alloc_pages_node(node, flags, 0);
        if (!pages)
                return NULL;
        vmcs = page_address(pages);
        memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));

        /* KVM supports Enlightened VMCS v1 only */
        if (kvm_is_using_evmcs())
                vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
        else
                vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);

        if (shadow)
                vmcs->hdr.shadow_vmcs = 1;
        return vmcs;
}

void free_vmcs(struct vmcs *vmcs)
{
        free_page((unsigned long)vmcs);
}

/*
 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
 */
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
        if (!loaded_vmcs->vmcs)
                return;
        loaded_vmcs_clear(loaded_vmcs);
        free_vmcs(loaded_vmcs->vmcs);
        loaded_vmcs->vmcs = NULL;
        if (loaded_vmcs->msr_bitmap)
                free_page((unsigned long)loaded_vmcs->msr_bitmap);
        WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}

int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
        loaded_vmcs->vmcs = alloc_vmcs(false);
        if (!loaded_vmcs->vmcs)
                return -ENOMEM;

        vmcs_clear(loaded_vmcs->vmcs);

        loaded_vmcs->shadow_vmcs = NULL;
        loaded_vmcs->hv_timer_soft_disabled = false;
        loaded_vmcs->cpu = -1;
        loaded_vmcs->launched = 0;

        if (cpu_has_vmx_msr_bitmap()) {
                loaded_vmcs->msr_bitmap = (unsigned long *)
                                __get_free_page(GFP_KERNEL_ACCOUNT);
                if (!loaded_vmcs->msr_bitmap)
                        goto out_vmcs;
                memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
        }

        memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
        memset(&loaded_vmcs->controls_shadow, 0,
                sizeof(struct vmcs_controls_shadow));

        return 0;

out_vmcs:
        free_loaded_vmcs(loaded_vmcs);
        return -ENOMEM;
}

static void free_kvm_area(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                free_vmcs(per_cpu(vmxarea, cpu));
                per_cpu(vmxarea, cpu) = NULL;
        }
}

static __init int alloc_kvm_area(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct vmcs *vmcs;

                vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
                if (!vmcs) {
                        free_kvm_area();
                        return -ENOMEM;
                }

                /*
                 * When eVMCS is enabled, alloc_vmcs_cpu() sets
                 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
                 * revision_id reported by MSR_IA32_VMX_BASIC.
                 *
                 * However, even though not explicitly documented by
                 * TLFS, VMXArea passed as VMXON argument should
                 * still be marked with revision_id reported by
                 * physical CPU.
                 */
                if (kvm_is_using_evmcs())
                        vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);

                per_cpu(vmxarea, cpu) = vmcs;
        }
        return 0;
}

static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
                struct kvm_segment *save)
{
        if (!emulate_invalid_guest_state) {
                /*
                 * CS and SS RPL should be equal during guest entry according
                 * to VMX spec, but in reality it is not always so. Since vcpu
                 * is in the middle of the transition from real mode to
                 * protected mode it is safe to assume that RPL 0 is a good
                 * default value.
                 */
                if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
                        save->selector &= ~SEGMENT_RPL_MASK;
                save->dpl = save->selector & SEGMENT_RPL_MASK;
                save->s = 1;
        }
        __vmx_set_segment(vcpu, save, seg);
}

static void enter_pmode(struct kvm_vcpu *vcpu)
{
        unsigned long flags;
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * Update real mode segment cache. It may be not up-to-date if segment
         * register was written while vcpu was in a guest mode.
         */
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

        vmx->rmode.vm86_active = 0;

        __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);

        flags = vmcs_readl(GUEST_RFLAGS);
        flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
        flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
        vmcs_writel(GUEST_RFLAGS, flags);

        vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
                        (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));

        vmx_update_exception_bitmap(vcpu);

        fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
        fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
        fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
        fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
        fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
        fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
}

static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        struct kvm_segment var = *save;

        var.dpl = 0x3;
        if (seg == VCPU_SREG_CS)
                var.type = 0x3;

        if (!emulate_invalid_guest_state) {
                var.selector = var.base >> 4;
                var.base = var.base & 0xffff0;
                var.limit = 0xffff;
                var.g = 0;
                var.db = 0;
                var.present = 1;
                var.s = 1;
                var.l = 0;
                var.unusable = 0;
                var.type = 0x3;
                var.avl = 0;
                if (save->base & 0xf)
                        pr_warn_once("segment base is not paragraph aligned "
                                     "when entering protected mode (seg=%d)", seg);
        }

        vmcs_write16(sf->selector, var.selector);
        vmcs_writel(sf->base, var.base);
        vmcs_write32(sf->limit, var.limit);
        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}

static void enter_rmode(struct kvm_vcpu *vcpu)
{
        unsigned long flags;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);

        /*
         * KVM should never use VM86 to virtualize Real Mode when L2 is active,
         * as using VM86 is unnecessary if unrestricted guest is enabled, and
         * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
         * should VM-Fail and KVM should reject userspace attempts to stuff
         * CR0.PG=0 when L2 is active.
         */
        WARN_ON_ONCE(is_guest_mode(vcpu));

        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
        vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);

        vmx->rmode.vm86_active = 1;

        vmx_segment_cache_clear(vmx);

        vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

        flags = vmcs_readl(GUEST_RFLAGS);
        vmx->rmode.save_rflags = flags;

        flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;

        vmcs_writel(GUEST_RFLAGS, flags);
        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
        vmx_update_exception_bitmap(vcpu);

        fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
        fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
        fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
        fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
        fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
        fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
}

int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /* Nothing to do if hardware doesn't support EFER. */
        if (!vmx_find_uret_msr(vmx, MSR_EFER))
                return 0;

        vcpu->arch.efer = efer;
#ifdef CONFIG_X86_64
        if (efer & EFER_LMA)
                vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
        else
                vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
#else
        if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
                return 1;
#endif

        vmx_setup_uret_msrs(vmx);
        return 0;
}

#ifdef CONFIG_X86_64

static void enter_lmode(struct kvm_vcpu *vcpu)
{
        u32 guest_tr_ar;

        vmx_segment_cache_clear(to_vmx(vcpu));

        guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
        if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
                pr_debug_ratelimited("%s: tss fixup for long mode. \n",
                                     __func__);
                vmcs_write32(GUEST_TR_AR_BYTES,
                             (guest_tr_ar & ~VMX_AR_TYPE_MASK)
                             | VMX_AR_TYPE_BUSY_64_TSS);
        }
        vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
}

static void exit_lmode(struct kvm_vcpu *vcpu)
{
        vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}

#endif

void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
         * the CPU is not required to invalidate guest-physical mappings on
         * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
         * associated with the root EPT structure and not any particular VPID
         * (INVVPID also isn't required to invalidate guest-physical mappings).
         */
        if (enable_ept) {
                ept_sync_global();
        } else if (enable_vpid) {
                if (cpu_has_vmx_invvpid_global()) {
                        vpid_sync_vcpu_global();
                } else {
                        vpid_sync_vcpu_single(vmx->vpid);
                        vpid_sync_vcpu_single(vmx->nested.vpid02);
                }
        }
}

static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
        if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
                return nested_get_vpid02(vcpu);
        return to_vmx(vcpu)->vpid;
}

static u64 construct_eptp(hpa_t root_hpa)
{
        u64 eptp = root_hpa | VMX_EPTP_MT_WB;
        struct kvm_mmu_page *root;

        if (kvm_mmu_is_dummy_root(root_hpa))
                return eptp | VMX_EPTP_PWL_4;

        /*
         * EPT roots should always have an associated MMU page.  Return a "bad"
         * EPTP to induce VM-Fail instead of continuing on in a unknown state.
         */
        root = root_to_sp(root_hpa);
        if (WARN_ON_ONCE(!root))
                return INVALID_PAGE;

        eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;

        if (enable_ept_ad_bits && !root->role.ad_disabled)
                eptp |= VMX_EPTP_AD_ENABLE_BIT;

        return eptp;
}

static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
{
        u64 eptp = construct_eptp(root_hpa);

        if (VALID_PAGE(eptp))
                ept_sync_context(eptp);
        else
                ept_sync_global();
}

void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
        struct kvm_mmu *mmu = vcpu->arch.mmu;
        u64 root_hpa = mmu->root.hpa;

        /* No flush required if the current context is invalid. */
        if (!VALID_PAGE(root_hpa))
                return;

        if (enable_ept)
                vmx_flush_tlb_ept_root(root_hpa);
        else
                vpid_sync_context(vmx_get_current_vpid(vcpu));
}

void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
        /*
         * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
         * vmx_flush_tlb_guest() for an explanation of why this is ok.
         */
        vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}

void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
        /*
         * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
         * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
         * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
         * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
         * i.e. no explicit INVVPID is necessary.
         */
        vpid_sync_context(vmx_get_current_vpid(vcpu));
}

void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

        if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
                return;

        if (is_pae_paging(vcpu)) {
                vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
                vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
                vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
                vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
        }
}

void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
        struct kvm_mmu *mmu = vcpu->arch.walk_mmu;

        if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
                return;

        mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
        mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
        mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
        mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);

        kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}

#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
                          CPU_BASED_CR3_STORE_EXITING)

bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
        if (is_guest_mode(vcpu))
                return nested_guest_cr0_valid(vcpu, cr0);

        if (to_vmx(vcpu)->nested.vmxon)
                return nested_host_cr0_valid(vcpu, cr0);

        return true;
}

void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long hw_cr0, old_cr0_pg;
        u32 tmp;

        old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);

        hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
        if (enable_unrestricted_guest)
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
        else {
                hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
                if (!enable_ept)
                        hw_cr0 |= X86_CR0_WP;

                if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
                        enter_pmode(vcpu);

                if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
                        enter_rmode(vcpu);
        }

        vmcs_writel(CR0_READ_SHADOW, cr0);
        vmcs_writel(GUEST_CR0, hw_cr0);
        vcpu->arch.cr0 = cr0;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);

#ifdef CONFIG_X86_64
        if (vcpu->arch.efer & EFER_LME) {
                if (!old_cr0_pg && (cr0 & X86_CR0_PG))
                        enter_lmode(vcpu);
                else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
                        exit_lmode(vcpu);
        }
#endif

        if (enable_ept && !enable_unrestricted_guest) {
                /*
                 * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
                 * the below code _enables_ CR3 exiting, vmx_cache_reg() will
                 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
                 * KVM's CR3 is installed.
                 */
                if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
                        vmx_cache_reg(vcpu, VCPU_EXREG_CR3);

                /*
                 * When running with EPT but not unrestricted guest, KVM must
                 * intercept CR3 accesses when paging is _disabled_.  This is
                 * necessary because restricted guests can't actually run with
                 * paging disabled, and so KVM stuffs its own CR3 in order to
                 * run the guest when identity mapped page tables.
                 *
                 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
                 * update, it may be stale with respect to CR3 interception,
                 * e.g. after nested VM-Enter.
                 *
                 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
                 * stores to forward them to L1, even if KVM does not need to
                 * intercept them to preserve its identity mapped page tables.
                 */
                if (!(cr0 & X86_CR0_PG)) {
                        exec_controls_setbit(vmx, CR3_EXITING_BITS);
                } else if (!is_guest_mode(vcpu)) {
                        exec_controls_clearbit(vmx, CR3_EXITING_BITS);
                } else {
                        tmp = exec_controls_get(vmx);
                        tmp &= ~CR3_EXITING_BITS;
                        tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
                        exec_controls_set(vmx, tmp);
                }

                /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
                if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
                        vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));

                /*
                 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
                 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
                 */
                if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
                        kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
        }

        /* depends on vcpu->arch.cr0 to be set to a new value */
        vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}

static int vmx_get_max_ept_level(void)
{
        if (cpu_has_vmx_ept_5levels())
                return 5;
        return 4;
}

void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
        struct kvm *kvm = vcpu->kvm;
        bool update_guest_cr3 = true;
        unsigned long guest_cr3;

        if (enable_ept) {
                KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
                                root_level != root_to_sp(root_hpa)->role.level);
                vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));

                hv_track_root_tdp(vcpu, root_hpa);

                if (!enable_unrestricted_guest && !is_paging(vcpu))
                        guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
                else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
                        guest_cr3 = vcpu->arch.cr3;
                else /* vmcs.GUEST_CR3 is already up-to-date. */
                        update_guest_cr3 = false;
                vmx_ept_load_pdptrs(vcpu);
        } else {
                guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
                            kvm_get_active_cr3_lam_bits(vcpu);
        }

        if (update_guest_cr3)
                vmcs_writel(GUEST_CR3, guest_cr3);
}

bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
        /*
         * We operate under the default treatment of SMM, so VMX cannot be
         * enabled under SMM.  Note, whether or not VMXE is allowed at all,
         * i.e. is a reserved bit, is handled by common x86 code.
         */
        if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
                return false;

        if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
                return false;

        return true;
}

void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long hw_cr4;

        /*
         * Pass through host's Machine Check Enable value to hw_cr4, which
         * is in force while we are in guest mode.  Do not let guests control
         * this bit, even if host CR4.MCE == 0.
         */
        hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
        if (enable_unrestricted_guest)
                hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
        else if (vmx->rmode.vm86_active)
                hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
        else
                hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;

        if (vmx_umip_emulated()) {
                if (cr4 & X86_CR4_UMIP) {
                        secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
                        hw_cr4 &= ~X86_CR4_UMIP;
                } else if (!is_guest_mode(vcpu) ||
                        !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
                        secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
                }
        }

        vcpu->arch.cr4 = cr4;
        kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);

        if (!enable_unrestricted_guest) {
                if (enable_ept) {
                        if (!is_paging(vcpu)) {
                                hw_cr4 &= ~X86_CR4_PAE;
                                hw_cr4 |= X86_CR4_PSE;
                        } else if (!(cr4 & X86_CR4_PAE)) {
                                hw_cr4 &= ~X86_CR4_PAE;
                        }
                }

                /*
                 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
                 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
                 * to be manually disabled when guest switches to non-paging
                 * mode.
                 *
                 * If !enable_unrestricted_guest, the CPU is always running
                 * with CR0.PG=1 and CR4 needs to be modified.
                 * If enable_unrestricted_guest, the CPU automatically
                 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
                 */
                if (!is_paging(vcpu))
                        hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
        }

        vmcs_writel(CR4_READ_SHADOW, cr4);
        vmcs_writel(GUEST_CR4, hw_cr4);

        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                vcpu->arch.cpuid_dynamic_bits_dirty = true;
}

void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 ar;

        if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
                *var = vmx->rmode.segs[seg];
                if (seg == VCPU_SREG_TR
                    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
                        return;
                var->base = vmx_read_guest_seg_base(vmx, seg);
                var->selector = vmx_read_guest_seg_selector(vmx, seg);
                return;
        }
        var->base = vmx_read_guest_seg_base(vmx, seg);
        var->limit = vmx_read_guest_seg_limit(vmx, seg);
        var->selector = vmx_read_guest_seg_selector(vmx, seg);
        ar = vmx_read_guest_seg_ar(vmx, seg);
        var->unusable = (ar >> 16) & 1;
        var->type = ar & 15;
        var->s = (ar >> 4) & 1;
        var->dpl = (ar >> 5) & 3;
        /*
         * Some userspaces do not preserve unusable property. Since usable
         * segment has to be present according to VMX spec we can use present
         * property to amend userspace bug by making unusable segment always
         * nonpresent. vmx_segment_access_rights() already marks nonpresent
         * segment as unusable.
         */
        var->present = !var->unusable;
        var->avl = (ar >> 12) & 1;
        var->l = (ar >> 13) & 1;
        var->db = (ar >> 14) & 1;
        var->g = (ar >> 15) & 1;
}

u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
        struct kvm_segment s;

        if (to_vmx(vcpu)->rmode.vm86_active) {
                vmx_get_segment(vcpu, &s, seg);
                return s.base;
        }
        return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}

static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int ar;

        if (unlikely(vmx->rmode.vm86_active))
                return 0;

        if (no_cache)
                ar = vmcs_read32(GUEST_SS_AR_BYTES);
        else
                ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
        return VMX_AR_DPL(ar);
}

int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
        return __vmx_get_cpl(vcpu, false);
}

int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
{
        return __vmx_get_cpl(vcpu, true);
}

static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
        u32 ar;

        ar = var->type & 15;
        ar |= (var->s & 1) << 4;
        ar |= (var->dpl & 3) << 5;
        ar |= (var->present & 1) << 7;
        ar |= (var->avl & 1) << 12;
        ar |= (var->l & 1) << 13;
        ar |= (var->db & 1) << 14;
        ar |= (var->g & 1) << 15;
        ar |= (var->unusable || !var->present) << 16;

        return ar;
}

void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];

        vmx_segment_cache_clear(vmx);

        if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
                vmx->rmode.segs[seg] = *var;
                if (seg == VCPU_SREG_TR)
                        vmcs_write16(sf->selector, var->selector);
                else if (var->s)
                        fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
                return;
        }

        vmcs_writel(sf->base, var->base);
        vmcs_write32(sf->limit, var->limit);
        vmcs_write16(sf->selector, var->selector);

        /*
         *   Fix the "Accessed" bit in AR field of segment registers for older
         * qemu binaries.
         *   IA32 arch specifies that at the time of processor reset the
         * "Accessed" bit in the AR field of segment registers is 1. And qemu
         * is setting it to 0 in the userland code. This causes invalid guest
         * state vmexit when "unrestricted guest" mode is turned on.
         *    Fix for this setup issue in cpu_reset is being pushed in the qemu
         * tree. Newer qemu binaries with that qemu fix would not need this
         * kvm hack.
         */
        if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
                var->type |= 0x1; /* Accessed */

        vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}

void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
        __vmx_set_segment(vcpu, var, seg);

        to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}

void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
        u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);

        *db = (ar >> 14) & 1;
        *l = (ar >> 13) & 1;
}

void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
        dt->address = vmcs_readl(GUEST_IDTR_BASE);
}

void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
        vmcs_writel(GUEST_IDTR_BASE, dt->address);
}

void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
        dt->address = vmcs_readl(GUEST_GDTR_BASE);
}

void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
        vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
        vmcs_writel(GUEST_GDTR_BASE, dt->address);
}

static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
        struct kvm_segment var;
        u32 ar;

        vmx_get_segment(vcpu, &var, seg);
        var.dpl = 0x3;
        if (seg == VCPU_SREG_CS)
                var.type = 0x3;
        ar = vmx_segment_access_rights(&var);

        if (var.base != (var.selector << 4))
                return false;
        if (var.limit != 0xffff)
                return false;
        if (ar != 0xf3)
                return false;

        return true;
}

static bool code_segment_valid(struct kvm_vcpu *vcpu)
{
        struct kvm_segment cs;
        unsigned int cs_rpl;

        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
        cs_rpl = cs.selector & SEGMENT_RPL_MASK;

        if (cs.unusable)
                return false;
        if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
                return false;
        if (!cs.s)
                return false;
        if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
                if (cs.dpl > cs_rpl)
                        return false;
        } else {
                if (cs.dpl != cs_rpl)
                        return false;
        }
        if (!cs.present)
                return false;

        /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
        return true;
}

static bool stack_segment_valid(struct kvm_vcpu *vcpu)
{
        struct kvm_segment ss;
        unsigned int ss_rpl;

        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
        ss_rpl = ss.selector & SEGMENT_RPL_MASK;

        if (ss.unusable)
                return true;
        if (ss.type != 3 && ss.type != 7)
                return false;
        if (!ss.s)
                return false;
        if (ss.dpl != ss_rpl) /* DPL != RPL */
                return false;
        if (!ss.present)
                return false;

        return true;
}

static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
        struct kvm_segment var;
        unsigned int rpl;

        vmx_get_segment(vcpu, &var, seg);
        rpl = var.selector & SEGMENT_RPL_MASK;

        if (var.unusable)
                return true;
        if (!var.s)
                return false;
        if (!var.present)
                return false;
        if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
                if (var.dpl < rpl) /* DPL < RPL */
                        return false;
        }

        /* TODO: Add other members to kvm_segment_field to allow checking for other access
         * rights flags
         */
        return true;
}

static bool tr_valid(struct kvm_vcpu *vcpu)
{
        struct kvm_segment tr;

        vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);

        if (tr.unusable)
                return false;
        if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
                return false;
        if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
                return false;
        if (!tr.present)
                return false;

        return true;
}

static bool ldtr_valid(struct kvm_vcpu *vcpu)
{
        struct kvm_segment ldtr;

        vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);

        if (ldtr.unusable)
                return true;
        if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
                return false;
        if (ldtr.type != 2)
                return false;
        if (!ldtr.present)
                return false;

        return true;
}

static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
{
        struct kvm_segment cs, ss;

        vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
        vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);

        return ((cs.selector & SEGMENT_RPL_MASK) ==
                 (ss.selector & SEGMENT_RPL_MASK));
}

/*
 * Check if guest state is valid. Returns true if valid, false if
 * not.
 * We assume that registers are always usable
 */
bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{
        /* real mode guest state checks */
        if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
                        return false;
        } else {
        /* protected mode guest state checks */
                if (!cs_ss_rpl_check(vcpu))
                        return false;
                if (!code_segment_valid(vcpu))
                        return false;
                if (!stack_segment_valid(vcpu))
                        return false;
                if (!data_segment_valid(vcpu, VCPU_SREG_DS))
                        return false;
                if (!data_segment_valid(vcpu, VCPU_SREG_ES))
                        return false;
                if (!data_segment_valid(vcpu, VCPU_SREG_FS))
                        return false;
                if (!data_segment_valid(vcpu, VCPU_SREG_GS))
                        return false;
                if (!tr_valid(vcpu))
                        return false;
                if (!ldtr_valid(vcpu))
                        return false;
        }
        /* TODO:
         * - Add checks on RIP
         * - Add checks on RFLAGS
         */

        return true;
}

static int init_rmode_tss(struct kvm *kvm, void __user *ua)
{
        const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
        u16 data;
        int i;

        for (i = 0; i < 3; i++) {
                if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
                        return -EFAULT;
        }

        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
        if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
                return -EFAULT;

        data = ~0;
        if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
                return -EFAULT;

        return 0;
}

static int init_rmode_identity_map(struct kvm *kvm)
{
        struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
        int i, r = 0;
        void __user *uaddr;
        u32 tmp;

        /* Protect kvm_vmx->ept_identity_pagetable_done. */
        mutex_lock(&kvm->slots_lock);

        if (likely(kvm_vmx->ept_identity_pagetable_done))
                goto out;

        if (!kvm_vmx->ept_identity_map_addr)
                kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;

        uaddr = __x86_set_memory_region(kvm,
                                        IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
                                        kvm_vmx->ept_identity_map_addr,
                                        PAGE_SIZE);
        if (IS_ERR(uaddr)) {
                r = PTR_ERR(uaddr);
                goto out;
        }

        /* Set up identity-mapping pagetable for EPT in real mode */
        for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
                tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
                        _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
                if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
                        r = -EFAULT;
                        goto out;
                }
        }
        kvm_vmx->ept_identity_pagetable_done = true;

out:
        mutex_unlock(&kvm->slots_lock);
        return r;
}

static void seg_setup(int seg)
{
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
        unsigned int ar;

        vmcs_write16(sf->selector, 0);
        vmcs_writel(sf->base, 0);
        vmcs_write32(sf->limit, 0xffff);
        ar = 0x93;
        if (seg == VCPU_SREG_CS)
                ar |= 0x08; /* code segment */

        vmcs_write32(sf->ar_bytes, ar);
}

int allocate_vpid(void)
{
        int vpid;

        if (!enable_vpid)
                return 0;
        spin_lock(&vmx_vpid_lock);
        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
        if (vpid < VMX_NR_VPIDS)
                __set_bit(vpid, vmx_vpid_bitmap);
        else
                vpid = 0;
        spin_unlock(&vmx_vpid_lock);
        return vpid;
}

void free_vpid(int vpid)
{
        if (!enable_vpid || vpid == 0)
                return;
        spin_lock(&vmx_vpid_lock);
        __clear_bit(vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
}

static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{
        /*
         * When KVM is a nested hypervisor on top of Hyper-V and uses
         * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
         * bitmap has changed.
         */
        if (kvm_is_using_evmcs()) {
                struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;

                if (evmcs->hv_enlightenments_control.msr_bitmap)
                        evmcs->hv_clean_fields &=
                                ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
        }

        vmx->nested.force_msr_bitmap_recalc = true;
}

void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;

        if (!cpu_has_vmx_msr_bitmap())
                return;

        vmx_msr_bitmap_l01_changed(vmx);

        if (type & MSR_TYPE_R) {
                if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
                        vmx_clear_msr_bitmap_read(msr_bitmap, msr);
                else
                        vmx_set_msr_bitmap_read(msr_bitmap, msr);
        }

        if (type & MSR_TYPE_W) {
                if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
                        vmx_clear_msr_bitmap_write(msr_bitmap, msr);
                else
                        vmx_set_msr_bitmap_write(msr_bitmap, msr);
        }
}

static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
        /*
         * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
         * of the MSR bitmap.  KVM emulates APIC registers up through 0x3f0,
         * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
         */
        const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
        const int write_idx = read_idx + (0x800 / sizeof(u64));
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
        u8 mode;

        if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
                return;

        if (cpu_has_secondary_exec_ctrls() &&
            (secondary_exec_controls_get(vmx) &
             SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
                mode = MSR_BITMAP_MODE_X2APIC;
                if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
                        mode |= MSR_BITMAP_MODE_X2APIC_APICV;
        } else {
                mode = 0;
        }

        if (mode == vmx->x2apic_msr_bitmap_mode)
                return;

        vmx->x2apic_msr_bitmap_mode = mode;

        /*
         * Reset the bitmap for MSRs 0x800 - 0x83f.  Leave AMD's uber-extended
         * registers (0x840 and above) intercepted, KVM doesn't support them.
         * Intercept all writes by default and poke holes as needed.  Pass
         * through reads for all valid registers by default in x2APIC+APICv
         * mode, only the current timer count needs on-demand emulation by KVM.
         */
        if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
                msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
        else
                msr_bitmap[read_idx] = ~0ull;
        msr_bitmap[write_idx] = ~0ull;

        /*
         * TPR reads and writes can be virtualized even if virtual interrupt
         * delivery is not in use.
         */
        vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
                                  !(mode & MSR_BITMAP_MODE_X2APIC));

        if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
                vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
                vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
                vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
                if (enable_ipiv)
                        vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
        }
}

void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
        u32 i;

        vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
        vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
        vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
        vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
        for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
        }
}

static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
{
        u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
                                    VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;
        bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu);
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool intercept = !has_mediated_pmu;
        int i;

        if (!enable_mediated_pmu)
                return;

        if (!cpu_has_save_perf_global_ctrl()) {
                vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL;

                if (has_mediated_pmu)
                        vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
                else
                        vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL);
        }

        vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
                                    has_mediated_pmu);

        vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu);

        for (i = 0; i < pmu->nr_arch_gp_counters; i++) {
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
                                          MSR_TYPE_RW, intercept);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW,
                                          intercept || !fw_writes_is_enabled(vcpu));
        }
        for ( ; i < kvm_pmu_cap.num_counters_gp; i++) {
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i,
                                          MSR_TYPE_RW, true);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i,
                                          MSR_TYPE_RW, true);
        }

        for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
                vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
                                          MSR_TYPE_RW, intercept);
        for ( ; i < kvm_pmu_cap.num_counters_fixed; i++)
                vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i,
                                          MSR_TYPE_RW, true);

        intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
        vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS,
                                  MSR_TYPE_RW, intercept);
        vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                  MSR_TYPE_RW, intercept);
        vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
                                  MSR_TYPE_RW, intercept);
}

static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
        bool intercept;

        if (!cpu_has_vmx_msr_bitmap())
                return;

        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
#ifdef CONFIG_X86_64
        vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
#endif
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
        vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
        if (kvm_cstate_in_guest(vcpu->kvm)) {
                vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
                vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
                vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
                vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
        }
        if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
                vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
                vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
        }

        /* PT MSRs can be passed through iff PT is exposed to the guest. */
        if (vmx_pt_mode_is_host_guest())
                pt_update_intercept_for_msr(vcpu);

        if (vcpu->arch.xfd_no_write_intercept)
                vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);

        vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
                                  !to_vmx(vcpu)->spec_ctrl);

        if (kvm_cpu_cap_has(X86_FEATURE_XFD))
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
                                          !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));

        if (cpu_feature_enabled(X86_FEATURE_IBPB))
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
                                          !guest_has_pred_cmd_msr(vcpu));

        if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
                                          !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));

        if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
                intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);

                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
        }

        if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
                intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
                            !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);

                vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
                vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
        }

        vmx_recalc_pmu_msr_intercepts(vcpu);

        /*
         * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
         * filtered by userspace.
         */
}

static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
        exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING,
                                kvm_need_rdpmc_intercept(vcpu));
}

void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
{
        vmx_recalc_instruction_intercepts(vcpu);
        vmx_recalc_msr_intercepts(vcpu);
}

static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
                                                int vector)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
         * and freed, and must not be accessed outside of vcpu->mutex.  The
         * vCPU's cached PI NV is valid if and only if posted interrupts
         * enabled in its vmcs12, i.e. checking the vector also checks that
         * L1 has enabled posted interrupts for L2.
         */
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
                /*
                 * If a posted intr is not recognized by hardware,
                 * we will accomplish it in the next vmentry.
                 */
                vmx->nested.pi_pending = true;
                kvm_make_request(KVM_REQ_EVENT, vcpu);

                /*
                 * This pairs with the smp_mb_*() after setting vcpu->mode in
                 * vcpu_enter_guest() to guarantee the vCPU sees the event
                 * request if triggering a posted interrupt "fails" because
                 * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
                 * the smb_wmb() in kvm_make_request() only ensures everything
                 * done before making the request is visible when the request
                 * is visible, it doesn't ensure ordering between the store to
                 * vcpu->requests and the load from vcpu->mode.
                 */
                smp_mb__after_atomic();

                /* the PIR and ON have been set by L1. */
                kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
                return 0;
        }
        return -1;
}
/*
 * Send interrupt to vcpu via posted interrupt way.
 * 1. If target vcpu is running(non-root mode), send posted interrupt
 * notification to vcpu and hardware will sync PIR to vIRR atomically.
 * 2. If target vcpu isn't running(root mode), kick it to pick up the
 * interrupt from PIR in next vmentry.
 */
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
        struct vcpu_vt *vt = to_vt(vcpu);
        int r;

        r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
        if (!r)
                return 0;

        /* Note, this is called iff the local APIC is in-kernel. */
        if (!vcpu->arch.apic->apicv_active)
                return -1;

        __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
        return 0;
}

void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
                           int trig_mode, int vector)
{
        struct kvm_vcpu *vcpu = apic->vcpu;

        if (vmx_deliver_posted_interrupt(vcpu, vector)) {
                kvm_lapic_set_irr(vector, apic);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                kvm_vcpu_kick(vcpu);
        } else {
                trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
                                           trig_mode, vector);
        }
}

/*
 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
 * will not change in the lifetime of the guest.
 * Note that host-state that does change is set elsewhere. E.g., host-state
 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
 */
void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
{
        u32 low32, high32;
        unsigned long tmpl;
        unsigned long cr0, cr3, cr4;

        cr0 = read_cr0();
        WARN_ON(cr0 & X86_CR0_TS);
        vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */

        /*
         * Save the most likely value for this task's CR3 in the VMCS.
         * We can't use __get_current_cr3_fast() because we're not atomic.
         */
        cr3 = __read_cr3();
        vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
        vmx->loaded_vmcs->host_state.cr3 = cr3;

        /* Save the most likely value for this task's CR4 in the VMCS. */
        cr4 = cr4_read_shadow();
        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
        vmx->loaded_vmcs->host_state.cr4 = cr4;

        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
#ifdef CONFIG_X86_64
        /*
         * Load null selectors, so we can avoid reloading them in
         * vmx_prepare_switch_to_host(), in case userspace uses
         * the null selectors too (the expected case).
         */
        vmcs_write16(HOST_DS_SELECTOR, 0);
        vmcs_write16(HOST_ES_SELECTOR, 0);
#else
        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
#endif
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */

        vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */

        vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */

        rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
        vmcs_write32(HOST_IA32_SYSENTER_CS, low32);

        /*
         * SYSENTER is used for 32-bit system calls on either 32-bit or
         * 64-bit kernels.  It is always zero If neither is allowed, otherwise
         * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
         * have already done so!).
         */
        if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
                vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);

        rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */

        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
                rdmsr(MSR_IA32_CR_PAT, low32, high32);
                vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
        }

        if (cpu_has_load_ia32_efer())
                vmcs_write64(HOST_IA32_EFER, kvm_host.efer);

        /*
         * Supervisor shadow stack is not enabled on host side, i.e.,
         * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
         * description(RDSSP instruction), SSP is not readable in CPL0,
         * so resetting the two registers to 0s at VM-Exit does no harm
         * to kernel execution. When execution flow exits to userspace,
         * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
         * 3 and 4 for details.
         */
        if (cpu_has_load_cet_ctrl()) {
                vmcs_writel(HOST_S_CET, kvm_host.s_cet);
                vmcs_writel(HOST_SSP, 0);
                vmcs_writel(HOST_INTR_SSP_TABLE, 0);
        }

        /*
         * When running a guest with a mediated PMU, guest state is resident in
         * hardware after VM-Exit.  Zero PERF_GLOBAL_CTRL on exit so that host
         * activity doesn't bleed into the guest counters.  When running with
         * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every
         * entry/exit to merge guest and host PMU usage.
         */
        if (enable_mediated_pmu)
                vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0);
}

void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
        struct kvm_vcpu *vcpu = &vmx->vcpu;

        vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
                                          ~vcpu->arch.cr4_guest_rsvd_bits;
        if (!enable_ept) {
                vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
                vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
        }
        if (is_guest_mode(&vmx->vcpu))
                vcpu->arch.cr4_guest_owned_bits &=
                        ~get_vmcs12(vcpu)->cr4_guest_host_mask;
        vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}

static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
        u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;

        if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;

        if (!enable_vnmi)
                pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;

        if (!enable_preemption_timer)
                pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;

        return pin_based_exec_ctrl;
}

static u32 vmx_get_initial_vmentry_ctrl(void)
{
        u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;

        if (vmx_pt_mode_is_system())
                vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
                                  VM_ENTRY_LOAD_IA32_RTIT_CTL);
        /*
         * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
         */
        vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
                          VM_ENTRY_LOAD_IA32_EFER |
                          VM_ENTRY_IA32E_MODE);

        return vmentry_ctrl;
}

static u32 vmx_get_initial_vmexit_ctrl(void)
{
        u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;

        /*
         * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
         * nested virtualization and thus allowed to be set in vmcs12.
         */
        vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
                         VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);

        if (vmx_pt_mode_is_system())
                vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
                                 VM_EXIT_CLEAR_IA32_RTIT_CTL);
        /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
        return vmexit_ctrl &
                ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER |
                  VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL);
}

void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        guard(vmx_vmcs01)(vcpu);

        pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));

        secondary_exec_controls_changebit(vmx,
                                          SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                          SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
                                          kvm_vcpu_apicv_active(vcpu));
        if (enable_ipiv)
                tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
                                                 kvm_vcpu_apicv_active(vcpu));

        vmx_update_msr_bitmap_x2apic(vcpu);
}

static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
        u32 exec_control = vmcs_config.cpu_based_exec_ctrl;

        /*
         * Not used by KVM, but fully supported for nesting, i.e. are allowed in
         * vmcs12 and propagated to vmcs02 when set in vmcs12.
         */
        exec_control &= ~(CPU_BASED_RDTSC_EXITING |
                          CPU_BASED_USE_IO_BITMAPS |
                          CPU_BASED_MONITOR_TRAP_FLAG |
                          CPU_BASED_PAUSE_EXITING);

        /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
        exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
                          CPU_BASED_NMI_WINDOW_EXITING);

        if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
                exec_control &= ~CPU_BASED_MOV_DR_EXITING;

        if (!cpu_need_tpr_shadow(&vmx->vcpu))
                exec_control &= ~CPU_BASED_TPR_SHADOW;

#ifdef CONFIG_X86_64
        if (exec_control & CPU_BASED_TPR_SHADOW)
                exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
                                  CPU_BASED_CR8_STORE_EXITING);
        else
                exec_control |= CPU_BASED_CR8_STORE_EXITING |
                                CPU_BASED_CR8_LOAD_EXITING;
#endif
        /* No need to intercept CR3 access or INVPLG when using EPT. */
        if (enable_ept)
                exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
                                  CPU_BASED_CR3_STORE_EXITING |
                                  CPU_BASED_INVLPG_EXITING);
        if (kvm_mwait_in_guest(vmx->vcpu.kvm))
                exec_control &= ~(CPU_BASED_MWAIT_EXITING |
                                CPU_BASED_MONITOR_EXITING);
        if (kvm_hlt_in_guest(vmx->vcpu.kvm))
                exec_control &= ~CPU_BASED_HLT_EXITING;
        return exec_control;
}

static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
{
        u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;

        /*
         * IPI virtualization relies on APICv. Disable IPI virtualization if
         * APICv is inhibited.
         */
        if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
                exec_control &= ~TERTIARY_EXEC_IPI_VIRT;

        return exec_control;
}

/*
 * Adjust a single secondary execution control bit to intercept/allow an
 * instruction in the guest.  This is usually done based on whether or not a
 * feature has been exposed to the guest in order to correctly emulate faults.
 */
static inline void
vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
                                  u32 control, bool enabled, bool exiting)
{
        /*
         * If the control is for an opt-in feature, clear the control if the
         * feature is not exposed to the guest, i.e. not enabled.  If the
         * control is opt-out, i.e. an exiting control, clear the control if
         * the feature _is_ exposed to the guest, i.e. exiting/interception is
         * disabled for the associated instruction.  Note, the caller is
         * responsible presetting exec_control to set all supported bits.
         */
        if (enabled == exiting)
                *exec_control &= ~control;

        /*
         * Update the nested MSR settings so that a nested VMM can/can't set
         * controls for features that are/aren't exposed to the guest.
         */
        if (nested &&
            kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
                /*
                 * All features that can be added or removed to VMX MSRs must
                 * be supported in the first place for nested virtualization.
                 */
                if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
                        enabled = false;

                if (enabled)
                        vmx->nested.msrs.secondary_ctls_high |= control;
                else
                        vmx->nested.msrs.secondary_ctls_high &= ~control;
        }
}

/*
 * Wrapper macro for the common case of adjusting a secondary execution control
 * based on a single guest CPUID bit, with a dedicated feature bit.  This also
 * verifies that the control is actually supported by KVM and hardware.
 */
#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting)     \
({                                                                                              \
        struct kvm_vcpu *__vcpu = &(vmx)->vcpu;                                                 \
        bool __enabled;                                                                         \
                                                                                                \
        if (cpu_has_vmx_##name()) {                                                             \
                __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name);                 \
                vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
                                                  __enabled, exiting);                          \
        }                                                                                       \
})

/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
        vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)

#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
        vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)

static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
        struct kvm_vcpu *vcpu = &vmx->vcpu;

        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;

        if (vmx_pt_mode_is_system())
                exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
        if (!cpu_need_virtualize_apic_accesses(vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
        if (vmx->vpid == 0)
                exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
        if (!enable_ept) {
                exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
                enable_unrestricted_guest = 0;
        }
        if (!enable_unrestricted_guest)
                exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
        if (kvm_pause_in_guest(vmx->vcpu.kvm))
                exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
        if (!kvm_vcpu_apicv_active(vcpu))
                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;

        /*
         * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
         * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
         */
        exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;

        /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
         * in vmx_set_cr4.  */
        exec_control &= ~SECONDARY_EXEC_DESC;

        /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
           (handle_vmptrld).
           We can NOT enable shadow_vmcs here because we don't have yet
           a current VMCS12
        */
        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;

        /*
         * PML is enabled/disabled when dirty logging of memsmlots changes, but
         * it needs to be set here when dirty logging is already active, e.g.
         * if this vCPU was created after dirty logging was enabled.
         */
        if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
                exec_control &= ~SECONDARY_EXEC_ENABLE_PML;

        vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);

        /*
         * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
         * feature is exposed to the guest.  This creates a virtualization hole
         * if both are supported in hardware but only one is exposed to the
         * guest, but letting the guest execute RDTSCP or RDPID when either one
         * is advertised is preferable to emulating the advertised instruction
         * in KVM on #UD, and obviously better than incorrectly injecting #UD.
         */
        if (cpu_has_vmx_rdtscp()) {
                bool rdpid_or_rdtscp_enabled =
                        guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
                        guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);

                vmx_adjust_secondary_exec_control(vmx, &exec_control,
                                                  SECONDARY_EXEC_ENABLE_RDTSCP,
                                                  rdpid_or_rdtscp_enabled, false);
        }

        vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);

        vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
        vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);

        vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
                                    ENABLE_USR_WAIT_PAUSE, false);

        if (!vcpu->kvm->arch.bus_lock_detection_enabled)
                exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;

        if (!kvm_notify_vmexit_enabled(vcpu->kvm))
                exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;

        return exec_control;
}

static inline int vmx_get_pid_table_order(struct kvm *kvm)
{
        return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
}

static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
{
        struct page *pages;
        struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);

        if (!irqchip_in_kernel(kvm) || !enable_ipiv)
                return 0;

        if (kvm_vmx->pid_table)
                return 0;

        pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
                            vmx_get_pid_table_order(kvm));
        if (!pages)
                return -ENOMEM;

        kvm_vmx->pid_table = (void *)page_address(pages);
        return 0;
}

int vmx_vcpu_precreate(struct kvm *kvm)
{
        return vmx_alloc_ipiv_pid_table(kvm);
}

#define VMX_XSS_EXIT_BITMAP 0

static void init_vmcs(struct vcpu_vmx *vmx)
{
        struct kvm *kvm = vmx->vcpu.kvm;
        struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);

        if (nested)
                nested_vmx_set_vmcs_shadowing_bitmap();

        if (cpu_has_vmx_msr_bitmap())
                vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));

        vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */

        /* Control */
        pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));

        exec_controls_set(vmx, vmx_exec_control(vmx));

        if (cpu_has_secondary_exec_ctrls()) {
                secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
                if (vmx->ve_info)
                        vmcs_write64(VE_INFORMATION_ADDRESS,
                                     __pa(vmx->ve_info));
        }

        if (cpu_has_tertiary_exec_ctrls())
                tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));

        if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
                vmcs_write64(EOI_EXIT_BITMAP2, 0);
                vmcs_write64(EOI_EXIT_BITMAP3, 0);

                vmcs_write16(GUEST_INTR_STATUS, 0);

                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
                vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
        }

        if (vmx_can_use_ipiv(&vmx->vcpu)) {
                vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
                vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
        }

        if (!kvm_pause_in_guest(kvm)) {
                vmcs_write32(PLE_GAP, ple_gap);
                vmx->ple_window = ple_window;
                vmx->ple_window_dirty = true;
        }

        if (kvm_notify_vmexit_enabled(kvm))
                vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);

        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
        vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */

        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
        vmx_set_constant_host_state(vmx);
        vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */

        if (cpu_has_vmx_vmfunc())
                vmcs_write64(VM_FUNCTION_CONTROL, 0);

        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val));
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));

        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);

        vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());

        /* 22.2.1, 20.8.1 */
        vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());

        vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);

        set_cr4_guest_host_mask(vmx);

        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);

        if (cpu_has_vmx_xsaves())
                vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);

        if (enable_pml) {
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
        }

        vmx_write_encls_bitmap(&vmx->vcpu, NULL);

        if (vmx_pt_mode_is_host_guest()) {
                memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
                /* Bit[6~0] are forced to 1, writes are ignored. */
                vmx->pt_desc.guest.output_mask = 0x7F;
                vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
        }

        vmcs_write32(GUEST_SYSENTER_CS, 0);
        vmcs_writel(GUEST_SYSENTER_ESP, 0);
        vmcs_writel(GUEST_SYSENTER_EIP, 0);

        vmx_guest_debugctl_write(&vmx->vcpu, 0);

        if (cpu_has_vmx_tpr_shadow()) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
                if (cpu_need_tpr_shadow(&vmx->vcpu))
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
                                     __pa(vmx->vcpu.arch.apic->regs));
                vmcs_write32(TPR_THRESHOLD, 0);
        }

        vmx_setup_uret_msrs(vmx);
}

static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        init_vmcs(vmx);

        if (nested &&
            kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
                memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));

        vcpu_setup_sgx_lepubkeyhash(vcpu);

        vmx->nested.posted_intr_nv = -1;
        vmx->nested.vmxon_ptr = INVALID_GPA;
        vmx->nested.current_vmptr = INVALID_GPA;

#ifdef CONFIG_KVM_HYPERV
        vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
#endif

        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
                vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;

        /*
         * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
         * or POSTED_INTR_WAKEUP_VECTOR.
         */
        vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
        __pi_set_sn(&vmx->vt.pi_desc);
}

void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!init_event)
                __vmx_vcpu_reset(vcpu);

        vmx->rmode.vm86_active = 0;
        vmx->spec_ctrl = 0;

        vmx->msr_ia32_umwait_control = 0;

        vmx->hv_deadline_tsc = -1;
        kvm_set_cr8(vcpu, 0);

        seg_setup(VCPU_SREG_CS);
        vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
        vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);

        seg_setup(VCPU_SREG_DS);
        seg_setup(VCPU_SREG_ES);
        seg_setup(VCPU_SREG_FS);
        seg_setup(VCPU_SREG_GS);
        seg_setup(VCPU_SREG_SS);

        vmcs_write16(GUEST_TR_SELECTOR, 0);
        vmcs_writel(GUEST_TR_BASE, 0);
        vmcs_write32(GUEST_TR_LIMIT, 0xffff);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);

        vmcs_write16(GUEST_LDTR_SELECTOR, 0);
        vmcs_writel(GUEST_LDTR_BASE, 0);
        vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
        vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);

        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);

        vmcs_writel(GUEST_IDTR_BASE, 0);
        vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);

        vmx_segment_cache_clear(vmx);
        kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);

        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
        if (kvm_mpx_supported())
                vmcs_write64(GUEST_BNDCFGS, 0);

        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */

        if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
                vmcs_writel(GUEST_SSP, 0);
                vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
        }
        if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
            kvm_cpu_cap_has(X86_FEATURE_SHSTK))
                vmcs_writel(GUEST_S_CET, 0);

        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);

        vpid_sync_context(vmx->vpid);

        vmx_update_fb_clear_dis(vcpu, vmx);
}

void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
        exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}

void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
        if (!enable_vnmi ||
            vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                vmx_enable_irq_window(vcpu);
                return;
        }

        exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}

void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        uint32_t intr;
        int irq = vcpu->arch.interrupt.nr;

        trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);

        ++vcpu->stat.irq_injections;
        if (vmx->rmode.vm86_active) {
                int inc_eip = 0;
                if (vcpu->arch.interrupt.soft)
                        inc_eip = vcpu->arch.event_exit_inst_len;
                kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
                return;
        }
        intr = irq | INTR_INFO_VALID_MASK;
        if (vcpu->arch.interrupt.soft) {
                intr |= INTR_TYPE_SOFT_INTR;
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                             vmx->vcpu.arch.event_exit_inst_len);
        } else
                intr |= INTR_TYPE_EXT_INTR;
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);

        vmx_clear_hlt(vcpu);
}

void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!enable_vnmi) {
                /*
                 * Tracking the NMI-blocked state in software is built upon
                 * finding the next open IRQ window. This, in turn, depends on
                 * well-behaving guests: They have to keep IRQs disabled at
                 * least as long as the NMI handler runs. Otherwise we may
                 * cause NMI nesting, maybe breaking the guest. But as this is
                 * highly unlikely, we can live with the residual risk.
                 */
                vmx->loaded_vmcs->soft_vnmi_blocked = 1;
                vmx->loaded_vmcs->vnmi_blocked_time = 0;
        }

        ++vcpu->stat.nmi_injections;
        vmx->loaded_vmcs->nmi_known_unmasked = false;

        if (vmx->rmode.vm86_active) {
                kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
                return;
        }

        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);

        vmx_clear_hlt(vcpu);
}

bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool masked;

        if (!enable_vnmi)
                return vmx->loaded_vmcs->soft_vnmi_blocked;
        if (vmx->loaded_vmcs->nmi_known_unmasked)
                return false;
        masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
        vmx->loaded_vmcs->nmi_known_unmasked = !masked;
        return masked;
}

void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!enable_vnmi) {
                if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
                        vmx->loaded_vmcs->soft_vnmi_blocked = masked;
                        vmx->loaded_vmcs->vnmi_blocked_time = 0;
                }
        } else {
                vmx->loaded_vmcs->nmi_known_unmasked = !masked;
                if (masked)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
                else
                        vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
                                        GUEST_INTR_STATE_NMI);
        }
}

bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
{
        if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
                return false;

        if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
                return true;

        return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
                 GUEST_INTR_STATE_NMI));
}

int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return -EBUSY;

        /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
                return -EBUSY;

        return !vmx_nmi_blocked(vcpu);
}

bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
        return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}

bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
        if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
                return false;

        return __vmx_interrupt_blocked(vcpu);
}

int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return -EBUSY;

        /*
         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
         * e.g. if the IRQ arrived asynchronously after checking nested events.
         */
        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
                return -EBUSY;

        return !vmx_interrupt_blocked(vcpu);
}

int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
        void __user *ret;

        if (enable_unrestricted_guest)
                return 0;

        mutex_lock(&kvm->slots_lock);
        ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
                                      PAGE_SIZE * 3);
        mutex_unlock(&kvm->slots_lock);

        if (IS_ERR(ret))
                return PTR_ERR(ret);

        to_kvm_vmx(kvm)->tss_addr = addr;

        return init_rmode_tss(kvm, ret);
}

int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
        to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
        return 0;
}

static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
{
        switch (vec) {
        case BP_VECTOR:
                /*
                 * Update instruction length as we may reinject the exception
                 * from user space while in guest debugging mode.
                 */
                to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
                        return false;
                fallthrough;
        case DB_VECTOR:
                return !(vcpu->guest_debug &
                        (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
        case DE_VECTOR:
        case OF_VECTOR:
        case BR_VECTOR:
        case UD_VECTOR:
        case DF_VECTOR:
        case SS_VECTOR:
        case GP_VECTOR:
        case MF_VECTOR:
                return true;
        }
        return false;
}

static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                                  int vec, u32 err_code)
{
        /*
         * Instruction with address size override prefix opcode 0x67
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
                if (kvm_emulate_instruction(vcpu, 0)) {
                        if (vcpu->arch.halt_request) {
                                vcpu->arch.halt_request = 0;
                                return kvm_emulate_halt_noskip(vcpu);
                        }
                        return 1;
                }
                return 0;
        }

        /*
         * Forward all other exceptions that are valid in real mode.
         * FIXME: Breaks guest debugging in real mode, needs to be fixed with
         *        the required debugging infrastructure rework.
         */
        kvm_queue_exception(vcpu, vec);
        return 1;
}

static int handle_machine_check(struct kvm_vcpu *vcpu)
{
        /* handled by vmx_vcpu_run() */
        return 1;
}

/*
 * If the host has split lock detection disabled, then #AC is
 * unconditionally injected into the guest, which is the pre split lock
 * detection behaviour.
 *
 * If the host has split lock detection enabled then #AC is
 * only injected into the guest when:
 *  - Guest CPL == 3 (user mode)
 *  - Guest has #AC detection enabled in CR0
 *  - Guest EFLAGS has AC bit set
 */
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{
        if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
                return true;

        return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
               (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}

static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
{
        return vcpu->arch.guest_fpu.fpstate->xfd &&
               !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
}

static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code)
{
        unsigned long cr2 = vmx_get_exit_qual(vcpu);

        if (vcpu->arch.apf.host_apf_flags)
                goto handle_pf;

        /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */
        WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr);

        /*
         * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX
         * flag set in the error code (SGX1 hardware generates #GP(0)).  EPCM
         * violations have nothing to do with shadow paging and can never be
         * resolved by KVM; always reflect them into the guest.
         */
        if (error_code & PFERR_SGX_MASK) {
                WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) ||
                             !cpu_feature_enabled(X86_FEATURE_SGX2));

                if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
                        kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
                else
                        kvm_inject_gp(vcpu, 0);
                return 1;
        }

        /*
         * If EPT is enabled, fixup and inject the #PF.  KVM intercepts #PFs
         * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due
         * to the GPA being legal with respect to host.MAXPHYADDR).
         */
        if (enable_ept) {
                kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
                return 1;
        }

handle_pf:
        return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}

static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_run *kvm_run = vcpu->run;
        u32 intr_info, ex_no, error_code;
        unsigned long dr6;
        u32 vect_info;

        vect_info = vmx->idt_vectoring_info;
        intr_info = vmx_get_intr_info(vcpu);

        /*
         * Machine checks are handled by handle_exception_irqoff(), or by
         * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
         * vmx_vcpu_enter_exit().
         */
        if (is_machine_check(intr_info) || is_nmi(intr_info))
                return 1;

        /*
         * Queue the exception here instead of in handle_nm_fault_irqoff().
         * This ensures the nested_vmx check is not skipped so vmexit can
         * be reflected to L1 (when it intercepts #NM) before reaching this
         * point.
         */
        if (is_nm_fault(intr_info)) {
                kvm_queue_exception_p(vcpu, NM_VECTOR,
                                      is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
                return 1;
        }

        if (is_invalid_opcode(intr_info))
                return handle_ud(vcpu);

        if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
                struct vmx_ve_information *ve_info = vmx->ve_info;

                WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
                          "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
                dump_vmcs(vcpu);
                kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
                return 1;
        }

        error_code = 0;
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);

        if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
                WARN_ON_ONCE(!enable_vmware_backdoor);

                /*
                 * VMware backdoor emulation on #GP interception only handles
                 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
                 * error code on #GP.
                 */
                if (error_code) {
                        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
                        return 1;
                }
                return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
        }

        /*
         * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
         * MMIO, it is better to report an internal error.
         * See the comments in vmx_handle_exit.
         */
        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
                vcpu->run->internal.ndata = 4;
                vcpu->run->internal.data[0] = vect_info;
                vcpu->run->internal.data[1] = intr_info;
                vcpu->run->internal.data[2] = error_code;
                vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
                return 0;
        }

        if (is_page_fault(intr_info))
                return vmx_handle_page_fault(vcpu, error_code);

        ex_no = intr_info & INTR_INFO_VECTOR_MASK;

        if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
                return handle_rmode_exception(vcpu, ex_no, error_code);

        switch (ex_no) {
        case DB_VECTOR:
                dr6 = vmx_get_exit_qual(vcpu);
                if (!(vcpu->guest_debug &
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
                        /*
                         * If the #DB was due to ICEBP, a.k.a. INT1, skip the
                         * instruction.  ICEBP generates a trap-like #DB, but
                         * despite its interception control being tied to #DB,
                         * is an instruction intercept, i.e. the VM-Exit occurs
                         * on the ICEBP itself.  Use the inner "skip" helper to
                         * avoid single-step #DB and MTF updates, as ICEBP is
                         * higher priority.  Note, skipping ICEBP still clears
                         * STI and MOVSS blocking.
                         *
                         * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
                         * if single-step is enabled in RFLAGS and STI or MOVSS
                         * blocking is active, as the CPU doesn't set the bit
                         * on VM-Exit due to #DB interception.  VM-Entry has a
                         * consistency check that a single-step #DB is pending
                         * in this scenario as the previous instruction cannot
                         * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
                         * don't modify RFLAGS), therefore the one instruction
                         * delay when activating single-step breakpoints must
                         * have already expired.  Note, the CPU sets/clears BS
                         * as appropriate for all other VM-Exits types.
                         */
                        if (is_icebp(intr_info))
                                WARN_ON(!skip_emulated_instruction(vcpu));
                        else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
                                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                                  (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
                                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                                            vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);

                        kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
                        return 1;
                }
                kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
                kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
                fallthrough;
        case BP_VECTOR:
                /*
                 * Update instruction length as we may reinject #BP from
                 * user space while in guest debugging mode. Reading it for
                 * #DB as well causes no harm, it is not used in that case.
                 */
                vmx->vcpu.arch.event_exit_inst_len =
                        vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
                kvm_run->exit_reason = KVM_EXIT_DEBUG;
                kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                kvm_run->debug.arch.exception = ex_no;
                break;
        case AC_VECTOR:
                if (vmx_guest_inject_ac(vcpu)) {
                        kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
                        return 1;
                }

                /*
                 * Handle split lock. Depending on detection mode this will
                 * either warn and disable split lock detection for this
                 * task or force SIGBUS on it.
                 */
                if (handle_guest_split_lock(kvm_rip_read(vcpu)))
                        return 1;
                fallthrough;
        default:
                kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
                kvm_run->ex.exception = ex_no;
                kvm_run->ex.error_code = error_code;
                break;
        }
        return 0;
}

static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
        ++vcpu->stat.irq_exits;
        return 1;
}

static int handle_triple_fault(struct kvm_vcpu *vcpu)
{
        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
        vcpu->mmio_needed = 0;
        return 0;
}

static int handle_io(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification;
        int size, in, string;
        unsigned port;

        exit_qualification = vmx_get_exit_qual(vcpu);
        string = (exit_qualification & 16) != 0;

        ++vcpu->stat.io_exits;

        if (string)
                return kvm_emulate_instruction(vcpu, 0);

        port = exit_qualification >> 16;
        size = (exit_qualification & 7) + 1;
        in = (exit_qualification & 8) != 0;

        return kvm_fast_pio(vcpu, size, port, in);
}

void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
        /*
         * Patch in the VMCALL instruction:
         */
        hypercall[0] = 0x0f;
        hypercall[1] = 0x01;
        hypercall[2] = 0xc1;
}

/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
        if (is_guest_mode(vcpu)) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
                unsigned long orig_val = val;

                /*
                 * We get here when L2 changed cr0 in a way that did not change
                 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
                 * but did change L0 shadowed bits. So we first calculate the
                 * effective cr0 value that L1 would like to write into the
                 * hardware. It consists of the L2-owned bits from the new
                 * value combined with the L1-owned bits from L1's guest_cr0.
                 */
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);

                if (kvm_set_cr0(vcpu, val))
                        return 1;
                vmcs_writel(CR0_READ_SHADOW, orig_val);
                return 0;
        } else {
                return kvm_set_cr0(vcpu, val);
        }
}

static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
{
        if (is_guest_mode(vcpu)) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
                unsigned long orig_val = val;

                /* analogously to handle_set_cr0 */
                val = (val & ~vmcs12->cr4_guest_host_mask) |
                        (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
                if (kvm_set_cr4(vcpu, val))
                        return 1;
                vmcs_writel(CR4_READ_SHADOW, orig_val);
                return 0;
        } else
                return kvm_set_cr4(vcpu, val);
}

static int handle_desc(struct kvm_vcpu *vcpu)
{
        /*
         * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
         * and other code needs to be updated if UMIP can be guest owned.
         */
        BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);

        WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
        return kvm_emulate_instruction(vcpu, 0);
}

static int handle_cr(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification, val;
        int cr;
        int reg;
        int err;
        int ret;

        exit_qualification = vmx_get_exit_qual(vcpu);
        cr = exit_qualification & 15;
        reg = (exit_qualification >> 8) & 15;
        switch ((exit_qualification >> 4) & 3) {
        case 0: /* mov to cr */
                val = kvm_register_read(vcpu, reg);
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
                        err = handle_set_cr0(vcpu, val);
                        return kvm_complete_insn_gp(vcpu, err);
                case 3:
                        WARN_ON_ONCE(enable_unrestricted_guest);

                        err = kvm_set_cr3(vcpu, val);
                        return kvm_complete_insn_gp(vcpu, err);
                case 4:
                        err = handle_set_cr4(vcpu, val);
                        return kvm_complete_insn_gp(vcpu, err);
                case 8: {
                                u8 cr8_prev = kvm_get_cr8(vcpu);
                                u8 cr8 = (u8)val;
                                err = kvm_set_cr8(vcpu, cr8);
                                ret = kvm_complete_insn_gp(vcpu, err);
                                if (lapic_in_kernel(vcpu))
                                        return ret;
                                if (cr8_prev <= cr8)
                                        return ret;
                                /*
                                 * TODO: we might be squashing a
                                 * KVM_GUESTDBG_SINGLESTEP-triggered
                                 * KVM_EXIT_DEBUG here.
                                 */
                                vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
                                return 0;
                        }
                }
                break;
        case 2: /* clts */
                KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
                return -EIO;
        case 1: /*mov from cr*/
                switch (cr) {
                case 3:
                        WARN_ON_ONCE(enable_unrestricted_guest);

                        val = kvm_read_cr3(vcpu);
                        kvm_register_write(vcpu, reg, val);
                        trace_kvm_cr_read(cr, val);
                        return kvm_skip_emulated_instruction(vcpu);
                case 8:
                        val = kvm_get_cr8(vcpu);
                        kvm_register_write(vcpu, reg, val);
                        trace_kvm_cr_read(cr, val);
                        return kvm_skip_emulated_instruction(vcpu);
                }
                break;
        case 3: /* lmsw */
                val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
                trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
                kvm_lmsw(vcpu, val);

                return kvm_skip_emulated_instruction(vcpu);
        default:
                break;
        }
        vcpu->run->exit_reason = 0;
        vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
               (int)(exit_qualification >> 4) & 3, cr);
        return 0;
}

static int handle_dr(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification;
        int dr, dr7, reg;
        int err = 1;

        exit_qualification = vmx_get_exit_qual(vcpu);
        dr = exit_qualification & DEBUG_REG_ACCESS_NUM;

        /* First, if DR does not exist, trigger UD */
        if (!kvm_require_dr(vcpu, dr))
                return 1;

        if (vmx_get_cpl(vcpu) > 0)
                goto out;

        dr7 = vmcs_readl(GUEST_DR7);
        if (dr7 & DR7_GD) {
                /*
                 * As the vm-exit takes precedence over the debug trap, we
                 * need to emulate the latter, either for the host or the
                 * guest debugging itself.
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                        vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
                        vcpu->run->debug.arch.dr7 = dr7;
                        vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                        vcpu->run->debug.arch.exception = DB_VECTOR;
                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                        return 0;
                } else {
                        kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
                        return 1;
                }
        }

        if (vcpu->guest_debug == 0) {
                exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);

                /*
                 * No more DR vmexits; force a reload of the debug registers
                 * and reenter on this instruction.  The next vmexit will
                 * retrieve the full state of the debug registers.
                 */
                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
                return 1;
        }

        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
        if (exit_qualification & TYPE_MOV_FROM_DR) {
                kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
                err = 0;
        } else {
                err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
        }

out:
        return kvm_complete_insn_gp(vcpu, err);
}

void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
        get_debugreg(vcpu->arch.db[0], 0);
        get_debugreg(vcpu->arch.db[1], 1);
        get_debugreg(vcpu->arch.db[2], 2);
        get_debugreg(vcpu->arch.db[3], 3);
        get_debugreg(vcpu->arch.dr6, 6);
        vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);

        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
        exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);

        /*
         * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
         * a stale dr6 from the guest.
         */
        set_debugreg(DR6_RESERVED, 6);
}

void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
        vmcs_writel(GUEST_DR7, val);
}

static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
        kvm_apic_update_ppr(vcpu);
        return 1;
}

static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
        exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);

        kvm_make_request(KVM_REQ_EVENT, vcpu);

        ++vcpu->stat.irq_window_exits;
        return 1;
}

static int handle_invlpg(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

        kvm_mmu_invlpg(vcpu, exit_qualification);
        return kvm_skip_emulated_instruction(vcpu);
}

static int handle_apic_access(struct kvm_vcpu *vcpu)
{
        if (likely(fasteoi)) {
                unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
                int access_type, offset;

                access_type = exit_qualification & APIC_ACCESS_TYPE;
                offset = exit_qualification & APIC_ACCESS_OFFSET;
                /*
                 * Sane guest uses MOV to write EOI, with written value
                 * not cared. So make a short-circuit here by avoiding
                 * heavy instruction emulation.
                 */
                if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
                    (offset == APIC_EOI)) {
                        kvm_lapic_set_eoi(vcpu);
                        return kvm_skip_emulated_instruction(vcpu);
                }
        }
        return kvm_emulate_instruction(vcpu, 0);
}

static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
        int vector = exit_qualification & 0xff;

        /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
        kvm_apic_set_eoi_accelerated(vcpu, vector);
        return 1;
}

static int handle_apic_write(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);

        /*
         * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
         * hardware has done any necessary aliasing, offset adjustments, etc...
         * for the access.  I.e. the correct value has already been  written to
         * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
         * retrieve the register value and emulate the access.
         */
        u32 offset = exit_qualification & 0xff0;

        kvm_apic_write_nodecode(vcpu, offset);
        return 1;
}

static int handle_task_switch(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
        bool has_error_code = false;
        u32 error_code = 0;
        u16 tss_selector;
        int reason, type, idt_v, idt_index;

        idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
        idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
        type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);

        exit_qualification = vmx_get_exit_qual(vcpu);

        reason = (u32)exit_qualification >> 30;
        if (reason == TASK_SWITCH_GATE && idt_v) {
                switch (type) {
                case INTR_TYPE_NMI_INTR:
                        vcpu->arch.nmi_injected = false;
                        vmx_set_nmi_mask(vcpu, true);
                        break;
                case INTR_TYPE_EXT_INTR:
                case INTR_TYPE_SOFT_INTR:
                        kvm_clear_interrupt_queue(vcpu);
                        break;
                case INTR_TYPE_HARD_EXCEPTION:
                        if (vmx->idt_vectoring_info &
                            VECTORING_INFO_DELIVER_CODE_MASK) {
                                has_error_code = true;
                                error_code =
                                        vmcs_read32(IDT_VECTORING_ERROR_CODE);
                        }
                        fallthrough;
                case INTR_TYPE_SOFT_EXCEPTION:
                        kvm_clear_exception_queue(vcpu);
                        break;
                default:
                        break;
                }
        }
        tss_selector = exit_qualification;

        if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
                       type != INTR_TYPE_EXT_INTR &&
                       type != INTR_TYPE_NMI_INTR))
                WARN_ON(!skip_emulated_instruction(vcpu));

        /*
         * TODO: What about debug traps on tss switch?
         *       Are we supposed to inject them and update dr6?
         */
        return kvm_task_switch(vcpu, tss_selector,
                               type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
                               reason, has_error_code, error_code);
}

static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
        gpa_t gpa;

        /*
         * EPT violation happened while executing iret from NMI,
         * "blocked by NMI" bit has to be set before next VM entry.
         * There are errata that may cause this bit to not be set:
         * AAK134, BY25.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        enable_vnmi &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);

        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(vcpu, gpa, exit_qualification);

        /*
         * Check that the GPA doesn't exceed physical memory limits, as that is
         * a guest page fault.  We have to emulate the instruction here, because
         * if the illegal address is that of a paging structure, then
         * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
         * would also use advanced VM-exit information for EPT violations to
         * reconstruct the page fault error code.
         */
        if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
                return kvm_emulate_instruction(vcpu, 0);

        return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}

static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
        gpa_t gpa;

        if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
                return 1;

        /*
         * A nested guest cannot optimize MMIO vmexits, because we have an
         * nGPA here instead of the required GPA.
         */
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        if (!is_guest_mode(vcpu) &&
            !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                trace_kvm_fast_mmio(gpa);
                return kvm_skip_emulated_instruction(vcpu);
        }

        return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
}

static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
        if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
                return -EIO;

        exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
        ++vcpu->stat.nmi_window_exits;
        kvm_make_request(KVM_REQ_EVENT, vcpu);

        return 1;
}

/*
 * Returns true if emulation is required (due to the vCPU having invalid state
 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
 * current vCPU state.
 */
static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (!vmx->vt.emulation_required)
                return false;

        /*
         * It is architecturally impossible for emulation to be required when a
         * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
         * guest state is invalid and unrestricted guest is disabled, i.e. KVM
         * should synthesize VM-Fail instead emulation L2 code.  This path is
         * only reachable if userspace modifies L2 guest state after KVM has
         * performed the nested VM-Enter consistency checks.
         */
        if (vmx->nested.nested_run_pending)
                return true;

        /*
         * KVM only supports emulating exceptions if the vCPU is in Real Mode.
         * If emulation is required, KVM can't perform a successful VM-Enter to
         * inject the exception.
         */
        return !vmx->rmode.vm86_active &&
               (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}

static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool intr_window_requested;
        unsigned count = 130;

        intr_window_requested = exec_controls_get(vmx) &
                                CPU_BASED_INTR_WINDOW_EXITING;

        while (vmx->vt.emulation_required && count-- != 0) {
                if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
                        return handle_interrupt_window(&vmx->vcpu);

                if (kvm_test_request(KVM_REQ_EVENT, vcpu))
                        return 1;

                /*
                 * Ensure that any updates to kvm->buses[] observed by the
                 * previous instruction (emulated or otherwise) are also
                 * visible to the instruction KVM is about to emulate.
                 */
                smp_rmb();

                if (!kvm_emulate_instruction(vcpu, 0))
                        return 0;

                if (vmx_unhandleable_emulation_required(vcpu)) {
                        kvm_prepare_emulation_failure_exit(vcpu);
                        return 0;
                }

                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
                        return kvm_emulate_halt_noskip(vcpu);
                }

                /*
                 * Note, return 1 and not 0, vcpu_run() will invoke
                 * xfer_to_guest_mode() which will create a proper return
                 * code.
                 */
                if (__xfer_to_guest_mode_work_pending())
                        return 1;
        }

        return 1;
}

int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
        if (vmx_unhandleable_emulation_required(vcpu)) {
                kvm_prepare_emulation_failure_exit(vcpu);
                return 0;
        }

        return 1;
}

/*
 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
 */
static int handle_pause(struct kvm_vcpu *vcpu)
{
        if (!kvm_pause_in_guest(vcpu->kvm))
                grow_ple_window(vcpu);

        /*
         * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
         * VM-execution control is ignored if CPL > 0. OTOH, KVM
         * never set PAUSE_EXITING and just set PLE if supported,
         * so the vcpu must be CPL=0 if it gets a PAUSE exit.
         */
        kvm_vcpu_on_spin(vcpu, true);
        return kvm_skip_emulated_instruction(vcpu);
}

static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
        return 1;
}

static int handle_invpcid(struct kvm_vcpu *vcpu)
{
        u32 vmx_instruction_info;
        unsigned long type;
        gva_t gva;
        struct {
                u64 pcid;
                u64 gla;
        } operand;
        int gpr_index;

        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }

        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
        gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
        type = kvm_register_read(vcpu, gpr_index);

        /* According to the Intel instruction reference, the memory operand
         * is read even if it isn't needed (e.g., for type==all)
         */
        if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
                                vmx_instruction_info, false,
                                sizeof(operand), &gva))
                return 1;

        return kvm_handle_invpcid(vcpu, type, gva);
}

static int handle_pml_full(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification;

        trace_kvm_pml_full(vcpu->vcpu_id);

        exit_qualification = vmx_get_exit_qual(vcpu);

        /*
         * PML buffer FULL happened while executing iret from NMI,
         * "blocked by NMI" bit has to be set before next VM entry.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        enable_vnmi &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                GUEST_INTR_STATE_NMI);

        /*
         * PML buffer already flushed at beginning of VMEXIT. Nothing to do
         * here.., and there's no userspace involvement needed for PML.
         */
        return 1;
}

static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
                                                   bool force_immediate_exit)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * In the *extremely* unlikely scenario that this is a spurious VM-Exit
         * due to the timer expiring while it was "soft" disabled, just eat the
         * exit and re-enter the guest.
         */
        if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
                return EXIT_FASTPATH_REENTER_GUEST;

        /*
         * If the timer expired because KVM used it to force an immediate exit,
         * then mission accomplished.
         */
        if (force_immediate_exit)
                return EXIT_FASTPATH_EXIT_HANDLED;

        /*
         * If L2 is active, go down the slow path as emulating the guest timer
         * expiration likely requires synthesizing a nested VM-Exit.
         */
        if (is_guest_mode(vcpu))
                return EXIT_FASTPATH_NONE;

        kvm_lapic_expired_hv_timer(vcpu);
        return EXIT_FASTPATH_REENTER_GUEST;
}

static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
        /*
         * This non-fastpath handler is reached if and only if the preemption
         * timer was being used to emulate a guest timer while L2 is active.
         * All other scenarios are supposed to be handled in the fastpath.
         */
        WARN_ON_ONCE(!is_guest_mode(vcpu));
        kvm_lapic_expired_hv_timer(vcpu);
        return 1;
}

/*
 * When nested=0, all VMX instruction VM Exits filter here.  The handlers
 * are overwritten by nested_vmx_hardware_setup() when nested=1.
 */
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
}

static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
{
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
}

#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
        /*
         * SGX virtualization is disabled.  There is no software enable bit for
         * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
         * the guest from executing ENCLS (when SGX is supported by hardware).
         */
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
}
#endif /* CONFIG_X86_SGX_KVM */

static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
        /*
         * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
         * VM-Exits. Unconditionally set the flag here and leave the handling to
         * vmx_handle_exit().
         */
        to_vt(vcpu)->exit_reason.bus_lock_detected = true;
        return 1;
}

static int handle_notify(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qual = vmx_get_exit_qual(vcpu);
        bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;

        ++vcpu->stat.notify_window_exits;

        /*
         * Notify VM exit happened while executing iret from NMI,
         * "blocked by NMI" bit has to be set before next VM entry.
         */
        if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                              GUEST_INTR_STATE_NMI);

        if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
            context_invalid) {
                vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
                vcpu->run->notify.flags = context_invalid ?
                                          KVM_NOTIFY_CONTEXT_INVALID : 0;
                return 0;
        }

        return 1;
}

static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
{
        return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
}

static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
{
        return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
                                     vmx_get_msr_imm_reg(vcpu));
}

static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
{
        return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
                                     vmx_get_msr_imm_reg(vcpu));
}

/*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
 * to be done to userspace and return 0.
 */
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
        [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
        [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
        [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
        [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
        [EXIT_REASON_CR_ACCESS]               = handle_cr,
        [EXIT_REASON_DR_ACCESS]               = handle_dr,
        [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
        [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
        [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
        [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = kvm_emulate_halt,
        [EXIT_REASON_INVD]                    = kvm_emulate_invd,
        [EXIT_REASON_INVLPG]                  = handle_invlpg,
        [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
        [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
        [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
        [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
        [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
        [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
        [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
        [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
        [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
        [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
        [EXIT_REASON_VMON]                    = handle_vmx_instruction,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
        [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
        [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
        [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
        [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
        [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
        [EXIT_REASON_GDTR_IDTR]               = handle_desc,
        [EXIT_REASON_LDTR_TR]                 = handle_desc,
        [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
        [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
        [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
        [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
        [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
        [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
        [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
        [EXIT_REASON_INVPCID]                 = handle_invpcid,
        [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
        [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
        [EXIT_REASON_ENCLS]                   = handle_encls,
        [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
        [EXIT_REASON_NOTIFY]                  = handle_notify,
        [EXIT_REASON_SEAMCALL]                = handle_tdx_instruction,
        [EXIT_REASON_TDCALL]                  = handle_tdx_instruction,
        [EXIT_REASON_MSR_READ_IMM]            = handle_rdmsr_imm,
        [EXIT_REASON_MSR_WRITE_IMM]           = handle_wrmsr_imm,
};

static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);

void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
                       u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        *reason = vmx->vt.exit_reason.full;
        *info1 = vmx_get_exit_qual(vcpu);
        if (!(vmx->vt.exit_reason.failed_vmentry)) {
                *info2 = vmx->idt_vectoring_info;
                *intr_info = vmx_get_intr_info(vcpu);
                if (is_exception_with_error_code(*intr_info))
                        *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
                else
                        *error_code = 0;
        } else {
                *info2 = 0;
                *intr_info = 0;
                *error_code = 0;
        }
}

void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
{
        *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
        if (is_exception_with_error_code(*intr_info))
                *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
        else
                *error_code = 0;
}

static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
        if (vmx->pml_pg) {
                __free_page(vmx->pml_pg);
                vmx->pml_pg = NULL;
        }
}

static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u16 pml_idx, pml_tail_index;
        u64 *pml_buf;
        int i;

        pml_idx = vmcs_read16(GUEST_PML_INDEX);

        /* Do nothing if PML buffer is empty */
        if (pml_idx == PML_HEAD_INDEX)
                return;
        /*
         * PML index always points to the next available PML buffer entity
         * unless PML log has just overflowed.
         */
        pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;

        /*
         * PML log is written backwards: the CPU first writes the entry 511
         * then the entry 510, and so on.
         *
         * Read the entries in the same order they were written, to ensure that
         * the dirty ring is filled in the same order the CPU wrote them.
         */
        pml_buf = page_address(vmx->pml_pg);

        for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
                u64 gpa;

                gpa = pml_buf[i];
                WARN_ON(gpa & (PAGE_SIZE - 1));
                kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
        }

        /* reset PML index */
        vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}

static void nested_vmx_mark_all_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.apic_access_page_map);
        kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map);
        kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map);
}

static void vmx_dump_sel(char *name, uint32_t sel)
{
        pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
               name, vmcs_read16(sel),
               vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
               vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
               vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
}

static void vmx_dump_dtsel(char *name, uint32_t limit)
{
        pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
               name, vmcs_read32(limit),
               vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}

static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
{
        unsigned int i;
        struct vmx_msr_entry *e;

        pr_err("MSR %s:\n", name);
        for (i = 0, e = m->val; i < m->nr; ++i, ++e)
                pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
}

void dump_vmcs(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmentry_ctl, vmexit_ctl;
        u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
        u64 tertiary_exec_control;
        unsigned long cr4;
        int efer_slot;

        if (!dump_invalid_vmcs) {
                pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
                return;
        }

        vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
        vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
        cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
        cr4 = vmcs_readl(GUEST_CR4);

        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
        else
                secondary_exec_control = 0;

        if (cpu_has_tertiary_exec_ctrls())
                tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
        else
                tertiary_exec_control = 0;

        pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
               vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
        pr_err("*** Guest State ***\n");
        pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
               vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
               vmcs_readl(CR0_GUEST_HOST_MASK));
        pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
               cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
        pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
        if (cpu_has_vmx_ept()) {
                pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
                       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
                pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
                       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
        }
        pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
               vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
        pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
               vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
        pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
               vmcs_readl(GUEST_SYSENTER_ESP),
               vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
        vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
        vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
        vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
        vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
        vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
        vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
        vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
        vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
        vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
        vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
        efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
        if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
                pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
        else if (efer_slot >= 0)
                pr_err("EFER= 0x%016llx (autoload)\n",
                       vmx->msr_autoload.guest.val[efer_slot].value);
        else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
                pr_err("EFER= 0x%016llx (effective)\n",
                       vcpu->arch.efer | (EFER_LMA | EFER_LME));
        else
                pr_err("EFER= 0x%016llx (effective)\n",
                       vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
        if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
                pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
        pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
               vmcs_read64(GUEST_IA32_DEBUGCTL),
               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
        if (cpu_has_load_perf_global_ctrl() &&
            vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
                pr_err("PerfGlobCtl = 0x%016llx\n",
                       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
        if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
                pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
        pr_err("Interruptibility = %08x  ActivityState = %08x\n",
               vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
               vmcs_read32(GUEST_ACTIVITY_STATE));
        if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
                pr_err("InterruptStatus = %04x\n",
                       vmcs_read16(GUEST_INTR_STATUS));
        if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
                vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
        if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
                vmx_dump_msrs("autostore", &vmx->msr_autostore);

        if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
                pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
                       vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
                       vmcs_readl(GUEST_INTR_SSP_TABLE));
        pr_err("*** Host State ***\n");
        pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
               vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
        pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
               vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
               vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
               vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
               vmcs_read16(HOST_TR_SELECTOR));
        pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
               vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
               vmcs_readl(HOST_TR_BASE));
        pr_err("GDTBase=%016lx IDTBase=%016lx\n",
               vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
        pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
               vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
               vmcs_readl(HOST_CR4));
        pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
               vmcs_readl(HOST_IA32_SYSENTER_ESP),
               vmcs_read32(HOST_IA32_SYSENTER_CS),
               vmcs_readl(HOST_IA32_SYSENTER_EIP));
        if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
                pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
        if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
                pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
        if (cpu_has_load_perf_global_ctrl() &&
            vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                pr_err("PerfGlobCtl = 0x%016llx\n",
                       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
        if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
                vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
        if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
                pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
                       vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
                       vmcs_readl(HOST_INTR_SSP_TABLE));

        pr_err("*** Control State ***\n");
        pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
               cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
        pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
               pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
        pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
               vmcs_read32(EXCEPTION_BITMAP),
               vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
               vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
        pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
               vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
               vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
               vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
        pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
               vmcs_read32(VM_EXIT_INTR_INFO),
               vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
               vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
        pr_err("        reason=%08x qualification=%016lx\n",
               vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
        pr_err("IDTVectoring: info=%08x errcode=%08x\n",
               vmcs_read32(IDT_VECTORING_INFO_FIELD),
               vmcs_read32(IDT_VECTORING_ERROR_CODE));
        pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
        if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
                pr_err("TSC Multiplier = 0x%016llx\n",
                       vmcs_read64(TSC_MULTIPLIER));
        if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
                if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
                        u16 status = vmcs_read16(GUEST_INTR_STATUS);
                        pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
                }
                pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
                if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
                        pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
                pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
        }
        if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
                pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
        if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
                pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
        if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
                pr_err("PLE Gap=%08x Window=%08x\n",
                       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
        if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
                pr_err("Virtual processor ID = 0x%04x\n",
                       vmcs_read16(VIRTUAL_PROCESSOR_ID));
        if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
                struct vmx_ve_information *ve_info = vmx->ve_info;
                u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);

                /*
                 * If KVM is dumping the VMCS, then something has gone wrong
                 * already.  Derefencing an address from the VMCS, which could
                 * very well be corrupted, is a terrible idea.  The virtual
                 * address is known so use it.
                 */
                pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
                       ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
                pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
                       ve_info->exit_reason, ve_info->delivery,
                       ve_info->exit_qualification,
                       ve_info->guest_linear_address,
                       ve_info->guest_physical_address, ve_info->eptp_index);
        }
}

/*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
 */
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
        u32 vectoring_info = vmx->idt_vectoring_info;
        u16 exit_handler_index;

        /*
         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
         * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
         * querying dirty_bitmap, we only need to kick all vcpus out of guest
         * mode as if vcpus is in root mode, the PML buffer must has been
         * flushed already.  Note, PML is never enabled in hardware while
         * running L2.
         */
        if (enable_pml && !is_guest_mode(vcpu))
                vmx_flush_pml_buffer(vcpu);

        /*
         * KVM should never reach this point with a pending nested VM-Enter.
         * More specifically, short-circuiting VM-Entry to emulate L2 due to
         * invalid guest state should never happen as that means KVM knowingly
         * allowed a nested VM-Enter with an invalid vmcs12.  More below.
         */
        if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
                return -EIO;

        if (is_guest_mode(vcpu)) {
                /*
                 * PML is never enabled when running L2, bail immediately if a
                 * PML full exit occurs as something is horribly wrong.
                 */
                if (exit_reason.basic == EXIT_REASON_PML_FULL)
                        goto unexpected_vmexit;

                /*
                 * The host physical addresses of some pages of guest memory
                 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
                 * Page). The CPU may write to these pages via their host
                 * physical address while L2 is running, bypassing any
                 * address-translation-based dirty tracking (e.g. EPT write
                 * protection).
                 *
                 * Mark them dirty on every exit from L2 to prevent them from
                 * getting out of sync with dirty tracking.
                 */
                nested_vmx_mark_all_vmcs12_pages_dirty(vcpu);

                /*
                 * Synthesize a triple fault if L2 state is invalid.  In normal
                 * operation, nested VM-Enter rejects any attempt to enter L2
                 * with invalid state.  However, those checks are skipped if
                 * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
                 * L2 state is invalid, it means either L1 modified SMRAM state
                 * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
                 * doing so is architecturally allowed in the RSM case, and is
                 * the least awful solution for the userspace case without
                 * risking false positives.
                 */
                if (vmx->vt.emulation_required) {
                        nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
                        return 1;
                }

                if (nested_vmx_reflect_vmexit(vcpu))
                        return 1;
        }

        /* If guest state is invalid, start emulating.  L2 is handled above. */
        if (vmx->vt.emulation_required)
                return handle_invalid_guest_state(vcpu);

        if (exit_reason.failed_vmentry) {
                dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = exit_reason.full;
                vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }

        if (unlikely(vmx->fail)) {
                dump_vmcs(vcpu);
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason
                        = vmcs_read32(VM_INSTRUCTION_ERROR);
                vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }

        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
            (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
             exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
             exit_reason.basic != EXIT_REASON_PML_FULL &&
             exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
             exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
             exit_reason.basic != EXIT_REASON_NOTIFY &&
             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
                kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
                return 0;
        }

        if (unlikely(!enable_vnmi &&
                     vmx->loaded_vmcs->soft_vnmi_blocked)) {
                if (!vmx_interrupt_blocked(vcpu)) {
                        vmx->loaded_vmcs->soft_vnmi_blocked = 0;
                } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
                           vcpu->arch.nmi_pending) {
                        /*
                         * This CPU don't support us in finding the end of an
                         * NMI-blocked window if the guest runs with IRQs
                         * disabled. So we pull the trigger after 1 s of
                         * futile waiting, but inform the user about this.
                         */
                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
                               "state on VCPU %d after 1 s timeout\n",
                               __func__, vcpu->vcpu_id);
                        vmx->loaded_vmcs->soft_vnmi_blocked = 0;
                }
        }

        if (exit_fastpath != EXIT_FASTPATH_NONE)
                return 1;

        if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
                goto unexpected_vmexit;
#ifdef CONFIG_MITIGATION_RETPOLINE
        if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
                return kvm_emulate_wrmsr(vcpu);
        else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
                return handle_wrmsr_imm(vcpu);
        else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
                return handle_preemption_timer(vcpu);
        else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
                return handle_interrupt_window(vcpu);
        else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
                return handle_external_interrupt(vcpu);
        else if (exit_reason.basic == EXIT_REASON_HLT)
                return kvm_emulate_halt(vcpu);
        else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
                return handle_ept_misconfig(vcpu);
#endif

        exit_handler_index = array_index_nospec((u16)exit_reason.basic,
                                                kvm_vmx_max_exit_handlers);
        if (!kvm_vmx_exit_handlers[exit_handler_index])
                goto unexpected_vmexit;

        return kvm_vmx_exit_handlers[exit_handler_index](vcpu);

unexpected_vmexit:
        dump_vmcs(vcpu);
        kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
        return 0;
}

int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
        int ret = __vmx_handle_exit(vcpu, exit_fastpath);

        /*
         * Exit to user space when bus lock detected to inform that there is
         * a bus lock in guest.
         */
        if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
                if (ret > 0)
                        vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;

                vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
                return 0;
        }
        return ret;
}

void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        int tpr_threshold;

        if (is_guest_mode(vcpu) &&
                nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
                return;

        guard(vmx_vmcs01)(vcpu);

        tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
        vmcs_write32(TPR_THRESHOLD, tpr_threshold);
}

void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 sec_exec_control;

        if (!lapic_in_kernel(vcpu))
                return;

        if (!flexpriority_enabled &&
            !cpu_has_vmx_virtualize_x2apic_mode())
                return;

        guard(vmx_vmcs01)(vcpu);

        sec_exec_control = secondary_exec_controls_get(vmx);
        sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);

        switch (kvm_get_apic_mode(vcpu)) {
        case LAPIC_MODE_INVALID:
                WARN_ONCE(true, "Invalid local APIC state");
                break;
        case LAPIC_MODE_DISABLED:
                break;
        case LAPIC_MODE_XAPIC:
                if (flexpriority_enabled) {
                        sec_exec_control |=
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);

                        /*
                         * Flush the TLB, reloading the APIC access page will
                         * only do so if its physical address has changed, but
                         * the guest may have inserted a non-APIC mapping into
                         * the TLB while the APIC access page was disabled.
                         *
                         * If L2 is active, immediately flush L1's TLB instead
                         * of requesting a flush of the current TLB, because
                         * the current TLB context is L2's.
                         */
                        if (!is_guest_mode(vcpu))
                                kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                        else if (!enable_ept)
                                vpid_sync_context(vmx->vpid);
                        else if (VALID_PAGE(vcpu->arch.root_mmu.root.hpa))
                                vmx_flush_tlb_ept_root(vcpu->arch.root_mmu.root.hpa);
                }
                break;
        case LAPIC_MODE_X2APIC:
                if (cpu_has_vmx_virtualize_x2apic_mode())
                        sec_exec_control |=
                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                break;
        }
        secondary_exec_controls_set(vmx, sec_exec_control);

        vmx_update_msr_bitmap_x2apic(vcpu);
}

void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
        const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *slot;
        struct page *refcounted_page;
        unsigned long mmu_seq;
        kvm_pfn_t pfn;
        bool writable;

        /* Note, the VIRTUALIZE_APIC_ACCESSES check needs to query vmcs01. */
        guard(vmx_vmcs01)(vcpu);

        if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
            SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
                return;

        /*
         * Explicitly grab the memslot using KVM's internal slot ID to ensure
         * KVM doesn't unintentionally grab a userspace memslot.  It _should_
         * be impossible for userspace to create a memslot for the APIC when
         * APICv is enabled, but paranoia won't hurt in this case.
         */
        slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return;

        /*
         * Ensure that the mmu_notifier sequence count is read before KVM
         * retrieves the pfn from the primary MMU.  Note, the memslot is
         * protected by SRCU, not the mmu_notifier.  Pairs with the smp_wmb()
         * in kvm_mmu_invalidate_end().
         */
        mmu_seq = kvm->mmu_invalidate_seq;
        smp_rmb();

        /*
         * No need to retry if the memslot does not exist or is invalid.  KVM
         * controls the APIC-access page memslot, and only deletes the memslot
         * if APICv is permanently inhibited, i.e. the memslot won't reappear.
         */
        pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
        if (is_error_noslot_pfn(pfn))
                return;

        read_lock(&vcpu->kvm->mmu_lock);
        if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
                kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
        else
                vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));

        /*
         * Do not pin the APIC access page in memory so that it can be freely
         * migrated, the MMU notifier will call us again if it is migrated or
         * swapped out.  KVM backs the memslot with anonymous memory, the pfn
         * should always point at a refcounted page (if the pfn is valid).
         */
        if (!WARN_ON_ONCE(!refcounted_page))
                kvm_release_page_clean(refcounted_page);

        /*
         * No need for a manual TLB flush at this point, KVM has already done a
         * flush if there were SPTEs pointing at the previous page.
         */
        read_unlock(&vcpu->kvm->mmu_lock);
}

void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
        u16 status;
        u8 old;

        if (max_isr == -1)
                max_isr = 0;

        /*
         * Always update SVI in vmcs01, as SVI is only relevant for L2 if and
         * only if Virtual Interrupt Delivery is enabled in vmcs12, and if VID
         * is enabled then L2 EOIs affect L2's vAPIC, not L1's vAPIC.
         */
        guard(vmx_vmcs01)(vcpu);

        status = vmcs_read16(GUEST_INTR_STATUS);
        old = status >> 8;
        if (max_isr != old) {
                status &= 0xff;
                status |= max_isr << 8;
                vmcs_write16(GUEST_INTR_STATUS, status);
        }
}

static void vmx_set_rvi(int vector)
{
        u16 status;
        u8 old;

        if (vector == -1)
                vector = 0;

        status = vmcs_read16(GUEST_INTR_STATUS);
        old = (u8)status & 0xff;
        if ((u8)vector != old) {
                status &= ~0xff;
                status |= (u8)vector;
                vmcs_write16(GUEST_INTR_STATUS, status);
        }
}

int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
        struct vcpu_vt *vt = to_vt(vcpu);
        int max_irr;
        bool got_posted_interrupt;

        if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
                return -EIO;

        if (pi_test_on(&vt->pi_desc)) {
                pi_clear_on(&vt->pi_desc);
                /*
                 * IOMMU can write to PID.ON, so the barrier matters even on UP.
                 * But on x86 this is just a compiler barrier anyway.
                 */
                smp_mb__after_atomic();
                got_posted_interrupt =
                        kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
        } else {
                max_irr = kvm_lapic_find_highest_irr(vcpu);
                got_posted_interrupt = false;
        }

        /*
         * Newly recognized interrupts are injected via either virtual interrupt
         * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
         * disabled in two cases:
         *
         * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
         * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
         * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
         * into L2, but KVM doesn't use virtual interrupt delivery to inject
         * interrupts into L2, and so KVM_REQ_EVENT is again needed.
         *
         * 2) If APICv is disabled for this vCPU, assigned devices may still
         * attempt to post interrupts.  The posted interrupt vector will cause
         * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
         */
        if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
                vmx_set_rvi(max_irr);
        else if (got_posted_interrupt)
                kvm_make_request(KVM_REQ_EVENT, vcpu);

        return max_irr;
}

void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
        if (!kvm_vcpu_apicv_active(vcpu))
                return;

        vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
        vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
        vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
        vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}

void vmx_do_interrupt_irqoff(unsigned long entry);
void vmx_do_nmi_irqoff(void);

static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
        /*
         * Save xfd_err to guest_fpu before interrupt is enabled, so the
         * MSR value is not clobbered by the host activity before the guest
         * has chance to consume it.
         *
         * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
         * interception may have been caused by L1 interception.  Per the SDM,
         * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
         *
         * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
         * unlike CR2 and DR6, the value is not a payload that is attached to
         * the #NM exception.
         */
        if (is_xfd_nm_fault(vcpu))
                rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}

static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
        /* if exit due to PF check for async PF */
        if (is_page_fault(intr_info))
                vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
        /* if exit due to NM, handle before interrupts are enabled */
        else if (is_nm_fault(intr_info))
                handle_nm_fault_irqoff(vcpu);
        /* Handle machine checks before interrupts are enabled */
        else if (is_machine_check(intr_info))
                kvm_machine_check();
}

static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
                                             u32 intr_info)
{
        unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;

        if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
            "unexpected VM-Exit interrupt info: 0x%x", intr_info))
                return;

        /*
         * Invoke the kernel's IRQ handler for the vector.  Use the FRED path
         * when it's available even if FRED isn't fully enabled, e.g. even if
         * FRED isn't supported in hardware, in order to avoid the indirect
         * CALL in the non-FRED path.
         */
        kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
        if (IS_ENABLED(CONFIG_X86_FRED))
                fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
        else
                vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
        kvm_after_interrupt(vcpu);

        vcpu->arch.at_instruction_boundary = true;
}

void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
        if (to_vt(vcpu)->emulation_required)
                return;

        switch (vmx_get_exit_reason(vcpu).basic) {
        case EXIT_REASON_EXTERNAL_INTERRUPT:
                handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
                break;
        case EXIT_REASON_EXCEPTION_NMI:
                handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
                break;
        case EXIT_REASON_MCE_DURING_VMENTRY:
                kvm_machine_check();
                break;
        default:
                break;
        }
}

/*
 * The kvm parameter can be NULL (module initialization, or invocation before
 * VM creation). Be sure to check the kvm parameter before using it.
 */
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
        switch (index) {
        case MSR_IA32_SMBASE:
                if (!IS_ENABLED(CONFIG_KVM_SMM))
                        return false;
                /*
                 * We cannot do SMM unless we can run the guest in big
                 * real mode.
                 */
                return enable_unrestricted_guest || emulate_invalid_guest_state;
        case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                return nested;
        case MSR_AMD64_VIRT_SPEC_CTRL:
        case MSR_AMD64_TSC_RATIO:
                /* This is AMD only.  */
                return false;
        default:
                return true;
        }
}

static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
        u32 exit_intr_info;
        bool unblock_nmi;
        u8 vector;
        bool idtv_info_valid;

        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;

        if (enable_vnmi) {
                if (vmx->loaded_vmcs->nmi_known_unmasked)
                        return;

                exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
                unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
                vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
                /*
                 * SDM 3: 27.7.1.2 (September 2008)
                 * Re-set bit "block by NMI" before VM entry if vmexit caused by
                 * a guest IRET fault.
                 * SDM 3: 23.2.2 (September 2008)
                 * Bit 12 is undefined in any of the following cases:
                 *  If the VM exit sets the valid bit in the IDT-vectoring
                 *   information field.
                 *  If the VM exit is due to a double fault.
                 */
                if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
                    vector != DF_VECTOR && !idtv_info_valid)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
                else
                        vmx->loaded_vmcs->nmi_known_unmasked =
                                !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
                                  & GUEST_INTR_STATE_NMI);
        } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
                vmx->loaded_vmcs->vnmi_blocked_time +=
                        ktime_to_ns(ktime_sub(ktime_get(),
                                              vmx->loaded_vmcs->entry_time));
}

static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
                                      u32 idt_vectoring_info,
                                      int instr_len_field,
                                      int error_code_field)
{
        u8 vector;
        int type;
        bool idtv_info_valid;

        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;

        vcpu->arch.nmi_injected = false;
        kvm_clear_exception_queue(vcpu);
        kvm_clear_interrupt_queue(vcpu);

        if (!idtv_info_valid)
                return;

        kvm_make_request(KVM_REQ_EVENT, vcpu);

        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;

        switch (type) {
        case INTR_TYPE_NMI_INTR:
                vcpu->arch.nmi_injected = true;
                /*
                 * SDM 3: 27.7.1.2 (September 2008)
                 * Clear bit "block by NMI" before VM entry if a NMI
                 * delivery faulted.
                 */
                vmx_set_nmi_mask(vcpu, false);
                break;
        case INTR_TYPE_SOFT_EXCEPTION:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                fallthrough;
        case INTR_TYPE_HARD_EXCEPTION: {
                u32 error_code = 0;

                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
                        error_code = vmcs_read32(error_code_field);

                kvm_requeue_exception(vcpu, vector,
                                      idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
                                      error_code);
                break;
        }
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                fallthrough;
        case INTR_TYPE_EXT_INTR:
                kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
                break;
        default:
                break;
        }
}

static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
        __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
                                  VM_EXIT_INSTRUCTION_LEN,
                                  IDT_VECTORING_ERROR_CODE);
}

void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
        __vmx_complete_interrupts(vcpu,
                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
                                  VM_ENTRY_INSTRUCTION_LEN,
                                  VM_ENTRY_EXCEPTION_ERROR_CODE);

        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}

static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
        int i, nr_msrs;
        struct perf_guest_switch_msr *msrs;
        struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);

        if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu))
                return;

        pmu->host_cross_mapped_mask = 0;
        if (pmu->pebs_enable & pmu->global_ctrl)
                intel_pmu_cross_mapped_check(pmu);

        /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
        msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
        if (!msrs)
                return;

        for (i = 0; i < nr_msrs; i++)
                if (msrs[i].host == msrs[i].guest)
                        clear_atomic_switch_msr(vmx, msrs[i].msr);
                else
                        add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
                                              msrs[i].host);
}

static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu)
{
        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL))
                return;

        if (!cpu_has_save_perf_global_ctrl()) {
                int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore,
                                                       MSR_CORE_PERF_GLOBAL_CTRL);

                if (WARN_ON_ONCE(slot < 0))
                        return;

                pmu->global_ctrl = vmx->msr_autostore.val[slot].value;
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl);
                return;
        }

        pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
}

static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 tscl;
        u32 delta_tsc;

        if (force_immediate_exit) {
                vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
                vmx->loaded_vmcs->hv_timer_soft_disabled = false;
        } else if (vmx->hv_deadline_tsc != -1) {
                tscl = rdtsc();
                if (vmx->hv_deadline_tsc > tscl)
                        /* set_hv_timer ensures the delta fits in 32-bits */
                        delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
                                cpu_preemption_timer_multi);
                else
                        delta_tsc = 0;

                vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
                vmx->loaded_vmcs->hv_timer_soft_disabled = false;
        } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
                vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
                vmx->loaded_vmcs->hv_timer_soft_disabled = true;
        }
}

void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
        if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
                vmx->loaded_vmcs->host_state.rsp = host_rsp;
                vmcs_writel(HOST_RSP, host_rsp);
        }
}

void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
                                        unsigned int flags)
{
        u64 hostval = this_cpu_read(x86_spec_ctrl_current);

        if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
                return;

        if (flags & VMX_RUN_SAVE_SPEC_CTRL)
                vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);

        /*
         * If the guest/host SPEC_CTRL values differ, restore the host value.
         *
         * For legacy IBRS, the IBRS bit always needs to be written after
         * transitioning from a less privileged predictor mode, regardless of
         * whether the guest/host values differ.
         */
        if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
            vmx->spec_ctrl != hostval)
                native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);

        barrier_nospec();
}

static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
                                             bool force_immediate_exit)
{
        /*
         * If L2 is active, some VMX preemption timer exits can be handled in
         * the fastpath even, all other exits must use the slow path.
         */
        if (is_guest_mode(vcpu) &&
            vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
                return EXIT_FASTPATH_NONE;

        switch (vmx_get_exit_reason(vcpu).basic) {
        case EXIT_REASON_MSR_WRITE:
                return handle_fastpath_wrmsr(vcpu);
        case EXIT_REASON_MSR_WRITE_IMM:
                return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
                                                 vmx_get_msr_imm_reg(vcpu));
        case EXIT_REASON_PREEMPTION_TIMER:
                return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
        case EXIT_REASON_HLT:
                return handle_fastpath_hlt(vcpu);
        case EXIT_REASON_INVD:
                return handle_fastpath_invd(vcpu);
        default:
                return EXIT_FASTPATH_NONE;
        }
}

noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
{
        if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
            !is_nmi(vmx_get_intr_info(vcpu)))
                return;

        kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
        if (cpu_feature_enabled(X86_FEATURE_FRED))
                fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
        else
                vmx_do_nmi_irqoff();
        kvm_after_interrupt(vcpu);
}

static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                        unsigned int flags)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        guest_state_enter_irqoff();

        vmx_l1d_flush(vcpu);

        vmx_disable_fb_clear(vmx);

        if (vcpu->arch.cr2 != native_read_cr2())
                native_write_cr2(vcpu->arch.cr2);

        vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
                                   flags);

        vcpu->arch.cr2 = native_read_cr2();
        vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;

        vmx->idt_vectoring_info = 0;

        vmx_enable_fb_clear(vmx);

        if (unlikely(vmx->fail)) {
                vmx->vt.exit_reason.full = 0xdead;
                goto out;
        }

        vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
        if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
                vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

        vmx_handle_nmi(vcpu);

out:
        guest_state_exit_irqoff();
}

fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
        bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long cr3, cr4;

        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!enable_vnmi &&
                     vmx->loaded_vmcs->soft_vnmi_blocked))
                vmx->loaded_vmcs->entry_time = ktime_get();

        /*
         * Don't enter VMX if guest state is invalid, let the exit handler
         * start emulation until we arrive back to a valid state.  Synthesize a
         * consistency check VM-Exit due to invalid guest state and bail.
         */
        if (unlikely(vmx->vt.emulation_required)) {
                vmx->fail = 0;

                vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
                vmx->vt.exit_reason.failed_vmentry = 1;
                kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
                vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
                kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
                vmx->vt.exit_intr_info = 0;
                return EXIT_FASTPATH_NONE;
        }

        trace_kvm_entry(vcpu, force_immediate_exit);

        if (vmx->ple_window_dirty) {
                vmx->ple_window_dirty = false;
                vmcs_write32(PLE_WINDOW, vmx->ple_window);
        }

        /*
         * We did this in prepare_switch_to_guest, because it needs to
         * be within srcu_read_lock.
         */
        WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);

        if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
        if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
        vcpu->arch.regs_dirty = 0;

        if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
                set_debugreg(vcpu->arch.dr6, 6);

        if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
                vmx_reload_guest_debugctl(vcpu);

        /*
         * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
         * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
         * it switches back to the current->mm, which can occur in KVM context
         * when switching to a temporary mm to patch kernel code, e.g. if KVM
         * toggles a static key while handling a VM-Exit.
         */
        cr3 = __get_current_cr3_fast();
        if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
                vmcs_writel(HOST_CR3, cr3);
                vmx->loaded_vmcs->host_state.cr3 = cr3;
        }

        cr4 = cr4_read_shadow();
        if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
                vmcs_writel(HOST_CR4, cr4);
                vmx->loaded_vmcs->host_state.cr4 = cr4;
        }

        /* When single-stepping over STI and MOV SS, we must clear the
         * corresponding interruptibility bits in the guest state. Otherwise
         * vmentry fails as it then expects bit 14 (BS) in pending debug
         * exceptions being set, but that's not correct for the guest debugging
         * case. */
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vmx_set_interrupt_shadow(vcpu, 0);

        pt_guest_enter(vmx);

        atomic_switch_perf_msrs(vmx);
        if (intel_pmu_lbr_is_enabled(vcpu))
                vmx_passthrough_lbr_msrs(vcpu);

        if (enable_preemption_timer)
                vmx_update_hv_timer(vcpu, force_immediate_exit);
        else if (force_immediate_exit)
                smp_send_reschedule(vcpu->cpu);

        kvm_wait_lapic_expire(vcpu);

        /* The actual VMENTER/EXIT is in the .noinstr.text section. */
        vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));

        /* All fields are clean at this point */
        if (kvm_is_using_evmcs()) {
                current_evmcs->hv_clean_fields |=
                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;

                current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
        }

        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
        if (vcpu->arch.host_debugctl)
                update_debugctlmsr(vcpu->arch.host_debugctl);

#ifndef CONFIG_X86_64
        /*
         * The sysexit path does not restore ds/es, so we must set them to
         * a reasonable value ourselves.
         *
         * We can't defer this to vmx_prepare_switch_to_host() since that
         * function may be executed in interrupt context, which saves and
         * restore segments around it, nullifying its effect.
         */
        loadsegment(ds, __USER_DS);
        loadsegment(es, __USER_DS);
#endif

        pt_guest_exit(vmx);

        if (is_guest_mode(vcpu)) {
                /*
                 * Track VMLAUNCH/VMRESUME that have made past guest state
                 * checking.
                 */
                if (vmx->nested.nested_run_pending &&
                    !vmx_get_exit_reason(vcpu).failed_vmentry)
                        ++vcpu->stat.nested_run;

                vmx->nested.nested_run_pending = 0;
        }

        if (unlikely(vmx->fail))
                return EXIT_FASTPATH_NONE;

        trace_kvm_exit(vcpu, KVM_ISA_VMX);

        if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
                return EXIT_FASTPATH_NONE;

        vmx->loaded_vmcs->launched = 1;

        vmx_refresh_guest_perf_global_control(vcpu);

        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);

        return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}

void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (enable_pml)
                vmx_destroy_pml_buffer(vmx);
        free_vpid(vmx->vpid);
        nested_vmx_free_vcpu(vcpu);
        free_loaded_vmcs(vmx->loaded_vmcs);
        free_page((unsigned long)vmx->ve_info);
}

int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
        struct vmx_uret_msr *tsx_ctrl;
        struct vcpu_vmx *vmx;
        int i, err;

        BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
        vmx = to_vmx(vcpu);

        INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);

        err = -ENOMEM;

        vmx->vpid = allocate_vpid();

        /*
         * If PML is turned on, failure on enabling PML just results in failure
         * of creating the vcpu, therefore we can simplify PML logic (by
         * avoiding dealing with cases, such as enabling PML partially on vcpus
         * for the guest), etc.
         */
        if (enable_pml) {
                vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
                if (!vmx->pml_pg)
                        goto free_vpid;
        }

        for (i = 0; i < kvm_nr_uret_msrs; ++i)
                vmx->guest_uret_msrs[i].mask = -1ull;
        if (boot_cpu_has(X86_FEATURE_RTM)) {
                /*
                 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
                 * Keep the host value unchanged to avoid changing CPUID bits
                 * under the host kernel's feet.
                 */
                tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
                if (tsx_ctrl)
                        tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
        }

        err = alloc_loaded_vmcs(&vmx->vmcs01);
        if (err < 0)
                goto free_pml;

        /*
         * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
         * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
         * feature only for vmcs01, KVM currently isn't equipped to realize any
         * performance benefits from enabling it for vmcs02.
         */
        if (kvm_is_using_evmcs() &&
            (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
                struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;

                evmcs->hv_enlightenments_control.msr_bitmap = 1;
        }

        vmx->loaded_vmcs = &vmx->vmcs01;

        if (cpu_need_virtualize_apic_accesses(vcpu)) {
                err = kvm_alloc_apic_access_page(vcpu->kvm);
                if (err)
                        goto free_vmcs;
        }

        if (enable_ept && !enable_unrestricted_guest) {
                err = init_rmode_identity_map(vcpu->kvm);
                if (err)
                        goto free_vmcs;
        }

        err = -ENOMEM;
        if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
                struct page *page;

                BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);

                /* ve_info must be page aligned. */
                page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
                if (!page)
                        goto free_vmcs;

                vmx->ve_info = page_to_virt(page);
        }

        if (vmx_can_use_ipiv(vcpu))
                WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
                           __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);

        return 0;

free_vmcs:
        free_loaded_vmcs(vmx->loaded_vmcs);
free_pml:
        vmx_destroy_pml_buffer(vmx);
free_vpid:
        free_vpid(vmx->vpid);
        return err;
}

#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"

int vmx_vm_init(struct kvm *kvm)
{
        if (!ple_gap)
                kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);

        if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
                switch (l1tf_mitigation) {
                case L1TF_MITIGATION_OFF:
                case L1TF_MITIGATION_FLUSH_NOWARN:
                        /* 'I explicitly don't care' is set */
                        break;
                case L1TF_MITIGATION_AUTO:
                case L1TF_MITIGATION_FLUSH:
                case L1TF_MITIGATION_FLUSH_NOSMT:
                case L1TF_MITIGATION_FULL:
                        /*
                         * Warn upon starting the first VM in a potentially
                         * insecure environment.
                         */
                        if (sched_smt_active())
                                pr_warn_once(L1TF_MSG_SMT);
                        if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
                                pr_warn_once(L1TF_MSG_L1D);
                        break;
                case L1TF_MITIGATION_FULL_FORCE:
                        /* Flush is enforced */
                        break;
                }
        }

        if (enable_pml)
                kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
        return 0;
}

static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
{
        /*
         * Non-coherent DMA devices need the guest to flush CPU properly.
         * In that case it is not possible to map all guest RAM as WB, so
         * always trust guest PAT.
         */
        return !kvm_arch_has_noncoherent_dma(kvm) &&
               kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
}

u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
        /*
         * Force UC for host MMIO regions, as allowing the guest to access MMIO
         * with cacheable accesses will result in Machine Checks.
         */
        if (is_mmio)
                return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;

        /* Force WB if ignoring guest PAT */
        if (vmx_ignore_guest_pat(vcpu->kvm))
                return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;

        return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}

static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
        /*
         * These bits in the secondary execution controls field
         * are dynamic, the others are mostly based on the hypervisor
         * architecture and the guest's CPUID.  Do not touch the
         * dynamic bits.
         */
        u32 mask =
                SECONDARY_EXEC_SHADOW_VMCS |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_DESC;

        u32 cur_ctl = secondary_exec_controls_get(vmx);

        secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
}

/*
 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
 * (indicating "allowed-1") if they are supported in the guest's CPUID.
 */
static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_cpuid_entry2 *entry;

        vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
        vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;

#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
        if (entry && (entry->_reg & (_cpuid_mask)))                     \
                vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
} while (0)

        entry = kvm_find_cpuid_entry(vcpu, 0x1);
        cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
        cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
        cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
        cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
        cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
        cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
        cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
        cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
        cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
        cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
        cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
        cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
        cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
        cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));

        entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
        cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
        cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
        cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
        cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
        cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
        cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
        cr4_fixed1_update(X86_CR4_CET,        ecx, feature_bit(SHSTK));
        cr4_fixed1_update(X86_CR4_CET,        edx, feature_bit(IBT));

        entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
        cr4_fixed1_update(X86_CR4_LAM_SUP,    eax, feature_bit(LAM));

#undef cr4_fixed1_update
}

static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct kvm_cpuid_entry2 *best = NULL;
        int i;

        for (i = 0; i < PT_CPUID_LEAVES; i++) {
                best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
                if (!best)
                        return;
                vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
                vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
                vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
                vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
        }

        /* Get the number of configurable Address Ranges for filtering */
        vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
                                                PT_CAP_num_address_ranges);

        /* Initialize and clear the no dependency bits */
        vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
                        RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
                        RTIT_CTL_BRANCH_EN);

        /*
         * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
         * will inject an #GP
         */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
                vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;

        /*
         * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
         * PSBFreq can be set
         */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
                vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
                                RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);

        /*
         * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
         */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
                vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
                                              RTIT_CTL_MTC_RANGE);

        /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
                vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
                                                        RTIT_CTL_PTW_EN);

        /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
                vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;

        /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
                vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;

        /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
        if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
                vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;

        /* unmask address range configure area */
        for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
                vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}

void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * XSAVES is effectively enabled if and only if XSAVE is also exposed
         * to the guest.  XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
         * set if and only if XSAVE is supported.
         */
        if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
                guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);

        vmx_setup_uret_msrs(vmx);

        if (cpu_has_secondary_exec_ctrls())
                vmcs_set_secondary_exec_control(vmx,
                                                vmx_secondary_exec_control(vmx));

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
                vmx->msr_ia32_feature_control_valid_bits |=
                        FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                        FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
        else
                vmx->msr_ia32_feature_control_valid_bits &=
                        ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                          FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
                nested_vmx_cr_fixed1_bits_update(vcpu);

        if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
                        guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
                update_intel_pt_cfg(vcpu);

        if (boot_cpu_has(X86_FEATURE_RTM)) {
                struct vmx_uret_msr *msr;
                msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
                if (msr) {
                        bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
                        vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
                }
        }

        set_cr4_guest_host_mask(vmx);

        vmx_write_encls_bitmap(vcpu, NULL);
        if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
                vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
        else
                vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
                vmx->msr_ia32_feature_control_valid_bits |=
                        FEAT_CTL_SGX_LC_ENABLED;
        else
                vmx->msr_ia32_feature_control_valid_bits &=
                        ~FEAT_CTL_SGX_LC_ENABLED;

        /* Refresh #PF interception to account for MAXPHYADDR changes. */
        vmx_update_exception_bitmap(vcpu);
}

static __init u64 vmx_get_perf_capabilities(void)
{
        u64 perf_cap = PERF_CAP_FW_WRITES;
        u64 host_perf_cap = 0;

        if (!enable_pmu)
                return 0;

        if (boot_cpu_has(X86_FEATURE_PDCM))
                rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);

        if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) &&
            !enable_mediated_pmu) {
                x86_perf_get_lbr(&vmx_lbr_caps);

                /*
                 * KVM requires LBR callstack support, as the overhead due to
                 * context switching LBRs without said support is too high.
                 * See intel_pmu_create_guest_lbr_event() for more info.
                 */
                if (!vmx_lbr_caps.has_callstack)
                        memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
                else if (vmx_lbr_caps.nr)
                        perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
        }

        if (vmx_pebs_supported()) {
                perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;

                /*
                 * Disallow adaptive PEBS as it is functionally broken, can be
                 * used by the guest to read *host* LBRs, and can be used to
                 * bypass userspace event filters.  To correctly and safely
                 * support adaptive PEBS, KVM needs to:
                 *
                 * 1. Account for the ADAPTIVE flag when (re)programming fixed
                 *    counters.
                 *
                 * 2. Gain support from perf (or take direct control of counter
                 *    programming) to support events without adaptive PEBS
                 *    enabled for the hardware counter.
                 *
                 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
                 *    adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
                 *
                 * 4. Document which PMU events are effectively exposed to the
                 *    guest via adaptive PEBS, and make adaptive PEBS mutually
                 *    exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
                 */
                perf_cap &= ~PERF_CAP_PEBS_BASELINE;
        }

        return perf_cap;
}

static __init void vmx_set_cpu_caps(void)
{
        kvm_initialize_cpu_caps();

        /* CPUID 0x1 */
        if (nested)
                kvm_cpu_cap_set(X86_FEATURE_VMX);

        /* CPUID 0x7 */
        if (kvm_mpx_supported())
                kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
        if (!cpu_has_vmx_invpcid())
                kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
        if (vmx_pt_mode_is_host_guest())
                kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
        if (vmx_pebs_supported()) {
                kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
                kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
        }

        if (!enable_pmu)
                kvm_cpu_cap_clear(X86_FEATURE_PDCM);
        kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();

        if (!enable_sgx) {
                kvm_cpu_cap_clear(X86_FEATURE_SGX);
                kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
                kvm_cpu_cap_clear(X86_FEATURE_SGX1);
                kvm_cpu_cap_clear(X86_FEATURE_SGX2);
                kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
        }

        if (vmx_umip_emulated())
                kvm_cpu_cap_set(X86_FEATURE_UMIP);

        /* CPUID 0xD.1 */
        if (!cpu_has_vmx_xsaves())
                kvm_cpu_cap_clear(X86_FEATURE_XSAVES);

        /* CPUID 0x80000001 and 0x7 (RDPID) */
        if (!cpu_has_vmx_rdtscp()) {
                kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
                kvm_cpu_cap_clear(X86_FEATURE_RDPID);
        }

        if (cpu_has_vmx_waitpkg())
                kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);

        /*
         * Disable CET if unrestricted_guest is unsupported as KVM doesn't
         * enforce CET HW behaviors in emulator. On platforms with
         * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
         * fails, so disable CET in this case too.
         */
        if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
            !cpu_has_vmx_basic_no_hw_errcode_cc()) {
                kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
                kvm_cpu_cap_clear(X86_FEATURE_IBT);
        }

        kvm_setup_xss_caps();
        kvm_finalize_cpu_caps();
}

static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
                                  struct x86_instruction_info *info,
                                  unsigned long *exit_qualification)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned short port;
        int size;
        bool imm;

        /*
         * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
         * VM-exits depend on the 'unconditional IO exiting' VM-execution
         * control.
         *
         * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
         */
        if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
                return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);

        if (info->intercept == x86_intercept_in ||
            info->intercept == x86_intercept_ins) {
                port = info->src_val;
                size = info->dst_bytes;
                imm  = info->src_type == OP_IMM;
        } else {
                port = info->dst_val;
                size = info->src_bytes;
                imm  = info->dst_type == OP_IMM;
        }


        *exit_qualification = ((unsigned long)port << 16) | (size - 1);

        if (info->intercept == x86_intercept_ins ||
            info->intercept == x86_intercept_outs)
                *exit_qualification |= BIT(4);

        if (info->rep_prefix)
                *exit_qualification |= BIT(5);

        if (imm)
                *exit_qualification |= BIT(6);

        return nested_vmx_check_io_bitmaps(vcpu, port, size);
}

int vmx_check_intercept(struct kvm_vcpu *vcpu,
                        struct x86_instruction_info *info,
                        enum x86_intercept_stage stage,
                        struct x86_exception *exception)
{
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned long exit_qualification = 0;
        u32 vm_exit_reason;
        u64 exit_insn_len;

        switch (info->intercept) {
        case x86_intercept_rdpid:
                /*
                 * RDPID causes #UD if not enabled through secondary execution
                 * controls (ENABLE_RDTSCP).  Note, the implicit MSR access to
                 * TSC_AUX is NOT subject to interception, i.e. checking only
                 * the dedicated execution control is architecturally correct.
                 */
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
                        exception->vector = UD_VECTOR;
                        exception->error_code_valid = false;
                        return X86EMUL_PROPAGATE_FAULT;
                }
                return X86EMUL_CONTINUE;

        case x86_intercept_in:
        case x86_intercept_ins:
        case x86_intercept_out:
        case x86_intercept_outs:
                if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
                        return X86EMUL_CONTINUE;

                vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
                break;

        case x86_intercept_lgdt:
        case x86_intercept_lidt:
        case x86_intercept_lldt:
        case x86_intercept_ltr:
        case x86_intercept_sgdt:
        case x86_intercept_sidt:
        case x86_intercept_sldt:
        case x86_intercept_str:
                if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
                        return X86EMUL_CONTINUE;

                if (info->intercept == x86_intercept_lldt ||
                    info->intercept == x86_intercept_ltr ||
                    info->intercept == x86_intercept_sldt ||
                    info->intercept == x86_intercept_str)
                        vm_exit_reason = EXIT_REASON_LDTR_TR;
                else
                        vm_exit_reason = EXIT_REASON_GDTR_IDTR;
                /*
                 * FIXME: Decode the ModR/M to generate the correct exit
                 *        qualification for memory operands.
                 */
                break;

        case x86_intercept_hlt:
                if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
                        return X86EMUL_CONTINUE;

                vm_exit_reason = EXIT_REASON_HLT;
                break;

        case x86_intercept_pause:
                /*
                 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
                 * with vanilla NOPs in the emulator.  Apply the interception
                 * check only to actual PAUSE instructions.  Don't check
                 * PAUSE-loop-exiting, software can't expect a given PAUSE to
                 * exit, i.e. KVM is within its rights to allow L2 to execute
                 * the PAUSE.
                 */
                if ((info->rep_prefix != REPE_PREFIX) ||
                    !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
                        return X86EMUL_CONTINUE;

                vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
                break;

        /* TODO: check more intercepts... */
        default:
                return X86EMUL_UNHANDLEABLE;
        }

        exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
        if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
                return X86EMUL_UNHANDLEABLE;

        __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
                            exit_insn_len);
        return X86EMUL_INTERCEPTED;
}

#ifdef CONFIG_X86_64
/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
static inline int u64_shl_div_u64(u64 a, unsigned int shift,
                                  u64 divisor, u64 *result)
{
        u64 low = a << shift, high = a >> (64 - shift);

        /* To avoid the overflow on divq */
        if (high >= divisor)
                return 1;

        /* Low hold the result, high hold rem which is discarded */
        asm("divq %2\n\t" : "=a" (low), "=d" (high) :
            "rm" (divisor), "0" (low), "1" (high));
        *result = low;

        return 0;
}

int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
                     bool *expired)
{
        struct vcpu_vmx *vmx;
        u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
        struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;

        vmx = to_vmx(vcpu);
        tscl = rdtsc();
        guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
        delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
        lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
                                                    ktimer->timer_advance_ns);

        if (delta_tsc > lapic_timer_advance_cycles)
                delta_tsc -= lapic_timer_advance_cycles;
        else
                delta_tsc = 0;

        /* Convert to host delta tsc if tsc scaling is enabled */
        if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
            delta_tsc && u64_shl_div_u64(delta_tsc,
                                kvm_caps.tsc_scaling_ratio_frac_bits,
                                vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
                return -ERANGE;

        /*
         * If the delta tsc can't fit in the 32 bit after the multi shift,
         * we can't use the preemption timer.
         * It's possible that it fits on later vmentries, but checking
         * on every vmentry is costly so we just use an hrtimer.
         */
        if (delta_tsc >> (cpu_preemption_timer_multi + 32))
                return -ERANGE;

        vmx->hv_deadline_tsc = tscl + delta_tsc;
        *expired = !delta_tsc;
        return 0;
}

void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
        to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif

void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        if (WARN_ON_ONCE(!enable_pml))
                return;

        guard(vmx_vmcs01)(vcpu);

        /*
         * Note, nr_memslots_dirty_logging can be changed concurrent with this
         * code, but in that case another update request will be made and so
         * the guest will never run with a stale PML value.
         */
        if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
                secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
        else
                secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}

void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
        if (vcpu->arch.mcg_cap & MCG_LMCE_P)
                to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
                        FEAT_CTL_LMCE_ENABLED;
        else
                to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
                        ~FEAT_CTL_LMCE_ENABLED;
}

#ifdef CONFIG_KVM_SMM
int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
        /* we need a nested vmexit to enter SMM, postpone if run is pending */
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return -EBUSY;
        return !is_smm(vcpu);
}

int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        /*
         * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
         * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
         * SMI and RSM only modify state that is saved and restored via SMRAM.
         * E.g. most MSRs are left untouched, but many are modified by VM-Exit
         * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
         */
        vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
        if (vmx->nested.smm.guest_mode)
                nested_vmx_vmexit(vcpu, -1, 0, 0);

        vmx->nested.smm.vmxon = vmx->nested.vmxon;
        vmx->nested.vmxon = false;
        vmx_clear_hlt(vcpu);
        return 0;
}

int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int ret;

        if (vmx->nested.smm.vmxon) {
                vmx->nested.vmxon = true;
                vmx->nested.smm.vmxon = false;
        }

        if (vmx->nested.smm.guest_mode) {
                /* Triple fault if the state is invalid.  */
                if (nested_vmx_check_restored_vmcs12(vcpu) < 0)
                        return 1;

                ret = nested_vmx_enter_non_root_mode(vcpu, false);
                if (ret != NVMX_VMENTRY_SUCCESS)
                        return 1;

                vmx->nested.nested_run_pending = 1;
                vmx->nested.smm.guest_mode = false;
        }
        return 0;
}

void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
        /* RSM will cause a vmexit anyway.  */
}
#endif

bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
        return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}

void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
        if (is_guest_mode(vcpu)) {
                struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;

                if (hrtimer_try_to_cancel(timer) == 1)
                        hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
        }
}

void vmx_hardware_unsetup(void)
{
        kvm_set_posted_intr_wakeup_handler(NULL);

        if (nested)
                nested_vmx_hardware_unsetup();

        free_kvm_area();
}

void vmx_vm_destroy(struct kvm *kvm)
{
        struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);

        free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
}

/*
 * Note, the SDM states that the linear address is masked *after* the modified
 * canonicality check, whereas KVM masks (untags) the address and then performs
 * a "normal" canonicality check.  Functionally, the two methods are identical,
 * and when the masking occurs relative to the canonicality check isn't visible
 * to software, i.e. KVM's behavior doesn't violate the SDM.
 */
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
{
        int lam_bit;
        unsigned long cr3_bits;

        if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
                return gva;

        if (!is_64_bit_mode(vcpu))
                return gva;

        /*
         * Bit 63 determines if the address should be treated as user address
         * or a supervisor address.
         */
        if (!(gva & BIT_ULL(63))) {
                cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
                if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
                        return gva;

                /* LAM_U48 is ignored if LAM_U57 is set. */
                lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
        } else {
                if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
                        return gva;

                lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
        }

        /*
         * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
         * Bit 63 is retained from the raw virtual address so that untagging
         * doesn't change a user access to a supervisor access, and vice versa.
         */
        return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}

static unsigned int vmx_handle_intel_pt_intr(void)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();

        /* '0' on failure so that the !PT case can use a RET0 static call. */
        if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
                return 0;

        kvm_make_request(KVM_REQ_PMI, vcpu);
        __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
                  (unsigned long *)&vcpu->arch.pmu.global_status);
        return 1;
}

static __init void vmx_setup_user_return_msrs(void)
{

        /*
         * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
         * will emulate SYSCALL in legacy mode if the vendor string in guest
         * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
         * support this emulation, MSR_STAR is included in the list for i386,
         * but is never loaded into hardware.  MSR_CSTAR is also never loaded
         * into hardware and is here purely for emulation purposes.
         */
        const u32 vmx_uret_msrs_list[] = {
        #ifdef CONFIG_X86_64
                MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
        #endif
                MSR_EFER, MSR_TSC_AUX, MSR_STAR,
                MSR_IA32_TSX_CTRL,
        };
        int i;

        BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);

        for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
                kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}

static void __init vmx_setup_me_spte_mask(void)
{
        u64 me_mask = 0;

        /*
         * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
         * kvm_host.maxphyaddr.  On MKTME and/or TDX capable systems,
         * boot_cpu_data.x86_phys_bits holds the actual physical address
         * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
         * MAXPHYADDR reported by CPUID.  Those bits between are KeyID bits.
         */
        if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
                me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
                                    kvm_host.maxphyaddr - 1);

        /*
         * Unlike SME, host kernel doesn't support setting up any
         * MKTME KeyID on Intel platforms.  No memory encryption
         * bits should be included into the SPTE.
         */
        kvm_mmu_set_me_spte_mask(0, me_mask);
}

__init int vmx_hardware_setup(void)
{
        unsigned long host_bndcfgs;
        struct desc_ptr dt;
        int r;

        store_idt(&dt);
        host_idt_base = dt.address;

        vmx_setup_user_return_msrs();


        if (boot_cpu_has(X86_FEATURE_NX))
                kvm_enable_efer_bits(EFER_NX);

        if (boot_cpu_has(X86_FEATURE_MPX)) {
                rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
                WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
        }

        if (!cpu_has_vmx_mpx())
                kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
                                             XFEATURE_MASK_BNDCSR);

        if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
            !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;

        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels() ||
            !cpu_has_vmx_ept_mt_wb() ||
            !cpu_has_vmx_invept_global())
                enable_ept = 0;

        /* NX support is required for shadow paging. */
        if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
                pr_err_ratelimited("NX (Execute Disable) not supported\n");
                return -EOPNOTSUPP;
        }

        /*
         * Shadow paging doesn't have a (further) performance penalty
         * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
         * by default
         */
        if (!enable_ept)
                allow_smaller_maxphyaddr = true;

        if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
                enable_ept_ad_bits = 0;

        if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
                enable_unrestricted_guest = 0;

        if (!cpu_has_vmx_flexpriority())
                flexpriority_enabled = 0;

        if (!cpu_has_virtual_nmis())
                enable_vnmi = 0;

#ifdef CONFIG_X86_SGX_KVM
        if (!cpu_has_vmx_encls_vmexit())
                enable_sgx = false;
#endif

        /*
         * set_apic_access_page_addr() is used to reload apic access
         * page upon invalidation.  No need to do anything if not
         * using the APIC_ACCESS_ADDR VMCS field.
         */
        if (!flexpriority_enabled)
                vt_x86_ops.set_apic_access_page_addr = NULL;

        if (!cpu_has_vmx_tpr_shadow())
                vt_x86_ops.update_cr8_intercept = NULL;

#if IS_ENABLED(CONFIG_HYPERV)
        if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
            && enable_ept) {
                vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
                vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
        }
#endif

        if (!cpu_has_vmx_ple()) {
                ple_gap = 0;
                ple_window = 0;
                ple_window_grow = 0;
                ple_window_max = 0;
                ple_window_shrink = 0;
        }

        if (!cpu_has_vmx_apicv())
                enable_apicv = 0;
        if (!enable_apicv)
                vt_x86_ops.sync_pir_to_irr = NULL;

        if (!enable_apicv || !cpu_has_vmx_ipiv())
                enable_ipiv = false;

        if (cpu_has_vmx_tsc_scaling())
                kvm_caps.has_tsc_control = true;

        kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
        kvm_caps.tsc_scaling_ratio_frac_bits = 48;
        kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
        kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();

        set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */

        if (enable_ept)
                kvm_mmu_set_ept_masks(enable_ept_ad_bits,
                                      cpu_has_vmx_ept_execute_only());
        else
                vt_x86_ops.get_mt_mask = NULL;

        /*
         * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
         * bits to shadow_zero_check.
         */
        vmx_setup_me_spte_mask();

        kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
                          ept_caps_to_lpage_level(vmx_capability.ept));

        /*
         * Only enable PML when hardware supports PML feature, and both EPT
         * and EPT A/D bit features are enabled -- PML depends on them to work.
         */
        if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
                enable_pml = 0;

        if (!cpu_has_vmx_preemption_timer())
                enable_preemption_timer = false;

        if (enable_preemption_timer) {
                u64 use_timer_freq = 5000ULL * 1000 * 1000;

                cpu_preemption_timer_multi =
                        vmx_misc_preemption_timer_rate(vmcs_config.misc);

                if (tsc_khz)
                        use_timer_freq = (u64)tsc_khz * 1000;
                use_timer_freq >>= cpu_preemption_timer_multi;

                /*
                 * KVM "disables" the preemption timer by setting it to its max
                 * value.  Don't use the timer if it might cause spurious exits
                 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
                 */
                if (use_timer_freq > 0xffffffffu / 10)
                        enable_preemption_timer = false;
        }

        if (!enable_preemption_timer) {
                vt_x86_ops.set_hv_timer = NULL;
                vt_x86_ops.cancel_hv_timer = NULL;
        }

        kvm_caps.supported_mce_cap |= MCG_LMCE_P;
        kvm_caps.supported_mce_cap |= MCG_CMCI_P;

        if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
                return -EINVAL;
        if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
                pt_mode = PT_MODE_SYSTEM;
        if (pt_mode == PT_MODE_HOST_GUEST)
                vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
        else
                vt_init_ops.handle_intel_pt_intr = NULL;

        setup_default_sgx_lepubkeyhash();

        vmx_set_cpu_caps();

        /*
         * Configure nested capabilities after core CPU capabilities so that
         * nested support can be conditional on base support, e.g. so that KVM
         * can hide/show features based on kvm_cpu_cap_has().
         */
        if (nested) {
                r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
                if (r)
                        return r;
        }

        r = alloc_kvm_area();
        if (r)
                goto err_kvm_area;

        kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);

        /*
         * On Intel CPUs that lack self-snoop feature, letting the guest control
         * memory types may result in unexpected behavior. So always ignore guest
         * PAT on those CPUs and map VM as writeback, not allowing userspace to
         * disable the quirk.
         *
         * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
         * supported, UC is slow enough to cause issues with some older guests (e.g.
         * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
         * map the video RAM, causing wayland desktop to fail to get started
         * correctly). To avoid breaking those older guests that rely on KVM to force
         * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
         * safer (for performance) default behavior.
         *
         * On top of this, non-coherent DMA devices need the guest to flush CPU
         * caches properly.  This also requires honoring guest PAT, and is forced
         * independent of the quirk in vmx_ignore_guest_pat().
         */
        if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
                kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;

        kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;

        return 0;

err_kvm_area:
        if (nested)
                nested_vmx_hardware_unsetup();
        return r;
}

void vmx_exit(void)
{
        allow_smaller_maxphyaddr = false;

        vmx_cleanup_l1d_flush();

        kvm_x86_vendor_exit();
}

int __init vmx_init(void)
{
        int r, cpu;

        KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);

        if (!kvm_is_vmx_supported())
                return -EOPNOTSUPP;

        /*
         * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
         * i.e. there's nothing to unwind if a later step fails.
         */
        hv_init_evmcs();

        /*
         * Parse the VMCS config and VMX capabilities before anything else, so
         * that the information is available to all setup flows.
         */
        if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
                return -EIO;

        r = kvm_x86_vendor_init(&vt_init_ops);
        if (r)
                return r;

        /* Must be called after common x86 init so enable_ept is setup. */
        r = vmx_setup_l1d_flush();
        if (r)
                goto err_l1d_flush;

        for_each_possible_cpu(cpu) {
                INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));

                pi_init_cpu(cpu);
        }

        vmx_check_vmcs12_offsets();

        return 0;

err_l1d_flush:
        kvm_x86_vendor_exit();
        return r;
}