arch/x86/kvm/vmx/tdx.c

root/arch/x86/kvm/vmx/tdx.c
// SPDX-License-Identifier: GPL-2.0
#include <linux/cleanup.h>
#include <linux/cpu.h>
#include <asm/cpufeature.h>
#include <asm/fpu/xcr.h>
#include <linux/misc_cgroup.h>
#include <linux/mmu_context.h>
#include <asm/tdx.h>
#include "capabilities.h"
#include "mmu.h"
#include "x86_ops.h"
#include "lapic.h"
#include "tdx.h"
#include "vmx.h"
#include "mmu/spte.h"
#include "common.h"
#include "posted_intr.h"
#include "irq.h"
#include <trace/events/kvm.h>
#include "trace.h"

#pragma GCC poison to_vmx

#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...)                       \
({                                                                              \
        struct kvm *_kvm = (__kvm);                                             \
        bool __ret = !!(__err);                                                 \
                                                                                \
        if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) {               \
                if (_kvm)                                                       \
                        kvm_vm_bugged(_kvm);                                    \
                pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
                                   __err,  __args);                             \
        }                                                                       \
        unlikely(__ret);                                                        \
})

#define TDX_BUG_ON(__err, __fn, __kvm)                          \
        __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")

#define TDX_BUG_ON_1(__err, __fn, a1, __kvm)                    \
        __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)

#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm)        \
        __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)

#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm)    \
        __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
                     a1, a2, a3)


bool enable_tdx __ro_after_init;
module_param_named(tdx, enable_tdx, bool, 0444);

#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))

static enum cpuhp_state tdx_cpuhp_state;

static const struct tdx_sys_info *tdx_sysinfo;

void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
{
        KVM_BUG_ON(1, tdx->vcpu.kvm);
        pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
}

void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
                      u64 val, u64 err)
{
        KVM_BUG_ON(1, tdx->vcpu.kvm);
        pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
}

#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)

static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
{
        return container_of(kvm, struct kvm_tdx, kvm);
}

static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
{
        return container_of(vcpu, struct vcpu_tdx, vcpu);
}

static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
{
        u64 val = KVM_SUPPORTED_TD_ATTRS;

        if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
                return 0;

        val &= td_conf->attributes_fixed0;

        return val;
}

static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
{
        u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;

        if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
                return 0;

        val &= td_conf->xfam_fixed0;

        return val;
}

static int tdx_get_guest_phys_addr_bits(const u32 eax)
{
        return (eax & GENMASK(23, 16)) >> 16;
}

static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
{
        return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
}

#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))

static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
{
        return entry->function == 7 && entry->index == 0 &&
               (entry->ebx & TDX_FEATURE_TSX);
}

static void clear_tsx(struct kvm_cpuid_entry2 *entry)
{
        entry->ebx &= ~TDX_FEATURE_TSX;
}

static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
{
        return entry->function == 7 && entry->index == 0 &&
               (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
}

static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
{
        entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
}

static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
{
        if (has_tsx(entry))
                clear_tsx(entry);

        if (has_waitpkg(entry))
                clear_waitpkg(entry);
}

static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
{
        return has_tsx(entry) || has_waitpkg(entry);
}

#define KVM_TDX_CPUID_NO_SUBLEAF        ((__u32)-1)

static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
{
        const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;

        entry->function = (u32)td_conf->cpuid_config_leaves[idx];
        entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
        entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
        entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
        entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
        entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;

        if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
                entry->index = 0;

        /*
         * The TDX module doesn't allow configuring the guest phys addr bits
         * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
         * to configure the GPAW.  Report these bits as configurable.
         */
        if (entry->function == 0x80000008)
                entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);

        tdx_clear_unsupported_cpuid(entry);
}

#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT       BIT(1)

static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
                             struct kvm_tdx_capabilities *caps)
{
        int i;

        caps->supported_attrs = tdx_get_supported_attrs(td_conf);
        if (!caps->supported_attrs)
                return -EIO;

        caps->supported_xfam = tdx_get_supported_xfam(td_conf);
        if (!caps->supported_xfam)
                return -EIO;

        caps->cpuid.nent = td_conf->num_cpuid_config;

        caps->user_tdvmcallinfo_1_r11 =
                TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;

        for (i = 0; i < td_conf->num_cpuid_config; i++)
                td_init_cpuid_entry2(&caps->cpuid.entries[i], i);

        return 0;
}

/*
 * Some SEAMCALLs acquire the TDX module globally, and can fail with
 * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
 */
static DEFINE_MUTEX(tdx_lock);

static atomic_t nr_configured_hkid;

static bool tdx_operand_busy(u64 err)
{
        return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
}


/*
 * A per-CPU list of TD vCPUs associated with a given CPU.
 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
 * list.
 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
 *   the old CPU during the IPI callback running on the old CPU, and then added
 *   to the per-CPU list of the new CPU.
 * - When a TD is tearing down, all vCPUs are disassociated from their current
 *   running CPUs and removed from the per-CPU list during the IPI callback
 *   running on those CPUs.
 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
 *   associated TD vCPUs and remove them from the per-CPU list.
 */
static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);

static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
{
        return to_tdx(vcpu)->vp_enter_args.r10;
}

static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
{
        return to_tdx(vcpu)->vp_enter_args.r11;
}

static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
                                                     long val)
{
        to_tdx(vcpu)->vp_enter_args.r10 = val;
}

static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
                                                    unsigned long val)
{
        to_tdx(vcpu)->vp_enter_args.r11 = val;
}

static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
{
        tdx_guest_keyid_free(kvm_tdx->hkid);
        kvm_tdx->hkid = -1;
        atomic_dec(&nr_configured_hkid);
        misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
        put_misc_cg(kvm_tdx->misc_cg);
        kvm_tdx->misc_cg = NULL;
}

static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
{
        return kvm_tdx->hkid > 0;
}

static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
{
        lockdep_assert_irqs_disabled();

        list_del(&to_tdx(vcpu)->cpu_list);

        /*
         * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
         * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
         * to its list before it's deleted from this CPU's list.
         */
        smp_wmb();

        vcpu->cpu = -1;
}

/*
 * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
 * retry (if necessary) after forcing vCPUs to exit and wait for the operation
 * to complete.  All flows that remove/block S-EPT entries run with mmu_lock
 * held for write, i.e. are mutually exclusive with each other, but they aren't
 * mutually exclusive with running vCPUs, and so can fail with "operand busy"
 * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
 *
 * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
 */
#define tdh_do_no_vcpus(tdh_func, kvm, args...)                                 \
({                                                                              \
        struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm);                            \
        u64 __err;                                                              \
                                                                                \
        lockdep_assert_held_write(&kvm->mmu_lock);                              \
                                                                                \
        __err = tdh_func(args);                                                 \
        if (unlikely(tdx_operand_busy(__err))) {                                \
                WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true);                 \
                kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);     \
                                                                                \
                __err = tdh_func(args);                                         \
                                                                                \
                WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false);                \
        }                                                                       \
        __err;                                                                  \
})

/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(struct page *page)
{
        u64 err, rcx, rdx, r8;

        err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);

        /*
         * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
         * before the HKID is released and control pages have also been
         * released at this point, so there is no possibility of contention.
         */
        if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
                return -EIO;

        return 0;
}

static int tdx_reclaim_page(struct page *page)
{
        int r;

        r = __tdx_reclaim_page(page);
        if (!r)
                tdx_quirk_reset_page(page);
        return r;
}


/*
 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
 * private KeyID.  Assume the cache associated with the TDX private KeyID has
 * been flushed.
 */
static void tdx_reclaim_control_page(struct page *ctrl_page)
{
        /*
         * Leak the page if the kernel failed to reclaim the page.
         * The kernel cannot use it safely anymore.
         */
        if (tdx_reclaim_page(ctrl_page))
                return;

        __free_page(ctrl_page);
}

struct tdx_flush_vp_arg {
        struct kvm_vcpu *vcpu;
        u64 err;
};

static void tdx_flush_vp(void *_arg)
{
        struct tdx_flush_vp_arg *arg = _arg;
        struct kvm_vcpu *vcpu = arg->vcpu;
        u64 err;

        arg->err = 0;
        lockdep_assert_irqs_disabled();

        /* Task migration can race with CPU offlining. */
        if (unlikely(vcpu->cpu != raw_smp_processor_id()))
                return;

        /*
         * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
         * list tracking still needs to be updated so that it's correct if/when
         * the vCPU does get initialized.
         */
        if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
                /*
                 * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
                 * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
                 * vp flush function is called when destructing vCPU/TD or vCPU
                 * migration.  No other thread uses TDVPR in those cases.
                 */
                err = tdh_vp_flush(&to_tdx(vcpu)->vp);
                if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
                        /*
                         * This function is called in IPI context. Do not use
                         * printk to avoid console semaphore.
                         * The caller prints out the error message, instead.
                         */
                        if (err)
                                arg->err = err;
                }
        }

        tdx_disassociate_vp(vcpu);
}

static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
{
        struct tdx_flush_vp_arg arg = {
                .vcpu = vcpu,
        };
        int cpu = vcpu->cpu;

        if (unlikely(cpu == -1))
                return;

        smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);

        TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
}

void tdx_disable_virtualization_cpu(void)
{
        int cpu = raw_smp_processor_id();
        struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
        struct tdx_flush_vp_arg arg;
        struct vcpu_tdx *tdx, *tmp;
        unsigned long flags;

        local_irq_save(flags);
        /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
        list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
                arg.vcpu = &tdx->vcpu;
                tdx_flush_vp(&arg);
        }
        local_irq_restore(flags);

        /*
         * Flush cache now if kexec is possible: this is necessary to avoid
         * having dirty private memory cachelines when the new kernel boots,
         * but WBINVD is a relatively expensive operation and doing it during
         * kexec can exacerbate races in native_stop_other_cpus().  Do it
         * now, since this is a safe moment and there is going to be no more
         * TDX activity on this CPU from this point on.
         */
        tdx_cpu_flush_cache_for_kexec();
}

#define TDX_SEAMCALL_RETRIES 10000

static void smp_func_do_phymem_cache_wb(void *unused)
{
        u64 err = 0;
        bool resume;
        int i;

        /*
         * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
         * KeyID on the package or core.  The TDX module may not finish the
         * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
         * kernel should retry it until it returns success w/o rescheduling.
         */
        for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
                resume = !!err;
                err = tdh_phymem_cache_wb(resume);
                switch (err) {
                case TDX_INTERRUPTED_RESUMABLE:
                        continue;
                case TDX_NO_HKID_READY_TO_WBCACHE:
                        err = TDX_SUCCESS; /* Already done by other thread */
                        fallthrough;
                default:
                        goto out;
                }
        }

out:
        TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
}

void tdx_mmu_release_hkid(struct kvm *kvm)
{
        bool packages_allocated, targets_allocated;
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        cpumask_var_t packages, targets;
        struct kvm_vcpu *vcpu;
        unsigned long j;
        int i;
        u64 err;

        if (!is_hkid_assigned(kvm_tdx))
                return;

        packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
        targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
        cpus_read_lock();

        kvm_for_each_vcpu(j, vcpu, kvm)
                tdx_flush_vp_on_cpu(vcpu);

        /*
         * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
         * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
         * Multiple TDX guests can be destroyed simultaneously. Take the
         * mutex to prevent it from getting error.
         */
        mutex_lock(&tdx_lock);

        /*
         * Releasing HKID is in vm_destroy().
         * After the above flushing vps, there should be no more vCPU
         * associations, as all vCPU fds have been released at this stage.
         */
        err = tdh_mng_vpflushdone(&kvm_tdx->td);
        if (err == TDX_FLUSHVP_NOT_DONE)
                goto out;
        if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
                pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
                       kvm_tdx->hkid);
                goto out;
        }

        for_each_online_cpu(i) {
                if (packages_allocated &&
                    cpumask_test_and_set_cpu(topology_physical_package_id(i),
                                             packages))
                        continue;
                if (targets_allocated)
                        cpumask_set_cpu(i, targets);
        }
        if (targets_allocated)
                on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
        else
                on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
        /*
         * In the case of error in smp_func_do_phymem_cache_wb(), the following
         * tdh_mng_key_freeid() will fail.
         */
        err = tdh_mng_key_freeid(&kvm_tdx->td);
        if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
                pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
                       kvm_tdx->hkid);
        } else {
                tdx_hkid_free(kvm_tdx);
        }

out:
        mutex_unlock(&tdx_lock);
        cpus_read_unlock();
        free_cpumask_var(targets);
        free_cpumask_var(packages);
}

static void tdx_reclaim_td_control_pages(struct kvm *kvm)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        u64 err;
        int i;

        /*
         * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
         * heavily with TDX module.  Give up freeing TD pages.  As the function
         * already warned, don't warn it again.
         */
        if (is_hkid_assigned(kvm_tdx))
                return;

        if (kvm_tdx->td.tdcs_pages) {
                for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
                        if (!kvm_tdx->td.tdcs_pages[i])
                                continue;

                        tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
                }
                kfree(kvm_tdx->td.tdcs_pages);
                kvm_tdx->td.tdcs_pages = NULL;
        }

        if (!kvm_tdx->td.tdr_page)
                return;

        if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
                return;

        /*
         * Use a SEAMCALL to ask the TDX module to flush the cache based on the
         * KeyID. TDX module may access TDR while operating on TD (Especially
         * when it is reclaiming TDCS).
         */
        err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
        if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
                return;

        tdx_quirk_reset_page(kvm_tdx->td.tdr_page);

        __free_page(kvm_tdx->td.tdr_page);
        kvm_tdx->td.tdr_page = NULL;
}

void tdx_vm_destroy(struct kvm *kvm)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);

        tdx_reclaim_td_control_pages(kvm);

        kvm_tdx->state = TD_STATE_UNINITIALIZED;
}

static int tdx_do_tdh_mng_key_config(void *param)
{
        struct kvm_tdx *kvm_tdx = param;
        u64 err;

        /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
        err = tdh_mng_key_config(&kvm_tdx->td);
        if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
                return -EIO;

        return 0;
}

int tdx_vm_init(struct kvm *kvm)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);

        kvm->arch.has_protected_state = true;
        /*
         * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap,
         * i.e. all EOIs are accelerated and never trigger exits.
         */
        kvm->arch.has_protected_eoi = true;
        kvm->arch.has_private_mem = true;
        kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;

        /*
         * Because guest TD is protected, VMM can't parse the instruction in TD.
         * Instead, guest uses MMIO hypercall.  For unmodified device driver,
         * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
         * instruction into MMIO hypercall.
         *
         * SPTE value for MMIO needs to be setup so that #VE is injected into
         * TD instead of triggering EPT MISCONFIG.
         * - RWX=0 so that EPT violation is triggered.
         * - suppress #VE bit is cleared to inject #VE.
         */
        kvm_mmu_set_mmio_spte_value(kvm, 0);

        /*
         * TDX has its own limit of maximum vCPUs it can support for all
         * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
         * such limit via the MAX_VCPU_PER_TD global metadata.  In
         * practice, it reflects the number of logical CPUs that ALL
         * platforms that the TDX module supports can possibly have.
         *
         * Limit TDX guest's maximum vCPUs to the number of logical CPUs
         * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
         * userspace would result in an unpredictable ABI.
         */
        kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());

        kvm_tdx->state = TD_STATE_UNINITIALIZED;

        return 0;
}

int tdx_vcpu_create(struct kvm_vcpu *vcpu)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        if (kvm_tdx->state != TD_STATE_INITIALIZED)
                return -EIO;

        /*
         * TDX module mandates APICv, which requires an in-kernel local APIC.
         * Disallow an in-kernel I/O APIC, because level-triggered interrupts
         * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
         */
        if (!irqchip_split(vcpu->kvm))
                return -EINVAL;

        fpstate_set_confidential(&vcpu->arch.guest_fpu);
        vcpu->arch.apic->guest_apic_protected = true;
        INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);

        vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;

        vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
        vcpu->arch.cr0_guest_owned_bits = -1ul;
        vcpu->arch.cr4_guest_owned_bits = -1ul;

        /* KVM can't change TSC offset/multiplier as TDX module manages them. */
        vcpu->arch.guest_tsc_protected = true;
        vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
        vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
        vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
        vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;

        vcpu->arch.guest_state_protected =
                !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);

        if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
                vcpu->arch.xfd_no_write_intercept = true;

        tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
        __pi_set_sn(&tdx->vt.pi_desc);

        tdx->state = VCPU_TD_STATE_UNINITIALIZED;

        return 0;
}

void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        vmx_vcpu_pi_load(vcpu, cpu);
        if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
                return;

        tdx_flush_vp_on_cpu(vcpu);

        KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
        local_irq_disable();
        /*
         * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
         * vcpu->cpu is read before tdx->cpu_list.
         */
        smp_rmb();

        list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
        local_irq_enable();
}

bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
        /*
         * KVM can't get the interrupt status of TDX guest and it assumes
         * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
         * which passes the interrupt blocked flag.
         */
        return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
               !to_tdx(vcpu)->vp_enter_args.r12;
}

static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
        u64 vcpu_state_details;

        if (pi_has_pending_interrupt(vcpu))
                return true;

        /*
         * Only check RVI pending for HALTED case with IRQ enabled.
         * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
         * interrupt was pending before TD exit, then it _must_ be blocked,
         * otherwise the interrupt would have been serviced at the instruction
         * boundary.
         */
        if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
            to_tdx(vcpu)->vp_enter_args.r12)
                return false;

        vcpu_state_details =
                td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);

        return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
}

struct tdx_uret_msr {
        u32 msr;
        unsigned int slot;
        u64 defval;
};

static struct tdx_uret_msr tdx_uret_msrs[] = {
        {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
        {.msr = MSR_STAR,},
        {.msr = MSR_LSTAR,},
        {.msr = MSR_TSC_AUX,},
};

void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
        struct vcpu_vt *vt = to_vt(vcpu);
        int i;

        if (vt->guest_state_loaded)
                return;

        if (likely(is_64bit_mm(current->mm)))
                vt->msr_host_kernel_gs_base = current->thread.gsbase;
        else
                vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);

        vt->guest_state_loaded = true;

        /*
         * Explicitly set user-return MSRs that are clobbered by the TDX-Module
         * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
         * written by the TDX-Module.  Don't rely on the TDX-Module to actually
         * clobber the MSRs, as the contract is poorly defined and not upheld.
         * E.g. the TDX-Module will synthesize an EPT Violation without doing
         * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
         * state.
         */
        for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
                kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
                                        tdx_uret_msrs[i].defval, -1ull);
}

static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
{
        struct vcpu_vt *vt = to_vt(vcpu);

        if (!vt->guest_state_loaded)
                return;

        ++vcpu->stat.host_state_reload;
        wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);

        vt->guest_state_loaded = false;
}

void tdx_vcpu_put(struct kvm_vcpu *vcpu)
{
        vmx_vcpu_pi_put(vcpu);
        tdx_prepare_switch_to_host(vcpu);
}

/*
 * Life cycles for a TD and a vCPU:
 * 1. KVM_CREATE_VM ioctl.
 *    TD state is TD_STATE_UNINITIALIZED.
 *    hkid is not assigned at this stage.
 * 2. KVM_TDX_INIT_VM ioctl.
 *    TD transitions to TD_STATE_INITIALIZED.
 *    hkid is assigned after this stage.
 * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
 *    3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
 *    3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
 *    3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
 *        kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
 * 4. KVM_TDX_INIT_VCPU ioctl.
 *    tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
 *    vCPU control structures are allocated at this stage.
 * 5. kvm_destroy_vm().
 *    5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
 *                                (2) puts hkid to !assigned state.
 *    5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
 *        transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
 *    5.3 tdx_vm_destroy()
 *        transitions TD to TD_STATE_UNINITIALIZED state.
 *
 * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
 * - If at 3.3, hkid is still assigned, but the vCPU must be in
 *   VCPU_TD_STATE_UNINITIALIZED state.
 * - if at 5.2, hkid must be !assigned and all vCPUs must be in
 *   VCPU_TD_STATE_INITIALIZED state and have been dissociated.
 */
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        int i;

        if (vcpu->cpu != -1) {
                KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
                tdx_flush_vp_on_cpu(vcpu);
                return;
        }

        /*
         * It is not possible to reclaim pages while hkid is assigned. It might
         * be assigned if the TD VM is being destroyed but freeing hkid failed,
         * in which case the pages are leaked.
         */
        if (is_hkid_assigned(kvm_tdx))
                return;

        if (tdx->vp.tdcx_pages) {
                for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
                        if (tdx->vp.tdcx_pages[i])
                                tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
                }
                kfree(tdx->vp.tdcx_pages);
                tdx->vp.tdcx_pages = NULL;
        }
        if (tdx->vp.tdvpr_page) {
                tdx_reclaim_control_page(tdx->vp.tdvpr_page);
                tdx->vp.tdvpr_page = NULL;
                tdx->vp.tdvpr_pa = 0;
        }

        tdx->state = VCPU_TD_STATE_UNINITIALIZED;
}

int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
        if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
                     to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
                return -EINVAL;

        return 1;
}

static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
{
        switch (tdvmcall_leaf(vcpu)) {
        case EXIT_REASON_CPUID:
        case EXIT_REASON_HLT:
        case EXIT_REASON_IO_INSTRUCTION:
        case EXIT_REASON_MSR_READ:
        case EXIT_REASON_MSR_WRITE:
                return tdvmcall_leaf(vcpu);
        case EXIT_REASON_EPT_VIOLATION:
                return EXIT_REASON_EPT_MISCONFIG;
        default:
                break;
        }

        return EXIT_REASON_TDCALL;
}

static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u32 exit_reason;

        switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
        case TDX_SUCCESS:
        case TDX_NON_RECOVERABLE_VCPU:
        case TDX_NON_RECOVERABLE_TD:
        case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
        case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
                break;
        default:
                return -1u;
        }

        exit_reason = tdx->vp_enter_ret;

        switch (exit_reason) {
        case EXIT_REASON_TDCALL:
                if (tdvmcall_exit_type(vcpu))
                        return EXIT_REASON_VMCALL;

                return tdcall_to_vmx_exit_reason(vcpu);
        case EXIT_REASON_EPT_MISCONFIG:
                /*
                 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
                 * non-instrumentable code with interrupts disabled.
                 */
                return -1u;
        default:
                break;
        }

        return exit_reason;
}

static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        struct vcpu_vt *vt = to_vt(vcpu);

        guest_state_enter_irqoff();

        tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);

        vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);

        vt->exit_qualification = tdx->vp_enter_args.rcx;
        tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
        tdx->exit_gpa = tdx->vp_enter_args.r8;
        vt->exit_intr_info = tdx->vp_enter_args.r9;

        vmx_handle_nmi(vcpu);

        guest_state_exit_irqoff();
}

static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
{
        return vmx_get_exit_reason(vcpu).failed_vmentry &&
               vmx_get_exit_reason(vcpu).full != -1u;
}

static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
        u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;

        /*
         * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
         * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
         *
         * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
         * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
         * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
         * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
         * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
         * requester may be blocked endlessly.
         */
        if (unlikely(tdx_operand_busy(vp_enter_ret)))
                return EXIT_FASTPATH_EXIT_HANDLED;

        return EXIT_FASTPATH_NONE;
}

#define TDX_REGS_AVAIL_SET      (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
                                 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
                                 BIT_ULL(VCPU_REGS_RAX) | \
                                 BIT_ULL(VCPU_REGS_RBX) | \
                                 BIT_ULL(VCPU_REGS_RCX) | \
                                 BIT_ULL(VCPU_REGS_RDX) | \
                                 BIT_ULL(VCPU_REGS_RBP) | \
                                 BIT_ULL(VCPU_REGS_RSI) | \
                                 BIT_ULL(VCPU_REGS_RDI) | \
                                 BIT_ULL(VCPU_REGS_R8) | \
                                 BIT_ULL(VCPU_REGS_R9) | \
                                 BIT_ULL(VCPU_REGS_R10) | \
                                 BIT_ULL(VCPU_REGS_R11) | \
                                 BIT_ULL(VCPU_REGS_R12) | \
                                 BIT_ULL(VCPU_REGS_R13) | \
                                 BIT_ULL(VCPU_REGS_R14) | \
                                 BIT_ULL(VCPU_REGS_R15))

static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);

        /*
         * All TDX hosts support PKRU; but even if they didn't,
         * vcpu->arch.host_pkru would be 0 and the wrpkru would be
         * skipped.
         */
        if (vcpu->arch.host_pkru != 0)
                wrpkru(vcpu->arch.host_pkru);

        if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
                xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);

        /*
         * Likewise, even if a TDX hosts didn't support XSS both arms of
         * the comparison would be 0 and the wrmsrl would be skipped.
         */
        if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
                wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}

#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
                                DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
                                DEBUGCTLMSR_FREEZE_IN_SMM)

fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        struct vcpu_vt *vt = to_vt(vcpu);

        /*
         * WARN if KVM wants to force an immediate exit, as the TDX module does
         * not guarantee entry into the guest, i.e. it's possible for KVM to
         * _think_ it completed entry to the guest and forced an immediate exit
         * without actually having done so.  Luckily, KVM never needs to force
         * an immediate exit for TDX (KVM can't do direct event injection, so
         * just WARN and continue on.
         */
        WARN_ON_ONCE(run_flags);

        /*
         * Wait until retry of SEPT-zap-related SEAMCALL completes before
         * allowing vCPU entry to avoid contention with tdh_vp_enter() and
         * TDCALLs.
         */
        if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
                return EXIT_FASTPATH_EXIT_HANDLED;

        trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);

        if (pi_test_on(&vt->pi_desc)) {
                apic->send_IPI_self(POSTED_INTR_VECTOR);

                if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
                               APIC_VECTOR_MASK, &vt->pi_desc))
                        kvm_wait_lapic_expire(vcpu);
        }

        tdx_vcpu_enter_exit(vcpu);

        if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
                update_debugctlmsr(vcpu->arch.host_debugctl);

        tdx_load_host_xsave_state(vcpu);

        vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;

        if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
                return EXIT_FASTPATH_NONE;

        if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
                return EXIT_FASTPATH_NONE;

        trace_kvm_exit(vcpu, KVM_ISA_VMX);

        if (unlikely(tdx_failed_vmentry(vcpu)))
                return EXIT_FASTPATH_NONE;

        return tdx_exit_handlers_fastpath(vcpu);
}

void tdx_inject_nmi(struct kvm_vcpu *vcpu)
{
        ++vcpu->stat.nmi_injections;
        td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
        /*
         * From KVM's perspective, NMI injection is completed right after
         * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
         * the TDX module or not.
         */
        vcpu->arch.nmi_injected = false;
        /*
         * TDX doesn't support KVM to request NMI window exit.  If there is
         * still a pending vNMI, KVM is not able to inject it along with the
         * one pending in TDX module in a back-to-back way.  Since the previous
         * vNMI is still pending in TDX module, i.e. it has not been delivered
         * to TDX guest yet, it's OK to collapse the pending vNMI into the
         * previous one.  The guest is expected to handle all the NMI sources
         * when handling the first vNMI.
         */
        vcpu->arch.nmi_pending = 0;
}

static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
{
        u32 intr_info = vmx_get_intr_info(vcpu);

        /*
         * Machine checks are handled by handle_exception_irqoff(), or by
         * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
         * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
         */
        if (is_nmi(intr_info) || is_machine_check(intr_info))
                return 1;

        vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
        vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
        vcpu->run->ex.error_code = 0;

        return 0;
}

static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
{
        tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
        return 1;
}

static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
{
        kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
        kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
        kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
        kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
        kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);

        return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
}

/*
 * Split into chunks and check interrupt pending between chunks.  This allows
 * for timely injection of interrupts to prevent issues with guest lockup
 * detection.
 */
#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
static void __tdx_map_gpa(struct vcpu_tdx *tdx);

static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        if (vcpu->run->hypercall.ret) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                tdx->vp_enter_args.r11 = tdx->map_gpa_next;
                return 1;
        }

        tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
        if (tdx->map_gpa_next >= tdx->map_gpa_end)
                return 1;

        /*
         * Stop processing the remaining part if there is a pending interrupt,
         * which could be qualified to deliver.  Skip checking pending RVI for
         * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
         */
        if (kvm_vcpu_has_events(vcpu)) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
                tdx->vp_enter_args.r11 = tdx->map_gpa_next;
                return 1;
        }

        __tdx_map_gpa(tdx);
        return 0;
}

static void __tdx_map_gpa(struct vcpu_tdx *tdx)
{
        u64 gpa = tdx->map_gpa_next;
        u64 size = tdx->map_gpa_end - tdx->map_gpa_next;

        if (size > TDX_MAP_GPA_MAX_LEN)
                size = TDX_MAP_GPA_MAX_LEN;

        tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
        tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
        /*
         * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
         * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
         * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
         * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
         */
        tdx->vcpu.run->hypercall.ret = 0;
        tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
        tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
        tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
                                           KVM_MAP_GPA_RANGE_ENCRYPTED :
                                           KVM_MAP_GPA_RANGE_DECRYPTED;
        tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;

        tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
}

static int tdx_map_gpa(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u64 gpa = tdx->vp_enter_args.r12;
        u64 size = tdx->vp_enter_args.r13;
        u64 ret;

        /*
         * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
         * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
         * bit set.  This is a base call so it should always be supported, but
         * KVM has no way to ensure that userspace implements the GHCI correctly.
         * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
         * to the guest.
         */
        if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
                ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
                goto error;
        }

        if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
            !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
            (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
             vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
                ret = TDVMCALL_STATUS_INVALID_OPERAND;
                goto error;
        }

        if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
                ret = TDVMCALL_STATUS_ALIGN_ERROR;
                goto error;
        }

        tdx->map_gpa_end = gpa + size;
        tdx->map_gpa_next = gpa;

        __tdx_map_gpa(tdx);
        return 0;

error:
        tdvmcall_set_return_code(vcpu, ret);
        tdx->vp_enter_args.r11 = gpa;
        return 1;
}

static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u64 *regs = vcpu->run->system_event.data;
        u64 *module_regs = &tdx->vp_enter_args.r8;
        int index = VCPU_REGS_RAX;

        vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
        vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
        vcpu->run->system_event.ndata = 16;

        /* Dump 16 general-purpose registers to userspace in ascending order. */
        regs[index++] = tdx->vp_enter_ret;
        regs[index++] = tdx->vp_enter_args.rcx;
        regs[index++] = tdx->vp_enter_args.rdx;
        regs[index++] = tdx->vp_enter_args.rbx;
        regs[index++] = 0;
        regs[index++] = 0;
        regs[index++] = tdx->vp_enter_args.rsi;
        regs[index] = tdx->vp_enter_args.rdi;
        for (index = 0; index < 8; index++)
                regs[VCPU_REGS_R8 + index] = module_regs[index];

        return 0;
}

static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
{
        u32 eax, ebx, ecx, edx;
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        /* EAX and ECX for cpuid is stored in R12 and R13. */
        eax = tdx->vp_enter_args.r12;
        ecx = tdx->vp_enter_args.r13;

        kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);

        tdx->vp_enter_args.r12 = eax;
        tdx->vp_enter_args.r13 = ebx;
        tdx->vp_enter_args.r14 = ecx;
        tdx->vp_enter_args.r15 = edx;

        return 1;
}

static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
{
        vcpu->arch.pio.count = 0;
        return 1;
}

static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
{
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        unsigned long val = 0;
        int ret;

        ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
                                         vcpu->arch.pio.port, &val, 1);

        WARN_ON_ONCE(!ret);

        tdvmcall_set_return_val(vcpu, val);

        return 1;
}

static int tdx_emulate_io(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        unsigned long val = 0;
        unsigned int port;
        u64 size, write;
        int ret;

        ++vcpu->stat.io_exits;

        size = tdx->vp_enter_args.r12;
        write = tdx->vp_enter_args.r13;
        port = tdx->vp_enter_args.r14;

        if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                return 1;
        }

        if (write) {
                val = tdx->vp_enter_args.r15;
                ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
        } else {
                ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
        }

        if (!ret)
                vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
                                                           tdx_complete_pio_in;
        else if (!write)
                tdvmcall_set_return_val(vcpu, val);

        return ret;
}

static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
{
        unsigned long val = 0;
        gpa_t gpa;
        int size;

        gpa = vcpu->mmio_fragments[0].gpa;
        size = vcpu->mmio_fragments[0].len;

        memcpy(&val, vcpu->run->mmio.data, size);
        tdvmcall_set_return_val(vcpu, val);
        trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
        return 1;
}

static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
                                 unsigned long val)
{
        if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                trace_kvm_fast_mmio(gpa);
                return 0;
        }

        trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
        if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
                return -EOPNOTSUPP;

        return 0;
}

static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
{
        unsigned long val;

        if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
                return -EOPNOTSUPP;

        tdvmcall_set_return_val(vcpu, val);
        trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
        return 0;
}

static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        int size, write, r;
        unsigned long val;
        gpa_t gpa;

        size = tdx->vp_enter_args.r12;
        write = tdx->vp_enter_args.r13;
        gpa = tdx->vp_enter_args.r14;
        val = write ? tdx->vp_enter_args.r15 : 0;

        if (size != 1 && size != 2 && size != 4 && size != 8)
                goto error;
        if (write != 0 && write != 1)
                goto error;

        /*
         * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
         * do MMIO emulation for private GPA.
         */
        if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
            vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
                goto error;

        gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));

        if (write)
                r = tdx_mmio_write(vcpu, gpa, size, val);
        else
                r = tdx_mmio_read(vcpu, gpa, size);
        if (!r)
                /* Kernel completed device emulation. */
                return 1;

        /* Request the device emulation to userspace device model. */
        vcpu->mmio_is_write = write;
        if (!write)
                vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;

        vcpu->run->mmio.phys_addr = gpa;
        vcpu->run->mmio.len = size;
        vcpu->run->mmio.is_write = write;
        vcpu->run->exit_reason = KVM_EXIT_MMIO;

        if (write) {
                memcpy(vcpu->run->mmio.data, &val, size);
        } else {
                vcpu->mmio_fragments[0].gpa = gpa;
                vcpu->mmio_fragments[0].len = size;
                trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
        }
        return 0;

error:
        tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
        return 1;
}

static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);

        /*
         * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
         * directly without the support from userspace, just set the value
         * returned from userspace.
         */
        tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
        tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
        tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
        tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;

        return 1;
}

static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        switch (tdx->vp_enter_args.r12) {
        case 0:
                tdx->vp_enter_args.r11 = 0;
                tdx->vp_enter_args.r12 = 0;
                tdx->vp_enter_args.r13 = 0;
                tdx->vp_enter_args.r14 = 0;
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
                return 1;
        case 1:
                vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
                vcpu->run->exit_reason = KVM_EXIT_TDX;
                vcpu->run->tdx.flags = 0;
                vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
                vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
                vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
                vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
                vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
                vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
                vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
                return 0;
        default:
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                return 1;
        }
}

static int tdx_complete_simple(struct kvm_vcpu *vcpu)
{
        tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
        return 1;
}

static int tdx_get_quote(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u64 gpa = tdx->vp_enter_args.r12;
        u64 size = tdx->vp_enter_args.r13;

        /* The gpa of buffer must have shared bit set. */
        if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                return 1;
        }

        vcpu->run->exit_reason = KVM_EXIT_TDX;
        vcpu->run->tdx.flags = 0;
        vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
        vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
        vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
        vcpu->run->tdx.get_quote.size = size;

        vcpu->arch.complete_userspace_io = tdx_complete_simple;

        return 0;
}

static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u64 vector = tdx->vp_enter_args.r12;

        if (vector < 32 || vector > 255) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                return 1;
        }

        vcpu->run->exit_reason = KVM_EXIT_TDX;
        vcpu->run->tdx.flags = 0;
        vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
        vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
        vcpu->run->tdx.setup_event_notify.vector = vector;

        vcpu->arch.complete_userspace_io = tdx_complete_simple;

        return 0;
}

static int handle_tdvmcall(struct kvm_vcpu *vcpu)
{
        switch (tdvmcall_leaf(vcpu)) {
        case TDVMCALL_MAP_GPA:
                return tdx_map_gpa(vcpu);
        case TDVMCALL_REPORT_FATAL_ERROR:
                return tdx_report_fatal_error(vcpu);
        case TDVMCALL_GET_TD_VM_CALL_INFO:
                return tdx_get_td_vm_call_info(vcpu);
        case TDVMCALL_GET_QUOTE:
                return tdx_get_quote(vcpu);
        case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
                return tdx_setup_event_notify_interrupt(vcpu);
        default:
                break;
        }

        tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
        return 1;
}

void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
{
        u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
                          TDX_SHARED_BIT_PWL_4;

        if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
                return;

        td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}

static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
                            kvm_pfn_t pfn)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        u64 err, entry, level_state;
        gpa_t gpa = gfn_to_gpa(gfn);

        lockdep_assert_held(&kvm->slots_lock);

        if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
            KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
                return -EIO;

        err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
                               kvm_tdx->page_add_src, &entry, &level_state);
        if (unlikely(tdx_operand_busy(err)))
                return -EBUSY;

        if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
                return -EIO;

        return 0;
}

static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
                            enum pg_level level, kvm_pfn_t pfn)
{
        int tdx_level = pg_level_to_tdx_sept_level(level);
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        struct page *page = pfn_to_page(pfn);
        gpa_t gpa = gfn_to_gpa(gfn);
        u64 entry, level_state;
        u64 err;

        err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
        if (unlikely(tdx_operand_busy(err)))
                return -EBUSY;

        if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
                return -EIO;

        return 0;
}

static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
                                     enum pg_level level, u64 mirror_spte)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        kvm_pfn_t pfn = spte_to_pfn(mirror_spte);

        /* TODO: handle large pages. */
        if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
                return -EIO;

        WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
                     (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);

        /*
         * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
         * before kvm_tdx->state.  Userspace must not be allowed to pre-fault
         * arbitrary memory until the initial memory image is finalized.  Pairs
         * with the smp_wmb() in tdx_td_finalize().
         */
        smp_rmb();

        /*
         * If the TD isn't finalized/runnable, then userspace is initializing
         * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
         */
        if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
                return tdx_mem_page_add(kvm, gfn, level, pfn);

        return tdx_mem_page_aug(kvm, gfn, level, pfn);
}

static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
                                     enum pg_level level, void *private_spt)
{
        int tdx_level = pg_level_to_tdx_sept_level(level);
        gpa_t gpa = gfn_to_gpa(gfn);
        struct page *page = virt_to_page(private_spt);
        u64 err, entry, level_state;

        err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
                               &level_state);
        if (unlikely(tdx_operand_busy(err)))
                return -EBUSY;

        if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
                return -EIO;

        return 0;
}

/*
 * Ensure shared and private EPTs to be flushed on all vCPUs.
 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
 * running in guest mode with the value "N - 1".
 *
 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
 * being increased to "N + 1".
 *
 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
 * to increase TD epoch to "N + 2").
 *
 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
 * guest mode with TD epoch value "N + 1".
 *
 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
 * waiting empty IPI handler ack_kick().
 *
 * No action is required to the vCPUs being kicked off since the kicking off
 * occurs certainly after TD epoch increment and before the next
 * tdh_mem_track().
 */
static void tdx_track(struct kvm *kvm)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        u64 err;

        /* If TD isn't finalized, it's before any vcpu running. */
        if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
                return;

        /*
         * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
         * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
         * tracking epoch hasn't completed.
         */
        lockdep_assert_held_write(&kvm->mmu_lock);

        err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
        TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);

        kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}

static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
                                     enum pg_level level, void *private_spt)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);

        /*
         * free_external_spt() is only called after hkid is freed when TD is
         * tearing down.
         * KVM doesn't (yet) zap page table pages in mirror page table while
         * TD is active, though guest pages mapped in mirror page table could be
         * zapped during TD is active, e.g. for shared <-> private conversion
         * and slot move/deletion.
         */
        if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
                return -EIO;

        /*
         * The HKID assigned to this TD was already freed and cache was
         * already flushed. We don't have to flush again.
         */
        return tdx_reclaim_page(virt_to_page(private_spt));
}

static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
                                         enum pg_level level, u64 mirror_spte)
{
        struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
        int tdx_level = pg_level_to_tdx_sept_level(level);
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        gpa_t gpa = gfn_to_gpa(gfn);
        u64 err, entry, level_state;

        lockdep_assert_held_write(&kvm->mmu_lock);

        /*
         * HKID is released after all private pages have been removed, and set
         * before any might be populated. Warn if zapping is attempted when
         * there can't be anything populated in the private EPT.
         */
        if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
                return;

        /* TODO: handle large pages. */
        if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
                return;

        err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
                              tdx_level, &entry, &level_state);
        if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
                return;

        /*
         * TDX requires TLB tracking before dropping private page.  Do
         * it here, although it is also done later.
         */
        tdx_track(kvm);

        /*
         * When zapping private page, write lock is held. So no race condition
         * with other vcpu sept operation.
         * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
         */
        err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
                              tdx_level, &entry, &level_state);
        if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
                return;

        err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
        if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
                return;

        tdx_quirk_reset_page(page);
}

void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
                           int trig_mode, int vector)
{
        struct kvm_vcpu *vcpu = apic->vcpu;
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        /* TDX supports only posted interrupt.  No lapic emulation. */
        __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);

        trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
}

static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
{
        u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
        u64 eq = vmx_get_exit_qual(vcpu);

        if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
                return false;

        return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
}

static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qual;
        gpa_t gpa = to_tdx(vcpu)->exit_gpa;
        bool local_retry = false;
        int ret;

        if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
                if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
                        pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
                                gpa, vcpu->vcpu_id);
                        kvm_vm_dead(vcpu->kvm);
                        return -EIO;
                }
                /*
                 * Always treat SEPT violations as write faults.  Ignore the
                 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
                 * TD private pages are always RWX in the SEPT tables,
                 * i.e. they're always mapped writable.  Just as importantly,
                 * treating SEPT violations as write faults is necessary to
                 * avoid COW allocations, which will cause TDAUGPAGE failures
                 * due to aliasing a single HPA to multiple GPAs.
                 */
                exit_qual = EPT_VIOLATION_ACC_WRITE;

                /* Only private GPA triggers zero-step mitigation */
                local_retry = true;
        } else {
                exit_qual = vmx_get_exit_qual(vcpu);
                /*
                 * EPT violation due to instruction fetch should never be
                 * triggered from shared memory in TDX guest.  If such EPT
                 * violation occurs, treat it as broken hardware.
                 */
                if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
                        return -EIO;
        }

        trace_kvm_page_fault(vcpu, gpa, exit_qual);

        /*
         * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
         * mapping in TDX.
         *
         * KVM may return RET_PF_RETRY for private GPA due to
         * - contentions when atomically updating SPTEs of the mirror page table
         * - in-progress GFN invalidation or memslot removal.
         * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
         *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
         *   or certain TDCALLs.
         *
         * If TDH.VP.ENTER is invoked more times than the threshold set by the
         * TDX module before KVM resolves the private GPA mapping, the TDX
         * module will activate zero-step mitigation during TDH.VP.ENTER. This
         * process acquires an SEPT tree lock in the TDX module, leading to
         * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
         * operations on other vCPUs.
         *
         * Breaking out of local retries for kvm_vcpu_has_events() is for
         * interrupt injection. kvm_vcpu_has_events() should not see pending
         * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
         * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
         * the guest even if the IRQ/NMI can't be delivered.
         *
         * Note: even without breaking out of local retries, zero-step
         * mitigation may still occur due to
         * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
         * - a single RIP causing EPT violations for more GFNs than the
         *   threshold count.
         * This is safe, as triggering zero-step mitigation only introduces
         * contentions to page installation SEAMCALLs on other vCPUs, which will
         * handle retries locally in their EPT violation handlers.
         */
        while (1) {
                struct kvm_memory_slot *slot;

                ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);

                if (ret != RET_PF_RETRY || !local_retry)
                        break;

                if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
                        break;

                if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
                        ret = -EIO;
                        break;
                }

                /*
                 * Bail if the memslot is invalid, i.e. is being deleted, as
                 * faulting in will never succeed and this task needs to drop
                 * SRCU in order to let memslot deletion complete.
                 */
                slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(gpa));
                if (slot && slot->flags & KVM_MEMSLOT_INVALID)
                        break;

                cond_resched();
        }
        return ret;
}

int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{
        if (err) {
                tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
                return 1;
        }

        if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
                tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));

        return 1;
}


int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        u64 vp_enter_ret = tdx->vp_enter_ret;
        union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);

        if (fastpath != EXIT_FASTPATH_NONE)
                return 1;

        if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
                KVM_BUG_ON(1, vcpu->kvm);
                return -EIO;
        }

        /*
         * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
         * TDX_SEAMCALL_VMFAILINVALID.
         */
        if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
                KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
                goto unhandled_exit;
        }

        if (unlikely(tdx_failed_vmentry(vcpu))) {
                /*
                 * If the guest state is protected, that means off-TD debug is
                 * not enabled, TDX_NON_RECOVERABLE must be set.
                 */
                WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
                                !(vp_enter_ret & TDX_NON_RECOVERABLE));
                vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
                vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
                return 0;
        }

        if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
                exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
                kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
                goto unhandled_exit;
        }

        WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
                     (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);

        switch (exit_reason.basic) {
        case EXIT_REASON_TRIPLE_FAULT:
                vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
                vcpu->mmio_needed = 0;
                return 0;
        case EXIT_REASON_EXCEPTION_NMI:
                return tdx_handle_exception_nmi(vcpu);
        case EXIT_REASON_EXTERNAL_INTERRUPT:
                ++vcpu->stat.irq_exits;
                return 1;
        case EXIT_REASON_CPUID:
                return tdx_emulate_cpuid(vcpu);
        case EXIT_REASON_HLT:
                return kvm_emulate_halt_noskip(vcpu);
        case EXIT_REASON_TDCALL:
                return handle_tdvmcall(vcpu);
        case EXIT_REASON_VMCALL:
                return tdx_emulate_vmcall(vcpu);
        case EXIT_REASON_IO_INSTRUCTION:
                return tdx_emulate_io(vcpu);
        case EXIT_REASON_MSR_READ:
                kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
                return kvm_emulate_rdmsr(vcpu);
        case EXIT_REASON_MSR_WRITE:
                kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
                kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
                kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
                return kvm_emulate_wrmsr(vcpu);
        case EXIT_REASON_EPT_MISCONFIG:
                return tdx_emulate_mmio(vcpu);
        case EXIT_REASON_EPT_VIOLATION:
                return tdx_handle_ept_violation(vcpu);
        case EXIT_REASON_OTHER_SMI:
                /*
                 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
                 * TD guest vCPU is running) will cause VM exit to TDX module,
                 * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
                 * and handled by kernel handler right away.
                 *
                 * The Other SMI exit can also be caused by the SEAM non-root
                 * machine check delivered via Machine Check System Management
                 * Interrupt (MSMI), but it has already been handled by the
                 * kernel machine check handler, i.e., the memory page has been
                 * marked as poisoned and it won't be freed to the free list
                 * when the TDX guest is terminated (the TDX module marks the
                 * guest as dead and prevent it from further running when
                 * machine check happens in SEAM non-root).
                 *
                 * - A MSMI will not reach here, it's handled as non_recoverable
                 *   case above.
                 * - If it's not an MSMI, no need to do anything here.
                 */
                return 1;
        default:
                break;
        }

unhandled_exit:
        kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
        return 0;
}

void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
                u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);

        *reason = tdx->vt.exit_reason.full;
        if (*reason != -1u) {
                *info1 = vmx_get_exit_qual(vcpu);
                *info2 = tdx->ext_exit_qualification;
                *intr_info = vmx_get_intr_info(vcpu);
        } else {
                *info1 = 0;
                *info2 = 0;
                *intr_info = 0;
        }

        *error_code = 0;
}

bool tdx_has_emulated_msr(u32 index)
{
        switch (index) {
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_ARCH_CAPABILITIES:
        case MSR_IA32_POWER_CTL:
        case MSR_IA32_CR_PAT:
        case MSR_MTRRcap:
        case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
        case MSR_MTRRdefType:
        case MSR_IA32_TSC_DEADLINE:
        case MSR_IA32_MISC_ENABLE:
        case MSR_PLATFORM_INFO:
        case MSR_MISC_FEATURES_ENABLES:
        case MSR_IA32_APICBASE:
        case MSR_EFER:
        case MSR_IA32_FEAT_CTL:
        case MSR_IA32_MCG_CAP:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_EXT_CTL:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
        case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
                /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
        case MSR_KVM_POLL_CONTROL:
                return true;
        case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
                /*
                 * x2APIC registers that are virtualized by the CPU can't be
                 * emulated, KVM doesn't have access to the virtual APIC page.
                 */
                switch (index) {
                case X2APIC_MSR(APIC_TASKPRI):
                case X2APIC_MSR(APIC_PROCPRI):
                case X2APIC_MSR(APIC_EOI):
                case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
                case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
                case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
                        return false;
                default:
                        return true;
                }
        default:
                return false;
        }
}

static bool tdx_is_read_only_msr(u32 index)
{
        return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
                index == MSR_IA32_FEAT_CTL;
}

int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
        switch (msr->index) {
        case MSR_IA32_FEAT_CTL:
                /*
                 * MCE and MCA are advertised via cpuid. Guest kernel could
                 * check if LMCE is enabled or not.
                 */
                msr->data = FEAT_CTL_LOCKED;
                if (vcpu->arch.mcg_cap & MCG_LMCE_P)
                        msr->data |= FEAT_CTL_LMCE_ENABLED;
                return 0;
        case MSR_IA32_MCG_EXT_CTL:
                if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
                        return 1;
                msr->data = vcpu->arch.mcg_ext_ctl;
                return 0;
        default:
                if (!tdx_has_emulated_msr(msr->index))
                        return 1;

                return kvm_get_msr_common(vcpu, msr);
        }
}

int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
        switch (msr->index) {
        case MSR_IA32_MCG_EXT_CTL:
                if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
                    (msr->data & ~MCG_EXT_CTL_LMCE_EN))
                        return 1;
                vcpu->arch.mcg_ext_ctl = msr->data;
                return 0;
        default:
                if (tdx_is_read_only_msr(msr->index))
                        return 1;

                if (!tdx_has_emulated_msr(msr->index))
                        return 1;

                return kvm_set_msr_common(vcpu, msr);
        }
}

static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
{
        const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
        struct kvm_tdx_capabilities __user *user_caps;
        struct kvm_tdx_capabilities *caps = NULL;
        u32 nr_user_entries;
        int ret = 0;

        /* flags is reserved for future use */
        if (cmd->flags)
                return -EINVAL;

        user_caps = u64_to_user_ptr(cmd->data);
        if (get_user(nr_user_entries, &user_caps->cpuid.nent))
                return -EFAULT;

        if (nr_user_entries < td_conf->num_cpuid_config)
                return -E2BIG;

        caps = kzalloc_flex(*caps, cpuid.entries, td_conf->num_cpuid_config);
        if (!caps)
                return -ENOMEM;

        ret = init_kvm_tdx_caps(td_conf, caps);
        if (ret)
                goto out;

        if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
                                                      caps->cpuid.nent))) {
                ret = -EFAULT;
                goto out;
        }

out:
        /* kfree() accepts NULL. */
        kfree(caps);
        return ret;
}

/*
 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
 * similar to TDX's GPAW. Use this field as the interface for userspace to
 * configure the GPAW and EPT level for TDs.
 *
 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
 * supported. Value 52 is only supported when the platform supports 5 level
 * EPT.
 */
static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
                                        struct td_params *td_params)
{
        const struct kvm_cpuid_entry2 *entry;
        int guest_pa;

        entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
        if (!entry)
                return -EINVAL;

        guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);

        if (guest_pa != 48 && guest_pa != 52)
                return -EINVAL;

        if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
                return -EINVAL;

        td_params->eptp_controls = VMX_EPTP_MT_WB;
        if (guest_pa == 52) {
                td_params->eptp_controls |= VMX_EPTP_PWL_5;
                td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
        } else {
                td_params->eptp_controls |= VMX_EPTP_PWL_4;
        }

        return 0;
}

static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
                                 struct td_params *td_params)
{
        const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
        const struct kvm_cpuid_entry2 *entry;
        struct tdx_cpuid_value *value;
        int i, copy_cnt = 0;

        /*
         * td_params.cpuid_values: The number and the order of cpuid_value must
         * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
         * It's assumed that td_params was zeroed.
         */
        for (i = 0; i < td_conf->num_cpuid_config; i++) {
                struct kvm_cpuid_entry2 tmp;

                td_init_cpuid_entry2(&tmp, i);

                entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
                                              tmp.function, tmp.index);
                if (!entry)
                        continue;

                if (tdx_unsupported_cpuid(entry))
                        return -EINVAL;

                copy_cnt++;

                value = &td_params->cpuid_values[i];
                value->eax = entry->eax;
                value->ebx = entry->ebx;
                value->ecx = entry->ecx;
                value->edx = entry->edx;

                /*
                 * TDX module does not accept nonzero bits 16..23 for the
                 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
                 */
                if (tmp.function == 0x80000008)
                        value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
        }

        /*
         * Rely on the TDX module to reject invalid configuration, but it can't
         * check of leafs that don't have a proper slot in td_params->cpuid_values
         * to stick then. So fail if there were entries that didn't get copied to
         * td_params.
         */
        if (copy_cnt != cpuid->nent)
                return -EINVAL;

        return 0;
}

static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
                        struct kvm_tdx_init_vm *init_vm)
{
        const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
        struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
        int ret;

        if (kvm->created_vcpus)
                return -EBUSY;

        if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
                return -EINVAL;

        if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
                return -EINVAL;

        td_params->max_vcpus = kvm->max_vcpus;
        td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
        td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;

        td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
        td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);

        ret = setup_tdparams_eptp_controls(cpuid, td_params);
        if (ret)
                return ret;

        ret = setup_tdparams_cpuids(cpuid, td_params);
        if (ret)
                return ret;

#define MEMCPY_SAME_SIZE(dst, src)                              \
        do {                                                    \
                BUILD_BUG_ON(sizeof(dst) != sizeof(src));       \
                memcpy((dst), (src), sizeof(dst));              \
        } while (0)

        MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
        MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
        MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);

        return 0;
}

static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
                         u64 *seamcall_err)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        cpumask_var_t packages;
        struct page **tdcs_pages = NULL;
        struct page *tdr_page;
        int ret, i;
        u64 err, rcx;

        *seamcall_err = 0;
        ret = tdx_guest_keyid_alloc();
        if (ret < 0)
                return ret;
        kvm_tdx->hkid = ret;
        kvm_tdx->misc_cg = get_current_misc_cg();
        ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
        if (ret)
                goto free_hkid;

        ret = -ENOMEM;

        atomic_inc(&nr_configured_hkid);

        tdr_page = alloc_page(GFP_KERNEL);
        if (!tdr_page)
                goto free_hkid;

        kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
        /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
        kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
        tdcs_pages = kzalloc_objs(*kvm_tdx->td.tdcs_pages,
                                  kvm_tdx->td.tdcs_nr_pages);
        if (!tdcs_pages)
                goto free_tdr;

        for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
                tdcs_pages[i] = alloc_page(GFP_KERNEL);
                if (!tdcs_pages[i])
                        goto free_tdcs;
        }

        if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
                goto free_tdcs;

        cpus_read_lock();

        /*
         * Need at least one CPU of the package to be online in order to
         * program all packages for host key id.  Check it.
         */
        for_each_present_cpu(i)
                cpumask_set_cpu(topology_physical_package_id(i), packages);
        for_each_online_cpu(i)
                cpumask_clear_cpu(topology_physical_package_id(i), packages);
        if (!cpumask_empty(packages)) {
                ret = -EIO;
                /*
                 * Because it's hard for human operator to figure out the
                 * reason, warn it.
                 */
#define MSG_ALLPKG      "All packages need to have online CPU to create TD. Online CPU and retry.\n"
                pr_warn_ratelimited(MSG_ALLPKG);
                goto free_packages;
        }

        /*
         * TDH.MNG.CREATE tries to grab the global TDX module and fails
         * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
         * lock to prevent it from failure.
         */
        mutex_lock(&tdx_lock);
        kvm_tdx->td.tdr_page = tdr_page;
        err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
        mutex_unlock(&tdx_lock);

        if (err == TDX_RND_NO_ENTROPY) {
                ret = -EAGAIN;
                goto free_packages;
        }

        if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
                ret = -EIO;
                goto free_packages;
        }

        for_each_online_cpu(i) {
                int pkg = topology_physical_package_id(i);

                if (cpumask_test_and_set_cpu(pkg, packages))
                        continue;

                /*
                 * Program the memory controller in the package with an
                 * encryption key associated to a TDX private host key id
                 * assigned to this TDR.  Concurrent operations on same memory
                 * controller results in TDX_OPERAND_BUSY. No locking needed
                 * beyond the cpus_read_lock() above as it serializes against
                 * hotplug and the first online CPU of the package is always
                 * used. We never have two CPUs in the same socket trying to
                 * program the key.
                 */
                ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
                                      kvm_tdx, true);
                if (ret)
                        break;
        }
        cpus_read_unlock();
        free_cpumask_var(packages);
        if (ret) {
                i = 0;
                goto teardown;
        }

        kvm_tdx->td.tdcs_pages = tdcs_pages;
        for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
                err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
                if (err == TDX_RND_NO_ENTROPY) {
                        /* Here it's hard to allow userspace to retry. */
                        ret = -EAGAIN;
                        goto teardown;
                }
                if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
                        ret = -EIO;
                        goto teardown;
                }
        }

        err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
        if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
                /*
                 * Because a user gives operands, don't warn.
                 * Return a hint to the user because it's sometimes hard for the
                 * user to figure out which operand is invalid.  SEAMCALL status
                 * code includes which operand caused invalid operand error.
                 */
                *seamcall_err = err;
                ret = -EINVAL;
                goto teardown;
        } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
                ret = -EIO;
                goto teardown;
        }

        return 0;

        /*
         * The sequence for freeing resources from a partially initialized TD
         * varies based on where in the initialization flow failure occurred.
         * Simply use the full teardown and destroy, which naturally play nice
         * with partial initialization.
         */
teardown:
        /* Only free pages not yet added, so start at 'i' */
        for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
                if (tdcs_pages[i]) {
                        __free_page(tdcs_pages[i]);
                        tdcs_pages[i] = NULL;
                }
        }
        if (!kvm_tdx->td.tdcs_pages)
                kfree(tdcs_pages);

        tdx_mmu_release_hkid(kvm);
        tdx_reclaim_td_control_pages(kvm);

        return ret;

free_packages:
        cpus_read_unlock();
        free_cpumask_var(packages);

free_tdcs:
        for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
                if (tdcs_pages[i])
                        __free_page(tdcs_pages[i]);
        }
        kfree(tdcs_pages);
        kvm_tdx->td.tdcs_pages = NULL;

free_tdr:
        if (tdr_page)
                __free_page(tdr_page);
        kvm_tdx->td.tdr_page = NULL;

free_hkid:
        tdx_hkid_free(kvm_tdx);

        return ret;
}

static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
                                      u64 *data)
{
        u64 err;

        err = tdh_mng_rd(&tdx->td, field_id, data);

        return err;
}

#define TDX_MD_UNREADABLE_LEAF_MASK     GENMASK(30, 7)
#define TDX_MD_UNREADABLE_SUBLEAF_MASK  GENMASK(31, 7)

static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
                          bool sub_leaf_set, int *entry_index,
                          struct kvm_cpuid_entry2 *out)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
        u64 ebx_eax, edx_ecx;
        u64 err = 0;

        if (sub_leaf > 0b1111111)
                return -EINVAL;

        if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
                return -EINVAL;

        if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
            sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
                return -EINVAL;

        /*
         * bit 23:17, REVSERVED: reserved, must be 0;
         * bit 16,    LEAF_31: leaf number bit 31;
         * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
         *                      implicitly 0;
         * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
         * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
         *                         the SUBLEAF_6_0 is all-1.
         *                         sub-leaf bits 31:7 are implicitly 0;
         * bit 0,     ELEMENT_I: Element index within field;
         */
        field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
        field_id |= (leaf & 0x7f) << 9;
        if (sub_leaf_set)
                field_id |= (sub_leaf & 0x7f) << 1;
        else
                field_id |= 0x1fe;

        err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
        if (err) //TODO check for specific errors
                goto err_out;

        out->eax = (u32) ebx_eax;
        out->ebx = (u32) (ebx_eax >> 32);

        field_id++;
        err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
        /*
         * It's weird that reading edx_ecx fails while reading ebx_eax
         * succeeded.
         */
        if (WARN_ON_ONCE(err))
                goto err_out;

        out->ecx = (u32) edx_ecx;
        out->edx = (u32) (edx_ecx >> 32);

        out->function = leaf;
        out->index = sub_leaf;
        out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;

        /*
         * Work around missing support on old TDX modules, fetch
         * guest maxpa from gfn_direct_bits.
         */
        if (leaf == 0x80000008) {
                gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
                unsigned int g_maxpa = __ffs(gpa_bits) + 1;

                out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
        }

        (*entry_index)++;

        return 0;

err_out:
        out->eax = 0;
        out->ebx = 0;
        out->ecx = 0;
        out->edx = 0;

        return -EIO;
}

typedef void *tdx_vm_state_guard_t;

static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
{
        int r;

        mutex_lock(&kvm->lock);

        if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
                r = -EBUSY;
                goto out_err;
        }

        r = kvm_lock_all_vcpus(kvm);
        if (r)
                goto out_err;

        /*
         * Note the unintuitive ordering!  vcpu->mutex must be taken outside
         * kvm->slots_lock!
         */
        mutex_lock(&kvm->slots_lock);
        return kvm;

out_err:
        mutex_unlock(&kvm->lock);
        return ERR_PTR(r);
}

static void tdx_release_vm_state_locks(struct kvm *kvm)
{
        mutex_unlock(&kvm->slots_lock);
        kvm_unlock_all_vcpus(kvm);
        mutex_unlock(&kvm->lock);
}

DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
             if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
             tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);

static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
        struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        struct kvm_tdx_init_vm *init_vm;
        struct td_params *td_params = NULL;
        u32 nr_user_entries;
        int ret;

        BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
        BUILD_BUG_ON(sizeof(struct td_params) != 1024);

        if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
                return -EINVAL;

        if (cmd->flags)
                return -EINVAL;

        if (get_user(nr_user_entries, &user_data->cpuid.nent))
                return -EFAULT;

        if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
                return -E2BIG;

        init_vm = memdup_user(user_data,
                              struct_size(user_data, cpuid.entries, nr_user_entries));
        if (IS_ERR(init_vm))
                return PTR_ERR(init_vm);

        if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
                ret = -EINVAL;
                goto out;
        }

        if (init_vm->cpuid.padding) {
                ret = -EINVAL;
                goto out;
        }

        td_params = kzalloc_obj(struct td_params);
        if (!td_params) {
                ret = -ENOMEM;
                goto out;
        }

        ret = setup_tdparams(kvm, td_params, init_vm);
        if (ret)
                goto out;

        ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
        if (ret)
                goto out;

        kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
        kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
        kvm_tdx->attributes = td_params->attributes;
        kvm_tdx->xfam = td_params->xfam;

        if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
                kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
        else
                kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;

        kvm_tdx->state = TD_STATE_INITIALIZED;
out:
        /* kfree() accepts NULL. */
        kfree(init_vm);
        kfree(td_params);

        return ret;
}

void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
        /*
         * flush_tlb_current() is invoked when the first time for the vcpu to
         * run or when root of shared EPT is invalidated.
         * KVM only needs to flush shared EPT because the TDX module handles TLB
         * invalidation for private EPT in tdh_vp_enter();
         *
         * A single context invalidation for shared EPT can be performed here.
         * However, this single context invalidation requires the private EPTP
         * rather than the shared EPTP to flush shared EPT, as shared EPT uses
         * private EPTP as its ASID for TLB invalidation.
         *
         * To avoid reading back private EPTP, perform a global invalidation for
         * shared EPT instead to keep this function simple.
         */
        ept_sync_global();
}

void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
        /*
         * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
         * ensure that private EPT will be flushed on the next TD enter. No need
         * to call tdx_track() here again even when this callback is a result of
         * zapping private EPT.
         *
         * Due to the lack of the context to determine which EPT has been
         * affected by zapping, invoke invept() directly here for both shared
         * EPT and private EPT for simplicity, though it's not necessary for
         * private EPT.
         */
        ept_sync_global();
}

static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);

        if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
                return -EINVAL;

        cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
        if (tdx_operand_busy(cmd->hw_error))
                return -EBUSY;
        if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
                return -EIO;

        kvm_tdx->state = TD_STATE_RUNNABLE;
        /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
        smp_wmb();
        kvm->arch.pre_fault_allowed = true;
        return 0;
}

static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
{
        if (copy_from_user(cmd, argp, sizeof(*cmd)))
                return -EFAULT;

        /*
         * Userspace should never set hw_error.  KVM writes hw_error to report
         * hardware-defined error back to userspace.
         */
        if (cmd->hw_error)
                return -EINVAL;

        return 0;
}

int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
{
        struct kvm_tdx_cmd tdx_cmd;
        int r;

        r = tdx_get_cmd(argp, &tdx_cmd);
        if (r)
                return r;

        if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
                return tdx_get_capabilities(&tdx_cmd);

        CLASS(tdx_vm_state_guard, guard)(kvm);
        if (IS_ERR(guard))
                return PTR_ERR(guard);

        switch (tdx_cmd.id) {
        case KVM_TDX_INIT_VM:
                r = tdx_td_init(kvm, &tdx_cmd);
                break;
        case KVM_TDX_FINALIZE_VM:
                r = tdx_td_finalize(kvm, &tdx_cmd);
                break;
        default:
                return -EINVAL;
        }

        if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
                return -EFAULT;

        return r;
}

/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        struct page *page;
        int ret, i;
        u64 err;

        page = alloc_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;
        tdx->vp.tdvpr_page = page;

        /*
         * page_to_phys() does not work in 'noinstr' code, like guest
         * entry via tdh_vp_enter(). Precalculate and store it instead
         * of doing it at runtime later.
         */
        tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);

        tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
                                     GFP_KERNEL);
        if (!tdx->vp.tdcx_pages) {
                ret = -ENOMEM;
                goto free_tdvpr;
        }

        for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
                page = alloc_page(GFP_KERNEL);
                if (!page) {
                        ret = -ENOMEM;
                        goto free_tdcx;
                }
                tdx->vp.tdcx_pages[i] = page;
        }

        err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
        if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
                ret = -EIO;
                goto free_tdcx;
        }

        for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
                err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
                if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
                        /*
                         * Pages already added are reclaimed by the vcpu_free
                         * method, but the rest are freed here.
                         */
                        for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
                                __free_page(tdx->vp.tdcx_pages[i]);
                                tdx->vp.tdcx_pages[i] = NULL;
                        }
                        return -EIO;
                }
        }

        /*
         * tdh_vp_init() can take an exclusive lock of the TDR resource inside
         * the TDX-Module.  The TDR resource is also taken as shared in several
         * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
         * (TDX-Module locks are try-lock implementations with no slow path).
         * Take mmu_lock for write to reflect the nature of the lock taken by
         * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
         * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
         */
        scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
                err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
                if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
                        return -EIO;
        }

        vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

        return 0;

free_tdcx:
        for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
                if (tdx->vp.tdcx_pages[i])
                        __free_page(tdx->vp.tdcx_pages[i]);
                tdx->vp.tdcx_pages[i] = NULL;
        }
        kfree(tdx->vp.tdcx_pages);
        tdx->vp.tdcx_pages = NULL;

free_tdvpr:
        if (tdx->vp.tdvpr_page)
                __free_page(tdx->vp.tdvpr_page);
        tdx->vp.tdvpr_page = NULL;
        tdx->vp.tdvpr_pa = 0;

        return ret;
}

/* Sometimes reads multipple subleafs. Return how many enties were written. */
static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
                                   struct kvm_cpuid_entry2 *output_e)
{
        int sub_leaf = 0;
        int ret;

        /* First try without a subleaf */
        ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);

        /* If success, or invalid leaf, just give up */
        if (ret != -EIO)
                return ret;

        /*
         * If the try without a subleaf failed, try reading subleafs until
         * failure. The TDX module only supports 6 bits of subleaf index.
         */
        while (1) {
                /* Keep reading subleafs until there is a failure. */
                if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
                        return !sub_leaf;

                sub_leaf++;
                output_e++;
        }

        return 0;
}

static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
        struct kvm_cpuid2 __user *output;
        struct kvm_cpuid2 *td_cpuid;
        int r = 0, i = 0, leaf;
        u32 level;

        output = u64_to_user_ptr(cmd->data);
        td_cpuid = kzalloc(sizeof(*td_cpuid) +
                        sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
                        GFP_KERNEL);
        if (!td_cpuid)
                return -ENOMEM;

        if (copy_from_user(td_cpuid, output, sizeof(*output))) {
                r = -EFAULT;
                goto out;
        }

        /* Read max CPUID for normal range */
        if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
                r = -EIO;
                goto out;
        }
        level = td_cpuid->entries[0].eax;

        for (leaf = 1; leaf <= level; leaf++)
                tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);

        /* Read max CPUID for extended range */
        if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
                r = -EIO;
                goto out;
        }
        level = td_cpuid->entries[i - 1].eax;

        for (leaf = 0x80000001; leaf <= level; leaf++)
                tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);

        if (td_cpuid->nent < i)
                r = -E2BIG;
        td_cpuid->nent = i;

        if (copy_to_user(output, td_cpuid, sizeof(*output))) {
                r = -EFAULT;
                goto out;
        }

        if (r == -E2BIG)
                goto out;

        if (copy_to_user(output->entries, td_cpuid->entries,
                         td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                r = -EFAULT;

out:
        kfree(td_cpuid);

        return r;
}

static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
        u64 apic_base;
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        int ret;

        if (cmd->flags)
                return -EINVAL;

        if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
                return -EINVAL;

        /*
         * TDX requires X2APIC, userspace is responsible for configuring guest
         * CPUID accordingly.
         */
        apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
                (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
        if (kvm_apic_set_base(vcpu, apic_base, true))
                return -EINVAL;

        ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
        if (ret)
                return ret;

        td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
        td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
        td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);

        tdx->state = VCPU_TD_STATE_INITIALIZED;

        return 0;
}

void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
        /*
         * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
         * INIT events.
         *
         * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
         * userspace needs to define the vCPU model before KVM can initialize
         * vCPU state, e.g. to enable x2APIC.
         */
        WARN_ON_ONCE(init_event);
}

struct tdx_gmem_post_populate_arg {
        struct kvm_vcpu *vcpu;
        __u32 flags;
};

static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
                                  struct page *src_page, void *_arg)
{
        struct tdx_gmem_post_populate_arg *arg = _arg;
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        u64 err, entry, level_state;
        gpa_t gpa = gfn_to_gpa(gfn);
        int ret, i;

        if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
                return -EIO;

        if (!src_page)
                return -EOPNOTSUPP;

        kvm_tdx->page_add_src = src_page;
        ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
        kvm_tdx->page_add_src = NULL;

        if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
                return ret;

        /*
         * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
         * between mapping the pfn and now, but slots_lock prevents memslot
         * updates, filemap_invalidate_lock() prevents guest_memfd updates,
         * mmu_notifier events can't reach S-EPT entries, and KVM's internal
         * zapping flows are mutually exclusive with S-EPT mappings.
         */
        for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
                err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
                if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
                        return -EIO;
        }

        return 0;
}

static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
        struct vcpu_tdx *tdx = to_tdx(vcpu);
        struct kvm *kvm = vcpu->kvm;
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        struct kvm_tdx_init_mem_region region;
        struct tdx_gmem_post_populate_arg arg;
        long gmem_ret;
        int ret;

        if (tdx->state != VCPU_TD_STATE_INITIALIZED)
                return -EINVAL;

        /* Once TD is finalized, the initial guest memory is fixed. */
        if (kvm_tdx->state == TD_STATE_RUNNABLE)
                return -EINVAL;

        if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
                return -EINVAL;

        if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
                return -EFAULT;

        if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
            !region.nr_pages ||
            region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
            !vt_is_tdx_private_gpa(kvm, region.gpa) ||
            !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
                return -EINVAL;

        ret = 0;
        while (region.nr_pages) {
                if (signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }

                arg = (struct tdx_gmem_post_populate_arg) {
                        .vcpu = vcpu,
                        .flags = cmd->flags,
                };
                gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
                                             u64_to_user_ptr(region.source_addr),
                                             1, tdx_gmem_post_populate, &arg);
                if (gmem_ret < 0) {
                        ret = gmem_ret;
                        break;
                }

                if (gmem_ret != 1) {
                        ret = -EIO;
                        break;
                }

                region.source_addr += PAGE_SIZE;
                region.gpa += PAGE_SIZE;
                region.nr_pages--;

                cond_resched();
        }

        if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
                ret = -EFAULT;
        return ret;
}

int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
        struct kvm *kvm = vcpu->kvm;
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        struct kvm_tdx_cmd cmd;
        int r;

        r = tdx_get_cmd(argp, &cmd);
        if (r)
                return r;

        CLASS(tdx_vm_state_guard, guard)(kvm);
        if (IS_ERR(guard))
                return PTR_ERR(guard);

        if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
                return -EINVAL;

        vcpu_load(vcpu);

        switch (cmd.id) {
        case KVM_TDX_INIT_MEM_REGION:
                r = tdx_vcpu_init_mem_region(vcpu, &cmd);
                break;
        case KVM_TDX_INIT_VCPU:
                r = tdx_vcpu_init(vcpu, &cmd);
                break;
        default:
                r = -ENOIOCTLCMD;
                break;
        }

        vcpu_put(vcpu);

        return r;
}

int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
        struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
        struct kvm_tdx_cmd cmd;
        int ret;

        if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
                return -EINVAL;

        ret = tdx_get_cmd(argp, &cmd);
        if (ret)
                return ret;

        switch (cmd.id) {
        case KVM_TDX_GET_CPUID:
                ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
        if (!is_private)
                return 0;

        return PG_LEVEL_4K;
}

static int tdx_online_cpu(unsigned int cpu)
{
        unsigned long flags;
        int r;

        /* Sanity check CPU is already in post-VMXON */
        WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));

        local_irq_save(flags);
        r = tdx_cpu_enable();
        local_irq_restore(flags);

        return r;
}

static int tdx_offline_cpu(unsigned int cpu)
{
        int i;

        /* No TD is running.  Allow any cpu to be offline. */
        if (!atomic_read(&nr_configured_hkid))
                return 0;

        /*
         * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
         * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
         * controller with pconfig.  If we have active TDX HKID, refuse to
         * offline the last online cpu.
         */
        for_each_online_cpu(i) {
                /*
                 * Found another online cpu on the same package.
                 * Allow to offline.
                 */
                if (i != cpu && topology_physical_package_id(i) ==
                                topology_physical_package_id(cpu))
                        return 0;
        }

        /*
         * This is the last cpu of this package.  Don't offline it.
         *
         * Because it's hard for human operator to understand the
         * reason, warn it.
         */
#define MSG_ALLPKG_ONLINE \
        "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
        pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
        return -EBUSY;
}

static void __do_tdx_cleanup(void)
{
        /*
         * Once TDX module is initialized, it cannot be disabled and
         * re-initialized again w/o runtime update (which isn't
         * supported by kernel).  Only need to remove the cpuhp here.
         * The TDX host core code tracks TDX status and can handle
         * 'multiple enabling' scenario.
         */
        WARN_ON_ONCE(!tdx_cpuhp_state);
        cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
        tdx_cpuhp_state = 0;
}

static void __tdx_cleanup(void)
{
        cpus_read_lock();
        __do_tdx_cleanup();
        cpus_read_unlock();
}

static int __init __do_tdx_bringup(void)
{
        int r;

        /*
         * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
         * online CPUs before calling tdx_enable(), and on any new
         * going-online CPU to make sure it is ready for TDX guest.
         */
        r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
                                         "kvm/cpu/tdx:online",
                                         tdx_online_cpu, tdx_offline_cpu);
        if (r < 0)
                return r;

        tdx_cpuhp_state = r;

        r = tdx_enable();
        if (r)
                __do_tdx_cleanup();

        return r;
}

static int __init __tdx_bringup(void)
{
        const struct tdx_sys_info_td_conf *td_conf;
        int r, i;

        for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
                /*
                 * Check if MSRs (tdx_uret_msrs) can be saved/restored
                 * before returning to user space.
                 */
                tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
                if (tdx_uret_msrs[i].slot == -1) {
                        /* If any MSR isn't supported, it is a KVM bug */
                        pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
                                tdx_uret_msrs[i].msr);
                        return -EIO;
                }
        }

        /*
         * Enabling TDX requires enabling hardware virtualization first,
         * as making SEAMCALLs requires CPU being in post-VMXON state.
         */
        r = kvm_enable_virtualization();
        if (r)
                return r;

        cpus_read_lock();
        r = __do_tdx_bringup();
        cpus_read_unlock();

        if (r)
                goto tdx_bringup_err;

        r = -EINVAL;
        /* Get TDX global information for later use */
        tdx_sysinfo = tdx_get_sysinfo();
        if (WARN_ON_ONCE(!tdx_sysinfo))
                goto get_sysinfo_err;

        /* Check TDX module and KVM capabilities */
        if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
            !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
                goto get_sysinfo_err;

        if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
                goto get_sysinfo_err;

        /*
         * TDX has its own limit of maximum vCPUs it can support for all
         * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
         * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
         * extension on per-VM basis.
         *
         * TDX module reports such limit via the MAX_VCPU_PER_TD global
         * metadata.  Different modules may report different values.
         * Some old module may also not support this metadata (in which
         * case this limit is U16_MAX).
         *
         * In practice, the reported value reflects the maximum logical
         * CPUs that ALL the platforms that the module supports can
         * possibly have.
         *
         * Simply forwarding the MAX_VCPU_PER_TD to userspace could
         * result in an unpredictable ABI.  KVM instead always advertise
         * the number of logical CPUs the platform has as the maximum
         * vCPUs for TDX guests.
         *
         * Make sure MAX_VCPU_PER_TD reported by TDX module is not
         * smaller than the number of logical CPUs, otherwise KVM will
         * report an unsupported value to userspace.
         *
         * Note, a platform with TDX enabled in the BIOS cannot support
         * physical CPU hotplug, and TDX requires the BIOS has marked
         * all logical CPUs in MADT table as enabled.  Just use
         * num_present_cpus() for the number of logical CPUs.
         */
        td_conf = &tdx_sysinfo->td_conf;
        if (td_conf->max_vcpus_per_td < num_present_cpus()) {
                pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
                                td_conf->max_vcpus_per_td, num_present_cpus());
                goto get_sysinfo_err;
        }

        if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids()))
                goto get_sysinfo_err;

        /*
         * Leave hardware virtualization enabled after TDX is enabled
         * successfully.  TDX CPU hotplug depends on this.
         */
        return 0;

get_sysinfo_err:
        __tdx_cleanup();
tdx_bringup_err:
        kvm_disable_virtualization();
        return r;
}

void tdx_cleanup(void)
{
        if (enable_tdx) {
                misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
                __tdx_cleanup();
                kvm_disable_virtualization();
        }
}

int __init tdx_bringup(void)
{
        int r, i;

        /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
        for_each_possible_cpu(i)
                INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));

        if (!enable_tdx)
                return 0;

        if (!enable_ept) {
                pr_err("EPT is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
                pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!enable_apicv) {
                pr_err("APICv is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
                pr_err("tdx: OSXSAVE is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
                pr_err("tdx: MOVDIR64B is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
                pr_err("Self-snoop is required for TDX\n");
                goto success_disable_tdx;
        }

        if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
                pr_err("tdx: no TDX private KeyIDs available\n");
                goto success_disable_tdx;
        }

        if (!enable_virt_at_load) {
                pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
                goto success_disable_tdx;
        }

        /*
         * Ideally KVM should probe whether TDX module has been loaded
         * first and then try to bring it up.  But TDX needs to use SEAMCALL
         * to probe whether the module is loaded (there is no CPUID or MSR
         * for that), and making SEAMCALL requires enabling virtualization
         * first, just like the rest steps of bringing up TDX module.
         *
         * So, for simplicity do everything in __tdx_bringup(); the first
         * SEAMCALL will return -ENODEV when the module is not loaded.  The
         * only complication is having to make sure that initialization
         * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
         * cases.
         */
        r = __tdx_bringup();
        if (r) {
                /*
                 * Disable TDX only but don't fail to load module if the TDX
                 * module could not be loaded.  No need to print message saying
                 * "module is not loaded" because it was printed when the first
                 * SEAMCALL failed.  Don't bother unwinding the S-EPT hooks or
                 * vm_size, as kvm_x86_ops have already been finalized (and are
                 * intentionally not exported).  The S-EPT code is unreachable,
                 * and allocating a few more bytes per VM in a should-be-rare
                 * failure scenario is a non-issue.
                 */
                if (r == -ENODEV)
                        goto success_disable_tdx;

                enable_tdx = 0;
        }

        return r;

success_disable_tdx:
        enable_tdx = 0;
        return 0;
}

void __init tdx_hardware_setup(void)
{
        KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);

        /*
         * Note, if the TDX module can't be loaded, KVM TDX support will be
         * disabled but KVM will continue loading (see tdx_bringup()).
         */
        vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));

        vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
        vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
        vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
        vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
        vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
}
Linux