arch/x86/kvm/vmx/sgx.c

root/arch/x86/kvm/vmx/sgx.c
// SPDX-License-Identifier: GPL-2.0
/*  Copyright(c) 2021 Intel Corporation. */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <asm/msr.h>
#include <asm/sgx.h>

#include "x86.h"
#include "kvm_cache_regs.h"
#include "nested.h"
#include "sgx.h"
#include "vmx.h"

bool __read_mostly enable_sgx = 1;
module_param_named(sgx, enable_sgx, bool, 0444);

/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */
static u64 sgx_pubkey_hash[4] __ro_after_init;

/*
 * ENCLS's memory operands use a fixed segment (DS) and a fixed
 * address size based on the mode.  Related prefixes are ignored.
 */
static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
                             int size, int alignment, gva_t *gva)
{
        struct kvm_segment s;
        bool fault;

        /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
        *gva = offset;
        if (!is_64_bit_mode(vcpu)) {
                vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
                *gva += s.base;
        }

        if (!IS_ALIGNED(*gva, alignment)) {
                fault = true;
        } else if (likely(is_64_bit_mode(vcpu))) {
                *gva = vmx_get_untagged_addr(vcpu, *gva, 0);
                fault = is_noncanonical_address(*gva, vcpu, 0);
        } else {
                *gva &= 0xffffffff;
                fault = (s.unusable) ||
                        (s.type != 2 && s.type != 3) ||
                        (*gva > s.limit) ||
                        ((s.base != 0 || s.limit != 0xffffffff) &&
                        (((u64)*gva + size - 1) > s.limit + 1));
        }
        if (fault)
                kvm_inject_gp(vcpu, 0);
        return fault ? -EINVAL : 0;
}

static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
                                         unsigned int size)
{
        uint64_t data[2] = { addr, size };

        __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
}

static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
                        unsigned int size)
{
        if (__copy_from_user(data, (void __user *)hva, size)) {
                sgx_handle_emulation_failure(vcpu, hva, size);
                return -EFAULT;
        }

        return 0;
}

static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write,
                          gpa_t *gpa)
{
        struct x86_exception ex;

        if (write)
                *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex);
        else
                *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex);

        if (*gpa == INVALID_GPA) {
                kvm_inject_emulated_page_fault(vcpu, &ex);
                return -EFAULT;
        }

        return 0;
}

static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva)
{
        *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa));
        if (kvm_is_error_hva(*hva)) {
                sgx_handle_emulation_failure(vcpu, gpa, 1);
                return -EFAULT;
        }

        *hva |= gpa & ~PAGE_MASK;

        return 0;
}

static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
{
        struct x86_exception ex;

        /*
         * A non-EPCM #PF indicates a bad userspace HVA.  This *should* check
         * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC,
         * but the error code isn't (yet) plumbed through the ENCLS helpers.
         */
        if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
                kvm_prepare_emulation_failure_exit(vcpu);
                return 0;
        }

        /*
         * If the guest thinks it's running on SGX2 hardware, inject an SGX
         * #PF if the fault matches an EPCM fault signature (#GP on SGX1,
         * #PF on SGX2).  The assumption is that EPCM faults are much more
         * likely than a bad userspace address.
         */
        if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
            guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) {
                memset(&ex, 0, sizeof(ex));
                ex.vector = PF_VECTOR;
                ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
                                PFERR_SGX_MASK;
                ex.address = gva;
                ex.error_code_valid = true;
                ex.nested_page_fault = false;
                kvm_inject_emulated_page_fault(vcpu, &ex);
        } else {
                kvm_inject_gp(vcpu, 0);
        }
        return 1;
}

static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
                                  struct sgx_pageinfo *pageinfo,
                                  unsigned long secs_hva,
                                  gva_t secs_gva)
{
        struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents;
        struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1;
        u64 attributes, xfrm, size;
        u32 miscselect;
        u8 max_size_log2;
        int trapnr, ret;

        sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
        sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
        if (!sgx_12_0 || !sgx_12_1) {
                kvm_prepare_emulation_failure_exit(vcpu);
                return 0;
        }

        miscselect = contents->miscselect;
        attributes = contents->attributes;
        xfrm = contents->xfrm;
        size = contents->size;

        /* Enforce restriction of access to the PROVISIONKEY. */
        if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
            (attributes & SGX_ATTR_PROVISIONKEY)) {
                if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
                        pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
                kvm_inject_gp(vcpu, 0);
                return 1;
        }

        /*
         * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM.  Note
         * that the allowed XFRM (XFeature Request Mask) isn't strictly bound
         * by the supported XCR0.  FP+SSE *must* be set in XFRM, even if XSAVE
         * is unsupported, i.e. even if XCR0 itself is completely unsupported.
         */
        if ((u32)miscselect & ~sgx_12_0->ebx ||
            (u32)attributes & ~sgx_12_1->eax ||
            (u32)(attributes >> 32) & ~sgx_12_1->ebx ||
            (u32)xfrm & ~sgx_12_1->ecx ||
            (u32)(xfrm >> 32) & ~sgx_12_1->edx ||
            xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
            (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                kvm_inject_gp(vcpu, 0);
                return 1;
        }

        /* Enforce CPUID restriction on max enclave size. */
        max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
                                                            sgx_12_0->edx;
        if (size >= BIT_ULL(max_size_log2)) {
                kvm_inject_gp(vcpu, 0);
                return 1;
        }

        /*
         * sgx_virt_ecreate() returns:
         *  1) 0:       ECREATE was successful
         *  2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the
         *              exception number.
         *  3) -EINVAL: access_ok() on @secs_hva failed. This should never
         *              happen as KVM checks host addresses at memslot creation.
         *              sgx_virt_ecreate() has already warned in this case.
         */
        ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr);
        if (!ret)
                return kvm_skip_emulated_instruction(vcpu);
        if (ret == -EFAULT)
                return sgx_inject_fault(vcpu, secs_gva, trapnr);

        return ret;
}

static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
{
        gva_t pageinfo_gva, secs_gva;
        gva_t metadata_gva, contents_gva;
        gpa_t metadata_gpa, contents_gpa, secs_gpa;
        unsigned long metadata_hva, contents_hva, secs_hva;
        struct sgx_pageinfo pageinfo;
        struct sgx_secs *contents;
        struct x86_exception ex;
        int r;

        if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) ||
            sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva))
                return 1;

        /*
         * Copy the PAGEINFO to local memory, its pointers need to be
         * translated, i.e. we need to do a deep copy/translate.
         */
        r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo,
                                sizeof(pageinfo), &ex);
        if (r == X86EMUL_PROPAGATE_FAULT) {
                kvm_inject_emulated_page_fault(vcpu, &ex);
                return 1;
        } else if (r != X86EMUL_CONTINUE) {
                sgx_handle_emulation_failure(vcpu, pageinfo_gva,
                                             sizeof(pageinfo));
                return 0;
        }

        if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) ||
            sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096,
                              &contents_gva))
                return 1;

        /*
         * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA.
         * Resume the guest on failure to inject a #PF.
         */
        if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) ||
            sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) ||
            sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa))
                return 1;

        /*
         * ...and then to HVA.  The order of accesses isn't architectural, i.e.
         * KVM doesn't have to fully process one address at a time.  Exit to
         * userspace if a GPA is invalid.
         */
        if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) ||
            sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) ||
            sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva))
                return 0;

        /*
         * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the
         * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and
         * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
         * enforce restriction of access to the PROVISIONKEY.
         */
        contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
        if (!contents)
                return -ENOMEM;

        /* Exit to userspace if copying from a host userspace address fails. */
        if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) {
                free_page((unsigned long)contents);
                return 0;
        }

        pageinfo.metadata = metadata_hva;
        pageinfo.contents = (u64)contents;

        r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva);

        free_page((unsigned long)contents);

        return r;
}

static int handle_encls_einit(struct kvm_vcpu *vcpu)
{
        unsigned long sig_hva, secs_hva, token_hva, rflags;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        gva_t sig_gva, secs_gva, token_gva;
        gpa_t sig_gpa, secs_gpa, token_gpa;
        int ret, trapnr;

        if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) ||
            sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) ||
            sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva))
                return 1;

        /*
         * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA.
         * Resume the guest on failure to inject a #PF.
         */
        if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) ||
            sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) ||
            sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa))
                return 1;

        /*
         * ...and then to HVA.  The order of accesses isn't architectural, i.e.
         * KVM doesn't have to fully process one address at a time.  Exit to
         * userspace if a GPA is invalid.  Note, all structures are aligned and
         * cannot split pages.
         */
        if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) ||
            sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) ||
            sgx_gpa_to_hva(vcpu, token_gpa, &token_hva))
                return 0;

        ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva,
                             (void __user *)secs_hva,
                             vmx->msr_ia32_sgxlepubkeyhash, &trapnr);

        if (ret == -EFAULT)
                return sgx_inject_fault(vcpu, secs_gva, trapnr);

        /*
         * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva,
         * @token_hva or @secs_hva. This should never happen as KVM checks host
         * addresses at memslot creation. sgx_virt_einit() has already warned
         * in this case, so just return.
         */
        if (ret < 0)
                return ret;

        rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF |
                                          X86_EFLAGS_AF | X86_EFLAGS_SF |
                                          X86_EFLAGS_OF);
        if (ret)
                rflags |= X86_EFLAGS_ZF;
        else
                rflags &= ~X86_EFLAGS_ZF;
        vmx_set_rflags(vcpu, rflags);

        kvm_rax_write(vcpu, ret);
        return kvm_skip_emulated_instruction(vcpu);
}

static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
{
        /*
         * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will
         * be reached if and only if the SGX1 leafs are enabled.
         */
        if (leaf >= ECREATE && leaf <= ETRACK)
                return true;

        if (leaf >= EAUG && leaf <= EMODT)
                return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2);

        return false;
}

static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu)
{
        const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED;

        return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits;
}

int handle_encls(struct kvm_vcpu *vcpu)
{
        u32 leaf = (u32)kvm_rax_read(vcpu);

        if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
            !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
        } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
                   !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
                kvm_inject_gp(vcpu, 0);
        } else {
                if (leaf == ECREATE)
                        return handle_encls_ecreate(vcpu);
                if (leaf == EINIT)
                        return handle_encls_einit(vcpu);
                WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
                vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
                return 0;
        }
        return 1;
}

void setup_default_sgx_lepubkeyhash(void)
{
        /*
         * Use Intel's default value for Skylake hardware if Launch Control is
         * not supported, i.e. Intel's hash is hardcoded into silicon, or if
         * Launch Control is supported and enabled, i.e. mimic the reset value
         * and let the guest write the MSRs at will.  If Launch Control is
         * supported but disabled, then use the current MSR values as the hash
         * MSRs exist but are read-only (locked and not writable).
         */
        if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
            rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
                sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
                sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
                sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
                sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
        } else {
                /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
                rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
                rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
                rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
        }
}

void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu)
{
        struct vcpu_vmx *vmx = to_vmx(vcpu);

        memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
               sizeof(sgx_pubkey_hash));
}

/*
 * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM
 * restrictions if the guest's allowed-1 settings diverge from hardware.
 */
static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu)
{
        struct kvm_cpuid_entry2 *guest_cpuid;
        u32 eax, ebx, ecx, edx;

        if (!vcpu->kvm->arch.sgx_provisioning_allowed)
                return true;

        guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0);
        if (!guest_cpuid)
                return true;

        cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx);
        if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx)
                return true;

        guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1);
        if (!guest_cpuid)
                return true;

        cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx);
        if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx ||
            guest_cpuid->ecx != ecx || guest_cpuid->edx != edx)
                return true;

        return false;
}

void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
        /*
         * There is no software enable bit for SGX that is virtualized by
         * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the
         * guest (either by the host or by the guest's BIOS) but enabled in the
         * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate
         * the expected system behavior for ENCLS.
         */
        u64 bitmap = -1ull;

        /* Nothing to do if hardware doesn't support SGX */
        if (!cpu_has_vmx_encls_vmexit())
                return;

        if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) &&
            sgx_enabled_in_guest_bios(vcpu)) {
                if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
                        bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
                        if (sgx_intercept_encls_ecreate(vcpu))
                                bitmap |= (1 << ECREATE);
                }

                if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
                        bitmap &= ~GENMASK_ULL(EMODT, EAUG);

                /*
                 * Trap and execute EINIT if launch control is enabled in the
                 * host using the guest's values for launch control MSRs, even
                 * if the guest's values are fixed to hardware default values.
                 * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing
                 * the MSRs is extraordinarily expensive.
                 */
                if (boot_cpu_has(X86_FEATURE_SGX_LC))
                        bitmap |= (1 << EINIT);

                if (!vmcs12 && is_guest_mode(vcpu))
                        vmcs12 = get_vmcs12(vcpu);
                if (vmcs12 && nested_cpu_has_encls_exit(vmcs12))
                        bitmap |= vmcs12->encls_exiting_bitmap;
        }
        vmcs_write64(ENCLS_EXITING_BITMAP, bitmap);
}
Linux