root/arch/x86/kernel/cpu/sgx/virt.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Device driver to expose SGX enclave memory to KVM guests.
 *
 * Copyright(c) 2021 Intel Corporation.
 */

#include <linux/kvm_types.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/xarray.h>
#include <asm/sgx.h>
#include <uapi/asm/sgx.h>

#include "encls.h"
#include "sgx.h"

struct sgx_vepc {
        struct xarray page_array;
        struct mutex lock;
};

/*
 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
 * virtual EPC instances, and the lock to protect it.
 */
static struct mutex zombie_secs_pages_lock;
static struct list_head zombie_secs_pages;

static int __sgx_vepc_fault(struct sgx_vepc *vepc,
                            struct vm_area_struct *vma, unsigned long addr)
{
        struct sgx_epc_page *epc_page;
        unsigned long index, pfn;
        int ret;

        WARN_ON(!mutex_is_locked(&vepc->lock));

        /* Calculate index of EPC page in virtual EPC's page_array */
        index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);

        epc_page = xa_load(&vepc->page_array, index);
        if (epc_page)
                return 0;

        epc_page = sgx_alloc_epc_page(vepc, false);
        if (IS_ERR(epc_page))
                return PTR_ERR(epc_page);

        ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
        if (ret)
                goto err_free;

        pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));

        ret = vmf_insert_pfn(vma, addr, pfn);
        if (ret != VM_FAULT_NOPAGE) {
                ret = -EFAULT;
                goto err_delete;
        }

        return 0;

err_delete:
        xa_erase(&vepc->page_array, index);
err_free:
        sgx_free_epc_page(epc_page);
        return ret;
}

static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct sgx_vepc *vepc = vma->vm_private_data;
        int ret;

        mutex_lock(&vepc->lock);
        ret = __sgx_vepc_fault(vepc, vma, vmf->address);
        mutex_unlock(&vepc->lock);

        if (!ret)
                return VM_FAULT_NOPAGE;

        if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
                mmap_read_unlock(vma->vm_mm);
                return VM_FAULT_RETRY;
        }

        return VM_FAULT_SIGBUS;
}

static const struct vm_operations_struct sgx_vepc_vm_ops = {
        .fault = sgx_vepc_fault,
};

static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct sgx_vepc *vepc = file->private_data;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        vma->vm_ops = &sgx_vepc_vm_ops;
        /* Don't copy VMA in fork() */
        vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
        vma->vm_private_data = vepc;

        return 0;
}

static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
{
        /*
         * Take a previously guest-owned EPC page and return it to the
         * general EPC page pool.
         *
         * Guests can not be trusted to have left this page in a good
         * state, so run EREMOVE on the page unconditionally.  In the
         * case that a guest properly EREMOVE'd this page, a superfluous
         * EREMOVE is harmless.
         */
        return __eremove(sgx_get_epc_virt_addr(epc_page));
}

static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
{
        int ret = sgx_vepc_remove_page(epc_page);
        if (ret) {
                /*
                 * Only SGX_CHILD_PRESENT is expected, which is because of
                 * EREMOVE'ing an SECS still with child, in which case it can
                 * be handled by EREMOVE'ing the SECS again after all pages in
                 * virtual EPC have been EREMOVE'd. See comments in below in
                 * sgx_vepc_release().
                 *
                 * The user of virtual EPC (KVM) needs to guarantee there's no
                 * logical processor is still running in the enclave in guest,
                 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
                 * handled here.
                 */
                WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
                          ret, ret);
                return ret;
        }

        sgx_free_epc_page(epc_page);
        return 0;
}

static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
{
        struct sgx_epc_page *entry;
        unsigned long index;
        long failures = 0;

        xa_for_each(&vepc->page_array, index, entry) {
                int ret = sgx_vepc_remove_page(entry);
                if (ret) {
                        if (ret == SGX_CHILD_PRESENT) {
                                /* The page is a SECS, userspace will retry.  */
                                failures++;
                        } else {
                                /*
                                 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
                                 * WARN, as userspace can induce said failures by
                                 * calling the ioctl concurrently on multiple vEPCs or
                                 * while one or more CPUs is running the enclave.  Only
                                 * a #PF on EREMOVE indicates a kernel/hardware issue.
                                 */
                                WARN_ON_ONCE(encls_faulted(ret) &&
                                             ENCLS_TRAPNR(ret) != X86_TRAP_GP);
                                return -EBUSY;
                        }
                }
                cond_resched();
        }

        /*
         * Return the number of SECS pages that failed to be removed, so
         * userspace knows that it has to retry.
         */
        return failures;
}

static int sgx_vepc_release(struct inode *inode, struct file *file)
{
        struct sgx_vepc *vepc = file->private_data;
        struct sgx_epc_page *epc_page, *tmp, *entry;
        unsigned long index;

        LIST_HEAD(secs_pages);

        xa_for_each(&vepc->page_array, index, entry) {
                /*
                 * Remove all normal, child pages.  sgx_vepc_free_page()
                 * will fail if EREMOVE fails, but this is OK and expected on
                 * SECS pages.  Those can only be EREMOVE'd *after* all their
                 * child pages. Retries below will clean them up.
                 */
                if (sgx_vepc_free_page(entry))
                        continue;

                xa_erase(&vepc->page_array, index);
                cond_resched();
        }

        /*
         * Retry EREMOVE'ing pages.  This will clean up any SECS pages that
         * only had children in this 'epc' area.
         */
        xa_for_each(&vepc->page_array, index, entry) {
                epc_page = entry;
                /*
                 * An EREMOVE failure here means that the SECS page still
                 * has children.  But, since all children in this 'sgx_vepc'
                 * have been removed, the SECS page must have a child on
                 * another instance.
                 */
                if (sgx_vepc_free_page(epc_page))
                        list_add_tail(&epc_page->list, &secs_pages);

                xa_erase(&vepc->page_array, index);
                cond_resched();
        }

        /*
         * SECS pages are "pinned" by child pages, and "unpinned" once all
         * children have been EREMOVE'd.  A child page in this instance
         * may have pinned an SECS page encountered in an earlier release(),
         * creating a zombie.  Since some children were EREMOVE'd above,
         * try to EREMOVE all zombies in the hopes that one was unpinned.
         */
        mutex_lock(&zombie_secs_pages_lock);
        list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
                /*
                 * Speculatively remove the page from the list of zombies,
                 * if the page is successfully EREMOVE'd it will be added to
                 * the list of free pages.  If EREMOVE fails, throw the page
                 * on the local list, which will be spliced on at the end.
                 */
                list_del(&epc_page->list);

                if (sgx_vepc_free_page(epc_page))
                        list_add_tail(&epc_page->list, &secs_pages);
                cond_resched();
        }

        if (!list_empty(&secs_pages))
                list_splice_tail(&secs_pages, &zombie_secs_pages);
        mutex_unlock(&zombie_secs_pages_lock);

        xa_destroy(&vepc->page_array);
        kfree(vepc);

        sgx_dec_usage_count();
        return 0;
}

static int __sgx_vepc_open(struct inode *inode, struct file *file)
{
        struct sgx_vepc *vepc;

        vepc = kzalloc_obj(struct sgx_vepc);
        if (!vepc)
                return -ENOMEM;
        mutex_init(&vepc->lock);
        xa_init(&vepc->page_array);

        file->private_data = vepc;

        return 0;
}

static int sgx_vepc_open(struct inode *inode, struct file *file)
{
        int ret;

        ret = sgx_inc_usage_count();
        if (ret)
                return ret;

        ret =  __sgx_vepc_open(inode, file);
        if (ret) {
                sgx_dec_usage_count();
                return ret;
        }

        return 0;
}

static long sgx_vepc_ioctl(struct file *file,
                           unsigned int cmd, unsigned long arg)
{
        struct sgx_vepc *vepc = file->private_data;

        switch (cmd) {
        case SGX_IOC_VEPC_REMOVE_ALL:
                if (arg)
                        return -EINVAL;
                return sgx_vepc_remove_all(vepc);

        default:
                return -ENOTTY;
        }
}

static const struct file_operations sgx_vepc_fops = {
        .owner          = THIS_MODULE,
        .open           = sgx_vepc_open,
        .unlocked_ioctl = sgx_vepc_ioctl,
        .compat_ioctl   = sgx_vepc_ioctl,
        .release        = sgx_vepc_release,
        .mmap           = sgx_vepc_mmap,
};

static struct miscdevice sgx_vepc_dev = {
        .minor          = MISC_DYNAMIC_MINOR,
        .name           = "sgx_vepc",
        .nodename       = "sgx_vepc",
        .fops           = &sgx_vepc_fops,
};

int __init sgx_vepc_init(void)
{
        /* SGX virtualization requires KVM to work */
        if (!cpu_feature_enabled(X86_FEATURE_VMX))
                return -ENODEV;

        INIT_LIST_HEAD(&zombie_secs_pages);
        mutex_init(&zombie_secs_pages_lock);

        return misc_register(&sgx_vepc_dev);
}

/**
 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
 * @pageinfo:   Pointer to PAGEINFO structure
 * @secs:       Userspace pointer to SECS page
 * @trapnr:     trap number injected to guest in case of ECREATE error
 *
 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
 * of enforcing policies of guest's enclaves, and return the trap number
 * which should be injected to guest in case of any ECREATE error.
 *
 * Return:
 * -  0:        ECREATE was successful.
 * - <0:        on error.
 */
int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
                     int *trapnr)
{
        int ret;

        /*
         * @secs is an untrusted, userspace-provided address.  It comes from
         * KVM and is assumed to be a valid pointer which points somewhere in
         * userspace.  This can fault and call SGX or other fault handlers when
         * userspace mapping @secs doesn't exist.
         *
         * Add a WARN() to make sure @secs is already valid userspace pointer
         * from caller (KVM), who should already have handled invalid pointer
         * case (for instance, made by malicious guest).  All other checks,
         * such as alignment of @secs, are deferred to ENCLS itself.
         */
        if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
                return -EINVAL;

        __uaccess_begin();
        ret = __ecreate(pageinfo, (void *)secs);
        __uaccess_end();

        if (encls_faulted(ret)) {
                *trapnr = ENCLS_TRAPNR(ret);
                return -EFAULT;
        }

        /* ECREATE doesn't return an error code, it faults or succeeds. */
        WARN_ON_ONCE(ret);
        return 0;
}
EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);

static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
                            void __user *secs)
{
        int ret;

        /*
         * Make sure all userspace pointers from caller (KVM) are valid.
         * All other checks deferred to ENCLS itself.  Also see comment
         * for @secs in sgx_virt_ecreate().
         */
#define SGX_EINITTOKEN_SIZE     304
        if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
                         !access_ok(token, SGX_EINITTOKEN_SIZE) ||
                         !access_ok(secs, PAGE_SIZE)))
                return -EINVAL;

        __uaccess_begin();
        ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
        __uaccess_end();

        return ret;
}

/**
 * sgx_virt_einit() - Run EINIT on behalf of guest
 * @sigstruct:          Userspace pointer to SIGSTRUCT structure
 * @token:              Userspace pointer to EINITTOKEN structure
 * @secs:               Userspace pointer to SECS page
 * @lepubkeyhash:       Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
 * @trapnr:             trap number injected to guest in case of EINIT error
 *
 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
 * needs to update hardware values to guest's virtual MSR values in order to
 * ensure EINIT is executed with expected hardware values.
 *
 * Return:
 * -  0:        EINIT was successful.
 * - <0:        on error.
 */
int sgx_virt_einit(void __user *sigstruct, void __user *token,
                   void __user *secs, u64 *lepubkeyhash, int *trapnr)
{
        int ret;

        if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
                ret = __sgx_virt_einit(sigstruct, token, secs);
        } else {
                preempt_disable();

                sgx_update_lepubkeyhash(lepubkeyhash);

                ret = __sgx_virt_einit(sigstruct, token, secs);
                preempt_enable();
        }

        /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
        if (ret == -EINVAL)
                return ret;

        if (encls_faulted(ret)) {
                *trapnr = ENCLS_TRAPNR(ret);
                return -EFAULT;
        }

        return ret;
}
EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);