root/drivers/vfio/container.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 *
 * VFIO container (/dev/vfio/vfio)
 */
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/capability.h>
#include <linux/iommu.h>
#include <linux/miscdevice.h>
#include <linux/vfio.h>
#include <uapi/linux/vfio.h>

#include "vfio.h"

struct vfio_container {
        struct kref                     kref;
        struct list_head                group_list;
        struct rw_semaphore             group_lock;
        struct vfio_iommu_driver        *iommu_driver;
        void                            *iommu_data;
        bool                            noiommu;
};

static struct vfio {
        struct list_head                iommu_drivers_list;
        struct mutex                    iommu_drivers_lock;
} vfio;

static void *vfio_noiommu_open(unsigned long arg)
{
        if (arg != VFIO_NOIOMMU_IOMMU)
                return ERR_PTR(-EINVAL);
        if (!capable(CAP_SYS_RAWIO))
                return ERR_PTR(-EPERM);

        return NULL;
}

static void vfio_noiommu_release(void *iommu_data)
{
}

static long vfio_noiommu_ioctl(void *iommu_data,
                               unsigned int cmd, unsigned long arg)
{
        if (cmd == VFIO_CHECK_EXTENSION)
                return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;

        return -ENOTTY;
}

static int vfio_noiommu_attach_group(void *iommu_data,
                struct iommu_group *iommu_group, enum vfio_group_type type)
{
        return 0;
}

static void vfio_noiommu_detach_group(void *iommu_data,
                                      struct iommu_group *iommu_group)
{
}

static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
        .name = "vfio-noiommu",
        .owner = THIS_MODULE,
        .open = vfio_noiommu_open,
        .release = vfio_noiommu_release,
        .ioctl = vfio_noiommu_ioctl,
        .attach_group = vfio_noiommu_attach_group,
        .detach_group = vfio_noiommu_detach_group,
};

/*
 * Only noiommu containers can use vfio-noiommu and noiommu containers can only
 * use vfio-noiommu.
 */
static bool vfio_iommu_driver_allowed(struct vfio_container *container,
                                      const struct vfio_iommu_driver *driver)
{
        if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU))
                return true;
        return container->noiommu == (driver->ops == &vfio_noiommu_ops);
}

/*
 * IOMMU driver registration
 */
int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
        struct vfio_iommu_driver *driver, *tmp;

        if (WARN_ON(!ops->register_device != !ops->unregister_device))
                return -EINVAL;

        driver = kzalloc_obj(*driver);
        if (!driver)
                return -ENOMEM;

        driver->ops = ops;

        mutex_lock(&vfio.iommu_drivers_lock);

        /* Check for duplicates */
        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
                if (tmp->ops == ops) {
                        mutex_unlock(&vfio.iommu_drivers_lock);
                        kfree(driver);
                        return -EINVAL;
                }
        }

        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);

        mutex_unlock(&vfio.iommu_drivers_lock);

        return 0;
}
EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);

void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
{
        struct vfio_iommu_driver *driver;

        mutex_lock(&vfio.iommu_drivers_lock);
        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
                if (driver->ops == ops) {
                        list_del(&driver->vfio_next);
                        mutex_unlock(&vfio.iommu_drivers_lock);
                        kfree(driver);
                        return;
                }
        }
        mutex_unlock(&vfio.iommu_drivers_lock);
}
EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);

/*
 * Container objects - containers are created when /dev/vfio/vfio is
 * opened, but their lifecycle extends until the last user is done, so
 * it's freed via kref.  Must support container/group/device being
 * closed in any order.
 */
static void vfio_container_release(struct kref *kref)
{
        struct vfio_container *container;
        container = container_of(kref, struct vfio_container, kref);

        kfree(container);
}

static void vfio_container_get(struct vfio_container *container)
{
        kref_get(&container->kref);
}

static void vfio_container_put(struct vfio_container *container)
{
        kref_put(&container->kref, vfio_container_release);
}

void vfio_device_container_register(struct vfio_device *device)
{
        struct vfio_iommu_driver *iommu_driver =
                device->group->container->iommu_driver;

        if (iommu_driver && iommu_driver->ops->register_device)
                iommu_driver->ops->register_device(
                        device->group->container->iommu_data, device);
}

void vfio_device_container_unregister(struct vfio_device *device)
{
        struct vfio_iommu_driver *iommu_driver =
                device->group->container->iommu_driver;

        if (iommu_driver && iommu_driver->ops->unregister_device)
                iommu_driver->ops->unregister_device(
                        device->group->container->iommu_data, device);
}

static long
vfio_container_ioctl_check_extension(struct vfio_container *container,
                                     unsigned long arg)
{
        struct vfio_iommu_driver *driver;
        long ret = 0;

        down_read(&container->group_lock);

        driver = container->iommu_driver;

        switch (arg) {
                /* No base extensions yet */
        default:
                /*
                 * If no driver is set, poll all registered drivers for
                 * extensions and return the first positive result.  If
                 * a driver is already set, further queries will be passed
                 * only to that driver.
                 */
                if (!driver) {
                        mutex_lock(&vfio.iommu_drivers_lock);
                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
                                            vfio_next) {

                                if (!list_empty(&container->group_list) &&
                                    !vfio_iommu_driver_allowed(container,
                                                               driver))
                                        continue;
                                if (!try_module_get(driver->ops->owner))
                                        continue;

                                ret = driver->ops->ioctl(NULL,
                                                         VFIO_CHECK_EXTENSION,
                                                         arg);
                                module_put(driver->ops->owner);
                                if (ret > 0)
                                        break;
                        }
                        mutex_unlock(&vfio.iommu_drivers_lock);
                } else
                        ret = driver->ops->ioctl(container->iommu_data,
                                                 VFIO_CHECK_EXTENSION, arg);
        }

        up_read(&container->group_lock);

        return ret;
}

/* hold write lock on container->group_lock */
static int __vfio_container_attach_groups(struct vfio_container *container,
                                          struct vfio_iommu_driver *driver,
                                          void *data)
{
        struct vfio_group *group;
        int ret = -ENODEV;

        list_for_each_entry(group, &container->group_list, container_next) {
                ret = driver->ops->attach_group(data, group->iommu_group,
                                                group->type);
                if (ret)
                        goto unwind;
        }

        return ret;

unwind:
        list_for_each_entry_continue_reverse(group, &container->group_list,
                                             container_next) {
                driver->ops->detach_group(data, group->iommu_group);
        }

        return ret;
}

static long vfio_ioctl_set_iommu(struct vfio_container *container,
                                 unsigned long arg)
{
        struct vfio_iommu_driver *driver;
        long ret = -ENODEV;

        down_write(&container->group_lock);

        /*
         * The container is designed to be an unprivileged interface while
         * the group can be assigned to specific users.  Therefore, only by
         * adding a group to a container does the user get the privilege of
         * enabling the iommu, which may allocate finite resources.  There
         * is no unset_iommu, but by removing all the groups from a container,
         * the container is deprivileged and returns to an unset state.
         */
        if (list_empty(&container->group_list) || container->iommu_driver) {
                up_write(&container->group_lock);
                return -EINVAL;
        }

        mutex_lock(&vfio.iommu_drivers_lock);
        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
                void *data;

                if (!vfio_iommu_driver_allowed(container, driver))
                        continue;
                if (!try_module_get(driver->ops->owner))
                        continue;

                /*
                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
                 * so test which iommu driver reported support for this
                 * extension and call open on them.  We also pass them the
                 * magic, allowing a single driver to support multiple
                 * interfaces if they'd like.
                 */
                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
                        module_put(driver->ops->owner);
                        continue;
                }

                data = driver->ops->open(arg);
                if (IS_ERR(data)) {
                        ret = PTR_ERR(data);
                        module_put(driver->ops->owner);
                        continue;
                }

                ret = __vfio_container_attach_groups(container, driver, data);
                if (ret) {
                        driver->ops->release(data);
                        module_put(driver->ops->owner);
                        continue;
                }

                container->iommu_driver = driver;
                container->iommu_data = data;
                break;
        }

        mutex_unlock(&vfio.iommu_drivers_lock);
        up_write(&container->group_lock);

        return ret;
}

static long vfio_fops_unl_ioctl(struct file *filep,
                                unsigned int cmd, unsigned long arg)
{
        struct vfio_container *container = filep->private_data;
        struct vfio_iommu_driver *driver;
        void *data;
        long ret = -EINVAL;

        if (!container)
                return ret;

        switch (cmd) {
        case VFIO_GET_API_VERSION:
                ret = VFIO_API_VERSION;
                break;
        case VFIO_CHECK_EXTENSION:
                ret = vfio_container_ioctl_check_extension(container, arg);
                break;
        case VFIO_SET_IOMMU:
                ret = vfio_ioctl_set_iommu(container, arg);
                break;
        default:
                driver = container->iommu_driver;
                data = container->iommu_data;

                if (driver) /* passthrough all unrecognized ioctls */
                        ret = driver->ops->ioctl(data, cmd, arg);
        }

        return ret;
}

static int vfio_fops_open(struct inode *inode, struct file *filep)
{
        struct vfio_container *container;

        container = kzalloc_obj(*container, GFP_KERNEL_ACCOUNT);
        if (!container)
                return -ENOMEM;

        INIT_LIST_HEAD(&container->group_list);
        init_rwsem(&container->group_lock);
        kref_init(&container->kref);

        filep->private_data = container;

        return 0;
}

static int vfio_fops_release(struct inode *inode, struct file *filep)
{
        struct vfio_container *container = filep->private_data;

        filep->private_data = NULL;

        vfio_container_put(container);

        return 0;
}

static const struct file_operations vfio_fops = {
        .owner          = THIS_MODULE,
        .open           = vfio_fops_open,
        .release        = vfio_fops_release,
        .unlocked_ioctl = vfio_fops_unl_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

struct vfio_container *vfio_container_from_file(struct file *file)
{
        struct vfio_container *container;

        /* Sanity check, is this really our fd? */
        if (file->f_op != &vfio_fops)
                return NULL;

        container = file->private_data;
        WARN_ON(!container); /* fget ensures we don't race vfio_release */
        return container;
}

static struct miscdevice vfio_dev = {
        .minor = VFIO_MINOR,
        .name = "vfio",
        .fops = &vfio_fops,
        .nodename = "vfio/vfio",
        .mode = S_IRUGO | S_IWUGO,
};

int vfio_container_attach_group(struct vfio_container *container,
                                struct vfio_group *group)
{
        struct vfio_iommu_driver *driver;
        int ret = 0;

        lockdep_assert_held(&group->group_lock);

        if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
                return -EPERM;

        down_write(&container->group_lock);

        /* Real groups and fake groups cannot mix */
        if (!list_empty(&container->group_list) &&
            container->noiommu != (group->type == VFIO_NO_IOMMU)) {
                ret = -EPERM;
                goto out_unlock_container;
        }

        if (group->type == VFIO_IOMMU) {
                ret = iommu_group_claim_dma_owner(group->iommu_group, group);
                if (ret)
                        goto out_unlock_container;
        }

        driver = container->iommu_driver;
        if (driver) {
                ret = driver->ops->attach_group(container->iommu_data,
                                                group->iommu_group,
                                                group->type);
                if (ret) {
                        if (group->type == VFIO_IOMMU)
                                iommu_group_release_dma_owner(
                                        group->iommu_group);
                        goto out_unlock_container;
                }
        }

        group->container = container;
        group->container_users = 1;
        container->noiommu = (group->type == VFIO_NO_IOMMU);
        list_add(&group->container_next, &container->group_list);

        /* Get a reference on the container and mark a user within the group */
        vfio_container_get(container);

out_unlock_container:
        up_write(&container->group_lock);
        return ret;
}

void vfio_group_detach_container(struct vfio_group *group)
{
        struct vfio_container *container = group->container;
        struct vfio_iommu_driver *driver;

        lockdep_assert_held(&group->group_lock);
        WARN_ON(group->container_users != 1);

        down_write(&container->group_lock);

        driver = container->iommu_driver;
        if (driver)
                driver->ops->detach_group(container->iommu_data,
                                          group->iommu_group);

        if (group->type == VFIO_IOMMU)
                iommu_group_release_dma_owner(group->iommu_group);

        group->container = NULL;
        group->container_users = 0;
        list_del(&group->container_next);

        /* Detaching the last group deprivileges a container, remove iommu */
        if (driver && list_empty(&container->group_list)) {
                driver->ops->release(container->iommu_data);
                module_put(driver->ops->owner);
                container->iommu_driver = NULL;
                container->iommu_data = NULL;
        }

        up_write(&container->group_lock);

        vfio_container_put(container);
}

int vfio_group_use_container(struct vfio_group *group)
{
        lockdep_assert_held(&group->group_lock);

        /*
         * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but
         * VFIO_SET_IOMMU hasn't been done yet.
         */
        if (!group->container->iommu_driver)
                return -EINVAL;

        if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO))
                return -EPERM;

        get_file(group->opened_file);
        group->container_users++;
        return 0;
}

void vfio_group_unuse_container(struct vfio_group *group)
{
        lockdep_assert_held(&group->group_lock);

        WARN_ON(group->container_users <= 1);
        group->container_users--;
        fput(group->opened_file);
}

int vfio_device_container_pin_pages(struct vfio_device *device,
                                    dma_addr_t iova, int npage,
                                    int prot, struct page **pages)
{
        struct vfio_container *container = device->group->container;
        struct iommu_group *iommu_group = device->group->iommu_group;
        struct vfio_iommu_driver *driver = container->iommu_driver;

        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
                return -E2BIG;

        if (unlikely(!driver || !driver->ops->pin_pages))
                return -ENOTTY;
        return driver->ops->pin_pages(container->iommu_data, iommu_group, iova,
                                      npage, prot, pages);
}

void vfio_device_container_unpin_pages(struct vfio_device *device,
                                       dma_addr_t iova, int npage)
{
        struct vfio_container *container = device->group->container;

        if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
                return;

        container->iommu_driver->ops->unpin_pages(container->iommu_data, iova,
                                                  npage);
}

int vfio_device_container_dma_rw(struct vfio_device *device,
                                 dma_addr_t iova, void *data,
                                 size_t len, bool write)
{
        struct vfio_container *container = device->group->container;
        struct vfio_iommu_driver *driver = container->iommu_driver;

        if (unlikely(!driver || !driver->ops->dma_rw))
                return -ENOTTY;
        return driver->ops->dma_rw(container->iommu_data, iova, data, len,
                                   write);
}

int __init vfio_container_init(void)
{
        int ret;

        mutex_init(&vfio.iommu_drivers_lock);
        INIT_LIST_HEAD(&vfio.iommu_drivers_list);

        ret = misc_register(&vfio_dev);
        if (ret) {
                pr_err("vfio: misc device register failed\n");
                return ret;
        }

        if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) {
                ret = vfio_register_iommu_driver(&vfio_noiommu_ops);
                if (ret)
                        goto err_misc;
        }
        return 0;

err_misc:
        misc_deregister(&vfio_dev);
        return ret;
}

void vfio_container_cleanup(void)
{
        if (IS_ENABLED(CONFIG_VFIO_NOIOMMU))
                vfio_unregister_iommu_driver(&vfio_noiommu_ops);
        misc_deregister(&vfio_dev);
        mutex_destroy(&vfio.iommu_drivers_lock);
}

MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");