root/drivers/infiniband/core/ib_core_uverbs.c
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
 * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
 * Copyright 2019 Marvell. All rights reserved.
 */
#include <linux/xarray.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include "uverbs.h"
#include "core_priv.h"

MODULE_IMPORT_NS("DMA_BUF");

/**
 * rdma_umap_priv_init() - Initialize the private data of a vma
 *
 * @priv: The already allocated private data
 * @vma: The vm area struct that needs private data
 * @entry: entry into the mmap_xa that needs to be linked with
 *       this vma
 *
 * Each time we map IO memory into user space this keeps track of the
 * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space
 * to point to the zero page and allow the hot unplug to proceed.
 *
 * This is necessary for cases like PCI physical hot unplug as the actual BAR
 * memory may vanish after this and access to it from userspace could MCE.
 *
 * RDMA drivers supporting disassociation must have their user space designed
 * to cope in some way with their IO pages going to the zero page.
 *
 */
void rdma_umap_priv_init(struct rdma_umap_priv *priv,
                         struct vm_area_struct *vma,
                         struct rdma_user_mmap_entry *entry)
{
        struct ib_uverbs_file *ufile = vma->vm_file->private_data;

        priv->vma = vma;
        if (entry) {
                kref_get(&entry->ref);
                priv->entry = entry;
        }
        vma->vm_private_data = priv;
        /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */

        mutex_lock(&ufile->umap_lock);
        list_add(&priv->list, &ufile->umaps);
        mutex_unlock(&ufile->umap_lock);
}
EXPORT_SYMBOL(rdma_umap_priv_init);

/**
 * rdma_user_mmap_io() - Map IO memory into a process
 *
 * @ucontext: associated user context
 * @vma: the vma related to the current mmap call
 * @pfn: pfn to map
 * @size: size to map
 * @prot: pgprot to use in remap call
 * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL
 *         if mmap_entry is not used by the driver
 *
 * This is to be called by drivers as part of their mmap() functions if they
 * wish to send something like PCI-E BAR memory to userspace.
 *
 * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on
 * success.
 */
int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
                      unsigned long pfn, unsigned long size, pgprot_t prot,
                      struct rdma_user_mmap_entry *entry)
{
        struct ib_uverbs_file *ufile = ucontext->ufile;
        struct rdma_umap_priv *priv;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        if (vma->vm_end - vma->vm_start != size)
                return -EINVAL;

        /* Driver is using this wrong, must be called by ib_uverbs_mmap */
        if (WARN_ON(!vma->vm_file ||
                    vma->vm_file->private_data != ufile))
                return -EINVAL;
        lockdep_assert_held(&ufile->device->disassociate_srcu);

        priv = kzalloc_obj(*priv);
        if (!priv)
                return -ENOMEM;

        vma->vm_page_prot = prot;
        if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
                kfree(priv);
                return -EAGAIN;
        }

        rdma_umap_priv_init(priv, vma, entry);
        return 0;
}
EXPORT_SYMBOL(rdma_user_mmap_io);

/**
 * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
 *
 * @ucontext: associated user context
 * @pgoff: The mmap offset >> PAGE_SHIFT
 *
 * This function is called when a user tries to mmap with an offset (returned
 * by rdma_user_mmap_get_offset()) it initially received from the driver. The
 * rdma_user_mmap_entry was created by the function
 * rdma_user_mmap_entry_insert().  This function increases the refcnt of the
 * entry so that it won't be deleted from the xarray in the meantime.
 *
 * Return an reference to an entry if exists or NULL if there is no
 * match. rdma_user_mmap_entry_put() must be called to put the reference.
 */
struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
                               unsigned long pgoff)
{
        struct rdma_user_mmap_entry *entry;

        if (pgoff > U32_MAX)
                return NULL;

        xa_lock(&ucontext->mmap_xa);

        entry = xa_load(&ucontext->mmap_xa, pgoff);

        /*
         * If refcount is zero, entry is already being deleted, driver_removed
         * indicates that the no further mmaps are possible and we waiting for
         * the active VMAs to be closed.
         */
        if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
            !kref_get_unless_zero(&entry->ref))
                goto err;

        xa_unlock(&ucontext->mmap_xa);

        ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
                  pgoff, entry->npages);

        return entry;

err:
        xa_unlock(&ucontext->mmap_xa);
        return NULL;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);

/**
 * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
 *
 * @ucontext: associated user context
 * @vma: the vma being mmap'd into
 *
 * This function is like rdma_user_mmap_entry_get_pgoff() except that it also
 * checks that the VMA is correct.
 */
struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
                         struct vm_area_struct *vma)
{
        struct rdma_user_mmap_entry *entry;

        if (!(vma->vm_flags & VM_SHARED))
                return NULL;
        entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
        if (!entry)
                return NULL;
        if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
                rdma_user_mmap_entry_put(entry);
                return NULL;
        }
        return entry;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_get);

static void rdma_user_mmap_entry_free(struct kref *kref)
{
        struct rdma_user_mmap_entry *entry =
                container_of(kref, struct rdma_user_mmap_entry, ref);
        struct ib_ucontext *ucontext = entry->ucontext;
        unsigned long i;

        /*
         * Erase all entries occupied by this single entry, this is deferred
         * until all VMA are closed so that the mmap offsets remain unique.
         */
        xa_lock(&ucontext->mmap_xa);
        for (i = 0; i < entry->npages; i++)
                __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
        xa_unlock(&ucontext->mmap_xa);

        ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
                  entry->start_pgoff, entry->npages);

        if (ucontext->device->ops.mmap_free)
                ucontext->device->ops.mmap_free(entry);
}

/**
 * rdma_user_mmap_entry_put() - Drop reference to the mmap entry
 *
 * @entry: an entry in the mmap_xa
 *
 * This function is called when the mapping is closed if it was
 * an io mapping or when the driver is done with the entry for
 * some other reason.
 * Should be called after rdma_user_mmap_entry_get was called
 * and entry is no longer needed. This function will erase the
 * entry and free it if its refcnt reaches zero.
 */
void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
{
        kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_put);

/**
 * rdma_user_mmap_entry_remove() - Drop reference to entry and
 *                                 mark it as unmmapable
 *
 * @entry: the entry to insert into the mmap_xa
 *
 * Drivers can call this to prevent userspace from creating more mappings for
 * entry, however existing mmaps continue to exist and ops->mmap_free() will
 * not be called until all user mmaps are destroyed.
 */
void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
{
        struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp;

        if (!entry)
                return;

        mutex_lock(&entry->dmabufs_lock);
        xa_lock(&entry->ucontext->mmap_xa);
        entry->driver_removed = true;
        xa_unlock(&entry->ucontext->mmap_xa);
        list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) {
                dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL);
                list_del(&uverbs_dmabuf->dmabufs_elm);
                uverbs_dmabuf->revoked = true;
                dma_buf_move_notify(uverbs_dmabuf->dmabuf);
                dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv,
                                      DMA_RESV_USAGE_BOOKKEEP, false,
                                      MAX_SCHEDULE_TIMEOUT);
                dma_resv_unlock(uverbs_dmabuf->dmabuf->resv);
                kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done);
                wait_for_completion(&uverbs_dmabuf->comp);
        }
        mutex_unlock(&entry->dmabufs_lock);

        kref_put(&entry->ref, rdma_user_mmap_entry_free);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_remove);

/**
 * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa
 *                                       in a given range.
 *
 * @ucontext: associated user context.
 * @entry: the entry to insert into the mmap_xa
 * @length: length of the address that will be mmapped
 * @min_pgoff: minimum pgoff to be returned
 * @max_pgoff: maximum pgoff to be returned
 *
 * This function should be called by drivers that use the rdma_user_mmap
 * interface for implementing their mmap syscall A database of mmap offsets is
 * handled in the core and helper functions are provided to insert entries
 * into the database and extract entries when the user calls mmap with the
 * given offset. The function allocates a unique page offset in a given range
 * that should be provided to user, the user will use the offset to retrieve
 * information such as address to be mapped and how.
 *
 * Return: 0 on success and -ENOMEM on failure
 */
int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
                                      struct rdma_user_mmap_entry *entry,
                                      size_t length, u32 min_pgoff,
                                      u32 max_pgoff)
{
        struct ib_uverbs_file *ufile = ucontext->ufile;
        XA_STATE(xas, &ucontext->mmap_xa, min_pgoff);
        u32 xa_first, xa_last, npages;
        int err;
        u32 i;

        if (!entry)
                return -EINVAL;

        kref_init(&entry->ref);
        INIT_LIST_HEAD(&entry->dmabufs);
        mutex_init(&entry->dmabufs_lock);

        entry->ucontext = ucontext;

        /*
         * We want the whole allocation to be done without interruption from a
         * different thread. The allocation requires finding a free range and
         * storing. During the xa_insert the lock could be released, possibly
         * allowing another thread to choose the same range.
         */
        mutex_lock(&ufile->umap_lock);

        xa_lock(&ucontext->mmap_xa);

        /* We want to find an empty range */
        npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
        entry->npages = npages;
        while (true) {
                /* First find an empty index */
                xas_find_marked(&xas, max_pgoff, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        goto err_unlock;

                xa_first = xas.xa_index;

                /* Is there enough room to have the range? */
                if (check_add_overflow(xa_first, npages, &xa_last))
                        goto err_unlock;

                /*
                 * Now look for the next present entry. If an entry doesn't
                 * exist, we found an empty range and can proceed.
                 */
                xas_next_entry(&xas, xa_last - 1);
                if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
                        break;
        }

        for (i = xa_first; i < xa_last; i++) {
                err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
                if (err)
                        goto err_undo;
        }

        /*
         * Internally the kernel uses a page offset, in libc this is a byte
         * offset. Drivers should not return pgoff to userspace.
         */
        entry->start_pgoff = xa_first;
        xa_unlock(&ucontext->mmap_xa);
        mutex_unlock(&ufile->umap_lock);

        ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
                  entry->start_pgoff, npages);

        return 0;

err_undo:
        for (; i > xa_first; i--)
                __xa_erase(&ucontext->mmap_xa, i - 1);

err_unlock:
        xa_unlock(&ucontext->mmap_xa);
        mutex_unlock(&ufile->umap_lock);
        return -ENOMEM;
}
EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range);

/**
 * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa.
 *
 * @ucontext: associated user context.
 * @entry: the entry to insert into the mmap_xa
 * @length: length of the address that will be mmapped
 *
 * This function should be called by drivers that use the rdma_user_mmap
 * interface for handling user mmapped addresses. The database is handled in
 * the core and helper functions are provided to insert entries into the
 * database and extract entries when the user calls mmap with the given offset.
 * The function allocates a unique page offset that should be provided to user,
 * the user will use the offset to retrieve information such as address to
 * be mapped and how.
 *
 * Return: 0 on success and -ENOMEM on failure
 */
int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
                                struct rdma_user_mmap_entry *entry,
                                size_t length)
{
        return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0,
                                                 U32_MAX);
}
EXPORT_SYMBOL(rdma_user_mmap_entry_insert);