root/arch/powerpc/platforms/pseries/vas.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright 2020-21 IBM Corp.
 */

#define pr_fmt(fmt) "vas: " fmt

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/irqdomain.h>
#include <asm/machdep.h>
#include <asm/hvcall.h>
#include <asm/plpar_wrappers.h>
#include <asm/firmware.h>
#include <asm/vphn.h>
#include <asm/vas.h>
#include "vas.h"

#define VAS_INVALID_WIN_ADDRESS 0xFFFFFFFFFFFFFFFFul
#define VAS_DEFAULT_DOMAIN_ID   0xFFFFFFFFFFFFFFFFul
/* The hypervisor allows one credit per window right now */
#define DEF_WIN_CREDS           1

static struct vas_all_caps caps_all;
static bool copypaste_feat;
static struct hv_vas_cop_feat_caps hv_cop_caps;

static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE];
static DEFINE_MUTEX(vas_pseries_mutex);
static bool migration_in_progress;

static long hcall_return_busy_check(long rc)
{
        /* Check if we are stalled for some time */
        if (H_IS_LONG_BUSY(rc)) {
                unsigned int ms;
                /*
                 * Allocate, Modify and Deallocate HCALLs returns
                 * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC
                 * for the long delay. So the sleep time should always
                 * be either 1 or 10msecs, but in case if the HCALL
                 * returns the long delay > 10 msecs, clamp the sleep
                 * time to 10msecs.
                 */
                ms = clamp(get_longbusy_msecs(rc), 1, 10);

                /*
                 * msleep() will often sleep at least 20 msecs even
                 * though the hypervisor suggests that the OS reissue
                 * HCALLs after 1 or 10msecs. Also the delay hint from
                 * the HCALL is just a suggestion. So OK to pause for
                 * less time than the hinted delay. Use usleep_range()
                 * to ensure we don't sleep much longer than actually
                 * needed.
                 */
                usleep_range(ms * (USEC_PER_MSEC / 10), ms * USEC_PER_MSEC);
                rc = H_BUSY;
        } else if (rc == H_BUSY) {
                cond_resched();
        }

        return rc;
}

/*
 * Allocate VAS window hcall
 */
static int h_allocate_vas_window(struct pseries_vas_window *win, u64 *domain,
                                     u8 wintype, u16 credits)
{
        long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
        long rc;

        do {
                rc = plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, wintype,
                                  credits, domain[0], domain[1], domain[2],
                                  domain[3], domain[4], domain[5]);

                rc = hcall_return_busy_check(rc);
        } while (rc == H_BUSY);

        if (rc == H_SUCCESS) {
                if (win->win_addr == VAS_INVALID_WIN_ADDRESS) {
                        pr_err("H_ALLOCATE_VAS_WINDOW: COPY/PASTE is not supported\n");
                        return -ENOTSUPP;
                }
                win->vas_win.winid = retbuf[0];
                win->win_addr = retbuf[1];
                win->complete_irq = retbuf[2];
                win->fault_irq = retbuf[3];
                return 0;
        }

        pr_err("H_ALLOCATE_VAS_WINDOW error: %ld, wintype: %u, credits: %u\n",
                rc, wintype, credits);

        return -EIO;
}

/*
 * Deallocate VAS window hcall.
 */
static int h_deallocate_vas_window(u64 winid)
{
        long rc;

        do {
                rc = plpar_hcall_norets(H_DEALLOCATE_VAS_WINDOW, winid);

                rc = hcall_return_busy_check(rc);
        } while (rc == H_BUSY);

        if (rc == H_SUCCESS)
                return 0;

        pr_err("H_DEALLOCATE_VAS_WINDOW error: %ld, winid: %llu\n",
                rc, winid);
        return -EIO;
}

/*
 * Modify VAS window.
 * After the window is opened with allocate window hcall, configure it
 * with flags and LPAR PID before using.
 */
static int h_modify_vas_window(struct pseries_vas_window *win)
{
        long rc;

        /*
         * AMR value is not supported in Linux VAS implementation.
         * The hypervisor ignores it if 0 is passed.
         */
        do {
                rc = plpar_hcall_norets(H_MODIFY_VAS_WINDOW,
                                        win->vas_win.winid, win->pid, 0,
                                        VAS_MOD_WIN_FLAGS, 0);

                rc = hcall_return_busy_check(rc);
        } while (rc == H_BUSY);

        if (rc == H_SUCCESS)
                return 0;

        pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u pid %u\n",
                        rc, win->vas_win.winid, win->pid);
        return -EIO;
}

/*
 * This hcall is used to determine the capabilities from the hypervisor.
 * @hcall: H_QUERY_VAS_CAPABILITIES or H_QUERY_NX_CAPABILITIES
 * @query_type: If 0 is passed, the hypervisor returns the overall
 *              capabilities which provides all feature(s) that are
 *              available. Then query the hypervisor to get the
 *              corresponding capabilities for the specific feature.
 *              Example: H_QUERY_VAS_CAPABILITIES provides VAS GZIP QoS
 *                      and VAS GZIP Default capabilities.
 *                      H_QUERY_NX_CAPABILITIES provides NX GZIP
 *                      capabilities.
 * @result: Return buffer to save capabilities.
 */
int h_query_vas_capabilities(const u64 hcall, u8 query_type, u64 result)
{
        long rc;

        rc = plpar_hcall_norets(hcall, query_type, result);

        if (rc == H_SUCCESS)
                return 0;

        /* H_FUNCTION means HV does not support VAS so don't print an error */
        if (rc != H_FUNCTION) {
                pr_err("%s error %ld, query_type %u, result buffer 0x%llx\n",
                        (hcall == H_QUERY_VAS_CAPABILITIES) ?
                                "H_QUERY_VAS_CAPABILITIES" :
                                "H_QUERY_NX_CAPABILITIES",
                        rc, query_type, result);
        }

        return -EIO;
}
EXPORT_SYMBOL_GPL(h_query_vas_capabilities);

/*
 * hcall to get fault CRB from the hypervisor.
 */
static int h_get_nx_fault(u32 winid, u64 buffer)
{
        long rc;

        rc = plpar_hcall_norets(H_GET_NX_FAULT, winid, buffer);

        if (rc == H_SUCCESS)
                return 0;

        pr_err("H_GET_NX_FAULT error: %ld, winid %u, buffer 0x%llx\n",
                rc, winid, buffer);
        return -EIO;

}

/*
 * Handle the fault interrupt.
 * When the fault interrupt is received for each window, query the
 * hypervisor to get the fault CRB on the specific fault. Then
 * process the CRB by updating CSB or send signal if the user space
 * CSB is invalid.
 * Note: The hypervisor forwards an interrupt for each fault request.
 *      So one fault CRB to process for each H_GET_NX_FAULT hcall.
 */
static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data)
{
        struct pseries_vas_window *txwin = data;
        struct coprocessor_request_block crb;
        struct vas_user_win_ref *tsk_ref;
        int rc;

        while (atomic_read(&txwin->pending_faults)) {
                rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb));
                if (!rc) {
                        tsk_ref = &txwin->vas_win.task_ref;
                        vas_dump_crb(&crb);
                        vas_update_csb(&crb, tsk_ref);
                }
                atomic_dec(&txwin->pending_faults);
        }

        return IRQ_HANDLED;
}

/*
 * irq_default_primary_handler() can be used only with IRQF_ONESHOT
 * which disables IRQ before executing the thread handler and enables
 * it after. But this disabling interrupt sets the VAS IRQ OFF
 * state in the hypervisor. If the NX generates fault interrupt
 * during this window, the hypervisor will not deliver this
 * interrupt to the LPAR. So use VAS specific IRQ handler instead
 * of calling the default primary handler.
 */
static irqreturn_t pseries_vas_irq_handler(int irq, void *data)
{
        struct pseries_vas_window *txwin = data;

        /*
         * The thread handler will process this interrupt if it is
         * already running.
         */
        atomic_inc(&txwin->pending_faults);

        return IRQ_WAKE_THREAD;
}

/*
 * Allocate window and setup IRQ mapping.
 */
static int allocate_setup_window(struct pseries_vas_window *txwin,
                                 u64 *domain, u8 wintype)
{
        int rc;

        rc = h_allocate_vas_window(txwin, domain, wintype, DEF_WIN_CREDS);
        if (rc)
                return rc;
        /*
         * On PowerVM, the hypervisor setup and forwards the fault
         * interrupt per window. So the IRQ setup and fault handling
         * will be done for each open window separately.
         */
        txwin->fault_virq = irq_create_mapping(NULL, txwin->fault_irq);
        if (!txwin->fault_virq) {
                pr_err("Failed irq mapping %d\n", txwin->fault_irq);
                rc = -EINVAL;
                goto out_win;
        }

        txwin->name = kasprintf(GFP_KERNEL, "vas-win-%d",
                                txwin->vas_win.winid);
        if (!txwin->name) {
                rc = -ENOMEM;
                goto out_irq;
        }

        rc = request_threaded_irq(txwin->fault_virq,
                                  pseries_vas_irq_handler,
                                  pseries_vas_fault_thread_fn, 0,
                                  txwin->name, txwin);
        if (rc) {
                pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n",
                       txwin->vas_win.winid, txwin->fault_virq, rc);
                goto out_free;
        }

        txwin->vas_win.wcreds_max = DEF_WIN_CREDS;

        return 0;
out_free:
        kfree(txwin->name);
out_irq:
        irq_dispose_mapping(txwin->fault_virq);
out_win:
        h_deallocate_vas_window(txwin->vas_win.winid);
        return rc;
}

static inline void free_irq_setup(struct pseries_vas_window *txwin)
{
        free_irq(txwin->fault_virq, txwin);
        kfree(txwin->name);
        irq_dispose_mapping(txwin->fault_virq);
}

static struct vas_window *vas_allocate_window(int vas_id, u64 flags,
                                              enum vas_cop_type cop_type)
{
        long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
        struct vas_cop_feat_caps *cop_feat_caps;
        struct vas_caps *caps;
        struct pseries_vas_window *txwin;
        int rc;

        txwin = kzalloc_obj(*txwin);
        if (!txwin)
                return ERR_PTR(-ENOMEM);

        /*
         * A VAS window can have many credits which means that many
         * requests can be issued simultaneously. But the hypervisor
         * restricts one credit per window.
         * The hypervisor introduces 2 different types of credits:
         * Default credit type (Uses normal priority FIFO):
         *      A limited number of credits are assigned to partitions
         *      based on processor entitlement. But these credits may be
         *      over-committed on a system depends on whether the CPUs
         *      are in shared or dedicated modes - that is, more requests
         *      may be issued across the system than NX can service at
         *      once which can result in paste command failure (RMA_busy).
         *      Then the process has to resend requests or fall-back to
         *      SW compression.
         * Quality of Service (QoS) credit type (Uses high priority FIFO):
         *      To avoid NX HW contention, the system admins can assign
         *      QoS credits for each LPAR so that this partition is
         *      guaranteed access to NX resources. These credits are
         *      assigned to partitions via the HMC.
         *      Refer PAPR for more information.
         *
         * Allocate window with QoS credits if user requested. Otherwise
         * default credits are used.
         */
        if (flags & VAS_TX_WIN_FLAG_QOS_CREDIT)
                caps = &vascaps[VAS_GZIP_QOS_FEAT_TYPE];
        else
                caps = &vascaps[VAS_GZIP_DEF_FEAT_TYPE];

        cop_feat_caps = &caps->caps;

        if (atomic_inc_return(&cop_feat_caps->nr_used_credits) >
                        atomic_read(&cop_feat_caps->nr_total_credits)) {
                pr_err_ratelimited("Credits are not available to allocate window\n");
                rc = -EINVAL;
                goto out;
        }

        if (vas_id == -1) {
                /*
                 * The user space is requesting to allocate a window on
                 * a VAS instance where the process is executing.
                 * On PowerVM, domain values are passed to the hypervisor
                 * to select VAS instance. Useful if the process is
                 * affinity to NUMA node.
                 * The hypervisor selects VAS instance if
                 * VAS_DEFAULT_DOMAIN_ID (-1) is passed for domain values.
                 * The h_allocate_vas_window hcall is defined to take a
                 * domain values as specified by h_home_node_associativity,
                 * So no unpacking needs to be done.
                 */
                rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, domain,
                                  VPHN_FLAG_VCPU, hard_smp_processor_id());
                if (rc != H_SUCCESS) {
                        pr_err("H_HOME_NODE_ASSOCIATIVITY error: %d\n", rc);
                        goto out;
                }
        }

        txwin->pid = mfspr(SPRN_PID);

        /*
         * Allocate / Deallocate window hcalls and setup / free IRQs
         * have to be protected with mutex.
         * Open VAS window: Allocate window hcall and setup IRQ
         * Close VAS window: Deallocate window hcall and free IRQ
         *      The hypervisor waits until all NX requests are
         *      completed before closing the window. So expects OS
         *      to handle NX faults, means IRQ can be freed only
         *      after the deallocate window hcall is returned.
         * So once the window is closed with deallocate hcall before
         * the IRQ is freed, it can be assigned to new allocate
         * hcall with the same fault IRQ by the hypervisor. It can
         * result in setup IRQ fail for the new window since the
         * same fault IRQ is not freed by the OS before.
         */
        mutex_lock(&vas_pseries_mutex);
        if (migration_in_progress) {
                rc = -EBUSY;
        } else {
                rc = allocate_setup_window(txwin, (u64 *)&domain[0],
                                   cop_feat_caps->win_type);
                if (!rc)
                        caps->nr_open_wins_progress++;
        }

        mutex_unlock(&vas_pseries_mutex);
        if (rc)
                goto out;

        /*
         * Modify window and it is ready to use.
         */
        rc = h_modify_vas_window(txwin);
        if (!rc)
                rc = get_vas_user_win_ref(&txwin->vas_win.task_ref);
        if (rc)
                goto out_free;

        txwin->win_type = cop_feat_caps->win_type;

        /*
         * The migration SUSPEND thread sets migration_in_progress and
         * closes all open windows from the list. But the window is
         * added to the list after open and modify HCALLs. So possible
         * that migration_in_progress is set before modify HCALL which
         * may cause some windows are still open when the hypervisor
         * initiates the migration.
         * So checks the migration_in_progress flag again and close all
         * open windows.
         *
         * Possible to lose the acquired credit with DLPAR core
         * removal after the window is opened. So if there are any
         * closed windows (means with lost credits), do not give new
         * window to user space. New windows will be opened only
         * after the existing windows are reopened when credits are
         * available.
         */
        mutex_lock(&vas_pseries_mutex);
        if (!caps->nr_close_wins && !migration_in_progress) {
                list_add(&txwin->win_list, &caps->list);
                caps->nr_open_windows++;
                caps->nr_open_wins_progress--;
                mutex_unlock(&vas_pseries_mutex);
                vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
                return &txwin->vas_win;
        }
        mutex_unlock(&vas_pseries_mutex);

        put_vas_user_win_ref(&txwin->vas_win.task_ref);
        rc = -EBUSY;
        pr_err_ratelimited("No credit is available to allocate window\n");

out_free:
        /*
         * Window is not operational. Free IRQ before closing
         * window so that do not have to hold mutex.
         */
        free_irq_setup(txwin);
        h_deallocate_vas_window(txwin->vas_win.winid);
        /*
         * Hold mutex and reduce nr_open_wins_progress counter.
         */
        mutex_lock(&vas_pseries_mutex);
        caps->nr_open_wins_progress--;
        mutex_unlock(&vas_pseries_mutex);
out:
        atomic_dec(&cop_feat_caps->nr_used_credits);
        kfree(txwin);
        return ERR_PTR(rc);
}

static u64 vas_paste_address(struct vas_window *vwin)
{
        struct pseries_vas_window *win;

        win = container_of(vwin, struct pseries_vas_window, vas_win);
        return win->win_addr;
}

static int deallocate_free_window(struct pseries_vas_window *win)
{
        int rc = 0;

        /*
         * The hypervisor waits for all requests including faults
         * are processed before closing the window - Means all
         * credits have to be returned. In the case of fault
         * request, a credit is returned after OS issues
         * H_GET_NX_FAULT hcall.
         * So free IRQ after executing H_DEALLOCATE_VAS_WINDOW
         * hcall.
         */
        rc = h_deallocate_vas_window(win->vas_win.winid);
        if (!rc)
                free_irq_setup(win);

        return rc;
}

static int vas_deallocate_window(struct vas_window *vwin)
{
        struct pseries_vas_window *win;
        struct vas_cop_feat_caps *caps;
        int rc = 0;

        if (!vwin)
                return -EINVAL;

        win = container_of(vwin, struct pseries_vas_window, vas_win);

        /* Should not happen */
        if (win->win_type >= VAS_MAX_FEAT_TYPE) {
                pr_err("Window (%u): Invalid window type %u\n",
                                vwin->winid, win->win_type);
                return -EINVAL;
        }

        caps = &vascaps[win->win_type].caps;
        mutex_lock(&vas_pseries_mutex);
        /*
         * VAS window is already closed in the hypervisor when
         * lost the credit or with migration. So just remove the entry
         * from the list, remove task references and free vas_window
         * struct.
         */
        if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) &&
                !(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) {
                rc = deallocate_free_window(win);
                if (rc) {
                        mutex_unlock(&vas_pseries_mutex);
                        return rc;
                }
        } else
                vascaps[win->win_type].nr_close_wins--;

        list_del(&win->win_list);
        atomic_dec(&caps->nr_used_credits);
        vascaps[win->win_type].nr_open_windows--;
        mutex_unlock(&vas_pseries_mutex);

        mm_context_remove_vas_window(vwin->task_ref.mm);
        put_vas_user_win_ref(&vwin->task_ref);

        kfree(win);
        return 0;
}

static const struct vas_user_win_ops vops_pseries = {
        .open_win       = vas_allocate_window,  /* Open and configure window */
        .paste_addr     = vas_paste_address,    /* To do copy/paste */
        .close_win      = vas_deallocate_window, /* Close window */
};

/*
 * Supporting only nx-gzip coprocessor type now, but this API code
 * extended to other coprocessor types later.
 */
int vas_register_api_pseries(struct module *mod, enum vas_cop_type cop_type,
                             const char *name)
{
        if (!copypaste_feat)
                return -ENOTSUPP;

        return vas_register_coproc_api(mod, cop_type, name, &vops_pseries);
}
EXPORT_SYMBOL_GPL(vas_register_api_pseries);

void vas_unregister_api_pseries(void)
{
        vas_unregister_coproc_api();
}
EXPORT_SYMBOL_GPL(vas_unregister_api_pseries);

/*
 * Get the specific capabilities based on the feature type.
 * Right now supports GZIP default and GZIP QoS capabilities.
 */
static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type,
                                struct hv_vas_cop_feat_caps *hv_caps)
{
        struct vas_cop_feat_caps *caps;
        struct vas_caps *vcaps;
        int rc = 0;

        vcaps = &vascaps[type];
        memset(vcaps, 0, sizeof(*vcaps));
        INIT_LIST_HEAD(&vcaps->list);

        vcaps->feat = feat;
        caps = &vcaps->caps;

        rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat,
                                          (u64)virt_to_phys(hv_caps));
        if (rc)
                return rc;

        caps->user_mode = hv_caps->user_mode;
        if (!(caps->user_mode & VAS_COPY_PASTE_USER_MODE)) {
                pr_err("User space COPY/PASTE is not supported\n");
                return -ENOTSUPP;
        }

        caps->descriptor = be64_to_cpu(hv_caps->descriptor);
        caps->win_type = hv_caps->win_type;
        if (caps->win_type >= VAS_MAX_FEAT_TYPE) {
                pr_err("Unsupported window type %u\n", caps->win_type);
                return -EINVAL;
        }
        caps->max_lpar_creds = be16_to_cpu(hv_caps->max_lpar_creds);
        caps->max_win_creds = be16_to_cpu(hv_caps->max_win_creds);
        atomic_set(&caps->nr_total_credits,
                   be16_to_cpu(hv_caps->target_lpar_creds));
        if (feat == VAS_GZIP_DEF_FEAT) {
                caps->def_lpar_creds = be16_to_cpu(hv_caps->def_lpar_creds);

                if (caps->max_win_creds < DEF_WIN_CREDS) {
                        pr_err("Window creds(%u) > max allowed window creds(%u)\n",
                               DEF_WIN_CREDS, caps->max_win_creds);
                        return -EINVAL;
                }
        }

        rc = sysfs_add_vas_caps(caps);
        if (rc)
                return rc;

        copypaste_feat = true;

        return 0;
}

/*
 * VAS windows can be closed due to lost credits when the core is
 * removed. So reopen them if credits are available due to DLPAR
 * core add and set the window active status. When NX sees the page
 * fault on the unmapped paste address, the kernel handles the fault
 * by setting the remapping to new paste address if the window is
 * active.
 */
static int reconfig_open_windows(struct vas_caps *vcaps, int creds,
                                 bool migrate)
{
        long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
        struct vas_cop_feat_caps *caps = &vcaps->caps;
        struct pseries_vas_window *win = NULL, *tmp;
        int rc, mv_ents = 0;
        int flag;

        /*
         * Nothing to do if there are no closed windows.
         */
        if (!vcaps->nr_close_wins)
                return 0;

        /*
         * For the core removal, the hypervisor reduces the credits
         * assigned to the LPAR and the kernel closes VAS windows
         * in the hypervisor depends on reduced credits. The kernel
         * uses LIFO (the last windows that are opened will be closed
         * first) and expects to open in the same order when credits
         * are available.
         * For example, 40 windows are closed when the LPAR lost 2 cores
         * (dedicated). If 1 core is added, this LPAR can have 20 more
         * credits. It means the kernel can reopen 20 windows. So move
         * 20 entries in the VAS windows lost and reopen next 20 windows.
         * For partition migration, reopen all windows that are closed
         * during resume.
         */
        if ((vcaps->nr_close_wins > creds) && !migrate)
                mv_ents = vcaps->nr_close_wins - creds;

        list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) {
                if (!mv_ents)
                        break;

                mv_ents--;
        }

        /*
         * Open windows if they are closed only with migration or
         * DLPAR (lost credit) before.
         */
        if (migrate)
                flag = VAS_WIN_MIGRATE_CLOSE;
        else
                flag = VAS_WIN_NO_CRED_CLOSE;

        list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) {
                /*
                 * This window is closed with DLPAR and migration events.
                 * So reopen the window with the last event.
                 * The user space is not suspended with the current
                 * migration notifier. So the user space can issue DLPAR
                 * CPU hotplug while migration in progress. In this case
                 * this window will be opened with the last event.
                 */
                if ((win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) &&
                        (win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) {
                        win->vas_win.status &= ~flag;
                        continue;
                }

                /*
                 * Nothing to do on this window if it is not closed
                 * with this flag
                 */
                if (!(win->vas_win.status & flag))
                        continue;

                rc = allocate_setup_window(win, (u64 *)&domain[0],
                                           caps->win_type);
                if (rc)
                        return rc;

                rc = h_modify_vas_window(win);
                if (rc)
                        goto out;

                mutex_lock(&win->vas_win.task_ref.mmap_mutex);
                /*
                 * Set window status to active
                 */
                win->vas_win.status &= ~flag;
                mutex_unlock(&win->vas_win.task_ref.mmap_mutex);
                win->win_type = caps->win_type;
                if (!--vcaps->nr_close_wins)
                        break;
        }

        return 0;
out:
        /*
         * Window modify HCALL failed. So close the window to the
         * hypervisor and return.
         */
        free_irq_setup(win);
        h_deallocate_vas_window(win->vas_win.winid);
        return rc;
}

/*
 * The hypervisor reduces the available credits if the LPAR lost core. It
 * means the excessive windows should not be active and the user space
 * should not be using these windows to send compression requests to NX.
 * So the kernel closes the excessive windows and unmap the paste address
 * such that the user space receives paste instruction failure. Then up to
 * the user space to fall back to SW compression and manage with the
 * existing windows.
 */
static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds,
                                                                        bool migrate)
{
        struct pseries_vas_window *win, *tmp;
        struct vas_user_win_ref *task_ref;
        struct vm_area_struct *vma;
        int rc = 0, flag;

        if (migrate)
                flag = VAS_WIN_MIGRATE_CLOSE;
        else
                flag = VAS_WIN_NO_CRED_CLOSE;

        list_for_each_entry_safe(win, tmp, &vcap->list, win_list) {
                /*
                 * This window is already closed due to lost credit
                 * or for migration before. Go for next window.
                 * For migration, nothing to do since this window
                 * closed for DLPAR and will be reopened even on
                 * the destination system with other DLPAR operation.
                 */
                if ((win->vas_win.status & VAS_WIN_MIGRATE_CLOSE) ||
                        (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) {
                        win->vas_win.status |= flag;
                        continue;
                }

                task_ref = &win->vas_win.task_ref;
                /*
                 * VAS mmap (coproc_mmap()) and its fault handler
                 * (vas_mmap_fault()) are called after holding mmap lock.
                 * So hold mmap mutex after mmap_lock to avoid deadlock.
                 */
                mmap_write_lock(task_ref->mm);
                mutex_lock(&task_ref->mmap_mutex);
                vma = task_ref->vma;
                /*
                 * Number of available credits are reduced, So select
                 * and close windows.
                 */
                win->vas_win.status |= flag;

                /*
                 * vma is set in the original mapping. But this mapping
                 * is done with mmap() after the window is opened with ioctl.
                 * so we may not see the original mapping if the core remove
                 * is done before the original mmap() and after the ioctl.
                 */
                if (vma)
                        zap_vma_pages(vma);

                mutex_unlock(&task_ref->mmap_mutex);
                mmap_write_unlock(task_ref->mm);
                /*
                 * Close VAS window in the hypervisor, but do not
                 * free vas_window struct since it may be reused
                 * when the credit is available later (DLPAR with
                 * adding cores). This struct will be used
                 * later when the process issued with close(FD).
                 */
                rc = deallocate_free_window(win);
                /*
                 * This failure is from the hypervisor.
                 * No way to stop migration for these failures.
                 * So ignore error and continue closing other windows.
                 */
                if (rc && !migrate)
                        return rc;

                vcap->nr_close_wins++;

                /*
                 * For migration, do not depend on lpar_creds in case if
                 * mismatch with the hypervisor value (should not happen).
                 * So close all active windows in the list and will be
                 * reopened windows based on the new lpar_creds on the
                 * destination system during resume.
                 */
                if (!migrate && !--excess_creds)
                        break;
        }

        return 0;
}

/*
 * Get new VAS capabilities when the core add/removal configuration
 * changes. Reconfig window configurations based on the credits
 * availability from this new capabilities.
 */
int vas_reconfig_capabilties(u8 type, int new_nr_creds)
{
        struct vas_cop_feat_caps *caps;
        int old_nr_creds;
        struct vas_caps *vcaps;
        int rc = 0, nr_active_wins;

        if (type >= VAS_MAX_FEAT_TYPE) {
                pr_err("Invalid credit type %d\n", type);
                return -EINVAL;
        }

        vcaps = &vascaps[type];
        caps = &vcaps->caps;

        mutex_lock(&vas_pseries_mutex);

        old_nr_creds = atomic_read(&caps->nr_total_credits);

        atomic_set(&caps->nr_total_credits, new_nr_creds);
        /*
         * The total number of available credits may be decreased or
         * increased with DLPAR operation. Means some windows have to be
         * closed / reopened. Hold the vas_pseries_mutex so that the
         * user space can not open new windows.
         */
        if (old_nr_creds <  new_nr_creds) {
                /*
                 * If the existing target credits is less than the new
                 * target, reopen windows if they are closed due to
                 * the previous DLPAR (core removal).
                 */
                rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds,
                                           false);
        } else {
                /*
                 * # active windows is more than new LPAR available
                 * credits. So close the excessive windows.
                 * On pseries, each window will have 1 credit.
                 */
                nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins;
                if (nr_active_wins > new_nr_creds)
                        rc = reconfig_close_windows(vcaps,
                                        nr_active_wins - new_nr_creds,
                                        false);
        }

        mutex_unlock(&vas_pseries_mutex);
        return rc;
}

int pseries_vas_dlpar_cpu(void)
{
        int new_nr_creds, rc;

        /*
         * NX-GZIP is not enabled. Nothing to do for DLPAR event
         */
        if (!copypaste_feat)
                return 0;


        rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
                                      vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat,
                                      (u64)virt_to_phys(&hv_cop_caps));
        if (!rc) {
                new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
                rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds);
        }

        if (rc)
                pr_err("Failed reconfig VAS capabilities with DLPAR\n");

        return rc;
}

/*
 * Total number of default credits available (target_credits)
 * in LPAR depends on number of cores configured. It varies based on
 * whether processors are in shared mode or dedicated mode.
 * Get the notifier when CPU configuration is changed with DLPAR
 * operation so that get the new target_credits (vas default capabilities)
 * and then update the existing windows usage if needed.
 */
static int pseries_vas_notifier(struct notifier_block *nb,
                                unsigned long action, void *data)
{
        struct of_reconfig_data *rd = data;
        struct device_node *dn = rd->dn;
        const __be32 *intserv = NULL;
        int len;

        /*
         * For shared CPU partition, the hypervisor assigns total credits
         * based on entitled core capacity. So updating VAS windows will
         * be called from lparcfg_write().
         */
        if (is_shared_processor())
                return NOTIFY_OK;

        if ((action == OF_RECONFIG_ATTACH_NODE) ||
                (action == OF_RECONFIG_DETACH_NODE))
                intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s",
                                          &len);
        /*
         * Processor config is not changed
         */
        if (!intserv)
                return NOTIFY_OK;

        return pseries_vas_dlpar_cpu();
}

static struct notifier_block pseries_vas_nb = {
        .notifier_call = pseries_vas_notifier,
};

/*
 * For LPM, all windows have to be closed on the source partition
 * before migration and reopen them on the destination partition
 * after migration. So closing windows during suspend and
 * reopen them during resume.
 */
int vas_migration_handler(int action)
{
        struct vas_cop_feat_caps *caps;
        int old_nr_creds, new_nr_creds = 0;
        struct vas_caps *vcaps;
        int i, rc = 0;

        pr_info("VAS migration event %d\n", action);

        /*
         * NX-GZIP is not enabled. Nothing to do for migration.
         */
        if (!copypaste_feat)
                return rc;

        if (action == VAS_SUSPEND)
                migration_in_progress = true;
        else
                migration_in_progress = false;

        for (i = 0; i < VAS_MAX_FEAT_TYPE; i++) {
                vcaps = &vascaps[i];
                caps = &vcaps->caps;
                old_nr_creds = atomic_read(&caps->nr_total_credits);

                rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
                                              vcaps->feat,
                                              (u64)virt_to_phys(&hv_cop_caps));
                if (!rc) {
                        new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
                        /*
                         * Should not happen. But incase print messages, close
                         * all windows in the list during suspend and reopen
                         * windows based on new lpar_creds on the destination
                         * system.
                         */
                        if (old_nr_creds != new_nr_creds) {
                                pr_err("Target credits mismatch with the hypervisor\n");
                                pr_err("state(%d): lpar creds: %d HV lpar creds: %d\n",
                                        action, old_nr_creds, new_nr_creds);
                                pr_err("Used creds: %d, Active creds: %d\n",
                                        atomic_read(&caps->nr_used_credits),
                                        vcaps->nr_open_windows - vcaps->nr_close_wins);
                        }
                } else {
                        pr_err("state(%d): Get VAS capabilities failed with %d\n",
                                action, rc);
                        /*
                         * We can not stop migration with the current lpm
                         * implementation. So continue closing all windows in
                         * the list (during suspend) and return without
                         * opening windows (during resume) if VAS capabilities
                         * HCALL failed.
                         */
                        if (action == VAS_RESUME)
                                goto out;
                }

                switch (action) {
                case VAS_SUSPEND:
                        mutex_lock(&vas_pseries_mutex);
                        rc = reconfig_close_windows(vcaps, vcaps->nr_open_windows,
                                                        true);
                        /*
                         * Windows are included in the list after successful
                         * open. So wait for closing these in-progress open
                         * windows in vas_allocate_window() which will be
                         * done if the migration_in_progress is set.
                         */
                        while (vcaps->nr_open_wins_progress) {
                                mutex_unlock(&vas_pseries_mutex);
                                msleep(10);
                                mutex_lock(&vas_pseries_mutex);
                        }
                        mutex_unlock(&vas_pseries_mutex);
                        break;
                case VAS_RESUME:
                        mutex_lock(&vas_pseries_mutex);
                        atomic_set(&caps->nr_total_credits, new_nr_creds);
                        rc = reconfig_open_windows(vcaps, new_nr_creds, true);
                        mutex_unlock(&vas_pseries_mutex);
                        break;
                default:
                        /* should not happen */
                        pr_err("Invalid migration action %d\n", action);
                        rc = -EINVAL;
                        goto out;
                }

                /*
                 * Ignore errors during suspend and return for resume.
                 */
                if (rc && (action == VAS_RESUME))
                        goto out;
        }

        pr_info("VAS migration event (%d) successful\n", action);

out:
        return rc;
}

static int __init pseries_vas_init(void)
{
        struct hv_vas_all_caps *hv_caps;
        int rc = 0;

        /*
         * Linux supports user space COPY/PASTE only with Radix
         */
        if (!radix_enabled()) {
                copypaste_feat = false;
                pr_err("API is supported only with radix page tables\n");
                return -ENOTSUPP;
        }

        hv_caps = kmalloc_obj(*hv_caps);
        if (!hv_caps)
                return -ENOMEM;
        /*
         * Get VAS overall capabilities by passing 0 to feature type.
         */
        rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, 0,
                                          (u64)virt_to_phys(hv_caps));
        if (rc)
                goto out;

        caps_all.descriptor = be64_to_cpu(hv_caps->descriptor);
        caps_all.feat_type = be64_to_cpu(hv_caps->feat_type);

        sysfs_pseries_vas_init(&caps_all);

        /*
         * QOS capabilities available
         */
        if (caps_all.feat_type & VAS_GZIP_QOS_FEAT_BIT) {
                rc = get_vas_capabilities(VAS_GZIP_QOS_FEAT,
                                          VAS_GZIP_QOS_FEAT_TYPE, &hv_cop_caps);

                if (rc)
                        goto out;
        }
        /*
         * Default capabilities available
         */
        if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT)
                rc = get_vas_capabilities(VAS_GZIP_DEF_FEAT,
                                          VAS_GZIP_DEF_FEAT_TYPE, &hv_cop_caps);

        if (!rc && copypaste_feat) {
                if (firmware_has_feature(FW_FEATURE_LPAR))
                        of_reconfig_notifier_register(&pseries_vas_nb);

                pr_info("GZIP feature is available\n");
        } else {
                /*
                 * Should not happen, but only when get default
                 * capabilities HCALL failed. So disable copy paste
                 * feature.
                 */
                copypaste_feat = false;
        }

out:
        kfree(hv_caps);
        return rc;
}
machine_device_initcall(pseries, pseries_vas_init);