root/usr/src/uts/sun4v/io/dr_mem.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * sun4v Memory DR Module
 */


#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/vmem.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/machsystm.h>      /* for page_freelist_coalesce() */
#include <sys/errno.h>
#include <sys/memnode.h>
#include <sys/memlist.h>
#include <sys/memlist_impl.h>
#include <sys/tuneable.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/debug.h>
#include <sys/vm.h>
#include <sys/callb.h>
#include <sys/memlist_plat.h>   /* for installed_top_size() */
#include <sys/condvar_impl.h>   /* for CV_HAS_WAITERS() */
#include <sys/dumphdr.h>        /* for dump_resize() */
#include <sys/atomic.h>         /* for use in stats collection */
#include <sys/rwlock.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/page.h>
#include <vm/vm_dep.h>
#define SUNDDI_IMPL             /* so sunddi.h will not redefine splx() et al */
#include <sys/sunddi.h>
#include <sys/mem_config.h>
#include <sys/mem_cage.h>
#include <sys/lgrp.h>
#include <sys/ddi.h>

#include <sys/modctl.h>
#include <sys/sysevent/dr.h>
#include <sys/mach_descrip.h>
#include <sys/mdesc.h>
#include <sys/ds.h>
#include <sys/drctl.h>
#include <sys/dr_util.h>
#include <sys/dr_mem.h>
#include <sys/suspend.h>


/*
 * DR operations are subject to Memory Alignment restrictions
 * for both address and the size of the request.
 */
#define MA_ADDR 0x10000000      /* addr alignment 256M */
#define MA_SIZE 0x10000000      /* size alignment 256M */

#define MBLK_IS_VALID(m) \
        (IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))

static memhandle_t dr_mh;       /* memory handle for delete */

static struct modlmisc modlmisc = {
        &mod_miscops,
        "sun4v memory DR"
};

static struct modlinkage modlinkage = {
        MODREV_1,
        (void *)&modlmisc,
        NULL
};

static int dr_mem_allow_unload = 0;

typedef int (*fn_t)(dr_mem_blk_t *, int *);

/*
 * Global Domain Services (DS) Handle
 */
static ds_svc_hdl_t ds_handle;

/*
 * Supported DS Capability Versions
 */
static ds_ver_t         dr_mem_vers[] = { { 1, 0 } };
#define DR_MEM_NVERS    (sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))

/*
 * DS Capability Description
 */
static ds_capability_t dr_mem_cap = {
        DR_MEM_DS_ID,           /* svc_id */
        dr_mem_vers,            /* vers */
        DR_MEM_NVERS            /* nvers */
};

/*
 * DS Callbacks
 */
static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
static void dr_mem_unreg_handler(ds_cb_arg_t arg);
static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);

/*
 * DS Client Ops Vector
 */
static ds_clnt_ops_t dr_mem_ops = {
        dr_mem_reg_handler,     /* ds_reg_cb */
        dr_mem_unreg_handler,   /* ds_unreg_cb */
        dr_mem_data_handler,    /* ds_data_cb */
        NULL                    /* cb_arg */
};

/*
 * Operation Results
 *
 * Used internally to gather results while an operation on a
 * list of mblks is in progress. In particular, it is used to
 * keep track of which mblks have already failed so that they are
 * not processed further, and the manner in which they failed.
 */
typedef struct {
        uint64_t        addr;
        uint64_t        size;
        uint32_t        result;
        uint32_t        status;
        char            *string;
} dr_mem_res_t;

static char *
dr_mem_estr[] = {
        "operation succeeded",          /* DR_MEM_RES_OK */
        "operation failed",             /* DR_MEM_RES_FAILURE */
        "operation was blocked",        /* DR_MEM_RES_BLOCKED */
        "memory not defined in MD",     /* DR_MEM_RES_NOT_IN_MD */
        "memory already in use",        /* DR_MEM_RES_ESPAN */
        "memory access test failed",    /* DR_MEM_RES_EFAULT */
        "resource not available",       /* DR_MEM_RES_ERESOURCE */
        "permanent pages in span",      /* DR_MEM_RES_PERM */
        "memory span busy",             /* DR_MEM_RES_EBUSY */
        "VM viability test failed",     /* DR_MEM_RES_ENOTVIABLE */
        "no pages to unconfigure",      /* DR_MEM_RES_ENOWORK */
        "operation cancelled",          /* DR_MEM_RES_ECANCELLED */
        "operation refused",            /* DR_MEM_RES_EREFUSED */
        "memory span duplicate",        /* DR_MEM_RES_EDUP */
        "invalid argument"              /* DR_MEM_RES_EINVAL */
};

static char *
dr_mem_estr_detail[] = {
        "",                                     /* DR_MEM_SRES_NONE */
        "memory DR disabled after migration"    /* DR_MEM_SRES_OS_SUSPENDED */
};

typedef struct {
        kcondvar_t cond;
        kmutex_t lock;
        int error;
        int done;
} mem_sync_t;

/*
 * Internal Functions
 */
static int dr_mem_init(void);
static int dr_mem_fini(void);

static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);

static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
static int dr_mem_configure(dr_mem_blk_t *, int *);
static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);

static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
    dr_mem_hdr_t **respp);

static int dr_mem_find(dr_mem_blk_t *mbp);
static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);

static int mem_add(pfn_t, pgcnt_t);
static int mem_del(pfn_t, pgcnt_t);

extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);

int
_init(void)
{
        int     status;

        /* check that Memory DR is enabled */
        if (dr_is_disabled(DR_TYPE_MEM))
                return (ENOTSUP);

        if ((status = dr_mem_init()) != 0) {
                cmn_err(CE_NOTE, "Memory DR initialization failed");
                return (status);
        }

        if ((status = mod_install(&modlinkage)) != 0) {
                (void) dr_mem_fini();
        }

        return (status);
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}

int
_fini(void)
{
        int     status;

        if (dr_mem_allow_unload == 0)
                return (EBUSY);

        if ((status = mod_remove(&modlinkage)) == 0) {
                (void) dr_mem_fini();
        }

        return (status);
}

static int
dr_mem_init(void)
{
        int rv;

        if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
                cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
                return (rv);
        }

        return (0);
}

static int
dr_mem_fini(void)
{
        int rv;

        if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
                cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
        }

        return (rv);
}

static void
dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
{
        DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
            ver->major, ver->minor, hdl);

        ds_handle = hdl;
}

static void
dr_mem_unreg_handler(ds_cb_arg_t arg)
{
        DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);

        ds_handle = DS_INVALID_HDL;
}

/*ARGSUSED*/
static void
dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
{
        dr_mem_hdr_t    *req = buf;
        dr_mem_hdr_t    err_resp;
        dr_mem_hdr_t    *resp = &err_resp;
        int             resp_len = 0;
        int             rv = EINVAL;

        /*
         * Sanity check the message
         */
        if (buflen < sizeof (dr_mem_hdr_t)) {
                DR_DBG_MEM("incoming message short: expected at least %ld "
                    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
                goto done;
        }

        if (req == NULL) {
                DR_DBG_MEM("empty message: expected at least %ld bytes\n",
                    sizeof (dr_mem_hdr_t));
                goto done;
        }

        DR_DBG_MEM("incoming request:\n");
        DR_DBG_DUMP_MSG(buf, buflen);

        /*
         * Process the command
         */
        switch (req->msg_type) {
        case DR_MEM_CONFIGURE:
        case DR_MEM_UNCONFIGURE:
                if (req->msg_arg == 0) {
                        DR_DBG_MEM("No mblks specified for operation\n");
                        goto done;
                }
                if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
                        DR_DBG_MEM("%s failed (%d)\n",
                            (req->msg_type == DR_MEM_CONFIGURE) ?
                            "Memory configure" : "Memory unconfigure", rv);
                }
                break;

        case DR_MEM_UNCONF_STATUS:
                if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
                        DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
                break;

        case DR_MEM_UNCONF_CANCEL:
                if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
                        DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
                break;

        case DR_MEM_QUERY:
                if (req->msg_arg == 0) {
                        DR_DBG_MEM("No mblks specified for operation\n");
                        goto done;
                }
                if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
                        DR_DBG_MEM("Memory query failed (%d)\n", rv);
                break;

        default:
                cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
                    req->msg_type);
                break;
        }

done:
        /* check if an error occurred */
        if (resp == &err_resp) {
                resp->req_num = (req) ? req->req_num : 0;
                resp->msg_type = DR_MEM_ERROR;
                resp->msg_arg = rv;
                resp_len = sizeof (dr_mem_hdr_t);
        }

        DR_DBG_MEM("outgoing response:\n");
        DR_DBG_DUMP_MSG(resp, resp_len);

        /* send back the response */
        if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
                DR_DBG_MEM("ds_send failed\n");
        }

        /* free any allocated memory */
        if (resp != &err_resp) {
                kmem_free(resp, resp_len);
        }
}

static char *
dr_mem_get_errstr(int result, int subresult)
{
        size_t len;
        char *errstr;
        const char *separator = ": ";

        if (subresult == DR_MEM_SRES_NONE)
                return (i_ddi_strdup(dr_mem_estr[result], KM_SLEEP));

        len = snprintf(NULL, 0, "%s%s%s", dr_mem_estr[result],
            separator, dr_mem_estr_detail[subresult]) + 1;

        errstr = kmem_alloc(len, KM_SLEEP);

        (void) snprintf(errstr, len, "%s%s%s", dr_mem_estr[result],
            separator, dr_mem_estr_detail[subresult]);

        return (errstr);
}

/*
 * Common routine to config or unconfig multiple mblks.
 *
 * Note: Do not modify result buffer or length on error.
 */
static int
dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
        int             rv;
        int             idx;
        int             count;
        int             result;
        int             subresult;
        int             status;
        boolean_t       suspend_allows_dr;
        fn_t            dr_fn;
        int             se_hint;
        dr_mem_blk_t    *req_mblks;
        dr_mem_res_t    *res;
        int             drctl_cmd;
        int             drctl_flags = 0;
        drctl_rsrc_t    *drctl_req;
        size_t          drctl_req_len;
        drctl_resp_t    *drctl_resp;
        drctl_rsrc_t    *drctl_rsrc;
        size_t          drctl_resp_len = 0;
        drctl_cookie_t  drctl_res_ck;

        ASSERT((req != NULL) && (req->msg_arg != 0));

        count = req->msg_arg;

        /*
         * Extract all information that is specific
         * to the various types of operations.
         */
        switch (req->msg_type) {
        case DR_MEM_CONFIGURE:
                dr_fn = dr_mem_configure;
                drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
                se_hint = SE_HINT_INSERT;
                break;
        case DR_MEM_UNCONFIGURE:
                dr_fn = dr_mem_unconfigure;
                drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
                se_hint = SE_HINT_REMOVE;
                break;
        default:
                /* Programming error if we reach this. */
                cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
                    __func__, req->msg_type);
                ASSERT(0);
                return (-1);
        }

        /* the incoming array of mblks to operate on */
        req_mblks = DR_MEM_CMD_MBLKS(req);

        /* allocate drctl request msg based on incoming resource count */
        drctl_req_len = sizeof (drctl_rsrc_t) * count;
        drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);

        /* copy the size for the drctl call from the incoming request msg */
        for (idx = 0; idx < count; idx++) {
                drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
                drctl_req[idx].res_mem_size = req_mblks[idx].size;
        }

        rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
            count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);

        ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));

        if (rv != 0) {
                DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
                    __func__, rv);
                kmem_free(drctl_resp, drctl_resp_len);
                kmem_free(drctl_req, drctl_req_len);
                return (rv);
        }

        ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);

        drctl_rsrc = drctl_resp->resp_resources;

        /* create the result scratch array */
        res = dr_mem_res_array_init(req, drctl_rsrc, count);

        /*
         * Memory DR operations are not safe if we have been suspended and
         * resumed. Until this limitation is lifted, check to see if memory
         * DR operations are permitted at this time by the suspend subsystem.
         */
        if ((suspend_allows_dr = suspend_memdr_allowed()) == B_FALSE) {
                result = DR_MEM_RES_BLOCKED;
                subresult = DR_MEM_SRES_OS_SUSPENDED;
        } else {
                subresult = DR_MEM_SRES_NONE;
        }

        /* perform the specified operation on each of the mblks */
        for (idx = 0; idx < count; idx++) {
                /*
                 * If no action will be taken against the current
                 * mblk, update the drctl resource information to
                 * ensure that it gets recovered properly during
                 * the drctl fini() call.
                 */
                if (res[idx].result != DR_MEM_RES_OK) {
                        drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
                        continue;
                }

                /*
                 * If memory DR operations are permitted at this time by
                 * the suspend subsystem, call the function to perform the
                 * operation, otherwise return a result indicating that the
                 * operation was blocked.
                 */
                if (suspend_allows_dr)
                        result = (*dr_fn)(&req_mblks[idx], &status);

                /* save off results of the operation */
                res[idx].result = result;
                res[idx].status = status;
                res[idx].addr = req_mblks[idx].addr;    /* for partial case */
                res[idx].size = req_mblks[idx].size;    /* for partial case */
                res[idx].string = dr_mem_get_errstr(result, subresult);

                /* save result for drctl fini() reusing init() msg memory */
                drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
                    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;

                DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
                    __func__, req_mblks[idx].addr, req_mblks[idx].size,
                    drctl_req[idx].status, result,
                    (res[idx].string) ? res[idx].string : "");
        }

        if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
                DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
                    __func__, rv);

        /*
         * Operation completed without any fatal errors.
         * Pack the response for transmission.
         */
        *resp_len = dr_mem_pack_response(req, res, resp);

        /* notify interested parties about the operation */
        dr_generate_event(DR_TYPE_MEM, se_hint);

        /*
         * Deallocate any scratch memory.
         */
        kmem_free(drctl_resp, drctl_resp_len);
        kmem_free(drctl_req, drctl_req_len);

        dr_mem_res_array_fini(res, count);

        return (0);
}

/*
 * Allocate and initialize a result array based on the initial
 * drctl operation. A valid result array is always returned.
 */
static dr_mem_res_t *
dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
{
        int             idx;
        dr_mem_res_t    *res;
        char            *err_str;
        size_t          err_len;

        /* allocate zero filled buffer to initialize fields */
        res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);

        /*
         * Fill in the result information for each resource.
         */
        for (idx = 0; idx < nrsrc; idx++) {
                res[idx].addr = rsrc[idx].res_mem_addr;
                res[idx].size = rsrc[idx].res_mem_size;
                res[idx].result = DR_MEM_RES_OK;

                if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
                        continue;

                /*
                 * Update the state information for this mblk.
                 */
                res[idx].result = DR_MEM_RES_BLOCKED;
                res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
                    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;

                /*
                 * If an error string exists, copy it out of the
                 * message buffer. This eliminates any dependency
                 * on the memory allocated for the message buffer
                 * itself.
                 */
                if (rsrc[idx].offset != 0) {
                        err_str = (char *)rsrc + rsrc[idx].offset;
                        err_len = strlen(err_str) + 1;

                        res[idx].string = kmem_alloc(err_len, KM_SLEEP);
                        bcopy(err_str, res[idx].string, err_len);
                }
        }

        return (res);
}

static void
dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
{
        int     idx;
        size_t  str_len;

        for (idx = 0; idx < nres; idx++) {
                /* deallocate the error string if present */
                if (res[idx].string) {
                        str_len = strlen(res[idx].string) + 1;
                        kmem_free(res[idx].string, str_len);
                }
        }

        /* deallocate the result array itself */
        kmem_free(res, sizeof (dr_mem_res_t) * nres);
}

/*
 * Allocate and pack a response message for transmission based
 * on the specified result array. A valid response message and
 * valid size information is always returned.
 */
static size_t
dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
{
        int             idx;
        dr_mem_hdr_t    *resp;
        dr_mem_stat_t   *resp_stat;
        size_t          resp_len;
        uint32_t        curr_off;
        caddr_t         curr_str;
        size_t          str_len;
        size_t          stat_len;
        int             nstat = req->msg_arg;

        /*
         * Calculate the size of the response message
         * and allocate an appropriately sized buffer.
         */
        resp_len = sizeof (dr_mem_hdr_t);

        /* add the stat array size */
        stat_len = sizeof (dr_mem_stat_t) * nstat;
        resp_len += stat_len;

        /* add the size of any error strings */
        for (idx = 0; idx < nstat; idx++) {
                if (res[idx].string != NULL) {
                        resp_len += strlen(res[idx].string) + 1;
                }
        }

        /* allocate the message buffer */
        resp = kmem_zalloc(resp_len, KM_SLEEP);

        /*
         * Fill in the header information.
         */
        resp->req_num = req->req_num;
        resp->msg_type = DR_MEM_OK;
        resp->msg_arg = nstat;

        /*
         * Fill in the stat information.
         */
        resp_stat = DR_MEM_RESP_STATS(resp);

        /* string offsets start immediately after stat array */
        curr_off = sizeof (dr_mem_hdr_t) + stat_len;
        curr_str = (char *)resp_stat + stat_len;

        for (idx = 0; idx < nstat; idx++) {
                resp_stat[idx].addr = res[idx].addr;
                resp_stat[idx].size = res[idx].size;
                resp_stat[idx].result = res[idx].result;
                resp_stat[idx].status = res[idx].status;

                if (res[idx].string != NULL) {
                        /* copy over the error string */
                        str_len = strlen(res[idx].string) + 1;
                        bcopy(res[idx].string, curr_str, str_len);
                        resp_stat[idx].string_off = curr_off;

                        curr_off += str_len;
                        curr_str += str_len;
                }
        }

        /* buffer should be exactly filled */
        ASSERT(curr_off == resp_len);

        *respp = resp;
        return (resp_len);
}

static void
dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
{
        memquery_t mq;

        DR_DBG_MEM("dr_mem_query...\n");


        (void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);

        if (!mq.phys_pages)
                return;

        mqp->addr = mbp->addr;
        mqp->mq.phys_pages = ptob(mq.phys_pages);
        mqp->mq.managed = ptob(mq.managed);
        mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
        mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
        mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
        /*
         * Set to the max byte offset within the page.
         */
        if (mqp->mq.nonrelocatable)
                mqp->mq.last_nonrelocatable += PAGESIZE - 1;
}

/*
 * Do not modify result buffer or length on error.
 */
static int
dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
        int             idx;
        int             rlen;
        int             nml;
        struct memlist  *ml;
        struct memlist  *phys_copy = NULL;
        dr_mem_blk_t    *req_mblks, mb;
        dr_mem_hdr_t    *rp;
        dr_mem_query_t  *stat;

        drctl_block();

        /* the incoming array of req_mblks to configure */
        req_mblks = DR_MEM_CMD_MBLKS(req);

        /* allocate a response message, should be freed by caller */
        nml = 0;
        rlen = sizeof (dr_mem_hdr_t);
        if (req_mblks->addr == 0 && req_mblks->size == 0) {
                /*
                 * Request is for domain's full view of it's memory.
                 * place a copy in phys_copy then release the memlist lock.
                 */
                memlist_read_lock();
                phys_copy = dr_memlist_dup(phys_install);
                memlist_read_unlock();

                for (ml = phys_copy; ml; ml = ml->ml_next)
                        nml++;

                rlen += nml * sizeof (dr_mem_query_t);
        } else {
                rlen += req->msg_arg * sizeof (dr_mem_query_t);
        }
        rp = kmem_zalloc(rlen, KM_SLEEP);

        /* fill in the known data */
        rp->req_num = req->req_num;
        rp->msg_type = DR_MEM_OK;
        rp->msg_arg = nml ? nml : req->msg_arg;

        /* stat array for the response */
        stat = DR_MEM_RESP_QUERY(rp);

        /* get the status for each of the mblocks */
        if (nml) {
                for (idx = 0, ml = phys_copy; ml; ml = ml->ml_next, idx++) {
                        mb.addr = ml->ml_address;
                        mb.size = ml->ml_size;
                        dr_mem_query(&mb, &stat[idx]);
                }
        } else {
                for (idx = 0; idx < req->msg_arg; idx++)
                        dr_mem_query(&req_mblks[idx], &stat[idx]);
        }

        *resp = rp;
        *resp_len = rlen;
        if (phys_copy != NULL) {
                dr_memlist_delete(phys_copy);
        }
        drctl_unblock();

        return (0);
}

static int
cvt_err(int err)
{
        int rv;

        switch (err) {
        case KPHYSM_OK:
                rv = DR_MEM_RES_OK;
                break;
        case KPHYSM_ESPAN:
                rv = DR_MEM_RES_ESPAN;
                break;
        case KPHYSM_EFAULT:
                rv = DR_MEM_RES_EFAULT;
                break;
        case KPHYSM_ERESOURCE:
                rv = DR_MEM_RES_ERESOURCE;
                break;
        case KPHYSM_ENOTSUP:
        case KPHYSM_ENOHANDLES:
                rv = DR_MEM_RES_FAILURE;
                break;
        case KPHYSM_ENONRELOC:
                rv = DR_MEM_RES_PERM;
                break;
        case KPHYSM_EHANDLE:
                rv = DR_MEM_RES_FAILURE;
                break;
        case KPHYSM_EBUSY:
                rv = DR_MEM_RES_EBUSY;
                break;
        case KPHYSM_ENOTVIABLE:
                rv = DR_MEM_RES_ENOTVIABLE;
                break;
        case KPHYSM_ESEQUENCE:
                rv = DR_MEM_RES_FAILURE;
                break;
        case KPHYSM_ENOWORK:
                rv = DR_MEM_RES_ENOWORK;
                break;
        case KPHYSM_ECANCELLED:
                rv = DR_MEM_RES_ECANCELLED;
                break;
        case KPHYSM_EREFUSED:
                rv = DR_MEM_RES_EREFUSED;
                break;
        case KPHYSM_ENOTFINISHED:
        case KPHYSM_ENOTRUNNING:
                rv = DR_MEM_RES_FAILURE;
                break;
        case KPHYSM_EDUP:
                rv = DR_MEM_RES_EDUP;
                break;
        default:
                rv = DR_MEM_RES_FAILURE;
                break;
        }

        return (rv);
}

static int
dr_mem_configure(dr_mem_blk_t *mbp, int *status)
{
        int rv;
        uint64_t addr, size;

        rv = 0;
        addr = mbp->addr;
        size = mbp->size;

        DR_DBG_MEM("dr_mem_configure...\n");

        if (!MBLK_IS_VALID(mbp)) {
                DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
                *status = DR_MEM_STAT_UNCONFIGURED;
                rv = DR_MEM_RES_EINVAL;
        } else if (rv = dr_mem_find(mbp)) {
                DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
                    addr, size, rv);
                if (rv == EINVAL) {
                        *status = DR_MEM_STAT_NOT_PRESENT;
                        rv = DR_MEM_RES_NOT_IN_MD;
                } else {
                        *status = DR_MEM_STAT_UNCONFIGURED;
                        rv = DR_MEM_RES_FAILURE;
                }
        } else {
                rv = mem_add(btop(addr), btop(size));
                DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
                if (rv) {
                        *status = DR_MEM_STAT_UNCONFIGURED;
                } else {
                        *status = DR_MEM_STAT_CONFIGURED;
                }
        }

        return (rv);
}

static int
dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
{
        int rv;

        DR_DBG_MEM("dr_mem_unconfigure...\n");

        if (!MBLK_IS_VALID(mbp)) {
                DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
                    mbp->addr, mbp->size);
                *status = DR_MEM_STAT_CONFIGURED;
                rv = DR_MEM_RES_EINVAL;
        } else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
                *status = DR_MEM_STAT_CONFIGURED;
        } else {
                *status = DR_MEM_STAT_UNCONFIGURED;
                rv = DR_MEM_RES_OK;
                DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
                    mbp->addr, mbp->size);
        }
        return (rv);
}

static int
dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
        int                     status;
        int                     rlen;
        memdelstat_t            del_stat, *stat;
        dr_mem_hdr_t            *rp;

        /*
         * If a mem delete is in progress, get its status.
         */
        status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));

        /* allocate a response message, should be freed by caller */
        rlen = sizeof (dr_mem_hdr_t);
        rlen += status * sizeof (memdelstat_t);
        rp = kmem_zalloc(rlen, KM_SLEEP);

        /* fill in the known data */
        rp->req_num = req->req_num;
        rp->msg_type = DR_MEM_OK;
        rp->msg_arg = status;

        if (status) {
                /* stat struct for the response */
                stat = DR_MEM_RESP_DEL_STAT(rp);
                stat->phys_pages = ptob(del_stat.phys_pages);
                stat->managed = ptob(del_stat.managed);
                stat->collected = ptob(del_stat.collected);
        }

        *resp = rp;
        *resp_len = rlen;

        return (0);
}

static int
dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
        int             rlen;
        dr_mem_hdr_t    *rp;

        /* allocate a response message, should be freed by caller */
        rlen = sizeof (dr_mem_hdr_t);
        rp = kmem_zalloc(rlen, KM_SLEEP);

        /* fill in the known data */
        rp->req_num = req->req_num;
        rp->msg_type = DR_MEM_OK;
        rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
            DR_MEM_RES_EINVAL : DR_MEM_RES_OK;

        *resp = rp;
        *resp_len = rlen;

        return (0);
}

static int
dr_mem_find(dr_mem_blk_t *mbp)
{
        md_t            *mdp = NULL;
        int             num_nodes;
        int             rv = 0;
        int             listsz;
        mde_cookie_t    *listp = NULL;
        mde_cookie_t    memnode;
        char            *found = "found";

        if ((mdp = md_get_handle()) == NULL) {
                DR_DBG_MEM("unable to initialize machine description\n");
                return (-1);
        }

        num_nodes = md_node_count(mdp);
        ASSERT(num_nodes > 0);

        listsz = num_nodes * sizeof (mde_cookie_t);
        listp = kmem_zalloc(listsz, KM_SLEEP);

        memnode = dr_mem_find_node_md(mbp, mdp, listp);

        if (memnode == MDE_INVAL_ELEM_COOKIE) {
                rv = EINVAL;
                found = "not found";
        }

        DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);

        kmem_free(listp, listsz);
        (void) md_fini_handle(mdp);

        return (rv);
}

/*
 * Look up a particular mblk in the MD. Returns the mde_cookie_t
 * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
 * otherwise. It is assumed the scratch array has already been
 * allocated so that it can accommodate the worst case scenario,
 * every node in the MD.
 */
static mde_cookie_t
dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
{
        int             idx;
        int             nnodes;
        mde_cookie_t    rootnode;
        uint64_t        base_prop;
        uint64_t        size_prop;
        mde_cookie_t    result = MDE_INVAL_ELEM_COOKIE;

        rootnode = md_root_node(mdp);
        ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);

        /*
         * Scan the DAG for all the mem nodes
         */
        nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
            md_find_name(mdp, "fwd"), listp);

        if (nnodes < 0) {
                DR_DBG_MEM("Scan for mblks failed\n");
                return (result);
        }

        DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);

        /*
         * Find the mblk of interest
         */
        for (idx = 0; idx < nnodes; idx++) {

                if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
                        DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
                            idx);
                        break;
                }

                if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
                        DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
                            idx);
                        break;
                }

                if (base_prop <= mbp->addr &&
                    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
                        /* found a match */
                        DR_DBG_MEM("dr_mem_find_node_md: found mblk "
                            "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
                        result = listp[idx];
                        break;
                }
        }

        if (result == MDE_INVAL_ELEM_COOKIE) {
                DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
                    mbp->addr, mbp->size);
        }

        return (result);
}

static int
mem_add(pfn_t base, pgcnt_t npgs)
{
        int rv, rc;

        DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);

        if (npgs == 0)
                return (DR_MEM_RES_OK);

        rv = kphysm_add_memory_dynamic(base, npgs);
        DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
            rv);
        if (rv == KPHYSM_OK) {
                if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
                        cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
        }
        rv = cvt_err(rv);
        return (rv);
}

static void
del_done(void *arg, int error)
{
        mem_sync_t *ms = arg;

        mutex_enter(&ms->lock);
        ms->error = error;
        ms->done = 1;
        cv_signal(&ms->cond);
        mutex_exit(&ms->lock);
}

static int
mem_del(pfn_t base, pgcnt_t npgs)
{
        int rv, err, del_range = 0;
        int convert = 1;
        mem_sync_t ms;
        memquery_t mq;
        memhandle_t mh;
        struct memlist *ml;
        struct memlist *d_ml = NULL;

        DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);

        if (npgs == 0)
                return (DR_MEM_RES_OK);

        if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
                cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
                rv = cvt_err(rv);
                return (rv);
        }
        if ((rv = kphysm_del_span_query(base, npgs, &mq))
            != KPHYSM_OK) {
                cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
                goto done;
        }
        if (mq.nonrelocatable) {
                DR_DBG_MEM("%s: non-reloc pages = %ld",
                    __func__, mq.nonrelocatable);
                rv  = KPHYSM_ENONRELOC;
                goto done;
        }
        if (rv = kcage_range_delete(base, npgs)) {
                switch (rv) {
                case EBUSY:
                        rv = DR_MEM_RES_ENOTVIABLE;
                        break;
                default:
                        rv = DR_MEM_RES_FAILURE;
                        break;
                }
                convert = 0; /* conversion done */
                cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
                goto done;
        } else {
                del_range++;
        }
        if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
                cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
                goto done;
        }
        if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
            != MEML_SPANOP_OK) {
                switch (rv) {
                case MEML_SPANOP_ESPAN:
                        rv = DR_MEM_RES_ESPAN;
                        break;
                case MEML_SPANOP_EALLOC:
                        rv = DR_MEM_RES_ERESOURCE;
                        break;
                default:
                        rv = DR_MEM_RES_FAILURE;
                        break;
                }
                convert = 0; /* conversion done */
                cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
                goto done;
        }

        DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);

        bzero((void *) &ms, sizeof (ms));

        mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
        mutex_enter(&ms.lock);

        if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
                /*
                 * Since we've called drctl_config_init, we are the only
                 * DR ctl operation in progress.  Set dr_mh to the
                 * delete memhandle for use by stat and cancel.
                 */
                ASSERT(dr_mh == NULL);
                dr_mh = mh;

                /*
                 * Wait for completion or interrupt.
                 */
                while (!ms.done) {
                        if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
                                /*
                                 * There is a pending signal.
                                 */
                                (void) kphysm_del_cancel(mh);
                                DR_DBG_MEM("%s: cancel", __func__);
                                /*
                                 * Wait for completion.
                                 */
                                while (!ms.done)
                                        cv_wait(&ms.cond, &ms.lock);
                        }
                }
                dr_mh = NULL;
                rv = ms.error;
        } else {
                DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
        }

        mutex_exit(&ms.lock);
        cv_destroy(&ms.cond);
        mutex_destroy(&ms.lock);

done:
        if (rv && del_range) {
                /*
                 * Add back the spans to the kcage growth list.
                 */
                for (ml = d_ml; ml; ml = ml->ml_next)
                        if (err = kcage_range_add(btop(ml->ml_address),
                            btop(ml->ml_size), KCAGE_DOWN))
                                cmn_err(CE_WARN, "kcage_range_add() = %d", err);
        }
        memlist_free_list(d_ml);

        if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
                cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
        if (convert)
                rv = cvt_err(rv);

        DR_DBG_MEM("%s: rv=%d", __func__, rv);

        return (rv);
}