root/usr/src/uts/common/os/mem_config.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2017 Joyent, Inc.
 */

#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/vmem.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/machsystm.h>      /* for page_freelist_coalesce() */
#include <sys/errno.h>
#include <sys/memnode.h>
#include <sys/memlist.h>
#include <sys/memlist_impl.h>
#include <sys/tuneable.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/debug.h>
#include <sys/vm.h>
#include <sys/callb.h>
#include <sys/memlist_plat.h>   /* for installed_top_size() */
#include <sys/condvar_impl.h>   /* for CV_HAS_WAITERS() */
#include <sys/dumphdr.h>        /* for dump_resize() */
#include <sys/atomic.h>         /* for use in stats collection */
#include <sys/rwlock.h>
#include <sys/cpuvar.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/page.h>
#include <vm/vm_dep.h>
#define SUNDDI_IMPL             /* so sunddi.h will not redefine splx() et al */
#include <sys/sunddi.h>
#include <sys/mem_config.h>
#include <sys/mem_cage.h>
#include <sys/lgrp.h>
#include <sys/ddi.h>
#include <sys/modctl.h>

extern struct memlist *phys_avail;

extern uint_t page_ctrs_adjust(int);
void page_ctrs_cleanup(void);
static void kphysm_setup_post_add(pgcnt_t);
static int kphysm_setup_pre_del(pgcnt_t);
static void kphysm_setup_post_del(pgcnt_t, int);

static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);

static int delspan_reserve(pfn_t, pgcnt_t);
static void delspan_unreserve(pfn_t, pgcnt_t);

kmutex_t memseg_lists_lock;
struct memseg *memseg_va_avail;
struct memseg *memseg_alloc(void);
static struct memseg *memseg_delete_junk;
static struct memseg *memseg_edit_junk;
void memseg_remap_init(void);
static void memseg_remap_to_dummy(struct memseg *);
static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
static struct memseg *memseg_reuse(pgcnt_t);

static struct kmem_cache *memseg_cache;

/*
 * Interfaces to manage externally allocated
 * page_t memory (metadata) for a memseg.
 */
#pragma weak    memseg_alloc_meta
#pragma weak    memseg_free_meta
#pragma weak    memseg_get_metapfn
#pragma weak    memseg_remap_meta

extern int ppvm_enable;
extern page_t *ppvm_base;
extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
extern void memseg_free_meta(void *, pgcnt_t);
extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
extern void memseg_remap_meta(struct memseg *);
static int memseg_is_dynamic(struct memseg *);
static int memseg_includes_meta(struct memseg *);
pfn_t memseg_get_start(struct memseg *);
static void memseg_cpu_vm_flush(void);

int meta_alloc_enable;

#ifdef  DEBUG
static int memseg_debug;
#define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
#else
#define MEMSEG_DEBUG(...)
#endif

/*
 * Add a chunk of memory to the system.
 * base: starting PAGESIZE page of new memory.
 * npgs: length in PAGESIZE pages.
 *
 * Adding mem this way doesn't increase the size of the hash tables;
 * growing them would be too hard.  This should be OK, but adding memory
 * dynamically most likely means more hash misses, since the tables will
 * be smaller than they otherwise would be.
 */
int
kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
{
        page_t *pp;
        page_t          *opp, *oepp, *segpp;
        struct memseg   *seg;
        uint64_t        avmem;
        pfn_t           pfn;
        pfn_t           pt_base = base;
        pgcnt_t         tpgs = npgs;
        pgcnt_t         metapgs = 0;
        int             exhausted;
        pfn_t           pnum;
        int             mnode;
        caddr_t         vaddr;
        int             reuse;
        int             mlret;
        int             rv;
        int             flags;
        int             meta_alloc = 0;
        void            *mapva;
        void            *metabase = (void *)base;
        pgcnt_t         nkpmpgs = 0;
        offset_t        kpm_pages_off = 0;

        cmn_err(CE_CONT,
            "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
            npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);

        /*
         * Add this span in the delete list to prevent interactions.
         */
        if (!delspan_reserve(base, npgs)) {
                return (KPHYSM_ESPAN);
        }
        /*
         * Check to see if any of the memory span has been added
         * by trying an add to the installed memory list. This
         * forms the interlocking process for add.
         */

        memlist_write_lock();

        mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
            (uint64_t)(tpgs) << PAGESHIFT, &phys_install);

        if (mlret == MEML_SPANOP_OK)
                installed_top_size(phys_install, &physmax, &physinstalled);

        memlist_write_unlock();

        if (mlret != MEML_SPANOP_OK) {
                if (mlret == MEML_SPANOP_EALLOC) {
                        delspan_unreserve(pt_base, tpgs);
                        return (KPHYSM_ERESOURCE);
                } else if (mlret == MEML_SPANOP_ESPAN) {
                        delspan_unreserve(pt_base, tpgs);
                        return (KPHYSM_ESPAN);
                } else {
                        delspan_unreserve(pt_base, tpgs);
                        return (KPHYSM_ERESOURCE);
                }
        }

        if (meta_alloc_enable) {
                /*
                 * Allocate the page_t's from existing memory;
                 * if that fails, allocate from the incoming memory.
                 */
                rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
                if (rv == KPHYSM_OK) {
                        ASSERT(metapgs);
                        ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
                        meta_alloc = 1;
                        goto mapalloc;
                }
        }

        /*
         * We store the page_t's for this new memory in the first
         * few pages of the chunk. Here, we go and get'em ...
         */

        /*
         * The expression after the '-' gives the number of pages
         * that will fit in the new memory based on a requirement
         * of (PAGESIZE + sizeof (page_t)) bytes per page.
         */
        metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
            (PAGESIZE + sizeof (page_t)));

        npgs -= metapgs;
        base += metapgs;

        ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);

        exhausted = (metapgs == 0 || npgs == 0);

        if (kpm_enable && !exhausted) {
                pgcnt_t start, end, nkpmpgs_prelim;
                size_t  ptsz;

                /*
                 * A viable kpm large page mapping must not overlap two
                 * dynamic memsegs. Therefore the total size is checked
                 * to be at least kpm_pgsz and also whether start and end
                 * points are at least kpm_pgsz aligned.
                 */
                if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
                    pmodkpmp(base + npgs)) {

                        kphysm_addmem_error_undospan(pt_base, tpgs);

                        /*
                         * There is no specific error code for violating
                         * kpm granularity constraints.
                         */
                        return (KPHYSM_ENOTVIABLE);
                }

                start = kpmptop(ptokpmp(base));
                end = kpmptop(ptokpmp(base + npgs));
                nkpmpgs_prelim = ptokpmp(end - start);
                ptsz = npgs * sizeof (page_t);
                metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
                exhausted = (tpgs <= metapgs);
                if (!exhausted) {
                        npgs = tpgs - metapgs;
                        base = pt_base + metapgs;

                        /* final nkpmpgs */
                        start = kpmptop(ptokpmp(base));
                        nkpmpgs = ptokpmp(end - start);
                        kpm_pages_off = ptsz +
                            (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
                }
        }

        /*
         * Is memory area supplied too small?
         */
        if (exhausted) {
                kphysm_addmem_error_undospan(pt_base, tpgs);
                /*
                 * There is no specific error code for 'too small'.
                 */
                return (KPHYSM_ERESOURCE);
        }

mapalloc:
        /*
         * We may re-use a previously allocated VA space for the page_ts
         * eventually, but we need to initialize and lock the pages first.
         */

        /*
         * Get an address in the kernel address map, map
         * the page_t pages and see if we can touch them.
         */

        mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
        if (mapva == NULL) {
                cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
                    " Can't allocate VA for page_ts");

                if (meta_alloc)
                        memseg_free_meta(metabase, metapgs);
                kphysm_addmem_error_undospan(pt_base, tpgs);

                return (KPHYSM_ERESOURCE);
        }
        pp = mapva;

        if (physmax < (pt_base + tpgs))
                physmax = (pt_base + tpgs);

        /*
         * In the remapping code we map one page at a time so we must do
         * the same here to match mapping sizes.
         */
        pfn = pt_base;
        vaddr = (caddr_t)pp;
        for (pnum = 0; pnum < metapgs; pnum++) {
                if (meta_alloc)
                        pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
                hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
                    PROT_READ | PROT_WRITE,
                    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
                pfn++;
                vaddr += ptob(1);
        }

        if (ddi_peek32((dev_info_t *)NULL,
            (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {

                cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
                    " Can't access pp array at 0x%p [phys 0x%lx]",
                    (void *)pp, pt_base);

                hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
                    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);

                vmem_free(heap_arena, mapva, ptob(metapgs));
                if (meta_alloc)
                        memseg_free_meta(metabase, metapgs);
                kphysm_addmem_error_undospan(pt_base, tpgs);

                return (KPHYSM_EFAULT);
        }

        /*
         * Add this memory slice to its memory node translation.
         *
         * Note that right now, each node may have only one slice;
         * this may change with COD or in larger SSM systems with
         * nested latency groups, so we must not assume that the
         * node does not yet exist.
         *
         * Note that there may be multiple memory nodes associated with
         * a single lgrp node on x86 systems.
         */
        pnum = pt_base + tpgs - 1;
        mem_node_add_range(pt_base, pnum);

        /*
         * Allocate or resize page counters as necessary to accommodate
         * the increase in memory pages.
         */
        mnode = PFN_2_MEM_NODE(pnum);
        PAGE_CTRS_ADJUST(base, npgs, rv);
        if (rv) {

                mem_node_del_range(pt_base, pnum);

                /* cleanup the  page counters */
                page_ctrs_cleanup();

                hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
                    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);

                vmem_free(heap_arena, mapva, ptob(metapgs));
                if (meta_alloc)
                        memseg_free_meta(metabase, metapgs);
                kphysm_addmem_error_undospan(pt_base, tpgs);

                return (KPHYSM_ERESOURCE);
        }

        /*
         * Update the phys_avail memory list.
         * The phys_install list was done at the start.
         */

        memlist_write_lock();

        mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
            (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
        ASSERT(mlret == MEML_SPANOP_OK);

        memlist_write_unlock();

        /* See if we can find a memseg to re-use. */
        if (meta_alloc) {
                seg = memseg_reuse(0);
                reuse = 1;      /* force unmapping of temp mapva */
                flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
                /*
                 * There is a 1:1 fixed relationship between a pfn
                 * and a page_t VA.  The pfn is used as an index into
                 * the ppvm_base page_t table in order to calculate
                 * the page_t base address for a given pfn range.
                 */
                segpp = ppvm_base + base;
        } else {
                seg = memseg_reuse(metapgs);
                reuse = (seg != NULL);
                flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
                segpp = pp;
        }

        /*
         * Initialize the memseg structure representing this memory
         * and add it to the existing list of memsegs. Do some basic
         * initialization and add the memory to the system.
         * In order to prevent lock deadlocks, the add_physmem()
         * code is repeated here, but split into several stages.
         *
         * If a memseg is reused, invalidate memseg pointers in
         * all cpu vm caches.  We need to do this this since the check
         *      pp >= seg->pages && pp < seg->epages
         * used in various places is not atomic and so the first compare
         * can happen before reuse and the second compare after reuse.
         * The invalidation ensures that a memseg is not deferenced while
         * it's page/pfn pointers are changing.
         */
        if (seg == NULL) {
                seg = memseg_alloc();
                ASSERT(seg != NULL);
                seg->msegflags = flags;
                MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
                    (void *)seg, (void *)(seg->pages));
                seg->pages = segpp;
        } else {
                ASSERT(seg->msegflags == flags);
                ASSERT(seg->pages_base == seg->pages_end);
                MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
                    (void *)seg, (void *)(seg->pages));
                if (meta_alloc) {
                        memseg_cpu_vm_flush();
                        seg->pages = segpp;
                }
        }

        seg->epages = seg->pages + npgs;
        seg->pages_base = base;
        seg->pages_end = base + npgs;

        /*
         * Initialize metadata. The page_ts are set to locked state
         * ready to be freed.
         */
        bzero((caddr_t)pp, ptob(metapgs));

        pfn = seg->pages_base;
        /* Save the original pp base in case we reuse a memseg. */
        opp = pp;
        oepp = opp + npgs;
        for (pp = opp; pp < oepp; pp++) {
                pp->p_pagenum = pfn;
                pfn++;
                page_iolock_init(pp);
                while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
                        continue;
                pp->p_offset = (u_offset_t)-1;
        }

        if (reuse) {
                /* Remap our page_ts to the re-used memseg VA space. */
                pfn = pt_base;
                vaddr = (caddr_t)seg->pages;
                for (pnum = 0; pnum < metapgs; pnum++) {
                        if (meta_alloc)
                                pfn = memseg_get_metapfn(metabase,
                                    (pgcnt_t)pnum);
                        hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
                            PROT_READ | PROT_WRITE,
                            HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
                        pfn++;
                        vaddr += ptob(1);
                }

                hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
                    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);

                vmem_free(heap_arena, mapva, ptob(metapgs));
        }

        hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);

        memsegs_lock(1);

        /*
         * The new memseg is inserted at the beginning of the list.
         * Not only does this save searching for the tail, but in the
         * case of a re-used memseg, it solves the problem of what
         * happens if some process has still got a pointer to the
         * memseg and follows the next pointer to continue traversing
         * the memsegs list.
         */

        hat_kpm_addmem_mseg_insert(seg);

        seg->next = memsegs;
        membar_producer();

        hat_kpm_addmem_memsegs_update(seg);

        memsegs = seg;

        build_pfn_hash();

        total_pages += npgs;

        /*
         * Recalculate the paging parameters now total_pages has changed.
         * This will also cause the clock hands to be reset before next use.
         */
        setupclock();

        memsegs_unlock(1);

        PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);

        /*
         * Free the pages outside the lock to avoid locking loops.
         */
        for (pp = seg->pages; pp < seg->epages; pp++) {
                page_free(pp, 1);
        }

        /*
         * Now that we've updated the appropriate memory lists we
         * need to reset a number of globals, since we've increased memory.
         * Several have already been updated for us as noted above. The
         * globals we're interested in at this point are:
         *   physmax - highest page frame number.
         *   physinstalled - number of pages currently installed (done earlier)
         *   maxmem - max free pages in the system
         *   physmem - physical memory pages available
         *   availrmem - real memory available
         */

        mutex_enter(&freemem_lock);
        maxmem += npgs;
        physmem += npgs;
        availrmem += npgs;
        availrmem_initial += npgs;

        mutex_exit(&freemem_lock);

        dump_resize();

        page_freelist_coalesce_all(mnode);

        kphysm_setup_post_add(npgs);

        cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
            "(0x%" PRIx64 ")\n",
            physinstalled << (PAGESHIFT - 10),
            (uint64_t)physinstalled << PAGESHIFT);

        avmem = (uint64_t)freemem << PAGESHIFT;
        cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
            "avail mem = %" PRId64 "\n", avmem);

        /*
         * Update lgroup generation number on single lgroup systems
         */
        if (nlgrps == 1)
                lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);

        /*
         * Inform DDI of update
         */
        ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
            (uint64_t)(tpgs) << PAGESHIFT);

        delspan_unreserve(pt_base, tpgs);

        return (KPHYSM_OK);             /* Successfully added system memory */
}

/*
 * There are various error conditions in kphysm_add_memory_dynamic()
 * which require a rollback of already changed global state.
 */
static void
kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
{
        int mlret;

        /* Unreserve memory span. */
        memlist_write_lock();

        mlret = memlist_delete_span(
            (uint64_t)(pt_base) << PAGESHIFT,
            (uint64_t)(tpgs) << PAGESHIFT, &phys_install);

        ASSERT(mlret == MEML_SPANOP_OK);
        phys_install_has_changed();
        installed_top_size(phys_install, &physmax, &physinstalled);

        memlist_write_unlock();
        delspan_unreserve(pt_base, tpgs);
}

/*
 * Only return an available memseg of exactly the right size
 * if size is required.
 * When the meta data area has it's own virtual address space
 * we will need to manage this more carefully and do best fit
 * allocations, possibly splitting an available area.
 */
struct memseg *
memseg_reuse(pgcnt_t metapgs)
{
        int type;
        struct memseg **segpp, *seg;

        mutex_enter(&memseg_lists_lock);

        segpp = &memseg_va_avail;
        for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
                caddr_t end;

                /*
                 * Make sure we are reusing the right segment type.
                 */
                type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;

                if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
                    != type)
                        continue;

                if (kpm_enable)
                        end = hat_kpm_mseg_reuse(seg);
                else
                        end = (caddr_t)seg->epages;

                /*
                 * Check for the right size if it is provided.
                 */
                if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
                        *segpp = seg->lnext;
                        seg->lnext = NULL;
                        break;
                }
        }
        mutex_exit(&memseg_lists_lock);

        return (seg);
}

static uint_t handle_gen;

struct memdelspan {
        struct memdelspan *mds_next;
        pfn_t           mds_base;
        pgcnt_t         mds_npgs;
        uint_t          *mds_bitmap;
        uint_t          *mds_bitmap_retired;
};

#define NBPBMW          (sizeof (uint_t) * NBBY)
#define MDS_BITMAPBYTES(MDSP) \
        ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))

struct transit_list {
        struct transit_list     *trl_next;
        struct memdelspan       *trl_spans;
        int                     trl_collect;
};

struct transit_list_head {
        kmutex_t                trh_lock;
        struct transit_list     *trh_head;
};

static struct transit_list_head transit_list_head;

struct mem_handle;
static void transit_list_collect(struct mem_handle *, int);
static void transit_list_insert(struct transit_list *);
static void transit_list_remove(struct transit_list *);

#ifdef DEBUG
#define MEM_DEL_STATS
#endif /* DEBUG */

#ifdef MEM_DEL_STATS
static int mem_del_stat_print = 0;
struct mem_del_stat {
        uint_t  nloop;
        uint_t  need_free;
        uint_t  free_loop;
        uint_t  free_low;
        uint_t  free_failed;
        uint_t  ncheck;
        uint_t  nopaget;
        uint_t  lockfail;
        uint_t  nfree;
        uint_t  nreloc;
        uint_t  nrelocfail;
        uint_t  already_done;
        uint_t  first_notfree;
        uint_t  npplocked;
        uint_t  nlockreloc;
        uint_t  nnorepl;
        uint_t  nmodreloc;
        uint_t  ndestroy;
        uint_t  nputpage;
        uint_t  nnoreclaim;
        uint_t  ndelay;
        uint_t  demotefail;
        uint64_t nticks_total;
        uint64_t nticks_pgrp;
        uint_t  retired;
        uint_t  toxic;
        uint_t  failing;
        uint_t  modtoxic;
        uint_t  npplkdtoxic;
        uint_t  gptlmodfail;
        uint_t  gptllckfail;
};
/*
 * The stat values are only incremented in the delete thread
 * so no locking or atomic required.
 */
#define MDSTAT_INCR(MHP, FLD)   (MHP)->mh_delstat.FLD++
#define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck))
#define MDSTAT_PGRP(MHP, ntck)  ((MHP)->mh_delstat.nticks_pgrp += (ntck))
static void mem_del_stat_print_func(struct mem_handle *);
#define MDSTAT_PRINT(MHP)       mem_del_stat_print_func((MHP))
#else /* MEM_DEL_STATS */
#define MDSTAT_INCR(MHP, FLD)
#define MDSTAT_TOTAL(MHP, ntck)
#define MDSTAT_PGRP(MHP, ntck)
#define MDSTAT_PRINT(MHP)
#endif /* MEM_DEL_STATS */

typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
        MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;

/*
 * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
 * The mutex may not be required for other fields, dependent on mh_state.
 */
struct mem_handle {
        kmutex_t        mh_mutex;
        struct mem_handle *mh_next;
        memhandle_t     mh_exthandle;
        mhnd_state_t    mh_state;
        struct transit_list mh_transit;
        pgcnt_t         mh_phys_pages;
        pgcnt_t         mh_vm_pages;
        pgcnt_t         mh_hold_todo;
        void            (*mh_delete_complete)(void *, int error);
        void            *mh_delete_complete_arg;
        volatile uint_t mh_cancel;
        volatile uint_t mh_dr_aio_cleanup_cancel;
        volatile uint_t mh_aio_cleanup_done;
        kcondvar_t      mh_cv;
        kthread_id_t    mh_thread_id;
        page_t          *mh_deleted;    /* link through p_next */
#ifdef MEM_DEL_STATS
        struct mem_del_stat mh_delstat;
#endif /* MEM_DEL_STATS */
};

static struct mem_handle *mem_handle_head;
static kmutex_t mem_handle_list_mutex;

static struct mem_handle *
kphysm_allocate_mem_handle()
{
        struct mem_handle *mhp;

        mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
        mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
        mutex_enter(&mem_handle_list_mutex);
        mutex_enter(&mhp->mh_mutex);
        /* handle_gen is protected by list mutex. */
        mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
        mhp->mh_next = mem_handle_head;
        mem_handle_head = mhp;
        mutex_exit(&mem_handle_list_mutex);

        return (mhp);
}

static void
kphysm_free_mem_handle(struct mem_handle *mhp)
{
        struct mem_handle **mhpp;

        ASSERT(mutex_owned(&mhp->mh_mutex));
        ASSERT(mhp->mh_state == MHND_FREE);
        /*
         * Exit the mutex to preserve locking order. This is OK
         * here as once in the FREE state, the handle cannot
         * be found by a lookup.
         */
        mutex_exit(&mhp->mh_mutex);

        mutex_enter(&mem_handle_list_mutex);
        mhpp = &mem_handle_head;
        while (*mhpp != NULL && *mhpp != mhp)
                mhpp = &(*mhpp)->mh_next;
        ASSERT(*mhpp == mhp);
        /*
         * No need to lock the handle (mh_mutex) as only
         * mh_next changing and this is the only thread that
         * can be referncing mhp.
         */
        *mhpp = mhp->mh_next;
        mutex_exit(&mem_handle_list_mutex);

        mutex_destroy(&mhp->mh_mutex);
        kmem_free(mhp, sizeof (struct mem_handle));
}

/*
 * This function finds the internal mem_handle corresponding to an
 * external handle and returns it with the mh_mutex held.
 */
static struct mem_handle *
kphysm_lookup_mem_handle(memhandle_t handle)
{
        struct mem_handle *mhp;

        mutex_enter(&mem_handle_list_mutex);
        for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
                if (mhp->mh_exthandle == handle) {
                        mutex_enter(&mhp->mh_mutex);
                        /*
                         * The state of the handle could have been changed
                         * by kphysm_del_release() while waiting for mh_mutex.
                         */
                        if (mhp->mh_state == MHND_FREE) {
                                mutex_exit(&mhp->mh_mutex);
                                continue;
                        }
                        break;
                }
        }
        mutex_exit(&mem_handle_list_mutex);
        return (mhp);
}

int
kphysm_del_gethandle(memhandle_t *xmhp)
{
        struct mem_handle *mhp;

        mhp = kphysm_allocate_mem_handle();
        /*
         * The handle is allocated using KM_SLEEP, so cannot fail.
         * If the implementation is changed, the correct error to return
         * here would be KPHYSM_ENOHANDLES.
         */
        ASSERT(mhp->mh_state == MHND_FREE);
        mhp->mh_state = MHND_INIT;
        *xmhp = mhp->mh_exthandle;
        mutex_exit(&mhp->mh_mutex);
        return (KPHYSM_OK);
}

static int
overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
{
        pfn_t e1, e2;

        e1 = b1 + l1;
        e2 = b2 + l2;

        return (!(b2 >= e1 || b1 >= e2));
}

static int can_remove_pgs(pgcnt_t);

static struct memdelspan *
span_to_install(pfn_t base, pgcnt_t npgs)
{
        struct memdelspan *mdsp;
        struct memdelspan *mdsp_new;
        uint64_t address, size, thislen;
        struct memlist *mlp;

        mdsp_new = NULL;

        address = (uint64_t)base << PAGESHIFT;
        size = (uint64_t)npgs << PAGESHIFT;
        while (size != 0) {
                memlist_read_lock();
                for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) {
                        if (address >= (mlp->ml_address + mlp->ml_size))
                                continue;
                        if ((address + size) > mlp->ml_address)
                                break;
                }
                if (mlp == NULL) {
                        address += size;
                        size = 0;
                        thislen = 0;
                } else {
                        if (address < mlp->ml_address) {
                                size -= (mlp->ml_address - address);
                                address = mlp->ml_address;
                        }
                        ASSERT(address >= mlp->ml_address);
                        if ((address + size) >
                            (mlp->ml_address + mlp->ml_size)) {
                                thislen =
                                    mlp->ml_size - (address - mlp->ml_address);
                        } else {
                                thislen = size;
                        }
                }
                memlist_read_unlock();
                /* TODO: phys_install could change now */
                if (thislen == 0)
                        continue;
                mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
                mdsp->mds_base = btop(address);
                mdsp->mds_npgs = btop(thislen);
                mdsp->mds_next = mdsp_new;
                mdsp_new = mdsp;
                address += thislen;
                size -= thislen;
        }
        return (mdsp_new);
}

static void
free_delspans(struct memdelspan *mdsp)
{
        struct memdelspan *amdsp;

        while ((amdsp = mdsp) != NULL) {
                mdsp = amdsp->mds_next;
                kmem_free(amdsp, sizeof (struct memdelspan));
        }
}

/*
 * Concatenate lists. No list ordering is required.
 */

static void
delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
{
        while (*mdspp != NULL)
                mdspp = &(*mdspp)->mds_next;

        *mdspp = mdsp;
}

/*
 * Given a new list of delspans, check there is no overlap with
 * all existing span activity (add or delete) and then concatenate
 * the new spans to the given list.
 * Return 1 for OK, 0 if overlapping.
 */
static int
delspan_insert(
        struct transit_list *my_tlp,
        struct memdelspan *mdsp_new)
{
        struct transit_list_head *trh;
        struct transit_list *tlp;
        int ret;

        trh = &transit_list_head;

        ASSERT(my_tlp != NULL);
        ASSERT(mdsp_new != NULL);

        ret = 1;
        mutex_enter(&trh->trh_lock);
        /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
        for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
                struct memdelspan *mdsp;

                for (mdsp = tlp->trl_spans; mdsp != NULL;
                    mdsp = mdsp->mds_next) {
                        struct memdelspan *nmdsp;

                        for (nmdsp = mdsp_new; nmdsp != NULL;
                            nmdsp = nmdsp->mds_next) {
                                if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
                                    nmdsp->mds_base, nmdsp->mds_npgs)) {
                                        ret = 0;
                                        goto done;
                                }
                        }
                }
        }
done:
        if (ret != 0) {
                if (my_tlp->trl_spans == NULL)
                        transit_list_insert(my_tlp);
                delspan_concat(&my_tlp->trl_spans, mdsp_new);
        }
        mutex_exit(&trh->trh_lock);
        return (ret);
}

static void
delspan_remove(
        struct transit_list *my_tlp,
        pfn_t base,
        pgcnt_t npgs)
{
        struct transit_list_head *trh;
        struct memdelspan *mdsp;

        trh = &transit_list_head;

        ASSERT(my_tlp != NULL);

        mutex_enter(&trh->trh_lock);
        if ((mdsp = my_tlp->trl_spans) != NULL) {
                if (npgs == 0) {
                        my_tlp->trl_spans = NULL;
                        free_delspans(mdsp);
                        transit_list_remove(my_tlp);
                } else {
                        struct memdelspan **prv;

                        prv = &my_tlp->trl_spans;
                        while (mdsp != NULL) {
                                pfn_t p_end;

                                p_end = mdsp->mds_base + mdsp->mds_npgs;
                                if (mdsp->mds_base >= base &&
                                    p_end <= (base + npgs)) {
                                        *prv = mdsp->mds_next;
                                        mdsp->mds_next = NULL;
                                        free_delspans(mdsp);
                                } else {
                                        prv = &mdsp->mds_next;
                                }
                                mdsp = *prv;
                        }
                        if (my_tlp->trl_spans == NULL)
                                transit_list_remove(my_tlp);
                }
        }
        mutex_exit(&trh->trh_lock);
}

/*
 * Reserve interface for add to stop delete before add finished.
 * This list is only accessed through the delspan_insert/remove
 * functions and so is fully protected by the mutex in struct transit_list.
 */

static struct transit_list reserve_transit;

static int
delspan_reserve(pfn_t base, pgcnt_t npgs)
{
        struct memdelspan *mdsp;
        int ret;

        mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
        mdsp->mds_base = base;
        mdsp->mds_npgs = npgs;
        if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
                free_delspans(mdsp);
        }
        return (ret);
}

static void
delspan_unreserve(pfn_t base, pgcnt_t npgs)
{
        delspan_remove(&reserve_transit, base, npgs);
}

/*
 * Return whether memseg was created by kphysm_add_memory_dynamic().
 */
static int
memseg_is_dynamic(struct memseg *seg)
{
        return (seg->msegflags & MEMSEG_DYNAMIC);
}

int
kphysm_del_span(
        memhandle_t handle,
        pfn_t base,
        pgcnt_t npgs)
{
        struct mem_handle *mhp;
        struct memseg *seg;
        struct memdelspan *mdsp;
        struct memdelspan *mdsp_new;
        pgcnt_t phys_pages, vm_pages;
        pfn_t p_end;
        page_t *pp;
        int ret;

        mhp = kphysm_lookup_mem_handle(handle);
        if (mhp == NULL) {
                return (KPHYSM_EHANDLE);
        }
        if (mhp->mh_state != MHND_INIT) {
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ESEQUENCE);
        }

        /*
         * Intersect the span with the installed memory list (phys_install).
         */
        mdsp_new = span_to_install(base, npgs);
        if (mdsp_new == NULL) {
                /*
                 * No physical memory in this range. Is this an
                 * error? If an attempt to start the delete is made
                 * for OK returns from del_span such as this, start will
                 * return an error.
                 * Could return KPHYSM_ENOWORK.
                 */
                /*
                 * It is assumed that there are no error returns
                 * from span_to_install() due to kmem_alloc failure.
                 */
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_OK);
        }
        /*
         * Does this span overlap an existing span?
         */
        if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
                /*
                 * Differentiate between already on list for this handle
                 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
                 */
                ret = KPHYSM_EBUSY;
                for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
                    mdsp = mdsp->mds_next) {
                        if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
                            base, npgs)) {
                                ret = KPHYSM_EDUP;
                                break;
                        }
                }
                mutex_exit(&mhp->mh_mutex);
                free_delspans(mdsp_new);
                return (ret);
        }
        /*
         * At this point the spans in mdsp_new have been inserted into the
         * list of spans for this handle and thereby to the global list of
         * spans being processed. Each of these spans must now be checked
         * for relocatability. As a side-effect segments in the memseg list
         * may be split.
         *
         * Note that mdsp_new can no longer be used as it is now part of
         * a larger list. Select elements of this larger list based
         * on base and npgs.
         */
restart:
        phys_pages = 0;
        vm_pages = 0;
        ret = KPHYSM_OK;
        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                pgcnt_t pages_checked;

                if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
                        continue;
                }
                p_end = mdsp->mds_base + mdsp->mds_npgs;
                /*
                 * The pages_checked count is a hack. All pages should be
                 * checked for relocatability. Those not covered by memsegs
                 * should be tested with arch_kphysm_del_span_ok().
                 */
                pages_checked = 0;
                for (seg = memsegs; seg; seg = seg->next) {
                        pfn_t mseg_start;

                        if (seg->pages_base >= p_end ||
                            seg->pages_end <= mdsp->mds_base) {
                                /* Span and memseg don't overlap. */
                                continue;
                        }
                        mseg_start = memseg_get_start(seg);
                        /* Check that segment is suitable for delete. */
                        if (memseg_includes_meta(seg)) {
                                /*
                                 * Check that this segment is completely
                                 * within the span.
                                 */
                                if (mseg_start < mdsp->mds_base ||
                                    seg->pages_end > p_end) {
                                        ret = KPHYSM_EBUSY;
                                        break;
                                }
                                pages_checked += seg->pages_end - mseg_start;
                        } else {
                                /*
                                 * If this segment is larger than the span,
                                 * try to split it. After the split, it
                                 * is necessary to restart.
                                 */
                                if (seg->pages_base < mdsp->mds_base ||
                                    seg->pages_end > p_end) {
                                        pfn_t abase;
                                        pgcnt_t anpgs;
                                        int s_ret;

                                        /* Split required.  */
                                        if (mdsp->mds_base < seg->pages_base)
                                                abase = seg->pages_base;
                                        else
                                                abase = mdsp->mds_base;
                                        if (p_end > seg->pages_end)
                                                anpgs = seg->pages_end - abase;
                                        else
                                                anpgs = p_end - abase;
                                        s_ret = kphysm_split_memseg(abase,
                                            anpgs);
                                        if (s_ret == 0) {
                                                /* Split failed. */
                                                ret = KPHYSM_ERESOURCE;
                                                break;
                                        }
                                        goto restart;
                                }
                                pages_checked +=
                                    seg->pages_end - seg->pages_base;
                        }
                        /*
                         * The memseg is wholly within the delete span.
                         * The individual pages can now be checked.
                         */
                        /* Cage test. */
                        for (pp = seg->pages; pp < seg->epages; pp++) {
                                if (PP_ISNORELOC(pp)) {
                                        ret = KPHYSM_ENONRELOC;
                                        break;
                                }
                        }
                        if (ret != KPHYSM_OK) {
                                break;
                        }
                        phys_pages += (seg->pages_end - mseg_start);
                        vm_pages += MSEG_NPAGES(seg);
                }
                if (ret != KPHYSM_OK)
                        break;
                if (pages_checked != mdsp->mds_npgs) {
                        ret = KPHYSM_ENONRELOC;
                        break;
                }
        }

        if (ret == KPHYSM_OK) {
                mhp->mh_phys_pages += phys_pages;
                mhp->mh_vm_pages += vm_pages;
        } else {
                /*
                 * Keep holding the mh_mutex to prevent it going away.
                 */
                delspan_remove(&mhp->mh_transit, base, npgs);
        }
        mutex_exit(&mhp->mh_mutex);
        return (ret);
}

int
kphysm_del_span_query(
        pfn_t base,
        pgcnt_t npgs,
        memquery_t *mqp)
{
        struct memdelspan *mdsp;
        struct memdelspan *mdsp_new;
        int done_first_nonreloc;

        mqp->phys_pages = 0;
        mqp->managed = 0;
        mqp->nonrelocatable = 0;
        mqp->first_nonrelocatable = 0;
        mqp->last_nonrelocatable = 0;

        mdsp_new = span_to_install(base, npgs);
        /*
         * It is OK to proceed here if mdsp_new == NULL.
         */
        done_first_nonreloc = 0;
        for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
                pfn_t sbase;
                pgcnt_t snpgs;

                mqp->phys_pages += mdsp->mds_npgs;
                sbase = mdsp->mds_base;
                snpgs = mdsp->mds_npgs;
                while (snpgs != 0) {
                        struct memseg *lseg, *seg;
                        pfn_t p_end;
                        page_t *pp;
                        pfn_t mseg_start;

                        p_end = sbase + snpgs;
                        /*
                         * Find the lowest addressed memseg that starts
                         * after sbase and account for it.
                         * This is to catch dynamic memsegs whose start
                         * is hidden.
                         */
                        seg = NULL;
                        for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
                                if ((lseg->pages_base >= sbase) ||
                                    (lseg->pages_base < p_end &&
                                    lseg->pages_end > sbase)) {
                                        if (seg == NULL ||
                                            seg->pages_base > lseg->pages_base)
                                                seg = lseg;
                                }
                        }
                        if (seg != NULL) {
                                mseg_start = memseg_get_start(seg);
                                /*
                                 * Now have the full extent of the memseg so
                                 * do the range check.
                                 */
                                if (mseg_start >= p_end ||
                                    seg->pages_end <= sbase) {
                                        /* Span does not overlap memseg. */
                                        seg = NULL;
                                }
                        }
                        /*
                         * Account for gap either before the segment if
                         * there is one or to the end of the span.
                         */
                        if (seg == NULL || mseg_start > sbase) {
                                pfn_t a_end;

                                a_end = (seg == NULL) ? p_end : mseg_start;
                                /*
                                 * Check with arch layer for relocatability.
                                 */
                                if (arch_kphysm_del_span_ok(sbase,
                                    (a_end - sbase))) {
                                        /*
                                         * No non-relocatble pages in this
                                         * area, avoid the fine-grained
                                         * test.
                                         */
                                        snpgs -= (a_end - sbase);
                                        sbase = a_end;
                                }
                                while (sbase < a_end) {
                                        if (!arch_kphysm_del_span_ok(sbase,
                                            1)) {
                                                mqp->nonrelocatable++;
                                                if (!done_first_nonreloc) {
                                                        mqp->
                                                            first_nonrelocatable
                                                            = sbase;
                                                        done_first_nonreloc = 1;
                                                }
                                                mqp->last_nonrelocatable =
                                                    sbase;
                                        }
                                        sbase++;
                                        snpgs--;
                                }
                        }
                        if (seg != NULL) {
                                ASSERT(mseg_start <= sbase);
                                if (seg->pages_base != mseg_start &&
                                    seg->pages_base > sbase) {
                                        pgcnt_t skip_pgs;

                                        /*
                                         * Skip the page_t area of a
                                         * dynamic memseg.
                                         */
                                        skip_pgs = seg->pages_base - sbase;
                                        if (snpgs <= skip_pgs) {
                                                sbase += snpgs;
                                                snpgs = 0;
                                                continue;
                                        }
                                        snpgs -= skip_pgs;
                                        sbase += skip_pgs;
                                }
                                ASSERT(snpgs != 0);
                                ASSERT(seg->pages_base <= sbase);
                                /*
                                 * The individual pages can now be checked.
                                 */
                                for (pp = seg->pages +
                                    (sbase - seg->pages_base);
                                    snpgs != 0 && pp < seg->epages; pp++) {
                                        mqp->managed++;
                                        if (PP_ISNORELOC(pp)) {
                                                mqp->nonrelocatable++;
                                                if (!done_first_nonreloc) {
                                                        mqp->
                                                            first_nonrelocatable
                                                            = sbase;
                                                        done_first_nonreloc = 1;
                                                }
                                                mqp->last_nonrelocatable =
                                                    sbase;
                                        }
                                        sbase++;
                                        snpgs--;
                                }
                        }
                }
        }

        free_delspans(mdsp_new);

        return (KPHYSM_OK);
}

/*
 * This release function can be called at any stage as follows:
 *      _gethandle only called
 *      _span(s) only called
 *      _start called but failed
 *      delete thread exited
 */
int
kphysm_del_release(memhandle_t handle)
{
        struct mem_handle *mhp;

        mhp = kphysm_lookup_mem_handle(handle);
        if (mhp == NULL) {
                return (KPHYSM_EHANDLE);
        }
        switch (mhp->mh_state) {
        case MHND_STARTING:
        case MHND_RUNNING:
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ENOTFINISHED);
        case MHND_FREE:
                ASSERT(mhp->mh_state != MHND_FREE);
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_EHANDLE);
        case MHND_INIT:
                break;
        case MHND_DONE:
                break;
        case MHND_RELEASE:
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ESEQUENCE);
        default:
#ifdef DEBUG
                cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
                    (void *)mhp, mhp->mh_state);
#endif /* DEBUG */
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_EHANDLE);
        }
        /*
         * Set state so that we can wait if necessary.
         * Also this means that we have read/write access to all
         * fields except mh_exthandle and mh_state.
         */
        mhp->mh_state = MHND_RELEASE;
        /*
         * The mem_handle cannot be de-allocated by any other operation
         * now, so no need to hold mh_mutex.
         */
        mutex_exit(&mhp->mh_mutex);

        delspan_remove(&mhp->mh_transit, 0, 0);
        mhp->mh_phys_pages = 0;
        mhp->mh_vm_pages = 0;
        mhp->mh_hold_todo = 0;
        mhp->mh_delete_complete = NULL;
        mhp->mh_delete_complete_arg = NULL;
        mhp->mh_cancel = 0;

        mutex_enter(&mhp->mh_mutex);
        ASSERT(mhp->mh_state == MHND_RELEASE);
        mhp->mh_state = MHND_FREE;

        kphysm_free_mem_handle(mhp);

        return (KPHYSM_OK);
}

/*
 * This cancel function can only be called with the thread running.
 */
int
kphysm_del_cancel(memhandle_t handle)
{
        struct mem_handle *mhp;

        mhp = kphysm_lookup_mem_handle(handle);
        if (mhp == NULL) {
                return (KPHYSM_EHANDLE);
        }
        if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ENOTRUNNING);
        }
        /*
         * Set the cancel flag and wake the delete thread up.
         * The thread may be waiting on I/O, so the effect of the cancel
         * may be delayed.
         */
        if (mhp->mh_cancel == 0) {
                mhp->mh_cancel = KPHYSM_ECANCELLED;
                cv_signal(&mhp->mh_cv);
        }
        mutex_exit(&mhp->mh_mutex);
        return (KPHYSM_OK);
}

int
kphysm_del_status(
        memhandle_t handle,
        memdelstat_t *mdstp)
{
        struct mem_handle *mhp;

        mhp = kphysm_lookup_mem_handle(handle);
        if (mhp == NULL) {
                return (KPHYSM_EHANDLE);
        }
        /*
         * Calling kphysm_del_status() is allowed before the delete
         * is started to allow for status display.
         */
        if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
            mhp->mh_state != MHND_RUNNING) {
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ENOTRUNNING);
        }
        mdstp->phys_pages = mhp->mh_phys_pages;
        mdstp->managed = mhp->mh_vm_pages;
        mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
        mutex_exit(&mhp->mh_mutex);
        return (KPHYSM_OK);
}

static int mem_delete_additional_pages = 100;

static int
can_remove_pgs(pgcnt_t npgs)
{
        /*
         * If all pageable pages were paged out, freemem would
         * equal availrmem.  There is a minimum requirement for
         * availrmem.
         */
        if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
            < npgs)
                return (0);
        /* TODO: check swap space, etc. */
        return (1);
}

static int
get_availrmem(pgcnt_t npgs)
{
        int ret;

        mutex_enter(&freemem_lock);
        ret = can_remove_pgs(npgs);
        if (ret != 0)
                availrmem -= npgs;
        mutex_exit(&freemem_lock);
        return (ret);
}

static void
put_availrmem(pgcnt_t npgs)
{
        mutex_enter(&freemem_lock);
        availrmem += npgs;
        mutex_exit(&freemem_lock);
}

#define FREEMEM_INCR    100
static pgcnt_t freemem_incr = FREEMEM_INCR;
#define DEL_FREE_WAIT_FRAC      4
#define DEL_FREE_WAIT_TICKS     ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)

#define DEL_BUSY_WAIT_FRAC      20
#define DEL_BUSY_WAIT_TICKS     ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)

static void kphysm_del_cleanup(struct mem_handle *);

static void page_delete_collect(page_t *, struct mem_handle *);

static pgcnt_t
delthr_get_freemem(struct mem_handle *mhp)
{
        pgcnt_t free_get;
        int ret;

        ASSERT(MUTEX_HELD(&mhp->mh_mutex));

        MDSTAT_INCR(mhp, need_free);
        /*
         * Get up to freemem_incr pages.
         */
        free_get = freemem_incr;
        if (free_get > mhp->mh_hold_todo)
                free_get = mhp->mh_hold_todo;
        /*
         * Take free_get pages away from freemem,
         * waiting if necessary.
         */

        while (!mhp->mh_cancel) {
                mutex_exit(&mhp->mh_mutex);
                MDSTAT_INCR(mhp, free_loop);
                /*
                 * Duplicate test from page_create_throttle()
                 * but don't override with !PG_WAIT.
                 */
                if (freemem < (free_get + throttlefree)) {
                        MDSTAT_INCR(mhp, free_low);
                        ret = 0;
                } else {
                        ret = page_create_wait(free_get, 0);
                        if (ret == 0) {
                                /* EMPTY */
                                MDSTAT_INCR(mhp, free_failed);
                        }
                }
                if (ret != 0) {
                        mutex_enter(&mhp->mh_mutex);
                        return (free_get);
                }

                /*
                 * Put pressure on pageout.
                 */
                page_needfree(free_get);
                WAKE_PAGEOUT_SCANNER(delthr);

                mutex_enter(&mhp->mh_mutex);
                (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
                    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
                mutex_exit(&mhp->mh_mutex);
                page_needfree(-(spgcnt_t)free_get);

                mutex_enter(&mhp->mh_mutex);
        }
        return (0);
}

#define DR_AIO_CLEANUP_DELAY    25000   /* 0.025secs, in usec */
#define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100
/*
 * This function is run as a helper thread for delete_memory_thread.
 * It is needed in order to force kaio cleanup, so that pages used in kaio
 * will be unlocked and subsequently relocated by delete_memory_thread.
 * The address of the delete_memory_threads's mem_handle is passed in to
 * this thread function, and is used to set the mh_aio_cleanup_done member
 * prior to calling thread_exit().
 */
static void
dr_aio_cleanup_thread(caddr_t amhp)
{
        proc_t *procp;
        int (*aio_cleanup_dr_delete_memory)(proc_t *);
        int cleaned;
        int n = 0;
        struct mem_handle *mhp;
        volatile uint_t *pcancel;

        mhp = (struct mem_handle *)amhp;
        ASSERT(mhp != NULL);
        pcancel = &mhp->mh_dr_aio_cleanup_cancel;
        if (modload("sys", "kaio") == -1) {
                mhp->mh_aio_cleanup_done = 1;
                cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
                thread_exit();
        }
        aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
            modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
        if (aio_cleanup_dr_delete_memory == NULL) {
                mhp->mh_aio_cleanup_done = 1;
                cmn_err(CE_WARN,
            "aio_cleanup_dr_delete_memory not found in kaio");
                thread_exit();
        }
        do {
                cleaned = 0;
                mutex_enter(&pidlock);
                for (procp = practive; (*pcancel == 0) && (procp != NULL);
                    procp = procp->p_next) {
                        mutex_enter(&procp->p_lock);
                        if (procp->p_aio != NULL) {
                                /* cleanup proc's outstanding kaio */
                                cleaned +=
                                    (*aio_cleanup_dr_delete_memory)(procp);
                        }
                        mutex_exit(&procp->p_lock);
                }
                mutex_exit(&pidlock);
                if ((*pcancel == 0) &&
                    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
                        /* delay a bit before retrying all procs again */
                        delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
                        n = 0;
                }
        } while (*pcancel == 0);
        mhp->mh_aio_cleanup_done = 1;
        thread_exit();
}

static void
delete_memory_thread(caddr_t amhp)
{
        struct mem_handle *mhp;
        struct memdelspan *mdsp;
        callb_cpr_t cprinfo;
        page_t *pp_targ;
        spgcnt_t freemem_left;
        void (*del_complete_funcp)(void *, int error);
        void *del_complete_arg;
        int comp_code;
        int ret;
        int first_scan;
        uint_t szc;
#ifdef MEM_DEL_STATS
        uint64_t start_total, ntick_total;
        uint64_t start_pgrp, ntick_pgrp;
#endif /* MEM_DEL_STATS */

        mhp = (struct mem_handle *)amhp;

#ifdef MEM_DEL_STATS
        start_total = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */

        CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
            callb_generic_cpr, "memdel");

        mutex_enter(&mhp->mh_mutex);
        ASSERT(mhp->mh_state == MHND_STARTING);

        mhp->mh_state = MHND_RUNNING;
        mhp->mh_thread_id = curthread;

        mhp->mh_hold_todo = mhp->mh_vm_pages;
        mutex_exit(&mhp->mh_mutex);

        /* Allocate the remap pages now, if necessary. */
        memseg_remap_init();

        /*
         * Subtract from availrmem now if possible as availrmem
         * may not be available by the end of the delete.
         */
        if (!get_availrmem(mhp->mh_vm_pages)) {
                comp_code = KPHYSM_ENOTVIABLE;
                mutex_enter(&mhp->mh_mutex);
                goto early_exit;
        }

        ret = kphysm_setup_pre_del(mhp->mh_vm_pages);

        mutex_enter(&mhp->mh_mutex);

        if (ret != 0) {
                mhp->mh_cancel = KPHYSM_EREFUSED;
                goto refused;
        }

        transit_list_collect(mhp, 1);

        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                ASSERT(mdsp->mds_bitmap == NULL);
                mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
                mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
                    KM_SLEEP);
        }

        first_scan = 1;
        freemem_left = 0;
        /*
         * Start dr_aio_cleanup_thread, which periodically iterates
         * through the process list and invokes aio cleanup.  This
         * is needed in order to avoid a deadly embrace between the
         * delete_memory_thread (waiting on writer lock for page, with the
         * exclusive-wanted bit set), kaio read request threads (waiting for a
         * reader lock on the same page that is wanted by the
         * delete_memory_thread), and threads waiting for kaio completion
         * (blocked on spt_amp->lock).
         */
        mhp->mh_dr_aio_cleanup_cancel = 0;
        mhp->mh_aio_cleanup_done = 0;
        (void) thread_create(NULL, 0, dr_aio_cleanup_thread,
            (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
        while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
                pgcnt_t collected;

                MDSTAT_INCR(mhp, nloop);
                collected = 0;
                for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
                    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
                        pfn_t pfn, p_end;

                        p_end = mdsp->mds_base + mdsp->mds_npgs;
                        for (pfn = mdsp->mds_base; (pfn < p_end) &&
                            (mhp->mh_cancel == 0); pfn++) {
                                page_t *pp, *tpp, *tpp_targ;
                                pgcnt_t bit;
                                struct vnode *vp;
                                u_offset_t offset;
                                int mod, result;
                                spgcnt_t pgcnt;

                                bit = pfn - mdsp->mds_base;
                                if ((mdsp->mds_bitmap[bit / NBPBMW] &
                                    (1 << (bit % NBPBMW))) != 0) {
                                        MDSTAT_INCR(mhp, already_done);
                                        continue;
                                }
                                if (freemem_left == 0) {
                                        freemem_left += delthr_get_freemem(mhp);
                                        if (freemem_left == 0)
                                                break;
                                }

                                /*
                                 * Release mh_mutex - some of this
                                 * stuff takes some time (eg PUTPAGE).
                                 */

                                mutex_exit(&mhp->mh_mutex);
                                MDSTAT_INCR(mhp, ncheck);

                                pp = page_numtopp_nolock(pfn);
                                if (pp == NULL) {
                                        /*
                                         * Not covered by a page_t - will
                                         * be dealt with elsewhere.
                                         */
                                        MDSTAT_INCR(mhp, nopaget);
                                        mutex_enter(&mhp->mh_mutex);
                                        mdsp->mds_bitmap[bit / NBPBMW] |=
                                            (1 << (bit % NBPBMW));
                                        continue;
                                }

                                if (!page_try_reclaim_lock(pp, SE_EXCL,
                                    SE_EXCL_WANTED | SE_RETIRED)) {
                                        /*
                                         * Page in use elsewhere.  Skip it.
                                         */
                                        MDSTAT_INCR(mhp, lockfail);
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }
                                /*
                                 * See if the cage expanded into the delete.
                                 * This can happen as we have to allow the
                                 * cage to expand.
                                 */
                                if (PP_ISNORELOC(pp)) {
                                        page_unlock(pp);
                                        mutex_enter(&mhp->mh_mutex);
                                        mhp->mh_cancel = KPHYSM_ENONRELOC;
                                        break;
                                }
                                if (PP_RETIRED(pp)) {
                                        /*
                                         * Page has been retired and is
                                         * not part of the cage so we
                                         * can now do the accounting for
                                         * it.
                                         */
                                        MDSTAT_INCR(mhp, retired);
                                        mutex_enter(&mhp->mh_mutex);
                                        mdsp->mds_bitmap[bit / NBPBMW]
                                            |= (1 << (bit % NBPBMW));
                                        mdsp->mds_bitmap_retired[bit /
                                            NBPBMW] |=
                                            (1 << (bit % NBPBMW));
                                        mhp->mh_hold_todo--;
                                        continue;
                                }
                                ASSERT(freemem_left != 0);
                                if (PP_ISFREE(pp)) {
                                        /*
                                         * Like page_reclaim() only 'freemem'
                                         * processing is already done.
                                         */
                                        MDSTAT_INCR(mhp, nfree);
                                free_page_collect:
                                        if (PP_ISAGED(pp)) {
                                                page_list_sub(pp,
                                                    PG_FREE_LIST);
                                        } else {
                                                page_list_sub(pp,
                                                    PG_CACHE_LIST);
                                        }
                                        PP_CLRFREE(pp);
                                        PP_CLRAGED(pp);
                                        collected++;
                                        mutex_enter(&mhp->mh_mutex);
                                        page_delete_collect(pp, mhp);
                                        mdsp->mds_bitmap[bit / NBPBMW] |=
                                            (1 << (bit % NBPBMW));
                                        freemem_left--;
                                        continue;
                                }
                                ASSERT(pp->p_vnode != NULL);
                                if (first_scan) {
                                        MDSTAT_INCR(mhp, first_notfree);
                                        page_unlock(pp);
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }
                                /*
                                 * Keep stats on pages encountered that
                                 * are marked for retirement.
                                 */
                                if (PP_TOXIC(pp)) {
                                        MDSTAT_INCR(mhp, toxic);
                                } else if (PP_PR_REQ(pp)) {
                                        MDSTAT_INCR(mhp, failing);
                                }
                                /*
                                 * In certain cases below, special exceptions
                                 * are made for pages that are toxic.  This
                                 * is because the current meaning of toxic
                                 * is that an uncorrectable error has been
                                 * previously associated with the page.
                                 */
                                if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
                                        if (!PP_TOXIC(pp)) {
                                                /*
                                                 * Must relocate locked in
                                                 * memory pages.
                                                 */
#ifdef MEM_DEL_STATS
                                                start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
                                                /*
                                                 * Lock all constituent pages
                                                 * of a large page to ensure
                                                 * that p_szc won't change.
                                                 */
                                                if (!group_page_trylock(pp,
                                                    SE_EXCL)) {
                                                        MDSTAT_INCR(mhp,
                                                            gptllckfail);
                                                        page_unlock(pp);
                                                        mutex_enter(
                                                            &mhp->mh_mutex);
                                                        continue;
                                                }
                                                MDSTAT_INCR(mhp, npplocked);
                                                pp_targ =
                                                    page_get_replacement_page(
                                                    pp, NULL, 0);
                                                if (pp_targ != NULL) {
#ifdef MEM_DEL_STATS
                                                        ntick_pgrp =
                                                            (uint64_t)
                                                            ddi_get_lbolt() -
                                                            start_pgrp;
#endif /* MEM_DEL_STATS */
                                                        MDSTAT_PGRP(mhp,
                                                            ntick_pgrp);
                                                        MDSTAT_INCR(mhp,
                                                            nlockreloc);
                                                        goto reloc;
                                                }
                                                group_page_unlock(pp);
                                                page_unlock(pp);
#ifdef MEM_DEL_STATS
                                                ntick_pgrp =
                                                    (uint64_t)ddi_get_lbolt() -
                                                    start_pgrp;
#endif /* MEM_DEL_STATS */
                                                MDSTAT_PGRP(mhp, ntick_pgrp);
                                                MDSTAT_INCR(mhp, nnorepl);
                                                mutex_enter(&mhp->mh_mutex);
                                                continue;
                                        } else {
                                                /*
                                                 * Cannot do anything about
                                                 * this page because it is
                                                 * toxic.
                                                 */
                                                MDSTAT_INCR(mhp, npplkdtoxic);
                                                page_unlock(pp);
                                                mutex_enter(&mhp->mh_mutex);
                                                continue;
                                        }
                                }
                                /*
                                 * Unload the mappings and check if mod bit
                                 * is set.
                                 */
                                ASSERT(!PP_ISKAS(pp));
                                (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
                                mod = hat_ismod(pp);

#ifdef MEM_DEL_STATS
                                start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
                                if (mod && !PP_TOXIC(pp)) {
                                        /*
                                         * Lock all constituent pages
                                         * of a large page to ensure
                                         * that p_szc won't change.
                                         */
                                        if (!group_page_trylock(pp, SE_EXCL)) {
                                                MDSTAT_INCR(mhp, gptlmodfail);
                                                page_unlock(pp);
                                                mutex_enter(&mhp->mh_mutex);
                                                continue;
                                        }
                                        pp_targ = page_get_replacement_page(pp,
                                            NULL, 0);
                                        if (pp_targ != NULL) {
                                                MDSTAT_INCR(mhp, nmodreloc);
#ifdef MEM_DEL_STATS
                                                ntick_pgrp =
                                                    (uint64_t)ddi_get_lbolt() -
                                                    start_pgrp;
#endif /* MEM_DEL_STATS */
                                                MDSTAT_PGRP(mhp, ntick_pgrp);
                                                goto reloc;
                                        }
                                        group_page_unlock(pp);
                                }

                                if (!page_try_demote_pages(pp)) {
                                        MDSTAT_INCR(mhp, demotefail);
                                        page_unlock(pp);
#ifdef MEM_DEL_STATS
                                        ntick_pgrp = (uint64_t)ddi_get_lbolt() -
                                            start_pgrp;
#endif /* MEM_DEL_STATS */
                                        MDSTAT_PGRP(mhp, ntick_pgrp);
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }

                                /*
                                 * Regular 'page-out'.
                                 */
                                if (!mod) {
                                        MDSTAT_INCR(mhp, ndestroy);
                                        page_destroy(pp, 1);
                                        /*
                                         * page_destroy was called with
                                         * dontfree. As long as p_lckcnt
                                         * and p_cowcnt are both zero, the
                                         * only additional action of
                                         * page_destroy with !dontfree is to
                                         * call page_free, so we can collect
                                         * the page here.
                                         */
                                        collected++;
#ifdef MEM_DEL_STATS
                                        ntick_pgrp = (uint64_t)ddi_get_lbolt() -
                                            start_pgrp;
#endif /* MEM_DEL_STATS */
                                        MDSTAT_PGRP(mhp, ntick_pgrp);
                                        mutex_enter(&mhp->mh_mutex);
                                        page_delete_collect(pp, mhp);
                                        mdsp->mds_bitmap[bit / NBPBMW] |=
                                            (1 << (bit % NBPBMW));
                                        continue;
                                }
                                /*
                                 * The page is toxic and the mod bit is
                                 * set, we cannot do anything here to deal
                                 * with it.
                                 */
                                if (PP_TOXIC(pp)) {
                                        page_unlock(pp);
#ifdef MEM_DEL_STATS
                                        ntick_pgrp = (uint64_t)ddi_get_lbolt() -
                                            start_pgrp;
#endif /* MEM_DEL_STATS */
                                        MDSTAT_PGRP(mhp, ntick_pgrp);
                                        MDSTAT_INCR(mhp, modtoxic);
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }
                                MDSTAT_INCR(mhp, nputpage);
                                vp = pp->p_vnode;
                                offset = pp->p_offset;
                                VN_HOLD(vp);
                                page_unlock(pp);
                                (void) VOP_PUTPAGE(vp, offset, PAGESIZE,
                                    B_INVAL|B_FORCE, kcred, NULL);
                                VN_RELE(vp);
#ifdef MEM_DEL_STATS
                                ntick_pgrp = (uint64_t)ddi_get_lbolt() -
                                    start_pgrp;
#endif /* MEM_DEL_STATS */
                                MDSTAT_PGRP(mhp, ntick_pgrp);
                                /*
                                 * Try to get the page back immediately
                                 * so that it can be collected.
                                 */
                                pp = page_numtopp_nolock(pfn);
                                if (pp == NULL) {
                                        MDSTAT_INCR(mhp, nnoreclaim);
                                        /*
                                         * This should not happen as this
                                         * thread is deleting the page.
                                         * If this code is generalized, this
                                         * becomes a reality.
                                         */
#ifdef DEBUG
                                        cmn_err(CE_WARN,
                                            "delete_memory_thread(0x%p) "
                                            "pfn 0x%lx has no page_t",
                                            (void *)mhp, pfn);
#endif /* DEBUG */
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }
                                if (page_try_reclaim_lock(pp, SE_EXCL,
                                    SE_EXCL_WANTED | SE_RETIRED)) {
                                        if (PP_ISFREE(pp)) {
                                                goto free_page_collect;
                                        }
                                        page_unlock(pp);
                                }
                                MDSTAT_INCR(mhp, nnoreclaim);
                                mutex_enter(&mhp->mh_mutex);
                                continue;

                        reloc:
                                /*
                                 * Got some freemem and a target
                                 * page, so move the data to avoid
                                 * I/O and lock problems.
                                 */
                                ASSERT(!page_iolock_assert(pp));
                                MDSTAT_INCR(mhp, nreloc);
                                /*
                                 * page_relocate() will return pgcnt: the
                                 * number of consecutive pages relocated.
                                 * If it is successful, pp will be a
                                 * linked list of the page structs that
                                 * were relocated. If page_relocate() is
                                 * unsuccessful, pp will be unmodified.
                                 */
#ifdef MEM_DEL_STATS
                                start_pgrp = ddi_get_lbolt();
#endif /* MEM_DEL_STATS */
                                result = page_relocate(&pp, &pp_targ, 0, 0,
                                    &pgcnt, NULL);
#ifdef MEM_DEL_STATS
                                ntick_pgrp = (uint64_t)ddi_get_lbolt() -
                                    start_pgrp;
#endif /* MEM_DEL_STATS */
                                MDSTAT_PGRP(mhp, ntick_pgrp);
                                if (result != 0) {
                                        MDSTAT_INCR(mhp, nrelocfail);
                                        /*
                                         * We did not succeed. We need
                                         * to give the pp_targ pages back.
                                         * page_free(pp_targ, 1) without
                                         * the freemem accounting.
                                         */
                                        group_page_unlock(pp);
                                        page_free_replacement_page(pp_targ);
                                        page_unlock(pp);
                                        mutex_enter(&mhp->mh_mutex);
                                        continue;
                                }

                                /*
                                 * We will then collect pgcnt pages.
                                 */
                                ASSERT(pgcnt > 0);
                                mutex_enter(&mhp->mh_mutex);
                                /*
                                 * We need to make sure freemem_left is
                                 * large enough.
                                 */
                                while ((freemem_left < pgcnt) &&
                                    (!mhp->mh_cancel)) {
                                        freemem_left +=
                                            delthr_get_freemem(mhp);
                                }

                                /*
                                 * Do not proceed if mh_cancel is set.
                                 */
                                if (mhp->mh_cancel) {
                                        while (pp_targ != NULL) {
                                                /*
                                                 * Unlink and unlock each page.
                                                 */
                                                tpp_targ = pp_targ;
                                                page_sub(&pp_targ, tpp_targ);
                                                page_unlock(tpp_targ);
                                        }
                                        /*
                                         * We need to give the pp pages back.
                                         * page_free(pp, 1) without the
                                         * freemem accounting.
                                         */
                                        page_free_replacement_page(pp);
                                        break;
                                }

                                /* Now remove pgcnt from freemem_left */
                                freemem_left -= pgcnt;
                                ASSERT(freemem_left >= 0);
                                szc = pp->p_szc;
                                while (pp != NULL) {
                                        /*
                                         * pp and pp_targ were passed back as
                                         * a linked list of pages.
                                         * Unlink and unlock each page.
                                         */
                                        tpp_targ = pp_targ;
                                        page_sub(&pp_targ, tpp_targ);
                                        page_unlock(tpp_targ);
                                        /*
                                         * The original page is now free
                                         * so remove it from the linked
                                         * list and collect it.
                                         */
                                        tpp = pp;
                                        page_sub(&pp, tpp);
                                        pfn = page_pptonum(tpp);
                                        collected++;
                                        ASSERT(PAGE_EXCL(tpp));
                                        ASSERT(tpp->p_vnode == NULL);
                                        ASSERT(!hat_page_is_mapped(tpp));
                                        ASSERT(tpp->p_szc == szc);
                                        tpp->p_szc = 0;
                                        page_delete_collect(tpp, mhp);
                                        bit = pfn - mdsp->mds_base;
                                        mdsp->mds_bitmap[bit / NBPBMW] |=
                                            (1 << (bit % NBPBMW));
                                }
                                ASSERT(pp_targ == NULL);
                        }
                }
                first_scan = 0;
                if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
                    (collected == 0)) {
                        /*
                         * This code is needed as we cannot wait
                         * for a page to be locked OR the delete to
                         * be cancelled.  Also, we must delay so
                         * that other threads get a chance to run
                         * on our cpu, otherwise page locks may be
                         * held indefinitely by those threads.
                         */
                        MDSTAT_INCR(mhp, ndelay);
                        CALLB_CPR_SAFE_BEGIN(&cprinfo);
                        (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
                            DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
                        CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
                }
        }
        /* stop the dr aio cleanup thread */
        mhp->mh_dr_aio_cleanup_cancel = 1;
        transit_list_collect(mhp, 0);
        if (freemem_left != 0) {
                /* Return any surplus. */
                page_create_putback(freemem_left);
                freemem_left = 0;
        }
#ifdef MEM_DEL_STATS
        ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
#endif /* MEM_DEL_STATS */
        MDSTAT_TOTAL(mhp, ntick_total);
        MDSTAT_PRINT(mhp);

        /*
         * If the memory delete was cancelled, exclusive-wanted bits must
         * be cleared. If there are retired pages being deleted, they need
         * to be unretired.
         */
        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                pfn_t pfn, p_end;

                p_end = mdsp->mds_base + mdsp->mds_npgs;
                for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
                        page_t *pp;
                        pgcnt_t bit;

                        bit = pfn - mdsp->mds_base;
                        if (mhp->mh_cancel) {
                                pp = page_numtopp_nolock(pfn);
                                if (pp != NULL) {
                                        if ((mdsp->mds_bitmap[bit / NBPBMW] &
                                            (1 << (bit % NBPBMW))) == 0) {
                                                page_lock_clr_exclwanted(pp);
                                        }
                                }
                        } else {
                                pp = NULL;
                        }
                        if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
                            (1 << (bit % NBPBMW))) != 0) {
                                /* do we already have pp? */
                                if (pp == NULL) {
                                        pp = page_numtopp_nolock(pfn);
                                }
                                ASSERT(pp != NULL);
                                ASSERT(PP_RETIRED(pp));
                                if (mhp->mh_cancel != 0) {
                                        page_unlock(pp);
                                        /*
                                         * To satisfy ASSERT below in
                                         * cancel code.
                                         */
                                        mhp->mh_hold_todo++;
                                } else {
                                        (void) page_unretire_pp(pp,
                                            PR_UNR_CLEAN);
                                }
                        }
                }
        }
        /*
         * Free retired page bitmap and collected page bitmap
         */
        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                ASSERT(mdsp->mds_bitmap_retired != NULL);
                kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
                mdsp->mds_bitmap_retired = NULL;        /* Paranoia. */
                ASSERT(mdsp->mds_bitmap != NULL);
                kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
                mdsp->mds_bitmap = NULL;        /* Paranoia. */
        }

        /* wait for our dr aio cancel thread to exit */
        while (!(mhp->mh_aio_cleanup_done)) {
                CALLB_CPR_SAFE_BEGIN(&cprinfo);
                delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
                CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
        }
refused:
        if (mhp->mh_cancel != 0) {
                page_t *pp;

                comp_code = mhp->mh_cancel;
                /*
                 * Go through list of deleted pages (mh_deleted) freeing
                 * them.
                 */
                while ((pp = mhp->mh_deleted) != NULL) {
                        mhp->mh_deleted = pp->p_next;
                        mhp->mh_hold_todo++;
                        mutex_exit(&mhp->mh_mutex);
                        /* Restore p_next. */
                        pp->p_next = pp->p_prev;
                        if (PP_ISFREE(pp)) {
                                cmn_err(CE_PANIC,
                                    "page %p is free",
                                    (void *)pp);
                        }
                        page_free(pp, 1);
                        mutex_enter(&mhp->mh_mutex);
                }
                ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);

                mutex_exit(&mhp->mh_mutex);
                put_availrmem(mhp->mh_vm_pages);
                mutex_enter(&mhp->mh_mutex);

                goto t_exit;
        }

        /*
         * All the pages are no longer in use and are exclusively locked.
         */

        mhp->mh_deleted = NULL;

        kphysm_del_cleanup(mhp);

        /*
         * mem_node_del_range needs to be after kphysm_del_cleanup so
         * that the mem_node_config[] will remain intact for the cleanup.
         */
        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                mem_node_del_range(mdsp->mds_base,
                    mdsp->mds_base + mdsp->mds_npgs - 1);
        }
        /* cleanup the page counters */
        page_ctrs_cleanup();

        comp_code = KPHYSM_OK;

t_exit:
        mutex_exit(&mhp->mh_mutex);
        kphysm_setup_post_del(mhp->mh_vm_pages,
            (comp_code == KPHYSM_OK) ? 0 : 1);
        mutex_enter(&mhp->mh_mutex);

early_exit:
        /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
        mhp->mh_state = MHND_DONE;
        del_complete_funcp = mhp->mh_delete_complete;
        del_complete_arg = mhp->mh_delete_complete_arg;
        CALLB_CPR_EXIT(&cprinfo);
        (*del_complete_funcp)(del_complete_arg, comp_code);
        thread_exit();
        /*NOTREACHED*/
}

/*
 * Start the delete of the memory from the system.
 */
int
kphysm_del_start(
        memhandle_t handle,
        void (*complete)(void *, int),
        void *complete_arg)
{
        struct mem_handle *mhp;

        mhp = kphysm_lookup_mem_handle(handle);
        if (mhp == NULL) {
                return (KPHYSM_EHANDLE);
        }
        switch (mhp->mh_state) {
        case MHND_FREE:
                ASSERT(mhp->mh_state != MHND_FREE);
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_EHANDLE);
        case MHND_INIT:
                break;
        case MHND_STARTING:
        case MHND_RUNNING:
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ESEQUENCE);
        case MHND_DONE:
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ESEQUENCE);
        case MHND_RELEASE:
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ESEQUENCE);
        default:
#ifdef DEBUG
                cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
                    (void *)mhp, mhp->mh_state);
#endif /* DEBUG */
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_EHANDLE);
        }

        if (mhp->mh_transit.trl_spans == NULL) {
                mutex_exit(&mhp->mh_mutex);
                return (KPHYSM_ENOWORK);
        }

        ASSERT(complete != NULL);
        mhp->mh_delete_complete = complete;
        mhp->mh_delete_complete_arg = complete_arg;
        mhp->mh_state = MHND_STARTING;
        /*
         * Release the mutex in case thread_create sleeps.
         */
        mutex_exit(&mhp->mh_mutex);

        /*
         * The "obvious" process for this thread is pageout (proc_pageout)
         * but this gives the thread too much power over freemem
         * which results in freemem starvation.
         */
        (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
            TS_RUN, maxclsyspri - 1);

        return (KPHYSM_OK);
}

static kmutex_t pp_dummy_lock;          /* Protects init. of pp_dummy. */
static caddr_t pp_dummy;
static pgcnt_t pp_dummy_npages;
static pfn_t *pp_dummy_pfn;     /* Array of dummy pfns. */

static void
memseg_remap_init_pages(page_t *pages, page_t *epages)
{
        page_t *pp;

        for (pp = pages; pp < epages; pp++) {
                pp->p_pagenum = PFN_INVALID;    /* XXXX */
                pp->p_offset = (u_offset_t)-1;
                page_iolock_init(pp);
                while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
                        continue;
                page_lock_delete(pp);
        }
}

void
memseg_remap_init()
{
        mutex_enter(&pp_dummy_lock);
        if (pp_dummy == NULL) {
                uint_t dpages;
                int i;

                /*
                 * dpages starts off as the size of the structure and
                 * ends up as the minimum number of pages that will
                 * hold a whole number of page_t structures.
                 */
                dpages = sizeof (page_t);
                ASSERT(dpages != 0);
                ASSERT(dpages <= MMU_PAGESIZE);

                while ((dpages & 1) == 0)
                        dpages >>= 1;

                pp_dummy_npages = dpages;
                /*
                 * Allocate pp_dummy pages directly from static_arena,
                 * since these are whole page allocations and are
                 * referenced by physical address.  This also has the
                 * nice fringe benefit of hiding the memory from
                 * ::findleaks since it doesn't deal well with allocated
                 * kernel heap memory that doesn't have any mappings.
                 */
                pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
                    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
                bzero(pp_dummy, ptob(pp_dummy_npages));
                ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
                pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
                    pp_dummy_npages, KM_SLEEP);
                for (i = 0; i < pp_dummy_npages; i++) {
                        pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
                            &pp_dummy[MMU_PAGESIZE * i]);
                        ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
                }
                /*
                 * Initialize the page_t's to a known 'deleted' state
                 * that matches the state of deleted pages.
                 */
                memseg_remap_init_pages((page_t *)pp_dummy,
                    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
                /* Remove kmem mappings for the pages for safety. */
                hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
                    HAT_UNLOAD_UNLOCK);
                /* Leave pp_dummy pointer set as flag that init is done. */
        }
        mutex_exit(&pp_dummy_lock);
}

/*
 * Remap a page-aglined range of page_t's to dummy pages.
 */
void
remap_to_dummy(caddr_t va, pgcnt_t metapgs)
{
        int phase;

        ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE));

        /*
         * We may start remapping at a non-zero page offset
         * within the dummy pages since the low/high ends
         * of the outgoing pp's could be shared by other
         * memsegs (see memseg_remap_meta).
         */
        phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages;
        /*CONSTCOND*/
        ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);

        while (metapgs != 0) {
                pgcnt_t n;
                int i, j;

                n = pp_dummy_npages;
                if (n > metapgs)
                        n = metapgs;
                for (i = 0; i < n; i++) {
                        j = (i + phase) % pp_dummy_npages;
                        hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
                            PROT_READ,
                            HAT_LOAD | HAT_LOAD_NOCONSIST |
                            HAT_LOAD_REMAP);
                        va += ptob(1);
                }
                metapgs -= n;
        }
}

static void
memseg_remap_to_dummy(struct memseg *seg)
{
        caddr_t pp;
        pgcnt_t metapgs;

        ASSERT(memseg_is_dynamic(seg));
        ASSERT(pp_dummy != NULL);


        if (!memseg_includes_meta(seg)) {
                memseg_remap_meta(seg);
                return;
        }

        pp = (caddr_t)seg->pages;
        metapgs = seg->pages_base - memseg_get_start(seg);
        ASSERT(metapgs != 0);

        seg->pages_end = seg->pages_base;

        remap_to_dummy(pp, metapgs);
}

/*
 * Transition all the deleted pages to the deleted state so that
 * page_lock will not wait. The page_lock_delete call will
 * also wake up any waiters.
 */
static void
memseg_lock_delete_all(struct memseg *seg)
{
        page_t *pp;

        for (pp = seg->pages; pp < seg->epages; pp++) {
                pp->p_pagenum = PFN_INVALID;    /* XXXX */
                page_lock_delete(pp);
        }
}

static void
kphysm_del_cleanup(struct mem_handle *mhp)
{
        struct memdelspan       *mdsp;
        struct memseg           *seg;
        struct memseg           **segpp;
        struct memseg           *seglist;
        pfn_t                   p_end;
        uint64_t                avmem;
        pgcnt_t                 avpgs;
        pgcnt_t                 npgs;

        avpgs = mhp->mh_vm_pages;

        memsegs_lock(1);

        /*
         * remove from main segment list.
         */
        npgs = 0;
        seglist = NULL;
        for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
            mdsp = mdsp->mds_next) {
                p_end = mdsp->mds_base + mdsp->mds_npgs;
                for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
                        if (seg->pages_base >= p_end ||
                            seg->pages_end <= mdsp->mds_base) {
                                /* Span and memseg don't overlap. */
                                segpp = &((*segpp)->next);
                                continue;
                        }
                        ASSERT(seg->pages_base >= mdsp->mds_base);
                        ASSERT(seg->pages_end <= p_end);

                        PLCNT_MODIFY_MAX(seg->pages_base,
                            seg->pages_base - seg->pages_end);

                        /* Hide the memseg from future scans. */
                        hat_kpm_delmem_mseg_update(seg, segpp);
                        *segpp = seg->next;
                        membar_producer();      /* TODO: Needed? */
                        npgs += MSEG_NPAGES(seg);

                        /*
                         * Leave the deleted segment's next pointer intact
                         * in case a memsegs scanning loop is walking this
                         * segment concurrently.
                         */
                        seg->lnext = seglist;
                        seglist = seg;
                }
        }

        build_pfn_hash();

        ASSERT(npgs < total_pages);
        total_pages -= npgs;

        /*
         * Recalculate the paging parameters now total_pages has changed.
         * This will also cause the clock hands to be reset before next use.
         */
        setupclock();

        memsegs_unlock(1);

        mutex_exit(&mhp->mh_mutex);

        while ((seg = seglist) != NULL) {
                pfn_t mseg_start;
                pfn_t mseg_base, mseg_end;
                pgcnt_t mseg_npgs;
                int mlret;

                seglist = seg->lnext;

                /*
                 * Put the page_t's into the deleted state to stop
                 * cv_wait()s on the pages. When we remap, the dummy
                 * page_t's will be in the same state.
                 */
                memseg_lock_delete_all(seg);
                /*
                 * Collect up information based on pages_base and pages_end
                 * early so that we can flag early that the memseg has been
                 * deleted by setting pages_end == pages_base.
                 */
                mseg_base = seg->pages_base;
                mseg_end = seg->pages_end;
                mseg_npgs = MSEG_NPAGES(seg);
                mseg_start = memseg_get_start(seg);

                if (memseg_is_dynamic(seg)) {
                        /* Remap the meta data to our special dummy area. */
                        memseg_remap_to_dummy(seg);

                        mutex_enter(&memseg_lists_lock);
                        seg->lnext = memseg_va_avail;
                        memseg_va_avail = seg;
                        mutex_exit(&memseg_lists_lock);
                } else {
                        /*
                         * For memory whose page_ts were allocated
                         * at boot, we need to find a new use for
                         * the page_t memory.
                         * For the moment, just leak it.
                         * (It is held in the memseg_delete_junk list.)
                         */
                        seg->pages_end = seg->pages_base;

                        mutex_enter(&memseg_lists_lock);
                        seg->lnext = memseg_delete_junk;
                        memseg_delete_junk = seg;
                        mutex_exit(&memseg_lists_lock);
                }

                /* Must not use seg now as it could be re-used. */

                memlist_write_lock();

                mlret = memlist_delete_span(
                    (uint64_t)(mseg_base) << PAGESHIFT,
                    (uint64_t)(mseg_npgs) << PAGESHIFT,
                    &phys_avail);
                ASSERT(mlret == MEML_SPANOP_OK);

                mlret = memlist_delete_span(
                    (uint64_t)(mseg_start) << PAGESHIFT,
                    (uint64_t)(mseg_end - mseg_start) <<
                    PAGESHIFT,
                    &phys_install);
                ASSERT(mlret == MEML_SPANOP_OK);
                phys_install_has_changed();

                memlist_write_unlock();
        }

        memlist_read_lock();
        installed_top_size(phys_install, &physmax, &physinstalled);
        memlist_read_unlock();

        mutex_enter(&freemem_lock);
        maxmem -= avpgs;
        physmem -= avpgs;
        /* availrmem is adjusted during the delete. */
        availrmem_initial -= avpgs;

        mutex_exit(&freemem_lock);

        dump_resize();

        cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
            "(0x%" PRIx64 ")\n",
            physinstalled << (PAGESHIFT - 10),
            (uint64_t)physinstalled << PAGESHIFT);

        avmem = (uint64_t)freemem << PAGESHIFT;
        cmn_err(CE_CONT, "?kphysm_delete: "
            "avail mem = %" PRId64 "\n", avmem);

        /*
         * Update lgroup generation number on single lgroup systems
         */
        if (nlgrps == 1)
                lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);

        /* Successfully deleted system memory */
        mutex_enter(&mhp->mh_mutex);
}

static uint_t mdel_nullvp_waiter;

static void
page_delete_collect(
        page_t *pp,
        struct mem_handle *mhp)
{
        if (pp->p_vnode) {
                page_hashout(pp, (kmutex_t *)NULL);
                /* do not do PP_SETAGED(pp); */
        } else {
                kmutex_t *sep;

                sep = page_se_mutex(pp);
                mutex_enter(sep);
                if (CV_HAS_WAITERS(&pp->p_cv)) {
                        mdel_nullvp_waiter++;
                        cv_broadcast(&pp->p_cv);
                }
                mutex_exit(sep);
        }
        ASSERT(pp->p_next == pp->p_prev);
        ASSERT(pp->p_next == NULL || pp->p_next == pp);
        pp->p_next = mhp->mh_deleted;
        mhp->mh_deleted = pp;
        ASSERT(mhp->mh_hold_todo != 0);
        mhp->mh_hold_todo--;
}

static void
transit_list_collect(struct mem_handle *mhp, int v)
{
        struct transit_list_head *trh;

        trh = &transit_list_head;
        mutex_enter(&trh->trh_lock);
        mhp->mh_transit.trl_collect = v;
        mutex_exit(&trh->trh_lock);
}

static void
transit_list_insert(struct transit_list *tlp)
{
        struct transit_list_head *trh;

        trh = &transit_list_head;
        ASSERT(MUTEX_HELD(&trh->trh_lock));
        tlp->trl_next = trh->trh_head;
        trh->trh_head = tlp;
}

static void
transit_list_remove(struct transit_list *tlp)
{
        struct transit_list_head *trh;
        struct transit_list **tlpp;

        trh = &transit_list_head;
        tlpp = &trh->trh_head;
        ASSERT(MUTEX_HELD(&trh->trh_lock));
        while (*tlpp != NULL && *tlpp != tlp)
                tlpp = &(*tlpp)->trl_next;
        ASSERT(*tlpp != NULL);
        if (*tlpp == tlp)
                *tlpp = tlp->trl_next;
        tlp->trl_next = NULL;
}

static struct transit_list *
pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
{
        struct transit_list *tlp;

        for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
                struct memdelspan *mdsp;

                for (mdsp = tlp->trl_spans; mdsp != NULL;
                    mdsp = mdsp->mds_next) {
                        if (pfnum >= mdsp->mds_base &&
                            pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
                                return (tlp);
                        }
                }
        }
        return (NULL);
}

int
pfn_is_being_deleted(pfn_t pfnum)
{
        struct transit_list_head *trh;
        struct transit_list *tlp;
        int ret;

        trh = &transit_list_head;
        if (trh->trh_head == NULL)
                return (0);

        mutex_enter(&trh->trh_lock);
        tlp = pfnum_to_transit_list(trh, pfnum);
        ret = (tlp != NULL && tlp->trl_collect);
        mutex_exit(&trh->trh_lock);

        return (ret);
}

#ifdef MEM_DEL_STATS
extern int hz;
static void
mem_del_stat_print_func(struct mem_handle *mhp)
{
        uint64_t tmp;

        if (mem_del_stat_print) {
                printf("memory delete loop %x/%x, statistics%s\n",
                    (uint_t)mhp->mh_transit.trl_spans->mds_base,
                    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
                    (mhp->mh_cancel ? " (cancelled)" : ""));
                printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
                printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
                printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
                printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
                printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
                printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
                printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
                printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
                printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
                printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
                printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
                printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
                printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
                printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
                printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
                printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
                printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
                printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
                printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
                printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
                printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
                printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
                printf("\t%8u retired\n", mhp->mh_delstat.retired);
                printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
                printf("\t%8u failing\n", mhp->mh_delstat.failing);
                printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
                printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
                printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
                printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
                tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
                printf(
                    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
                    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);

                tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
                printf(
                    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
                    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
        }
}
#endif /* MEM_DEL_STATS */

struct mem_callback {
        kphysm_setup_vector_t   *vec;
        void                    *arg;
};

#define NMEMCALLBACKS           100

static struct mem_callback mem_callbacks[NMEMCALLBACKS];
static uint_t nmemcallbacks;
static krwlock_t mem_callback_rwlock;

int
kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
{
        uint_t i, found;

        /*
         * This test will become more complicated when the version must
         * change.
         */
        if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
                return (EINVAL);

        if (vec->post_add == NULL || vec->pre_del == NULL ||
            vec->post_del == NULL)
                return (EINVAL);

        rw_enter(&mem_callback_rwlock, RW_WRITER);
        for (i = 0, found = 0; i < nmemcallbacks; i++) {
                if (mem_callbacks[i].vec == NULL && found == 0)
                        found = i + 1;
                if (mem_callbacks[i].vec == vec &&
                    mem_callbacks[i].arg == arg) {
#ifdef DEBUG
                        /* Catch this in DEBUG kernels. */
                        cmn_err(CE_WARN, "kphysm_setup_func_register"
                            "(0x%p, 0x%p) duplicate registration from 0x%p",
                            (void *)vec, arg, (void *)caller());
#endif /* DEBUG */
                        rw_exit(&mem_callback_rwlock);
                        return (EEXIST);
                }
        }
        if (found != 0) {
                i = found - 1;
        } else {
                ASSERT(nmemcallbacks < NMEMCALLBACKS);
                if (nmemcallbacks == NMEMCALLBACKS) {
                        rw_exit(&mem_callback_rwlock);
                        return (ENOMEM);
                }
                i = nmemcallbacks++;
        }
        mem_callbacks[i].vec = vec;
        mem_callbacks[i].arg = arg;
        rw_exit(&mem_callback_rwlock);
        return (0);
}

void
kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
{
        uint_t i;

        rw_enter(&mem_callback_rwlock, RW_WRITER);
        for (i = 0; i < nmemcallbacks; i++) {
                if (mem_callbacks[i].vec == vec &&
                    mem_callbacks[i].arg == arg) {
                        mem_callbacks[i].vec = NULL;
                        mem_callbacks[i].arg = NULL;
                        if (i == (nmemcallbacks - 1))
                                nmemcallbacks--;
                        break;
                }
        }
        rw_exit(&mem_callback_rwlock);
}

static void
kphysm_setup_post_add(pgcnt_t delta_pages)
{
        uint_t i;

        rw_enter(&mem_callback_rwlock, RW_READER);
        for (i = 0; i < nmemcallbacks; i++) {
                if (mem_callbacks[i].vec != NULL) {
                        (*mem_callbacks[i].vec->post_add)
                            (mem_callbacks[i].arg, delta_pages);
                }
        }
        rw_exit(&mem_callback_rwlock);
}

/*
 * Note the locking between pre_del and post_del: The reader lock is held
 * between the two calls to stop the set of functions from changing.
 */

static int
kphysm_setup_pre_del(pgcnt_t delta_pages)
{
        uint_t i;
        int ret;
        int aret;

        ret = 0;
        rw_enter(&mem_callback_rwlock, RW_READER);
        for (i = 0; i < nmemcallbacks; i++) {
                if (mem_callbacks[i].vec != NULL) {
                        aret = (*mem_callbacks[i].vec->pre_del)
                            (mem_callbacks[i].arg, delta_pages);
                        ret |= aret;
                }
        }

        return (ret);
}

static void
kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
{
        uint_t i;

        for (i = 0; i < nmemcallbacks; i++) {
                if (mem_callbacks[i].vec != NULL) {
                        (*mem_callbacks[i].vec->post_del)
                            (mem_callbacks[i].arg, delta_pages, cancelled);
                }
        }
        rw_exit(&mem_callback_rwlock);
}

static int
kphysm_split_memseg(
        pfn_t base,
        pgcnt_t npgs)
{
        struct memseg *seg;
        struct memseg **segpp;
        pgcnt_t size_low, size_high;
        struct memseg *seg_low, *seg_mid, *seg_high;

        /*
         * Lock the memsegs list against other updates now
         */
        memsegs_lock(1);

        /*
         * Find boot time memseg that wholly covers this area.
         */

        /* First find the memseg with page 'base' in it. */
        for (segpp = &memsegs; (seg = *segpp) != NULL;
            segpp = &((*segpp)->next)) {
                if (base >= seg->pages_base && base < seg->pages_end)
                        break;
        }
        if (seg == NULL) {
                memsegs_unlock(1);
                return (0);
        }
        if (memseg_includes_meta(seg)) {
                memsegs_unlock(1);
                return (0);
        }
        if ((base + npgs) > seg->pages_end) {
                memsegs_unlock(1);
                return (0);
        }

        /*
         * Work out the size of the two segments that will
         * surround the new segment, one for low address
         * and one for high.
         */
        ASSERT(base >= seg->pages_base);
        size_low = base - seg->pages_base;
        ASSERT(seg->pages_end >= (base + npgs));
        size_high = seg->pages_end - (base + npgs);

        /*
         * Sanity check.
         */
        if ((size_low + size_high) == 0) {
                memsegs_unlock(1);
                return (0);
        }

        /*
         * Allocate the new structures. The old memseg will not be freed
         * as there may be a reference to it.
         */
        seg_low = NULL;
        seg_high = NULL;

        if (size_low != 0)
                seg_low = memseg_alloc();

        seg_mid = memseg_alloc();

        if (size_high != 0)
                seg_high = memseg_alloc();

        /*
         * All allocation done now.
         */
        if (size_low != 0) {
                seg_low->pages = seg->pages;
                seg_low->epages = seg_low->pages + size_low;
                seg_low->pages_base = seg->pages_base;
                seg_low->pages_end = seg_low->pages_base + size_low;
                seg_low->next = seg_mid;
                seg_low->msegflags = seg->msegflags;
        }
        if (size_high != 0) {
                seg_high->pages = seg->epages - size_high;
                seg_high->epages = seg_high->pages + size_high;
                seg_high->pages_base = seg->pages_end - size_high;
                seg_high->pages_end = seg_high->pages_base + size_high;
                seg_high->next = seg->next;
                seg_high->msegflags = seg->msegflags;
        }

        seg_mid->pages = seg->pages + size_low;
        seg_mid->pages_base = seg->pages_base + size_low;
        seg_mid->epages = seg->epages - size_high;
        seg_mid->pages_end = seg->pages_end - size_high;
        seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
        seg_mid->msegflags = seg->msegflags;

        /*
         * Update hat_kpm specific info of all involved memsegs and
         * allow hat_kpm specific global chain updates.
         */
        hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);

        /*
         * At this point we have two equivalent memseg sub-chains,
         * seg and seg_low/seg_mid/seg_high, which both chain on to
         * the same place in the global chain. By re-writing the pointer
         * in the previous element we switch atomically from using the old
         * (seg) to the new.
         */
        *segpp = (seg_low != NULL) ? seg_low : seg_mid;

        membar_enter();

        build_pfn_hash();
        memsegs_unlock(1);

        /*
         * We leave the old segment, 'seg', intact as there may be
         * references to it. Also, as the value of total_pages has not
         * changed and the memsegs list is effectively the same when
         * accessed via the old or the new pointer, we do not have to
         * cause pageout_scanner() to re-evaluate its hand pointers.
         *
         * We currently do not re-use or reclaim the page_t memory.
         * If we do, then this may have to change.
         */

        mutex_enter(&memseg_lists_lock);
        seg->lnext = memseg_edit_junk;
        memseg_edit_junk = seg;
        mutex_exit(&memseg_lists_lock);

        return (1);
}

/*
 * The sfmmu hat layer (e.g.) accesses some parts of the memseg
 * structure using physical addresses. Therefore a kmem_cache is
 * used with KMC_NOHASH to avoid page crossings within a memseg
 * structure. KMC_NOHASH requires that no external (outside of
 * slab) information is allowed. This, in turn, implies that the
 * cache's slabsize must be exactly a single page, since per-slab
 * information (e.g. the freelist for the slab) is kept at the
 * end of the slab, where it is easy to locate. Should be changed
 * when a more obvious kmem_cache interface/flag will become
 * available.
 */
void
mem_config_init()
{
        memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
            0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
}

struct memseg *
memseg_alloc()
{
        struct memseg *seg;

        seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
        bzero(seg, sizeof (struct memseg));

        return (seg);
}

/*
 * Return whether the page_t memory for this memseg
 * is included in the memseg itself.
 */
static int
memseg_includes_meta(struct memseg *seg)
{
        return (seg->msegflags & MEMSEG_META_INCL);
}

pfn_t
memseg_get_start(struct memseg *seg)
{
        pfn_t           pt_start;

        if (memseg_includes_meta(seg)) {
                pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);

                /* Meta data is required to be at the beginning */
                ASSERT(pt_start < seg->pages_base);
        } else
                pt_start = seg->pages_base;

        return (pt_start);
}

/*
 * Invalidate memseg pointers in cpu private vm data caches.
 */
static void
memseg_cpu_vm_flush()
{
        cpu_t *cp;
        vm_cpu_data_t *vc;

        mutex_enter(&cpu_lock);
        pause_cpus(NULL, NULL);

        cp = cpu_list;
        do {
                vc = cp->cpu_vm_data;
                vc->vc_pnum_memseg = NULL;
                vc->vc_pnext_memseg = NULL;

        } while ((cp = cp->cpu_next) != cpu_list);

        start_cpus();
        mutex_exit(&cpu_lock);
}