root/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/mman.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <libproc.h>
#include <limits.h>
#include <procfs.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <time.h>
#include <unistd.h>
#include "rcapd.h"
#include "rcapd_rfd.h"
#include "rcapd_mapping.h"
#include "utils.h"

static int lpc_xmap_update(lprocess_t *);
#ifdef DEBUG
extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
#endif /* DEBUG */

/*
 * The number of file descriptors required to grab a process and create an
 * agent in it.
 */
#define PGRAB_FD_COUNT          10

/*
 * Record a position in an address space as it corresponds to a prpageheader_t
 * and affiliated structures.
 */
typedef struct prpageheader_cur {
        int pr_nmap;            /* number of mappings in address space */
        int pr_map;             /* number of this mapping */
        uint64_t pr_pgoff;      /* page offset into mapping */
        uint64_t pr_npage;      /* number of pages in mapping */
        uint64_t pr_pagesize;   /* page size of mapping */
        uintptr_t pr_addr;      /* base of mapping */
        prpageheader_t *pr_prpageheader;        /* associated page header */
        void *pr_pdaddr;        /* address of page's byte in pagedata */
        prxmap_t *pr_xmap;      /* array containing per-segment information */
        int pr_nxmap;           /* number of xmaps in array */
        int64_t pr_rss;         /* number of resident pages in mapping, */
                                /* or -1 if xmap is out of sync */
        int64_t pr_pg_rss;      /* number of pageable pages in mapping, or -1 */
} prpageheader_cur_t;

static struct ps_prochandle *scan_pr;   /* currently-scanned process's handle */

typedef enum {
        STDL_NORMAL,
        STDL_HIGH
} st_debug_level_t;

/*
 * Output a scanning-related debug message.
 */
/*PRINTFLIKE3*/ /*ARGSUSED*/
static void
st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
{
#ifdef DEBUG_MSG
        va_list alist;
        char *buf;
        size_t len;

        if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
            : RCM_DEBUG))
                return;

        len = strlen(msg) + LINELEN;
        buf = malloc(len);
        if (buf == NULL)
                return;
        (void) snprintf(buf, len, "%s %s scanner %s",
            (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
            lcol->lcol_name, msg);

        va_start(alist, msg);
        vdprintfe(RCM_DEBUG, buf, alist);
        va_end(alist);

        free(buf);
#endif /* DEBUG_MSG */
}

/*
 * Determine the collection's current victim, based on its last.  The last will
 * be returned, or, if invalid, any other valid process, if the collection has
 * any.
 */
static lprocess_t *
get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
{
        if (lpc == NULL || !lcollection_member(lcol, lpc))
                lpc = lcol->lcol_lprocess;

        /*
         * Find the next scannable process, and make it the victim.
         */
        while (lpc != NULL && lpc->lpc_unscannable != 0)
                lpc = lpc->lpc_next;

        return (lpc);
}

/*
 * Get a process's combined current pagedata (per-page referenced and modified
 * bits) and set the supplied pointer to it.  The caller is responsible for
 * freeing the data.  If the pagedata is unreadable, a nonzero value is
 * returned, and errno is set.  Otherwise, 0 is returned.
 */
static int
get_pagedata(prpageheader_t **pghpp, int fd)
{
        int res;
        struct stat st;

redo:
        errno = 0;
        if (fstat(fd, &st) != 0) {
                debug("cannot stat pagedata\n");
                return (-1);
        }

        errno = 0;
        *pghpp = malloc(st.st_size);
        if (*pghpp == NULL) {
                debug("cannot malloc() %ld bytes for pagedata", st.st_size);
                return (-1);
        }
        (void) bzero(*pghpp, st.st_size);

        errno = 0;
        if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
                free(*pghpp);
                *pghpp = NULL;
                if (res > 0 || errno == E2BIG) {
                        debug("pagedata changed size, retrying\n");
                        goto redo;
                } else {
                        debug("cannot read pagedata");
                        return (-1);
                }
        }

        return (0);
}

/*
 * Return the count of kilobytes of pages represented by the given pagedata
 * which meet the given criteria, having pages which are in all of the states
 * specified by the mask, and in none of the states in the notmask.  If the
 * CP_CLEAR flag is set, the pagedata will also be cleared.
 */
#define CP_CLEAR        1
static uint64_t
count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
{
        int map;
        caddr_t cur, end;
        prpageheader_t pgh = *pghp;
        prasmap_t *asmapp;
        uint64_t count = 0;

        cur = (caddr_t)pghp + sizeof (*pghp);
        for (map = 0; map < pgh.pr_nmap; map++) {
                asmapp = (prasmap_t *)(uintptr_t)cur;
                cur += sizeof (*asmapp);
                end = cur + asmapp->pr_npage;
                while (cur < end) {
                        if ((*cur & mask) == mask && (*cur & notmask) == 0)
                                count += asmapp->pr_pagesize / 1024;
                        if ((flags & CP_CLEAR) != 0)
                                *cur = 0;
                        cur++;
                }

                /*
                 * Skip to next 64-bit-aligned address to get the next
                 * prasmap_t.
                 */
                cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
        }

        return (count);
}

/*
 * Return the amount of memory (in kilobytes) that hasn't been referenced or
 * modified, which memory which will be paged out first.  Should be written to
 * exclude nonresident pages when sufficient interfaces exist.
 */
static uint64_t
unrm_size(lprocess_t *lpc)
{
        return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
            0, PG_MODIFIED | PG_REFERENCED));
}

/*
 * Advance a prpageheader_cur_t to the address space's next mapping, returning
 * its address, or NULL if there is none.  Any known nonpageable or nonresident
 * mappings will be skipped over.
 */
static uintptr_t
advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
{
        prasmap_t *pap;
        int i;

next:
        ASSERT(pcp->pr_map < pcp->pr_nmap);
        if ((pcp->pr_map + 1) == pcp->pr_nmap)
                return ((uintptr_t)NULL);
        pcp->pr_map++;
        if (pcp->pr_pgoff < pcp->pr_npage) {
                pcp->pr_pdaddr = (caddr_t)(uintptr_t)
                    ((uintptr_t)pcp->pr_pdaddr +
                    (pcp->pr_npage - pcp->pr_pgoff));
                pcp->pr_pgoff = pcp->pr_npage;
        }
        /*
         * Skip to next 64-bit-aligned address to get the next prasmap_t.
         */
        pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
        pap = (prasmap_t *)pcp->pr_pdaddr;
        pcp->pr_pgoff = 0;
        pcp->pr_npage = pap->pr_npage;
        pcp->pr_pagesize = pap->pr_pagesize;
        pcp->pr_addr = pap->pr_vaddr;
        pcp->pr_pdaddr = pap + 1;

        /*
         * Skip any known nonpageable mappings.  Currently, the only one
         * detected is the schedctl page.
         */
        if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
            MA_ANON)) == 0 && pap->pr_npage == 1) {
                debug("identified nonpageable schedctl mapping at %p\n",
                    (void *)pcp->pr_addr);
                goto next;
        }

        /*
         * Skip mappings with no resident pages.  If the xmap does not
         * correspond to the pagedata for any reason, it will be ignored.
         */
        pcp->pr_rss = -1;
        pcp->pr_pg_rss = -1;
        for (i = 0; i < pcp->pr_nxmap; i++) {
                prxmap_t *xmap = &pcp->pr_xmap[i];

                if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
                    (pcp->pr_npage * pcp->pr_pagesize)) {
                        pcp->pr_rss = xmap->pr_rss;
                        /*
                         * Remove COW pages from the pageable RSS count.
                         */
                        if ((xmap->pr_mflags & MA_SHARED) == 0)
                                pcp->pr_pg_rss = xmap->pr_anon;
                        break;
                }
        }
        if (pcp->pr_rss == 0) {
                debug("identified nonresident mapping at 0x%p\n",
                    (void *)pcp->pr_addr);
                goto next;
        } else if (pcp->pr_pg_rss == 0) {
                debug("identified unpageable mapping at 0x%p\n",
                    (void *)pcp->pr_addr);
                goto next;
        }

        return (pcp->pr_addr);
}

/*
 * Advance a prpageheader_cur_t to the mapping's next page, returning its
 * address, or NULL if there is none.
 */
static void *
advance_prpageheader_cur(prpageheader_cur_t *pcp)
{
        ASSERT(pcp->pr_pgoff < pcp->pr_npage);
        if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
                return (NULL);
        pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
        pcp->pr_pgoff++;

        ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
        return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
}

/*
 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
 * of an address space.
 */
static void *
set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
    prxmap_t *xmap, int nxmap)
{
        bzero(pcp, sizeof (*pcp));
        pcp->pr_nmap = php->pr_nmap;
        pcp->pr_map = -1;
        pcp->pr_prpageheader = php;
        pcp->pr_xmap = xmap;
        pcp->pr_nxmap = nxmap;
        pcp->pr_pdaddr = (prpageheader_t *)php + 1;

        return ((void *)advance_prpageheader_cur_nextmapping(pcp));
}

/*
 * Position a prpageheader_cur_t to the mapped address greater or equal to the
 * given value.
 */
static void *
set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
    prxmap_t *xmap, int nxmap, void *naddr)
{
        void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);

        while (addr != NULL && addr <= naddr)
                if (naddr < (void *)((caddr_t)pcp->pr_addr +
                    pcp->pr_pagesize * pcp->pr_npage)) {
                        uint64_t pgdiff = ((uintptr_t)naddr -
                            (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
                        pcp->pr_pgoff += pgdiff;
                        pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
                        addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
                            pcp->pr_pgoff;
                        break;
                } else
                        addr =
                            (void *)advance_prpageheader_cur_nextmapping(pcp);

        return (addr);
}

static void
revoke_pagedata(rfd_t *rfd)
{
        lprocess_t *lpc = rfd->rfd_data;

        st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
            " process %d\n", (int)lpc->lpc_pid);
        ASSERT(lpc->lpc_pgdata_fd != -1);
        lpc->lpc_pgdata_fd = -1;
}

#ifdef DEBUG
static void
mklmapping(lmapping_t **lm, prpageheader_t *pgh)
{
        prpageheader_cur_t cur;
        void *addr;

        addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
        ASSERT(*lm == NULL);
        while (addr != NULL) {
                (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
                    cur.pr_pagesize);
                addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
        }
}

static void
lmapping_dump(lmapping_t *lm)
{
        debug("lm: %p\n", (void *)lm);
        while (lm != NULL) {
                debug("\t(%p, %llx\n", (void *)lm->lm_addr,
                    (unsigned long long)lm->lm_size);
                lm = lm->lm_next;
        }
}
#endif /* DEBUG */

/*
 * OR two prpagedata_t which are supposedly snapshots of the same address
 * space.  Intersecting mappings with different page sizes are tolerated but
 * not normalized (not accurate).  If the mappings of the two snapshots differ
 * in any regard, the supplied mappings_changed flag will be set.
 */
static void
OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
{
        prpageheader_cur_t src_cur;
        prpageheader_cur_t dst_cur;
        uintptr_t src_addr;
        uintptr_t dst_addr;
        int mappings_changed = 0;

        /*
         * OR source pagedata with the destination, for pages of intersecting
         * mappings.
         */
        src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
        dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
        while (src_addr != (uintptr_t)NULL && dst_addr != (uintptr_t)NULL) {
                while (src_addr == dst_addr && src_addr != (uintptr_t)NULL) {
                        *(char *)dst_cur.pr_pdaddr |=
                            *(char *)src_cur.pr_pdaddr;
                        src_addr = (uintptr_t)advance_prpageheader_cur(
                            &src_cur);
                        dst_addr = (uintptr_t)advance_prpageheader_cur(
                            &dst_cur);
                }
                if (src_addr != dst_addr)
                        mappings_changed = 1;
                src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
                dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
                while (src_addr != dst_addr && src_addr != (uintptr_t)NULL &&
                    dst_addr != (uintptr_t)NULL) {
                        mappings_changed = 1;
                        if (src_addr < dst_addr)
                                src_addr = advance_prpageheader_cur_nextmapping(
                                    &src_cur);
                        else
                                dst_addr = advance_prpageheader_cur_nextmapping(
                                    &dst_cur);
                }
        }

        *mappings_changedp = mappings_changed;
}

/*
 * Merge the current pagedata with that on hand.  If the pagedata is
 * unretrievable for any reason, such as the process having exited or being a
 * zombie, a nonzero value is returned, the process should be marked
 * unscannable, and future attempts to scan it should be avoided, since the
 * symptom is probably permament.  If the mappings of either pagedata
 * differ in any respect, the supplied callback will be invoked once.
 */
static int
merge_current_pagedata(lprocess_t *lpc,
    void(*mappings_changed_cb) (lprocess_t *))
{
        prpageheader_t *pghp;
        int mappings_changed = 0;
        uint64_t cnt;

        if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
            0) {
                char pathbuf[PROC_PATH_MAX];

                (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
                    (int)lpc->lpc_pid);
                if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
                    revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
                    get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
                        return (-1);
                debug("starting/resuming pagedata collection for %d\n",
                    (int)lpc->lpc_pid);
        }

        cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
        if (cnt != 0 || lpc->lpc_rss != 0)
                debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
                    (int)lpc->lpc_pid, (unsigned long long)cnt,
                    (unsigned long long)lpc->lpc_rss);
        if (lpc->lpc_prpageheader != NULL) {
                /*
                 * OR the two snapshots.
                 */
#ifdef DEBUG
                lmapping_t *old = NULL;
                lmapping_t *new = NULL;

                mklmapping(&new, pghp);
                mklmapping(&old, lpc->lpc_prpageheader);
#endif /* DEBUG */
                OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
#ifdef DEBUG
                if (((mappings_changed != 0) ^
                    (lmapping_dump_diff(old, new) != 0))) {
                        debug("lmapping_changed inconsistent with lmapping\n");
                        debug("old\n");
                        lmapping_dump(old);
                        debug("new\n");
                        lmapping_dump(new);
                        debug("ignored\n");
                        lmapping_dump(lpc->lpc_ignore);
                        ASSERT(0);
                }
                lmapping_free(&new);
                lmapping_free(&old);
#endif /* DEBUG */
                free(lpc->lpc_prpageheader);
        } else
                mappings_changed = 1;
        lpc->lpc_prpageheader = pghp;

        cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
        if (cnt != 0 || lpc->lpc_rss != 0)
                debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
                    (int)lpc->lpc_pid, (unsigned long long)cnt,
                    (unsigned long long)lpc->lpc_rss);
        if (mappings_changed != 0) {
                debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
                if (mappings_changed_cb != NULL)
                        mappings_changed_cb(lpc);
        }
        return (0);
}

/*
 * Attempt to page out a region of the given process's address space.  May
 * return nonzero if not all of the pages may are pageable, for any reason.
 */
static int
pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
{
        int res;

        if (end <= start)
                return (0);

        errno = 0;
        res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
            (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
        debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);

        /*
         * EBUSY indicates none of the pages have backing store allocated, or
         * some pages were locked, which are less interesting than other
         * conditions, which are noted.
         */
        if (res != 0)
                if (errno == EBUSY)
                        res = 0;
                else
                        debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
                            (void *)start, (long long)(end - start), errno);

        return (res);
}

/*
 * Compute the delta of the victim process's RSS since the last call.  If the
 * psinfo cannot be obtained, no work is done, and no error is returned; it is
 * up to the caller to detect the process' termination via other means.
 */
static int64_t
rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
{
        int64_t d_rss = 0;

        if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
            lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
                d_rss = (int64_t)new_psinfo->pr_rssize -
                    (int64_t)old_psinfo->pr_rssize;
                if (d_rss < 0)
                        vic->lpc_collection->lcol_stat.lcols_pg_eff +=
                            (- d_rss);
                *old_psinfo = *new_psinfo;
        }

        return (d_rss);
}

static void
unignore_mappings(lprocess_t *lpc)
{
        lmapping_free(&lpc->lpc_ignore);
}

static void
unignore_referenced_mappings(lprocess_t *lpc)
{
        prpageheader_cur_t cur;
        void *vicaddr;

        vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
        while (vicaddr != NULL) {
                if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
                    != 0) {
                        if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
                            cur.pr_npage * cur.pr_pagesize) == 0)
                                debug("removed mapping 0x%p+0t%llukB from"
                                    " ignored set\n", (void *)cur.pr_addr,
                                    (unsigned long long)(cur.pr_npage *
                                    cur.pr_pagesize / 1024));
                        vicaddr = (void *)advance_prpageheader_cur_nextmapping(
                            &cur);
                } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
                        vicaddr = (void *)advance_prpageheader_cur_nextmapping(
                            &cur);
        }
}

/*
 * Resume scanning, starting with the last victim, if it is still valid, or any
 * other one, otherwise.
 */
void
scan(lcollection_t *lcol, int64_t excess)
{
        lprocess_t *vic, *lpc;
        void *vicaddr, *endaddr, *nvicaddr;
        prpageheader_cur_t cur;
        psinfo_t old_psinfo, new_psinfo;
        hrtime_t scan_start;
        int res, resumed;
        uint64_t col_unrm_size;

        st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
            (long long)excess);

        /*
         * Determine the address to start scanning at, depending on whether
         * scanning can be resumed.
         */
        endaddr = NULL;
        if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
            lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
                vicaddr = lcol->lcol_resaddr;
                st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
                    (int)vic->lpc_pid);
                resumed = 1;
        } else {
                vicaddr = NULL;
                resumed = 0;
        }

        scan_start = gethrtime();
        /*
         * Obtain the most current pagedata for the processes that might be
         * scanned, and remove from the ignored set any mappings which have
         * referenced or modified pages (in the hopes that the pageability of
         * the mapping's pages may have changed).  Determine if the
         * unreferenced and unmodified portion is impossibly small to suffice
         * to reduce the excess completely.  If so, ignore these bits so that
         * even working set will be paged out.
         */
        col_unrm_size = 0;
        lpc = vic;
        while (lpc != NULL && should_run) {
                if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
                        st_debug(STDL_NORMAL, lcol, "process %d:"
                            " exited/temporarily unscannable",
                            (int)lpc->lpc_pid);
                        goto next;
                }
                debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
                    (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
                    (unsigned long long)lpc->lpc_size);
                col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);

                if ((lcol->lcol_stat.lcols_scan_count %
                    RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
                        /*
                         * Periodically clear the set of ignored mappings.
                         * This will allow processes whose ignored segments'
                         * pageability have changed (without a corresponding
                         * reference or modification to a page) to be
                         * recognized.
                         */
                        if (lcol->lcol_stat.lcols_scan_count > 0)
                                unignore_mappings(lpc);
                } else {
                        /*
                         * Ensure mappings with referenced or modified pages
                         * are not in the ignored set.  Their usage might mean
                         * the condition which made them unpageable is gone.
                         */
                        unignore_referenced_mappings(lpc);
                }
next:
                lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
                    lpc->lpc_next) : NULL;
        }
        if (col_unrm_size < excess) {
                lpc = vic;
                debug("will not reduce excess with only unreferenced pages\n");
                while (lpc != NULL && should_run) {
                        if (lpc->lpc_prpageheader != NULL) {
                                (void) count_pages(lpc->lpc_prpageheader,
                                    CP_CLEAR, 0, 0);
                                if (lpc->lpc_pgdata_fd >= 0) {
                                        if (rfd_close(lpc->lpc_pgdata_fd) != 0)
                                                debug("coud not close %d"
                                                    " lpc_pgdata_fd %d",
                                                    (int)lpc->lpc_pid,
                                                    lpc->lpc_pgdata_fd);
                                        lpc->lpc_pgdata_fd = -1;
                                }
                        }
                        lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
                            lpc->lpc_next) : NULL;
                }
        }

        /*
         * Examine each process for pages to remove until the excess is
         * reduced.
         */
        while (vic != NULL && excess > 0 && should_run) {
                /*
                 * Skip processes whose death was reported when the merging of
                 * pagedata was attempted.
                 */
                if (vic->lpc_prpageheader == NULL)
                        goto nextproc;

                /*
                 * Obtain optional segment residency information.
                 */
                if (lpc_xmap_update(vic) != 0)
                        st_debug(STDL_NORMAL, lcol, "process %d: xmap"
                            " unreadable; ignoring", (int)vic->lpc_pid);

#ifdef DEBUG_MSG
                {
                        void *ovicaddr = vicaddr;
#endif /* DEBUG_MSG */
                vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
                    vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
#ifdef DEBUG_MSG
                        st_debug(STDL_NORMAL, lcol, "trying to resume from"
                            " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
                }
#endif /* DEBUG_MSG */

                /*
                 * Take control of the victim.
                 */
                if (get_psinfo(vic->lpc_pid, &old_psinfo,
                    vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
                    vic, vic) != 0) {
                        st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
                            (int)vic->lpc_pid);
                        goto nextproc;
                }
                (void) rfd_reserve(PGRAB_FD_COUNT);
                if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
                        st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
                            (int)vic->lpc_pid, res);
                        goto nextproc;
                }
                if (Pcreate_agent(scan_pr) != 0) {
                        st_debug(STDL_NORMAL, lcol, "cannot control %d",
                            (int)vic->lpc_pid);
                        goto nextproc;
                }
                /*
                 * Be very pessimistic about the state of the agent LWP --
                 * verify it's actually stopped.
                 */
                errno = 0;
                while (Pstate(scan_pr) == PS_RUN)
                        (void) Pwait(scan_pr, 0);
                if (Pstate(scan_pr) != PS_STOP) {
                        st_debug(STDL_NORMAL, lcol, "agent not in expected"
                            " state (%d)", Pstate(scan_pr));
                        goto nextproc;
                }

                /*
                 * Within the victim's address space, find contiguous ranges of
                 * unreferenced pages to page out.
                 */
                st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
                    (int)vic->lpc_pid);
                while (excess > 0 && vicaddr != NULL && should_run) {
                        /*
                         * Skip mappings in the ignored set.  Mappings get
                         * placed in the ignored set when all their resident
                         * pages are unreference and unmodified, yet unpageable
                         * -- such as when they are locked, or involved in
                         * asynchronous I/O.  They will be scanned again when
                         * some page is referenced or modified.
                         */
                        if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
                            cur.pr_npage * cur.pr_pagesize)) {
                                debug("ignored mapping at 0x%p\n",
                                    (void *)cur.pr_addr);
                                /*
                                 * Update statistics.
                                 */
                                lcol->lcol_stat.lcols_pg_att +=
                                    cur.pr_npage * cur.pr_pagesize / 1024;

                                vicaddr = (void *)
                                    advance_prpageheader_cur_nextmapping(&cur);
                                continue;
                        }

                        /*
                         * Determine a range of unreferenced pages to page out,
                         * and clear the R/M bits in the preceding referenced
                         * range.
                         */
                        st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
                            " npage %llu\n", vicaddr,
                            (unsigned long long)cur.pr_npage);
                        while (vicaddr != NULL &&
                            *(caddr_t)cur.pr_pdaddr != 0) {
                                *(caddr_t)cur.pr_pdaddr = 0;
                                vicaddr = advance_prpageheader_cur(&cur);
                        }
                        st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
                            " %p\n", vicaddr, cur.pr_pdaddr);
                        if (vicaddr == NULL) {
                                /*
                                 * The end of mapping was reached before any
                                 * unreferenced pages were seen.
                                 */
                                vicaddr = (void *)
                                    advance_prpageheader_cur_nextmapping(&cur);
                                continue;
                        }
                        do
                                endaddr = advance_prpageheader_cur(&cur);
                        while (endaddr != NULL &&
                            *(caddr_t)cur.pr_pdaddr == 0 &&
                            (((intptr_t)endaddr - (intptr_t)vicaddr) /
                            1024) < excess)
                                ;
                        st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
                            endaddr, *(caddr_t)cur.pr_pdaddr);

                        /*
                         * Page out from vicaddr to the end of the mapping, or
                         * endaddr if set, then continue scanning after
                         * endaddr, or the next mapping, if not set.
                         */
                        nvicaddr = endaddr;
                        if (endaddr == NULL)
                                endaddr = (caddr_t)cur.pr_addr +
                                    cur.pr_pagesize * cur.pr_npage;
                        if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
                            0) {
                                int64_t d_rss, att;
                                int willignore = 0;

                                excess += (d_rss = rss_delta(
                                    &new_psinfo, &old_psinfo, vic));

                                /*
                                 * If this pageout attempt was unsuccessful
                                 * (the resident portion was not affected), and
                                 * was for the whole mapping, put it in the
                                 * ignored set, so it will not be scanned again
                                 * until some page is referenced or modified.
                                 */
                                if (d_rss >= 0 && (void *)cur.pr_addr ==
                                    vicaddr && (cur.pr_pagesize * cur.pr_npage)
                                    == ((uintptr_t)endaddr -
                                    (uintptr_t)vicaddr)) {
                                        if (lmapping_insert(
                                            &vic->lpc_ignore,
                                            cur.pr_addr,
                                            cur.pr_pagesize *
                                            cur.pr_npage) != 0)
                                                debug("not enough memory to add"
                                                    " mapping at %p to ignored"
                                                    " set\n",
                                                    (void *)cur.pr_addr);
                                        willignore = 1;
                                }

                                /*
                                 * Update statistics.
                                 */
                                lcol->lcol_stat.lcols_pg_att += (att =
                                    ((intptr_t)endaddr - (intptr_t)vicaddr) /
                                    1024);
                                st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
                                    "+0t(%llu/%llu)kB%s\n", vicaddr,
                                    (unsigned long long)((d_rss <
                                    0) ? - d_rss : 0), (unsigned long long)att,
                                    willignore ? " (will ignore)" : "");
                        } else {
                                st_debug(STDL_NORMAL, lcol,
                                    "process %d: exited/unscannable\n",
                                    (int)vic->lpc_pid);
                                vic->lpc_unscannable = 1;
                                goto nextproc;
                        }

                        /*
                         * Update the statistics file, if it's time.
                         */
                        check_update_statistics();

                        vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
                            *)advance_prpageheader_cur_nextmapping(&cur);
                }
                excess += rss_delta(&new_psinfo, &old_psinfo, vic);
                st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
                    (long long)excess);
nextproc:
                /*
                 * If a process was grabbed, release it, destroying its agent.
                 */
                if (scan_pr != NULL) {
                        (void) Prelease(scan_pr, 0);
                        scan_pr = NULL;
                }
                lcol->lcol_victim = vic;
                /*
                 * Scan the collection at most once.  Only if scanning was not
                 * aborted for any reason, and the end of lprocess has not been
                 * reached, determine the next victim and scan it.
                 */
                if (vic != NULL) {
                        if (vic->lpc_next != NULL) {
                                /*
                                 * Determine the next process to be scanned.
                                 */
                                if (excess > 0) {
                                        vic = get_valid_victim(lcol,
                                            vic->lpc_next);
                                        vicaddr = 0;
                                }
                        } else {
                                /*
                                 * A complete scan of the collection was made,
                                 * so tick the scan counter and stop scanning
                                 * until the next request.
                                 */
                                lcol->lcol_stat.lcols_scan_count++;
                                lcol->lcol_stat.lcols_scan_time_complete
                                    = lcol->lcol_stat.lcols_scan_time;
                                /*
                                 * If an excess still exists, tick the
                                 * "ineffective scan" counter, signalling that
                                 * the cap may be uneforceable.
                                 */
                                if (resumed == 0 && excess > 0)
                                        lcol->lcol_stat
                                            .lcols_scan_ineffective++;
                                /*
                                 * Scanning should start at the beginning of
                                 * the process list at the next request.
                                 */
                                if (excess > 0)
                                        vic = NULL;
                        }
                }
        }
        lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
        st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
            (long long)excess);

        lcol->lcol_resaddr = vicaddr;
        if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
                lcol->lcol_victim = get_valid_victim(lcol,
                    lcol->lcol_victim->lpc_next);
        }
}

/*
 * Abort the scan in progress, and destroy the agent LWP of any grabbed
 * processes.
 */
void
scan_abort(void)
{
        if (scan_pr != NULL)
                (void) Prelease(scan_pr, 0);
}

static void
revoke_xmap(rfd_t *rfd)
{
        lprocess_t *lpc = rfd->rfd_data;

        debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
        ASSERT(lpc->lpc_xmap_fd != -1);
        lpc->lpc_xmap_fd = -1;
}

/*
 * Retrieve the process's current xmap , which is used to determine the size of
 * the resident portion of its segments.  Return zero if successful.
 */
static int
lpc_xmap_update(lprocess_t *lpc)
{
        int res;
        struct stat st;

        free(lpc->lpc_xmap);
        lpc->lpc_xmap = NULL;
        lpc->lpc_nxmap = -1;

        if (lpc->lpc_xmap_fd == -1) {
                char pathbuf[PROC_PATH_MAX];

                (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
                    (int)lpc->lpc_pid);
                if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
                    revoke_xmap, lpc, O_RDONLY, 0)) < 0)
                        return (-1);
        }

redo:
        errno = 0;
        if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
                debug("cannot stat xmap\n");
                (void) rfd_close(lpc->lpc_xmap_fd);
                lpc->lpc_xmap_fd = -1;
                return (-1);
        }

        if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
                debug("xmap wrong size\n");
                (void) rfd_close(lpc->lpc_xmap_fd);
                lpc->lpc_xmap_fd = -1;
                return (-1);
        }

        lpc->lpc_xmap = malloc(st.st_size);
        if (lpc->lpc_xmap == NULL) {
                debug("cannot malloc() %ld bytes for xmap", st.st_size);
                (void) rfd_close(lpc->lpc_xmap_fd);
                lpc->lpc_xmap_fd = -1;
                return (-1);
        }

        if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
            st.st_size) {
                free(lpc->lpc_xmap);
                lpc->lpc_xmap = NULL;
                if (res > 0) {
                        debug("xmap changed size, retrying\n");
                        goto redo;
                } else {
                        debug("cannot read xmap");
                        return (-1);
                }
        }
        lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);

        return (0);
}