root/usr/src/cmd/ppgsz/ppgsz.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2001-2003 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
#include <signal.h>
#include <errno.h>
#include <dirent.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <libproc.h>
#include <sys/sysmacros.h>
#include <libgen.h>
#include <thread.h>

#ifndef TRUE
#define TRUE    1
#endif
#ifndef FALSE
#define FALSE   0
#endif

static struct   ps_prochandle *Pr;
static char     *command;
static volatile int interrupt;
static int      Fflag;
static int      cflag = 1;

static void     intr(int);
static int      setpgsz(struct ps_prochandle *, int, size_t *);
static int      setpgsz_anon(struct ps_prochandle *, size_t, int);
static caddr_t  setup_mha(uint_t, size_t, int);
static size_t   discover_optimal_pagesize(struct ps_prochandle *,
                uint_t, pid_t);
static void     usage();

#define INVPGSZ         3

/* subopt */

static char     *suboptstr[] = {
        "heap",
        "stack",
        "anon",
        NULL
};

enum    suboptenum {
        E_HEAP,
        E_STACK,
        E_ANON
};

static size_t
atosz(char *optarg)
{
        size_t          sz = 0;
        char            *endptr;

        if (optarg == NULL || optarg[0] == '\0')
                return (INVPGSZ);

        sz = strtoll(optarg, &endptr, 0);

        switch (*endptr) {
        case 'T':
        case 't':
                sz *= 1024;
        /*FALLTHRU*/
        case 'G':
        case 'g':
                sz *= 1024;
        /*FALLTHRU*/
        case 'M':
        case 'm':
                sz *= 1024;
        /*FALLTHRU*/
        case 'K':
        case 'k':
                sz *= 1024;
        /*FALLTHRU*/
        case 'B':
        case 'b':
        default:
                break;
        }
        return (sz);
}

/* pgsz array sufficient for max page sizes */

static size_t   pgsza[8 * sizeof (void *)];
static int      nelem;

static void
getpgsz()
{
        if ((nelem = getpagesizes(NULL, 0)) == 0) {
                (void) fprintf(stderr, "%s: cannot determine system page"
                    " sizes\n", command);
                exit(125);
        }

        (void) getpagesizes(pgsza, nelem);
}

static size_t
cnvpgsz(char *optarg)
{
        size_t          pgsz = atosz(optarg);
        int             i;

        if (!ISP2(pgsz) || ((pgsz < pgsza[0]) && pgsz != 0)) {
                pgsz = INVPGSZ;
        } else {
                for (i = nelem - 1; i >= 0; i--) {
                        if (pgsz == pgsza[i])
                                break;
                        if (pgsz > pgsza[i]) {
                                pgsz = INVPGSZ;
                                break;
                        }
                }
        }
        if (pgsz == INVPGSZ) {
                if (optarg != NULL) {
                        (void) fprintf(stderr,
                            "%s: invalid page size specified (%s)\n",
                            command, optarg);
                } else {
                        usage();
                }
                exit(125);
        }
        return (pgsz);
}

static void
usage()
{
        (void) fprintf(stderr,
            "usage:\t%s -o option[,option] [-F] cmd | -p pid ...\n"
            "    (set preferred page size of cmd or each process)\n"
            "    -o option[,option]: options are\n"
            "         stack=sz\n"
            "         heap=sz\n"
            "         anon=sz           (sz: valid page size or 0 (zero))\n"
            "    -F: force grabbing of the target process(es)\n"
            "    cmd: launch command\n"
            "    -p pid ...: process id list\n",
            command);
        exit(125);
}

int
main(int argc, char *argv[])
{
        int             rc, err = 0;
        int             opt, subopt;
        int             errflg = 0;
        char            *options, *value;
        size_t          pgsz[] = {INVPGSZ, INVPGSZ, INVPGSZ};
        pid_t           pid;
        int             status;

        if ((command = strrchr(argv[0], '/')) != NULL)
                command++;
        else
                command = argv[0];

        getpgsz();

        /* options */
        while ((opt = getopt(argc, argv, "o:Fp")) != EOF) {
                switch (opt) {
                case 'o':               /* options */
                        options = optarg;
                        while (*options != '\0') {
                                subopt = getsubopt(&options, suboptstr, &value);
                                switch (subopt) {
                                case E_HEAP:
                                case E_STACK:
                                case E_ANON:
                                        pgsz[subopt] = cnvpgsz(value);
                                        break;
                                default:
                                        errflg = 1;
                                        break;
                                }
                        }
                        break;
                case 'F':               /* force grabbing (no O_EXCL) */
                        Fflag = PGRAB_FORCE;
                        break;
                case 'p':
                        cflag = 0;
                        break;
                default:
                        errflg = 1;
                        break;
                }
        }

        argc -= optind;
        argv += optind;

        if ((pgsz[E_HEAP] == INVPGSZ && pgsz[E_STACK] == INVPGSZ &&
            pgsz[E_ANON] == INVPGSZ) || errflg || argc <= 0) {
                usage();
        }

        /* catch signals from terminal */
        if (sigset(SIGHUP, SIG_IGN) == SIG_DFL)
                (void) sigset(SIGHUP, intr);
        if (sigset(SIGINT, SIG_IGN) == SIG_DFL)
                (void) sigset(SIGINT, intr);
        if (sigset(SIGQUIT, SIG_IGN) == SIG_DFL)
                (void) sigset(SIGQUIT, intr);
        (void) sigset(SIGTERM, intr);

        if (cflag && !interrupt) {              /* command */
                int             err;
                char            path[PATH_MAX];

                Pr = Pcreate(argv[0], &argv[0], &err, path, sizeof (path));
                if (Pr == NULL) {
                        switch (err) {
                        case C_PERM:
                                (void) fprintf(stderr,
                                    "%s: cannot control set-id or "
                                    "unreadable object file: %s\n",
                                    command, path);
                                break;
                        case C_LP64:
                                (void) fprintf(stderr,
                                    "%s: cannot control _LP64 "
                                    "program: %s\n", command, path);
                                break;
                        case C_NOEXEC:
                                (void) fprintf(stderr, "%s: cannot execute "
                                    "program: %s\n", command, argv[0]);
                                exit(126);
                                break;
                        case C_NOENT:
                                (void) fprintf(stderr, "%s: cannot find "
                                    "program: %s\n", command, argv[0]);
                                exit(127);
                                break;
                        case C_STRANGE:
                                break;
                        default:
                                (void) fprintf(stderr,
                                    "%s: %s\n", command, Pcreate_error(err));
                                break;
                        }
                        exit(125);
                }

                if ((rc = setpgsz(Pr, Pstatus(Pr)->pr_dmodel, pgsz)) != 0) {
                        (void) fprintf(stderr, "%s: set page size "
                            "failed for program: %s\n", command, argv[0]);
                        (void) pr_exit(Pr, 1);
                        exit(125);
                }

                /*
                 * release the command to run, wait for it and
                 * return it's exit status if we can.
                 */
                Prelease(Pr, 0);
                do {
                        pid = wait(&status);
                } while (pid == -1 && errno == EINTR);

                if (pid == -1) {
                        (void) fprintf(stderr, "%s: wait() error: %s\n",
                            command, strerror(errno));
                        exit(125);
                }

                /*
                 * Pass thru the child's exit value.
                 */
                if (WIFEXITED(status))
                        exit(WEXITSTATUS(status));
                exit(status | WCOREFLG);
        }

        /* process pids */

        while (--argc >= 0 && !interrupt) {
                char *arg;
                psinfo_t psinfo;
                int gret;

                (void) fflush(stdout);  /* line-at-a-time */

                /* get the specified pid and the psinfo struct */
                arg = *argv++;
                pid = proc_arg_psinfo(arg, PR_ARG_PIDS, &psinfo, &gret);

                if (pid == -1) {
                        (void) fprintf(stderr, "%s: cannot examine pid %s:"
                            " %s\n", command, arg, Pgrab_error(gret));
                        if (!isdigit(arg[0]) && strncmp(arg, "/proc/", 6)) {
                                (void) fprintf(stderr,
                                    "\tdo not use -p option"
                                    " to launch a command\n");
                        }
                        err++;
                } else if ((Pr = Pgrab(pid, Fflag, &gret)) != NULL) {
                        rc = setpgsz(Pr, Pstatus(Pr)->pr_dmodel, pgsz);
                        if (rc != 0) {
                                (void) fprintf(stderr, "%s: set page size "
                                    "failed for pid: %d\n", command, (int)pid);
                                err++;
                        }
                        Prelease(Pr, 0);
                        Pr = NULL;
                } else {
                        switch (gret) {
                        case G_SYS:
                                proc_unctrl_psinfo(&psinfo);
                                (void) fprintf(stderr, "%s: cannot set page "
                                    "size for system process: %d [ %s ]\n",
                                    command, (int)pid, psinfo.pr_psargs);
                                err++;
                                break;
                        case G_SELF:
                                /* do it to own self */
                                rc = setpgsz(NULL, psinfo.pr_dmodel, pgsz);
                                if (rc != 0) {
                                        (void) fprintf(stderr, "%s: set page"
                                            "size failed for self: %d\n",
                                            command, (int)pid);
                                        err++;
                                }
                                break;
                        default:
                                (void) fprintf(stderr, "%s: %s: %d\n",
                                    command, Pgrab_error(gret), (int)pid);
                                err++;
                                break;
                        }
                }
        }

        if (interrupt || err)
                exit(125);

        return (0);
}

/* ARGSUSED */
static void
intr(int sig)
{
        interrupt = 1;
}

/* ------ begin specific code ------ */

/* set process page size */
/*ARGSUSED*/
static int
setpgsz(struct  ps_prochandle *Pr, int dmodel, size_t pgsz[])
{
        int                     rc;
        int                     err = 0;
        caddr_t                 mpss;
        int                     i;
        static uint_t   pgszcmd[] =
        {MHA_MAPSIZE_BSSBRK, MHA_MAPSIZE_STACK, MHA_MAPSIZE_VA};

        for (i = E_HEAP; i <= E_ANON; i++) {
                if (pgsz[i] == INVPGSZ)
                        continue;

                if (i == E_ANON)
                        rc = setpgsz_anon(Pr, pgsz[i], dmodel);
                else {
                        mpss = setup_mha(pgszcmd[i], pgsz[i], dmodel);
                        rc = pr_memcntl(Pr, NULL, 0, MC_HAT_ADVISE, mpss, 0, 0);
                }

                if (rc < 0) {
                        (void) fprintf(stderr, "%s: warning: set %s page size "
                            "failed (%s) for pid %d\n", command, suboptstr[i],
                            strerror(errno), (int)Pstatus(Pr)->pr_pid);
                        err++;
                }
        }
        return (err);
}


/*
 * Walk through the process' address space segments.  Set all anonymous
 * segments to the new page size.
 */
static int
setpgsz_anon(struct ps_prochandle *Pr, size_t pgsz, int dmodel)
{
        caddr_t         mpss;
        prmap_t         map;
        uintptr_t       addr;
        size_t          size;
        const psinfo_t  *psinfo;
        const pstatus_t *pstatus;
        int             fd;
        int             rc;
        char            path[PATH_MAX];

        /*
         * Setting the page size for anonymous segments on a process before it
         * has run will have no effect, since it has not configured anonymous
         * memory and the page size setting is not "sticky" inside the kernel.
         * Any anonymous memory subsequently mapped will have the default page
         * size.
         */
        if (cflag)
                return (0);

        if ((psinfo = Ppsinfo(Pr)) == NULL)
                return (-1);
        if ((pstatus = Pstatus(Pr)) == NULL)
                return (-1);

        if (pgsz == 0)
                pgsz = discover_optimal_pagesize(Pr, dmodel, psinfo->pr_pid);

        mpss = setup_mha(MHA_MAPSIZE_VA, pgsz, dmodel);

        (void) snprintf(path, PATH_MAX, "/proc/%d/map", (int)psinfo->pr_pid);
        if ((fd = open(path, O_RDONLY)) < 0)
                return (-1);

        while (read(fd, &map, sizeof (map)) == sizeof (map)) {
                if ((map.pr_mflags & MA_ANON) == 0) {
                        /* Not anon. */
                        continue;
                } else if (map.pr_mflags & MA_SHARED) {
                        /* Can't change pagesize for shared mappings. */
                        continue;
                } else if (map.pr_vaddr + map.pr_size >
                    pstatus->pr_brkbase &&
                    map.pr_vaddr <
                    pstatus->pr_brkbase + pstatus->pr_brksize) {
                        /* Heap. */
                        continue;
                } else if (map.pr_vaddr >= pstatus->pr_stkbase &&
                    map.pr_vaddr + map.pr_size <=
                    pstatus->pr_stkbase + pstatus->pr_stksize) {
                        /* Stack. */
                        continue;
                } else if (map.pr_size < pgsz) {
                        /* Too small. */
                        continue;
                }

                /*
                 * Find the first address in the segment that is page-aligned.
                 */
                if (pgsz == 0 || ((map.pr_vaddr % pgsz) == 0))
                        addr = map.pr_vaddr;
                else
                        addr = map.pr_vaddr + (pgsz - (map.pr_vaddr % pgsz));

                /*
                 * Calculate how many pages will fit in the segment.
                 */
                if (pgsz == 0)
                        size = map.pr_size;
                else
                        size = map.pr_size - (addr % map.pr_vaddr) -
                            ((map.pr_vaddr + map.pr_size) % pgsz);

                /*
                 * If no aligned pages fit in the segment, ignore it.
                 */
                if (size < pgsz) {
                        continue;
                }

                rc = pr_memcntl(Pr, (caddr_t)addr, size,
                    MC_HAT_ADVISE, mpss, 0, 0);

                /*
                 * If an error occurs on any segment, report the error here and
                 * then go on to try setting the page size for the remaining
                 * segments.
                 */
                if (rc < 0) {
                        (void) fprintf(stderr, "%s: warning: set page size "
                            "failed (%s) for pid %d for anon segment at "
                            "address: %p\n", command, strerror(errno),
                            (int)psinfo->pr_pid, (void *)map.pr_vaddr);
                }
        }

        (void) close(fd);
        return (0);
}

/*
 * Discover the optimal page size for the process.
 * Do this by creating a 4M segment in the target process, set its pagesize
 * to 0, and read the map file to discover the page size selected by the system.
 */
static size_t
discover_optimal_pagesize(struct ps_prochandle *Pr, uint_t dmodel, pid_t pid)
{
        size_t                  size = 0;
        size_t                  len = pgsza[nelem - 1];
        prxmap_t                xmap;
        caddr_t                 mha;
        void                    *addr;
        int                     fd = -1;
        char                    path[PATH_MAX];

        (void) snprintf(path, PATH_MAX, "/proc/%d/xmap", (int)pid);
        if ((fd = open(path, O_RDONLY)) < 0)
                return (size);

        if ((addr = pr_mmap(Pr, (void *)len, len, PROT_READ | PROT_WRITE,
            MAP_PRIVATE | MAP_ANON | MAP_ALIGN, -1, 0)) == MAP_FAILED) {
                goto err;
        }

        mha = setup_mha(MHA_MAPSIZE_VA, 0, dmodel);
        if (pr_memcntl(Pr, addr, len, MC_HAT_ADVISE, mha, 0, 0) < 0) {
                goto err;
        }

        /*
         * Touch a page in the segment so the hat mapping gets created.
         */
        (void) Pwrite(Pr, &len, sizeof (len), (uintptr_t)addr);

        /*
         * Read through the address map looking for our segment.
         */

        while (read(fd, &xmap, sizeof (xmap)) == sizeof (xmap)) {
                if (xmap.pr_vaddr == (uintptr_t)addr)
                        break;
        }
        if (xmap.pr_vaddr != (uintptr_t)addr)
                goto err;

        size = xmap.pr_hatpagesize;

err:
        if (addr != MAP_FAILED) {
                if (pr_munmap(Pr, addr, len) == -1) {
                        (void) fprintf(stderr,
                            "%s: couldn't delete segment at %p\n",
                            command, addr);
                }
        }
        if (fd != -1)
                (void) close(fd);

        return (size);
}

static struct memcntl_mha       gmha;
#ifdef _LP64
static struct memcntl_mha32     gmha32;
#endif

static caddr_t
/* ARGSUSED */
setup_mha(uint_t command, size_t pagesize, int dmodel)
{
#ifdef _LP64
        if (dmodel == PR_MODEL_ILP32) {
                gmha32.mha_cmd = command;
                gmha32.mha_flags = 0;
                gmha32.mha_pagesize = pagesize;
                return ((caddr_t)&gmha32);
        }
#endif
        gmha.mha_cmd = command;
        gmha.mha_flags = 0;
        gmha.mha_pagesize = pagesize;
        return ((caddr_t)&gmha);
}