root/usr/src/uts/common/syscall/acctctl.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/user.h>
#include <sys/cred.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/pathname.h>
#include <sys/modctl.h>
#include <sys/acctctl.h>
#include <sys/bitmap.h>
#include <sys/exacct.h>
#include <sys/policy.h>

/*
 * acctctl(2)
 *
 *   acctctl() provides the administrative interface to the extended accounting
 *   subsystem.  The process and task accounting facilities are configurable:
 *   resources can be individually specified for recording in the appropriate
 *   accounting file.
 *
 *   The current implementation of acctctl() requires that the process and task
 *   and flow files be distinct across all zones.
 *
 * Locking
 *   Each accounting species has an ac_info_t which contains a mutex,
 *   used to protect the ac_info_t's contents, and to serialize access to the
 *   appropriate file.
 */

static list_t exacct_globals_list;
static kmutex_t exacct_globals_list_lock;

static int
ac_state_set(ac_info_t *info, void *buf, size_t bufsz)
{
        int state;

        if (buf == NULL || (bufsz != sizeof (int)))
                return (EINVAL);

        if (copyin(buf, &state, bufsz) != 0)
                return (EFAULT);

        if (state != AC_ON && state != AC_OFF)
                return (EINVAL);

        mutex_enter(&info->ac_lock);
        info->ac_state = state;
        mutex_exit(&info->ac_lock);
        return (0);
}

static int
ac_state_get(ac_info_t *info, void *buf, size_t bufsz)
{
        if (buf == NULL || (bufsz != sizeof (int)))
                return (EINVAL);

        mutex_enter(&info->ac_lock);
        if (copyout(&info->ac_state, buf, bufsz) != 0) {
                mutex_exit(&info->ac_lock);
                return (EFAULT);
        }
        mutex_exit(&info->ac_lock);
        return (0);
}

static boolean_t
ac_file_in_use(vnode_t *vp)
{
        boolean_t in_use = B_FALSE;
        struct exacct_globals *acg;

        if (vp == NULL)
                return (B_FALSE);
        mutex_enter(&exacct_globals_list_lock);
        /*
         * Start off by grabbing all locks.
         */
        for (acg = list_head(&exacct_globals_list); acg != NULL;
            acg = list_next(&exacct_globals_list, acg)) {
                mutex_enter(&acg->ac_proc.ac_lock);
                mutex_enter(&acg->ac_task.ac_lock);
                mutex_enter(&acg->ac_flow.ac_lock);
                mutex_enter(&acg->ac_net.ac_lock);
        }

        for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL;
            acg = list_next(&exacct_globals_list, acg)) {
                /*
                 * We need to verify that we aren't already using this file for
                 * accounting in any zone.
                 */
                if (vn_compare(acg->ac_proc.ac_vnode, vp) ||
                    vn_compare(acg->ac_task.ac_vnode, vp) ||
                    vn_compare(acg->ac_flow.ac_vnode, vp) ||
                    vn_compare(acg->ac_net.ac_vnode, vp))
                        in_use = B_TRUE;
        }

        /*
         * Drop all locks.
         */
        for (acg = list_head(&exacct_globals_list); acg != NULL;
            acg = list_next(&exacct_globals_list, acg)) {
                mutex_exit(&acg->ac_proc.ac_lock);
                mutex_exit(&acg->ac_task.ac_lock);
                mutex_exit(&acg->ac_flow.ac_lock);
                mutex_exit(&acg->ac_net.ac_lock);
        }
        mutex_exit(&exacct_globals_list_lock);
        return (in_use);
}

static int
ac_file_set(ac_info_t *info, void *ubuf, size_t bufsz)
{
        int error = 0;
        void *kbuf;
        void *namebuf;
        int namelen;
        vnode_t *vp;
        void *hdr;
        size_t hdrsize;
        vattr_t va;

        if (ubuf == NULL) {
                mutex_enter(&info->ac_lock);

                /*
                 * Closing accounting file
                 */
                if (info->ac_vnode != NULL) {
                        error = VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0,
                            CRED(), NULL);
                        if (error) {
                                mutex_exit(&info->ac_lock);
                                return (error);
                        }
                        VN_RELE(info->ac_vnode);
                        info->ac_vnode = NULL;
                }
                if (info->ac_file != NULL) {
                        kmem_free(info->ac_file, strlen(info->ac_file) + 1);
                        info->ac_file = NULL;
                }

                mutex_exit(&info->ac_lock);
                return (error);
        }

        if (bufsz < 2 || bufsz > MAXPATHLEN)
                return (EINVAL);

        /*
         * We have to copy in the whole buffer since we can't tell the length
         * of the string in user's address space.
         */
        kbuf = kmem_zalloc(bufsz, KM_SLEEP);
        if ((error = copyinstr((char *)ubuf, (char *)kbuf, bufsz, NULL)) != 0) {
                kmem_free(kbuf, bufsz);
                return (error);
        }
        if (*((char *)kbuf) != '/') {
                kmem_free(kbuf, bufsz);
                return (EINVAL);
        }

        /*
         * Now, allocate the space where we are going to save the
         * name of the accounting file and kmem_free kbuf. We have to do this
         * now because it is not good to sleep in kmem_alloc() while
         * holding ac_info's lock.
         */
        namelen = strlen(kbuf) + 1;
        namebuf = kmem_alloc(namelen, KM_SLEEP);
        (void) strcpy(namebuf, kbuf);
        kmem_free(kbuf, bufsz);

        /*
         * Check if this file already exists.
         */
        error = lookupname(namebuf, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);

        /*
         * Check if the file is already in use.
         */
        if (!error) {
                if (ac_file_in_use(vp)) {
                        /*
                         * If we're already using it then return EBUSY
                         */
                        kmem_free(namebuf, namelen);
                        VN_RELE(vp);
                        return (EBUSY);
                }
                VN_RELE(vp);
        }

        /*
         * Create an exacct header here because exacct_create_header() may
         * sleep so we should not be holding ac_lock. At this point we cannot
         * reliably know if we need the header or not, so we may end up not
         * using the header.
         */
        hdr = exacct_create_header(&hdrsize);

        /*
         * Now, grab info's ac_lock and try to set up everything.
         */
        mutex_enter(&info->ac_lock);

        if ((error = vn_open(namebuf, UIO_SYSSPACE,
            FCREAT | FWRITE | FOFFMAX, 0600, &vp, CRCREAT, 0)) != 0) {
                mutex_exit(&info->ac_lock);
                kmem_free(namebuf, namelen);
                kmem_free(hdr, hdrsize);
                return (error);
        }

        if (vp->v_type != VREG) {
                VN_RELE(vp);
                mutex_exit(&info->ac_lock);
                kmem_free(namebuf, namelen);
                kmem_free(hdr, hdrsize);
                return (EACCES);
        }

        if (info->ac_vnode != NULL) {
                /*
                 * Switch from an old file to a new file by swapping
                 * their vnode pointers.
                 */
                vnode_t *oldvp;
                oldvp = info->ac_vnode;
                info->ac_vnode = vp;
                vp = oldvp;
        } else {
                /*
                 * Start writing accounting records to a new file.
                 */
                info->ac_vnode = vp;
                vp = NULL;
        }
        if (vp) {
                /*
                 * We still need to close the old file.
                 */
                if ((error = VOP_CLOSE(vp, FWRITE, 1, 0, CRED(), NULL)) != 0) {
                        VN_RELE(vp);
                        mutex_exit(&info->ac_lock);
                        kmem_free(namebuf, namelen);
                        kmem_free(hdr, hdrsize);
                        return (error);
                }
                VN_RELE(vp);
                if (info->ac_file != NULL) {
                        kmem_free(info->ac_file,
                            strlen(info->ac_file) + 1);
                        info->ac_file = NULL;
                }
        }
        info->ac_file = namebuf;

        /*
         * Write the exacct header only if the file is empty.
         */
        error = VOP_GETATTR(info->ac_vnode, &va, AT_SIZE, CRED(), NULL);
        if (error == 0 && va.va_size == 0)
                error = exacct_write_header(info, hdr, hdrsize);

        mutex_exit(&info->ac_lock);
        kmem_free(hdr, hdrsize);
        return (error);
}

static int
ac_file_get(ac_info_t *info, void *buf, size_t bufsz)
{
        int error = 0;
        vnode_t *vnode;
        char *file;

        mutex_enter(&info->ac_lock);
        file = info->ac_file;
        vnode = info->ac_vnode;

        if (file == NULL || vnode == NULL) {
                mutex_exit(&info->ac_lock);
                return (ENOTACTIVE);
        }

        if (strlen(file) >= bufsz)
                error = ENOMEM;
        else
                error = copyoutstr(file, buf, MAXPATHLEN, NULL);

        mutex_exit(&info->ac_lock);
        return (error);
}

static int
ac_res_set(ac_info_t *info, void *buf, size_t bufsz, int maxres)
{
        ac_res_t *res;
        ac_res_t *tmp;
        ulong_t *maskp;
        int id;
        uint_t counter = 0;

        /*
         * Validate that a non-zero buffer, sized within limits and to an
         * integral number of ac_res_t's has been specified.
         */
        if (bufsz == 0 ||
            bufsz > sizeof (ac_res_t) * (AC_MAX_RES + 1) ||
            (bufsz / sizeof (ac_res_t)) * sizeof (ac_res_t) != bufsz)
                return (EINVAL);

        tmp = res = kmem_alloc(bufsz, KM_SLEEP);
        if (copyin(buf, res, bufsz) != 0) {
                kmem_free(res, bufsz);
                return (EFAULT);
        }

        maskp = (ulong_t *)&info->ac_mask;

        mutex_enter(&info->ac_lock);
        while ((id = tmp->ar_id) != AC_NONE && counter < maxres + 1) {
                if (id > maxres || id < 0) {
                        mutex_exit(&info->ac_lock);
                        kmem_free(res, bufsz);
                        return (EINVAL);
                }
                if (tmp->ar_state == AC_ON) {
                        BT_SET(maskp, id);
                } else if (tmp->ar_state == AC_OFF) {
                        BT_CLEAR(maskp, id);
                } else {
                        mutex_exit(&info->ac_lock);
                        kmem_free(res, bufsz);
                        return (EINVAL);
                }
                tmp++;
                counter++;
        }
        mutex_exit(&info->ac_lock);
        kmem_free(res, bufsz);
        return (0);
}

static int
ac_res_get(ac_info_t *info, void *buf, size_t bufsz, int maxres)
{
        int error = 0;
        ac_res_t *res;
        ac_res_t *tmp;
        size_t ressz = sizeof (ac_res_t) * (maxres + 1);
        ulong_t *maskp;
        int id;

        if (bufsz < ressz)
                return (EINVAL);
        tmp = res = kmem_alloc(ressz, KM_SLEEP);

        mutex_enter(&info->ac_lock);
        maskp = (ulong_t *)&info->ac_mask;
        for (id = 1; id <= maxres; id++) {
                tmp->ar_id = id;
                tmp->ar_state = BT_TEST(maskp, id);
                tmp++;
        }
        tmp->ar_id = AC_NONE;
        tmp->ar_state = AC_OFF;
        mutex_exit(&info->ac_lock);
        error = copyout(res, buf, ressz);
        kmem_free(res, ressz);
        return (error);
}

/*
 * acctctl()
 *
 * Overview
 *   acctctl() is the entry point for the acctctl(2) system call.
 *
 * Return values
 *   On successful completion, return 0; otherwise -1 is returned and errno is
 *   set appropriately.
 *
 * Caller's context
 *   Called from the system call path.
 */
int
acctctl(int cmd, void *buf, size_t bufsz)
{
        int error = 0;
        int mode = AC_MODE(cmd);
        int option = AC_OPTION(cmd);
        int maxres;
        ac_info_t *info;
        zone_t *zone = curproc->p_zone;
        struct exacct_globals *acg;

        acg = zone_getspecific(exacct_zone_key, zone);
        /*
         * exacct_zone_key and associated per-zone state were initialized when
         * the module was loaded.
         */
        ASSERT(exacct_zone_key != ZONE_KEY_UNINITIALIZED);
        ASSERT(acg != NULL);

        switch (mode) { /* sanity check */
        case AC_TASK:
                info = &acg->ac_task;
                maxres = AC_TASK_MAX_RES;
                break;
        case AC_PROC:
                info = &acg->ac_proc;
                maxres = AC_PROC_MAX_RES;
                break;
        /*
         * Flow/net accounting isn't configurable in non-global
         * zones, but we have this field on a per-zone basis for future
         * expansion as well as the ability to return default "unset"
         * values for the various AC_*_GET queries.  AC_*_SET commands
         * fail with EPERM for AC_FLOW and AC_NET in non-global zones.
         */
        case AC_FLOW:
                info = &acg->ac_flow;
                maxres = AC_FLOW_MAX_RES;
                break;
        case AC_NET:
                info = &acg->ac_net;
                maxres = AC_NET_MAX_RES;
                break;
        default:
                return (set_errno(EINVAL));
        }

        switch (option) {
        case AC_STATE_SET:
                if ((error = secpolicy_acct(CRED())) != 0)
                        break;
                if ((mode == AC_FLOW || mode == AC_NET) &&
                    getzoneid() != GLOBAL_ZONEID) {
                        error = EPERM;
                        break;
                }
                error = ac_state_set(info, buf, bufsz);
                break;
        case AC_STATE_GET:
                error = ac_state_get(info, buf, bufsz);
                break;
        case AC_FILE_SET:
                if ((error = secpolicy_acct(CRED())) != 0)
                        break;
                if ((mode == AC_FLOW || mode == AC_NET) &&
                    getzoneid() != GLOBAL_ZONEID) {
                        error = EPERM;
                        break;
                }
                error = ac_file_set(info, buf, bufsz);
                break;
        case AC_FILE_GET:
                error = ac_file_get(info, buf, bufsz);
                break;
        case AC_RES_SET:
                if ((error = secpolicy_acct(CRED())) != 0)
                        break;
                if ((mode == AC_FLOW || mode == AC_NET) &&
                    getzoneid() != GLOBAL_ZONEID) {
                        error = EPERM;
                        break;
                }
                error = ac_res_set(info, buf, bufsz, maxres);
                break;
        case AC_RES_GET:
                error = ac_res_get(info, buf, bufsz, maxres);
                break;
        default:
                return (set_errno(EINVAL));
        }
        if (error)
                return (set_errno(error));
        return (0);
}

static struct sysent ac_sysent = {
        3,
        SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
        acctctl
};

static struct modlsys modlsys = {
        &mod_syscallops,
        "acctctl system call",
        &ac_sysent
};

#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
        &mod_syscallops32,
        "32-bit acctctl system call",
        &ac_sysent
};
#endif

static struct modlinkage modlinkage = {
        MODREV_1,
        &modlsys,
#ifdef _SYSCALL32_IMPL
        &modlsys32,
#endif
        NULL
};

/* ARGSUSED */
static void *
exacct_zone_init(zoneid_t zoneid)
{
        struct exacct_globals *acg;

        acg = kmem_zalloc(sizeof (*acg), KM_SLEEP);
        mutex_enter(&exacct_globals_list_lock);
        list_insert_tail(&exacct_globals_list, acg);
        mutex_exit(&exacct_globals_list_lock);
        return (acg);
}

static void
exacct_free_info(ac_info_t *info)
{
        mutex_enter(&info->ac_lock);
        if (info->ac_vnode) {
                (void) VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0, kcred, NULL);
                VN_RELE(info->ac_vnode);
                kmem_free(info->ac_file, strlen(info->ac_file) + 1);
        }
        info->ac_state = AC_OFF;
        info->ac_vnode = NULL;
        info->ac_file = NULL;
        mutex_exit(&info->ac_lock);
}

/* ARGSUSED */
static void
exacct_zone_shutdown(zoneid_t zoneid, void *data)
{
        struct exacct_globals *acg = data;

        /*
         * The accounting files need to be closed during shutdown rather than
         * destroy, since otherwise the filesystem they reside on may fail to
         * unmount, thus causing the entire zone halt/reboot to fail.
         */
        exacct_free_info(&acg->ac_proc);
        exacct_free_info(&acg->ac_task);
        exacct_free_info(&acg->ac_flow);
        exacct_free_info(&acg->ac_net);
}

/* ARGSUSED */
static void
exacct_zone_fini(zoneid_t zoneid, void *data)
{
        struct exacct_globals *acg = data;

        mutex_enter(&exacct_globals_list_lock);
        list_remove(&exacct_globals_list, acg);
        mutex_exit(&exacct_globals_list_lock);

        mutex_destroy(&acg->ac_proc.ac_lock);
        mutex_destroy(&acg->ac_task.ac_lock);
        mutex_destroy(&acg->ac_flow.ac_lock);
        mutex_destroy(&acg->ac_net.ac_lock);
        kmem_free(acg, sizeof (*acg));
}

int
_init()
{
        int error;

        mutex_init(&exacct_globals_list_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&exacct_globals_list, sizeof (struct exacct_globals),
            offsetof(struct exacct_globals, ac_link));
        zone_key_create(&exacct_zone_key, exacct_zone_init,
            exacct_zone_shutdown, exacct_zone_fini);

        if ((error = mod_install(&modlinkage)) != 0) {
                (void) zone_key_delete(exacct_zone_key);
                exacct_zone_key = ZONE_KEY_UNINITIALIZED;
                mutex_destroy(&exacct_globals_list_lock);
                list_destroy(&exacct_globals_list);
        }
        return (error);
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}

int
_fini()
{
        return (EBUSY);
}