root/usr/src/uts/common/syscall/exacctsys.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/acctctl.h>
#include <sys/cmn_err.h>
#include <sys/cred.h>
#include <sys/errno.h>
#include <sys/exacct.h>
#include <sys/modctl.h>
#include <sys/procset.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/task.h>
#include <sys/types.h>
#include <sys/user.h>
#include <sys/policy.h>

/*
 * getacct(2), putacct(2), and wracct(2) system calls
 *
 *   The extended accounting subsystem provides three root-privileged system
 *   calls for interacting with the actual resource data associated with each
 *   task or process.  getacct() copies a packed exacct record reflecting the
 *   resource usage out to the buffer provided by the user.  wracct() writes a
 *   record to the appropriate extended accounting file.  putacct() takes the
 *   buffer provided by the user, and appends a "tag" record associated with the
 *   specified task or project that encapsulates the user data.  All three of
 *   these functions exit early if extended accounting is not active for the
 *   requested entity type.
 *
 * Locking
 *   Under the terminology introduced in os/task.c, all three of these system
 *   calls are task observers, when executing on an existing task.
 */

/*
 * getacct_callback() is used to copyout the buffer with accounting records
 * from the kernel back to the user. It also sets actual to the size of the
 * kernel buffer--the required minimum size for a successful outbound copy.
 */
/* ARGSUSED */
static int
getacct_callback(ac_info_t *unused, void *ubuf, size_t usize, void *kbuf,
    size_t ksize, size_t *actual)
{
        size_t size = MIN(usize, ksize);

        if (ubuf != NULL && copyout(kbuf, ubuf, size) != 0)
                return (EFAULT);
        *actual = ksize;
        return (0);
}

static int
getacct_task(ac_info_t *ac_task, taskid_t tkid, void *buf, size_t bufsize,
    size_t *sizep)
{
        task_t *tk;
        int error;

        mutex_enter(&ac_task->ac_lock);
        if (ac_task->ac_state == AC_OFF) {
                mutex_exit(&ac_task->ac_lock);
                return (ENOTACTIVE);
        }
        mutex_exit(&ac_task->ac_lock);

        if ((tk = task_hold_by_id(tkid)) == NULL)
                return (ESRCH);
        error = exacct_assemble_task_usage(ac_task, tk,
            getacct_callback, buf, bufsize, sizep, EW_PARTIAL);
        task_rele(tk);

        return (error);
}

static int
getacct_proc(ac_info_t *ac_proc, pid_t pid, void *buf, size_t bufsize,
    size_t *sizep)
{
        proc_t *p;
        proc_usage_t *pu;
        ulong_t mask[AC_MASK_SZ];
        ulong_t *ac_mask = &mask[0];
        int error;

        mutex_enter(&ac_proc->ac_lock);
        if (ac_proc->ac_state == AC_OFF) {
                mutex_exit(&ac_proc->ac_lock);
                return (ENOTACTIVE);
        }
        bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ);
        mutex_exit(&ac_proc->ac_lock);

        pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP);
        pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP);

        mutex_enter(&pidlock);
        if ((p = prfind(pid)) == NULL) {
                mutex_exit(&pidlock);
                kmem_free(pu->pu_command, MAXCOMLEN + 1);
                kmem_free(pu, sizeof (proc_usage_t));
                return (ESRCH);
        }
        mutex_enter(&p->p_lock);
        mutex_exit(&pidlock);

        exacct_calculate_proc_usage(p, pu, ac_mask, EW_PARTIAL, 0);
        mutex_exit(&p->p_lock);

        error = exacct_assemble_proc_usage(ac_proc, pu,
            getacct_callback, buf, bufsize, sizep, EW_PARTIAL);

        kmem_free(pu->pu_command, MAXCOMLEN + 1);
        kmem_free(pu, sizeof (proc_usage_t));

        return (error);
}

static ssize_t
getacct(idtype_t idtype, id_t id, void *buf, size_t bufsize)
{
        size_t size = 0;
        int error;
        struct exacct_globals *acg;

        if (bufsize > EXACCT_MAX_BUFSIZE)
                bufsize = EXACCT_MAX_BUFSIZE;

        acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
        switch (idtype) {
        case P_PID:
                error = getacct_proc(&acg->ac_proc, id, buf, bufsize, &size);
                break;
        case P_TASKID:
                error = getacct_task(&acg->ac_task, id, buf, bufsize, &size);
                break;
        default:
                error = EINVAL;
                break;
        }
        return (error == 0 ? (ssize_t)size : set_errno(error));
}

static int
putacct(idtype_t idtype, id_t id, void *buf, size_t bufsize, int flags)
{
        int error;
        taskid_t tkid;
        proc_t *p;
        task_t *tk;
        void *kbuf;
        struct exacct_globals *acg;

        if (bufsize == 0 || bufsize > EXACCT_MAX_BUFSIZE)
                return (set_errno(EINVAL));

        kbuf = kmem_alloc(bufsize, KM_SLEEP);
        if (copyin(buf, kbuf, bufsize) != 0) {
                error = EFAULT;
                goto out;
        }

        acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
        switch (idtype) {
        case P_PID:
                mutex_enter(&pidlock);
                if ((p = prfind(id)) == NULL) {
                        mutex_exit(&pidlock);
                        error = ESRCH;
                } else {
                        zone_t *zone = p->p_zone;

                        tkid = p->p_task->tk_tkid;
                        zone_hold(zone);
                        mutex_exit(&pidlock);

                        error = exacct_tag_proc(&acg->ac_proc, id, tkid, kbuf,
                            bufsize, flags, zone->zone_nodename);
                        zone_rele(zone);
                }
                break;
        case P_TASKID:
                if ((tk = task_hold_by_id(id)) != NULL) {
                        error = exacct_tag_task(&acg->ac_task, tk, kbuf,
                            bufsize, flags);
                        task_rele(tk);
                } else {
                        error = ESRCH;
                }
                break;
        default:
                error = EINVAL;
                break;
        }
out:
        kmem_free(kbuf, bufsize);
        return (error == 0 ? error : set_errno(error));
}

static int
wracct_task(ac_info_t *ac_task, taskid_t tkid, int flag, size_t *sizep)
{
        task_t *tk;
        int error;

        mutex_enter(&ac_task->ac_lock);
        if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
                mutex_exit(&ac_task->ac_lock);
                return (ENOTACTIVE);
        }
        mutex_exit(&ac_task->ac_lock);

        if ((tk = task_hold_by_id(tkid)) == NULL)
                return (ESRCH);
        error = exacct_assemble_task_usage(ac_task, tk, exacct_commit_callback,
            NULL, 0, sizep, flag);
        task_rele(tk);

        return (error);
}

static int
wracct_proc(ac_info_t *ac_proc, pid_t pid, int flag, size_t *sizep)
{
        proc_t *p;
        proc_usage_t *pu;
        ulong_t mask[AC_MASK_SZ];
        ulong_t *ac_mask = &mask[0];
        int error;

        mutex_enter(&ac_proc->ac_lock);
        if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
                mutex_exit(&ac_proc->ac_lock);
                return (ENOTACTIVE);
        }
        bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ);
        mutex_exit(&ac_proc->ac_lock);

        pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP);
        pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP);

        mutex_enter(&pidlock);
        if ((p = prfind(pid)) == NULL) {
                mutex_exit(&pidlock);
                kmem_free(pu->pu_command, MAXCOMLEN + 1);
                kmem_free(pu, sizeof (proc_usage_t));
                return (ESRCH);
        }
        mutex_enter(&p->p_lock);
        mutex_exit(&pidlock);
        exacct_calculate_proc_usage(p, pu, ac_mask, flag, 0);
        mutex_exit(&p->p_lock);

        error = exacct_assemble_proc_usage(ac_proc, pu,
            exacct_commit_callback, NULL, 0, sizep, flag);

        kmem_free(pu->pu_command, MAXCOMLEN + 1);
        kmem_free(pu, sizeof (proc_usage_t));

        return (error);
}

static int
wracct(idtype_t idtype, id_t id, int flags)
{
        int error;
        size_t size = 0;
        struct exacct_globals *acg;

        /*
         * Validate flags.
         */
        switch (flags) {
        case EW_PARTIAL:
        case EW_INTERVAL:
                break;
        default:
                return (set_errno(EINVAL));
        }

        acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
        switch (idtype) {
        case P_PID:
                if (flags == EW_INTERVAL)
                        return (set_errno(ENOTSUP));
                error = wracct_proc(&acg->ac_proc, id, flags, &size);
                break;
        case P_TASKID:
                error = wracct_task(&acg->ac_task, id, flags, &size);
                break;
        default:
                error = EINVAL;
                break;
        }

        return (error == 0 ? error : set_errno(error));
}

static long
exacct(int code, idtype_t idtype, id_t id, void *buf, size_t bufsize,
    int flags)
{
        if (secpolicy_acct(CRED()) != 0)
                return (set_errno(EPERM));

        if (exacct_zone_key == ZONE_KEY_UNINITIALIZED)
                return (set_errno(ENOTACTIVE));

        switch (code) {
        case 0:
                return (getacct(idtype, id, buf, bufsize));
        case 1:
                return (putacct(idtype, id, buf, bufsize, flags));
        case 2:
                return (wracct(idtype, id, flags));
        default:
                return (set_errno(EINVAL));
        }
}

#if defined(_LP64)
#define SE_LRVAL        SE_64RVAL
#else
#define SE_LRVAL        SE_32RVAL1
#endif

static struct sysent exacctsys_sysent = {
        6,
        SE_NOUNLOAD | SE_ARGC | SE_LRVAL,
        (int (*)())(uintptr_t)exacct
};

static struct modlsys modlsys = {
        &mod_syscallops,
        "extended accounting facility",
        &exacctsys_sysent
};

#ifdef _SYSCALL32_IMPL

static struct sysent exacctsys_sysent32 = {
        6,
        SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
        (int (*)())(uintptr_t)exacct
};

static struct modlsys modlsys32 = {
        &mod_syscallops32,
        "32-bit extended accounting facility",
        &exacctsys_sysent32
};

#endif

static struct modlinkage modlinkage = {
        MODREV_1,
        &modlsys,
#ifdef _SYSCALL32_IMPL
        &modlsys32,
#endif
        NULL
};

int
_init(void)
{
        return (mod_install(&modlinkage));
}

int
_fini(void)
{
        return (mod_remove(&modlinkage));
}

int
_info(struct modinfo *mip)
{
        return (mod_info(&modlinkage, mip));
}