root/usr/src/uts/common/os/contract.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2017 by Delphix. All rights reserved.
 */

/*
 * Contracts
 * ---------
 *
 * Contracts are a primitive which enrich the relationships between
 * processes and system resources.  The primary purpose of contracts is
 * to provide a means for the system to negotiate the departure from a
 * binding relationship (e.g. pages locked in memory or a thread bound
 * to processor), but they can also be used as a purely asynchronous
 * error reporting mechanism as they are with process contracts.
 *
 * More information on how one interfaces with contracts and what
 * contracts can do for you can be found in:
 *   PSARC 2003/193 Solaris Contracts
 *   PSARC 2004/460 Contracts addendum
 *
 * This file contains the core contracts framework.  By itself it is
 * useless: it depends the contracts filesystem (ctfs) to provide an
 * interface to user processes and individual contract types to
 * implement the process/resource relationships.
 *
 * Data structure overview
 * -----------------------
 *
 * A contract is represented by a contract_t, which itself points to an
 * encapsulating contract-type specific contract object.  A contract_t
 * contains the contract's static identity (including its terms), its
 * linkage to various bookkeeping structures, the contract-specific
 * event queue, and a reference count.
 *
 * A contract template is represented by a ct_template_t, which, like a
 * contract, points to an encapsulating contract-type specific template
 * object.  A ct_template_t contains the template's terms.
 *
 * An event queue is represented by a ct_equeue_t, and consists of a
 * list of events, a list of listeners, and a list of listeners who are
 * waiting for new events (affectionately referred to as "tail
 * listeners").  There are three queue types, defined by ct_listnum_t
 * (an enum).  An event may be on one of each type of queue
 * simultaneously; the list linkage used by a queue is determined by
 * its type.
 *
 * An event is represented by a ct_kevent_t, which contains mostly
 * static event data (e.g. id, payload).  It also has an array of
 * ct_member_t structures, each of which contains a list_node_t and
 * represent the event's linkage in a specific event queue.
 *
 * Each open of an event endpoint results in the creation of a new
 * listener, represented by a ct_listener_t.  In addition to linkage
 * into the aforementioned lists in the event_queue, a ct_listener_t
 * contains a pointer to the ct_kevent_t it is currently positioned at
 * as well as a set of status flags and other administrative data.
 *
 * Each process has a list of contracts it owns, p_ct_held; a pointer
 * to the process contract it is a member of, p_ct_process; the linkage
 * for that membership, p_ct_member; and an array of event queue
 * structures representing the process bundle queues.
 *
 * Each LWP has an array of its active templates, lwp_ct_active; and
 * the most recently created contracts, lwp_ct_latest.
 *
 * A process contract has a list of member processes and a list of
 * inherited contracts.
 *
 * There is a system-wide list of all contracts, as well as per-type
 * lists of contracts.
 *
 * Lock ordering overview
 * ----------------------
 *
 * Locks at the top are taken first:
 *
 *                   ct_evtlock
 *                   regent ct_lock
 *                   member ct_lock
 *                   pidlock
 *                   p_lock
 *    contract ctq_lock         contract_lock
 *    pbundle ctq_lock
 *    cte_lock
 *                   ct_reflock
 *
 * contract_lock and ctq_lock/cte_lock are not currently taken at the
 * same time.
 *
 * Reference counting and locking
 * ------------------------------
 *
 * A contract has a reference count, protected by ct_reflock.
 * (ct_reflock is also used in a couple other places where atomic
 * access to a variable is needed in an innermost context).  A process
 * maintains a hold on each contract it owns.  A process contract has a
 * hold on each contract is has inherited.  Each event has a hold on
 * the contract which generated it.  Process contract templates have
 * holds on the contracts referred to by their transfer terms.  CTFS
 * contract directory nodes have holds on contracts.  Lastly, various
 * code paths may temporarily take holds on contracts to prevent them
 * from disappearing while other processing is going on.  It is
 * important to note that the global contract lists do not hold
 * references on contracts; a contract is removed from these structures
 * atomically with the release of its last reference.
 *
 * At a given point in time, a contract can either be owned by a
 * process, inherited by a regent process contract, or orphaned.  A
 * contract_t's  owner and regent pointers, ct_owner and ct_regent, are
 * protected by its ct_lock.  The linkage in the holder's (holder =
 * owner or regent) list of contracts, ct_ctlist, is protected by
 * whatever lock protects the holder's data structure.  In order for
 * these two directions to remain consistent, changing the holder of a
 * contract requires that both locks be held.
 *
 * Events also have reference counts.  There is one hold on an event
 * per queue it is present on, in addition to those needed for the
 * usual sundry reasons.  Individual listeners are associated with
 * specific queues, and increase a queue-specific reference count
 * stored in the ct_member_t structure.
 *
 * The dynamic contents of an event (reference count and flags) are
 * protected by its cte_lock, while the contents of the embedded
 * ct_member_t structures are protected by the locks of the queues they
 * are linked into.  A ct_listener_t's contents are also protected by
 * its event queue's ctq_lock.
 *
 * Resource controls
 * -----------------
 *
 * Control:      project.max-contracts (rc_project_contract)
 * Description:  Maximum number of contracts allowed a project.
 *
 *   When a contract is created, the project's allocation is tested and
 *   (assuming success) increased.  When the last reference to a
 *   contract is released, the creating project's allocation is
 *   decreased.
 */

#include <sys/mutex.h>
#include <sys/debug.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/thread.h>
#include <sys/id_space.h>
#include <sys/avl.h>
#include <sys/list.h>
#include <sys/sysmacros.h>
#include <sys/proc.h>
#include <sys/ctfs.h>
#include <sys/contract_impl.h>
#include <sys/contract/process_impl.h>
#include <sys/dditypes.h>
#include <sys/contract/device_impl.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/model.h>
#include <sys/policy.h>
#include <sys/zone.h>
#include <sys/task.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>

extern rctl_hndl_t rc_project_contract;

static id_space_t       *contract_ids;
static avl_tree_t       contract_avl;
static kmutex_t         contract_lock;

int                     ct_ntypes = CTT_MAXTYPE;
static ct_type_t        *ct_types_static[CTT_MAXTYPE];
ct_type_t               **ct_types = ct_types_static;
int                     ct_debug;

static void cte_queue_create(ct_equeue_t *, ct_listnum_t, int, int);
static void cte_queue_destroy(ct_equeue_t *);
static void cte_queue_drain(ct_equeue_t *, int);
static void cte_trim(ct_equeue_t *, contract_t *);
static void cte_copy(ct_equeue_t *, ct_equeue_t *);

/*
 * contract_compar
 *
 * A contract comparator which sorts on contract ID.
 */
int
contract_compar(const void *x, const void *y)
{
        const contract_t *ct1 = x;
        const contract_t *ct2 = y;

        if (ct1->ct_id < ct2->ct_id)
                return (-1);
        if (ct1->ct_id > ct2->ct_id)
                return (1);
        return (0);
}

/*
 * contract_init
 *
 * Initializes the contract subsystem, the specific contract types, and
 * process 0.
 */
void
contract_init(void)
{
        /*
         * Initialize contract subsystem.
         */
        contract_ids = id_space_create("contracts", 1, INT_MAX);
        avl_create(&contract_avl, contract_compar, sizeof (contract_t),
            offsetof(contract_t, ct_ctavl));
        mutex_init(&contract_lock, NULL, MUTEX_DEFAULT, NULL);

        /*
         * Initialize contract types.
         */
        contract_process_init();
        contract_device_init();

        /*
         * Initialize p0/lwp0 contract state.
         */
        avl_create(&p0.p_ct_held, contract_compar, sizeof (contract_t),
            offsetof(contract_t, ct_ctlist));
}

/*
 * contract_dtor
 *
 * Performs basic destruction of the common portions of a contract.
 * Called from the failure path of contract_ctor and from
 * contract_rele.
 */
static void
contract_dtor(contract_t *ct)
{
        cte_queue_destroy(&ct->ct_events);
        list_destroy(&ct->ct_vnodes);
        mutex_destroy(&ct->ct_reflock);
        mutex_destroy(&ct->ct_lock);
        mutex_destroy(&ct->ct_evtlock);
}

/*
 * contract_ctor
 *
 * Called by a contract type to initialize a contract.  Fails if the
 * max-contract resource control would have been exceeded.  After a
 * successful call to contract_ctor, the contract is unlocked and
 * visible in all namespaces; any type-specific initialization should
 * be completed before calling contract_ctor.  Returns 0 on success.
 *
 * Because not all callers can tolerate failure, a 0 value for canfail
 * instructs contract_ctor to ignore the project.max-contracts resource
 * control.  Obviously, this "out" should only be employed by callers
 * who are sufficiently constrained in other ways (e.g. newproc).
 */
int
contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
    ctflags_t flags, proc_t *author, int canfail)
{
        avl_index_t where;
        klwp_t *curlwp = ttolwp(curthread);

        ASSERT(author == curproc);

        mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&ct->ct_evtlock, NULL, MUTEX_DEFAULT, NULL);
        ct->ct_id = id_alloc(contract_ids);

        cte_queue_create(&ct->ct_events, CTEL_CONTRACT, 20, 0);
        list_create(&ct->ct_vnodes, sizeof (contract_vnode_t),
            offsetof(contract_vnode_t, ctv_node));

        /*
         * Instance data
         */
        ct->ct_ref = 2;         /* one for the holder, one for "latest" */
        ct->ct_cuid = crgetuid(CRED());
        ct->ct_type = type;
        ct->ct_data = data;
        gethrestime(&ct->ct_ctime);
        ct->ct_state = CTS_OWNED;
        ct->ct_flags = flags;
        ct->ct_regent = author->p_ct_process ?
            &author->p_ct_process->conp_contract : NULL;
        ct->ct_ev_info = tmpl->ctmpl_ev_info;
        ct->ct_ev_crit = tmpl->ctmpl_ev_crit;
        ct->ct_cookie = tmpl->ctmpl_cookie;
        ct->ct_owner = author;
        ct->ct_ntime.ctm_total = -1;
        ct->ct_qtime.ctm_total = -1;
        ct->ct_nevent = NULL;

        /*
         * Test project.max-contracts.
         */
        mutex_enter(&author->p_lock);
        mutex_enter(&contract_lock);
        if (canfail && rctl_test(rc_project_contract,
            author->p_task->tk_proj->kpj_rctls, author, 1,
            RCA_SAFE) & RCT_DENY) {
                id_free(contract_ids, ct->ct_id);
                mutex_exit(&contract_lock);
                mutex_exit(&author->p_lock);
                ct->ct_events.ctq_flags |= CTQ_DEAD;
                contract_dtor(ct);
                return (1);
        }
        ct->ct_proj = author->p_task->tk_proj;
        ct->ct_proj->kpj_data.kpd_contract++;
        (void) project_hold(ct->ct_proj);
        mutex_exit(&contract_lock);

        /*
         * Insert into holder's avl of contracts.
         * We use an avl not because order is important, but because
         * readdir of /proc/contracts requires we be able to use a
         * scalar as an index into the process's list of contracts
         */
        ct->ct_zoneid = author->p_zone->zone_id;
        ct->ct_czuniqid = ct->ct_mzuniqid = author->p_zone->zone_uniqid;
        VERIFY(avl_find(&author->p_ct_held, ct, &where) == NULL);
        avl_insert(&author->p_ct_held, ct, where);
        mutex_exit(&author->p_lock);

        /*
         * Insert into global contract AVL
         */
        mutex_enter(&contract_lock);
        VERIFY(avl_find(&contract_avl, ct, &where) == NULL);
        avl_insert(&contract_avl, ct, where);
        mutex_exit(&contract_lock);

        /*
         * Insert into type AVL
         */
        mutex_enter(&type->ct_type_lock);
        VERIFY(avl_find(&type->ct_type_avl, ct, &where) == NULL);
        avl_insert(&type->ct_type_avl, ct, where);
        type->ct_type_timestruc = ct->ct_ctime;
        mutex_exit(&type->ct_type_lock);

        if (curlwp->lwp_ct_latest[type->ct_type_index])
                contract_rele(curlwp->lwp_ct_latest[type->ct_type_index]);
        curlwp->lwp_ct_latest[type->ct_type_index] = ct;

        return (0);
}

/*
 * contract_rele
 *
 * Releases a reference to a contract.  If the caller had the last
 * reference, the contract is removed from all namespaces, its
 * allocation against the max-contracts resource control is released,
 * and the contract type's free entry point is invoked for any
 * type-specific deconstruction and to (presumably) free the object.
 */
void
contract_rele(contract_t *ct)
{
        uint64_t nref;

        mutex_enter(&ct->ct_reflock);
        ASSERT(ct->ct_ref > 0);
        nref = --ct->ct_ref;
        mutex_exit(&ct->ct_reflock);
        if (nref == 0) {
                /*
                 * ct_owner is cleared when it drops its reference.
                 */
                ASSERT(ct->ct_owner == NULL);
                ASSERT(ct->ct_evcnt == 0);

                /*
                 * Remove from global contract AVL
                 */
                mutex_enter(&contract_lock);
                avl_remove(&contract_avl, ct);
                mutex_exit(&contract_lock);

                /*
                 * Remove from type AVL
                 */
                mutex_enter(&ct->ct_type->ct_type_lock);
                avl_remove(&ct->ct_type->ct_type_avl, ct);
                mutex_exit(&ct->ct_type->ct_type_lock);

                /*
                 * Release the contract's ID
                 */
                id_free(contract_ids, ct->ct_id);

                /*
                 * Release project hold
                 */
                mutex_enter(&contract_lock);
                ct->ct_proj->kpj_data.kpd_contract--;
                project_rele(ct->ct_proj);
                mutex_exit(&contract_lock);

                /*
                 * Free the contract
                 */
                contract_dtor(ct);
                ct->ct_type->ct_type_ops->contop_free(ct);
        }
}

/*
 * contract_hold
 *
 * Adds a reference to a contract
 */
void
contract_hold(contract_t *ct)
{
        mutex_enter(&ct->ct_reflock);
        ASSERT(ct->ct_ref < UINT64_MAX);
        ct->ct_ref++;
        mutex_exit(&ct->ct_reflock);
}

/*
 * contract_getzuniqid
 *
 * Get a contract's zone unique ID.  Needed because 64-bit reads and
 * writes aren't atomic on x86.  Since there are contexts where we are
 * unable to take ct_lock, we instead use ct_reflock; in actuality any
 * lock would do.
 */
uint64_t
contract_getzuniqid(contract_t *ct)
{
        uint64_t zuniqid;

        mutex_enter(&ct->ct_reflock);
        zuniqid = ct->ct_mzuniqid;
        mutex_exit(&ct->ct_reflock);

        return (zuniqid);
}

/*
 * contract_setzuniqid
 *
 * Sets a contract's zone unique ID.   See contract_getzuniqid.
 */
void
contract_setzuniqid(contract_t *ct, uint64_t zuniqid)
{
        mutex_enter(&ct->ct_reflock);
        ct->ct_mzuniqid = zuniqid;
        mutex_exit(&ct->ct_reflock);
}

/*
 * contract_abandon
 *
 * Abandons the specified contract.  If "explicit" is clear, the
 * contract was implicitly abandoned (by process exit) and should be
 * inherited if its terms allow it and its owner was a member of a
 * regent contract.  Otherwise, the contract type's abandon entry point
 * is invoked to either destroy or orphan the contract.
 */
int
contract_abandon(contract_t *ct, proc_t *p, int explicit)
{
        ct_equeue_t *q = NULL;
        contract_t *parent = &p->p_ct_process->conp_contract;
        int inherit = 0;

        VERIFY(p == curproc);

        mutex_enter(&ct->ct_lock);

        /*
         * Multiple contract locks are taken contract -> subcontract.
         * Check if the contract will be inherited so we can acquire
         * all the necessary locks before making sensitive changes.
         */
        if (!explicit && (ct->ct_flags & CTF_INHERIT) &&
            contract_process_accept(parent)) {
                mutex_exit(&ct->ct_lock);
                mutex_enter(&parent->ct_lock);
                mutex_enter(&ct->ct_lock);
                inherit = 1;
        }

        if (ct->ct_owner != p) {
                mutex_exit(&ct->ct_lock);
                if (inherit)
                        mutex_exit(&parent->ct_lock);
                return (EINVAL);
        }

        mutex_enter(&p->p_lock);
        if (explicit)
                avl_remove(&p->p_ct_held, ct);
        ct->ct_owner = NULL;
        mutex_exit(&p->p_lock);

        /*
         * Since we can't call cte_trim with the contract lock held,
         * we grab the queue pointer here.
         */
        if (p->p_ct_equeue)
                q = p->p_ct_equeue[ct->ct_type->ct_type_index];

        /*
         * contop_abandon may destroy the contract so we rely on it to
         * drop ct_lock.  We retain a reference on the contract so that
         * the cte_trim which follows functions properly.  Even though
         * cte_trim doesn't dereference the contract pointer, it is
         * still necessary to retain a reference to the contract so
         * that we don't trim events which are sent by a subsequently
         * allocated contract infortuitously located at the same address.
         */
        contract_hold(ct);

        if (inherit) {
                ct->ct_state = CTS_INHERITED;
                VERIFY(ct->ct_regent == parent);
                contract_process_take(parent, ct);

                /*
                 * We are handing off the process's reference to the
                 * parent contract.  For this reason, the order in
                 * which we drop the contract locks is also important.
                 */
                mutex_exit(&ct->ct_lock);
                mutex_exit(&parent->ct_lock);
        } else {
                ct->ct_regent = NULL;
                ct->ct_type->ct_type_ops->contop_abandon(ct);
        }

        /*
         * ct_lock has been dropped; we can safely trim the event
         * queue now.
         */
        if (q) {
                mutex_enter(&q->ctq_lock);
                cte_trim(q, ct);
                mutex_exit(&q->ctq_lock);
        }

        contract_rele(ct);

        return (0);
}

int
contract_newct(contract_t *ct)
{
        return (ct->ct_type->ct_type_ops->contop_newct(ct));
}

/*
 * contract_adopt
 *
 * Adopts a contract.  After a successful call to this routine, the
 * previously inherited contract will belong to the calling process,
 * and its events will have been appended to its new owner's process
 * bundle queue.
 */
int
contract_adopt(contract_t *ct, proc_t *p)
{
        avl_index_t where;
        ct_equeue_t *q;
        contract_t *parent;

        ASSERT(p == curproc);

        /*
         * Ensure the process has an event queue.  Checked by ASSERTs
         * below.
         */
        (void) contract_type_pbundle(ct->ct_type, p);

        mutex_enter(&ct->ct_lock);
        parent = ct->ct_regent;
        if (ct->ct_state != CTS_INHERITED ||
            &p->p_ct_process->conp_contract != parent ||
            p->p_zone->zone_uniqid != ct->ct_czuniqid) {
                mutex_exit(&ct->ct_lock);
                return (EINVAL);
        }

        /*
         * Multiple contract locks are taken contract -> subcontract.
         */
        mutex_exit(&ct->ct_lock);
        mutex_enter(&parent->ct_lock);
        mutex_enter(&ct->ct_lock);

        /*
         * It is possible that the contract was adopted by someone else
         * while its lock was dropped.  It isn't possible for the
         * contract to have been inherited by a different regent
         * contract.
         */
        if (ct->ct_state != CTS_INHERITED) {
                mutex_exit(&parent->ct_lock);
                mutex_exit(&ct->ct_lock);
                return (EBUSY);
        }
        ASSERT(ct->ct_regent == parent);

        ct->ct_state = CTS_OWNED;

        contract_process_adopt(ct, p);

        mutex_enter(&p->p_lock);
        ct->ct_owner = p;
        VERIFY(avl_find(&p->p_ct_held, ct, &where) == NULL);
        avl_insert(&p->p_ct_held, ct, where);
        mutex_exit(&p->p_lock);

        ASSERT(ct->ct_owner->p_ct_equeue);
        ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
        q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
        cte_copy(&ct->ct_events, q);
        mutex_exit(&ct->ct_lock);

        return (0);
}

/*
 * contract_ack
 *
 * Acknowledges receipt of a critical event.
 */
int
contract_ack(contract_t *ct, uint64_t evid, int ack)
{
        ct_kevent_t *ev;
        list_t *queue = &ct->ct_events.ctq_events;
        int error = ESRCH;
        int nego = 0;
        uint_t evtype;

        ASSERT(ack == CT_ACK || ack == CT_NACK);

        mutex_enter(&ct->ct_lock);
        mutex_enter(&ct->ct_events.ctq_lock);
        /*
         * We are probably ACKing something near the head of the queue.
         */
        for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
                if (ev->cte_id == evid) {
                        if (ev->cte_flags & CTE_NEG)
                                nego = 1;
                        else if (ack == CT_NACK)
                                break;
                        if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
                                ev->cte_flags |= CTE_ACK;
                                ct->ct_evcnt--;
                                evtype = ev->cte_type;
                                error = 0;
                        }
                        break;
                }
        }
        mutex_exit(&ct->ct_events.ctq_lock);
        mutex_exit(&ct->ct_lock);

        /*
         * Not all critical events are negotiation events, however
         * every negotiation event is a critical event. NEGEND events
         * are critical events but are not negotiation events
         */
        if (error || !nego)
                return (error);

        if (ack == CT_ACK)
                error = ct->ct_type->ct_type_ops->contop_ack(ct, evtype, evid);
        else
                error = ct->ct_type->ct_type_ops->contop_nack(ct, evtype, evid);

        return (error);
}

/*ARGSUSED*/
int
contract_ack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
{
        cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
            ct->ct_id);
        return (ENOSYS);
}

/*ARGSUSED*/
int
contract_qack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
{
        cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
            ct->ct_id);
        return (ENOSYS);
}

/*ARGSUSED*/
int
contract_qack_notsup(contract_t *ct, uint_t evtype, uint64_t evid)
{
        return (ERANGE);
}

/*
 * contract_qack
 *
 * Asks that negotiations be extended by another time quantum
 */
int
contract_qack(contract_t *ct, uint64_t evid)
{
        ct_kevent_t *ev;
        list_t *queue = &ct->ct_events.ctq_events;
        int nego = 0;
        uint_t evtype;

        mutex_enter(&ct->ct_lock);
        mutex_enter(&ct->ct_events.ctq_lock);

        for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
                if (ev->cte_id == evid) {
                        if ((ev->cte_flags & (CTE_NEG | CTE_ACK)) == CTE_NEG) {
                                evtype = ev->cte_type;
                                nego = 1;
                        }
                        break;
                }
        }
        mutex_exit(&ct->ct_events.ctq_lock);
        mutex_exit(&ct->ct_lock);

        /*
         * Only a negotiated event (which is by definition also a critical
         * event) which has not yet been acknowledged can provide
         * time quanta to a negotiating owner process.
         */
        if (!nego)
                return (ESRCH);

        return (ct->ct_type->ct_type_ops->contop_qack(ct, evtype, evid));
}

/*
 * contract_orphan
 *
 * Icky-poo.  This is a process-contract special, used to ACK all
 * critical messages when a contract is orphaned.
 */
void
contract_orphan(contract_t *ct)
{
        ct_kevent_t *ev;
        list_t *queue = &ct->ct_events.ctq_events;

        ASSERT(MUTEX_HELD(&ct->ct_lock));
        ASSERT(ct->ct_state != CTS_ORPHAN);

        mutex_enter(&ct->ct_events.ctq_lock);
        ct->ct_state = CTS_ORPHAN;
        for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
                if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
                        ev->cte_flags |= CTE_ACK;
                        ct->ct_evcnt--;
                }
        }
        mutex_exit(&ct->ct_events.ctq_lock);

        ASSERT(ct->ct_evcnt == 0);
}

/*
 * contract_destroy
 *
 * Explicit contract destruction.  Called when contract is empty.
 * The contract will actually stick around until all of its events are
 * removed from the bundle and and process bundle queues, and all fds
 * which refer to it are closed.  See contract_dtor if you are looking
 * for what destroys the contract structure.
 */
void
contract_destroy(contract_t *ct)
{
        ASSERT(MUTEX_HELD(&ct->ct_lock));
        ASSERT(ct->ct_state != CTS_DEAD);
        ASSERT(ct->ct_owner == NULL);

        ct->ct_state = CTS_DEAD;
        cte_queue_drain(&ct->ct_events, 1);
        mutex_exit(&ct->ct_lock);
        mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
        cte_trim(&ct->ct_type->ct_type_events, ct);
        mutex_exit(&ct->ct_type->ct_type_events.ctq_lock);
        mutex_enter(&ct->ct_lock);
        ct->ct_type->ct_type_ops->contop_destroy(ct);
        mutex_exit(&ct->ct_lock);
        contract_rele(ct);
}

/*
 * contract_vnode_get
 *
 * Obtains the contract directory vnode for this contract, if there is
 * one.  The caller must VN_RELE the vnode when they are through using
 * it.
 */
vnode_t *
contract_vnode_get(contract_t *ct, vfs_t *vfsp)
{
        contract_vnode_t *ctv;
        vnode_t *vp = NULL;

        mutex_enter(&ct->ct_lock);
        for (ctv = list_head(&ct->ct_vnodes); ctv != NULL;
            ctv = list_next(&ct->ct_vnodes, ctv))
                if (ctv->ctv_vnode->v_vfsp == vfsp) {
                        vp = ctv->ctv_vnode;
                        VN_HOLD(vp);
                        break;
                }
        mutex_exit(&ct->ct_lock);
        return (vp);
}

/*
 * contract_vnode_set
 *
 * Sets the contract directory vnode for this contract.  We don't hold
 * a reference on the vnode because we don't want to prevent it from
 * being freed.  The vnode's inactive entry point will take care of
 * notifying us when it should be removed.
 */
void
contract_vnode_set(contract_t *ct, contract_vnode_t *ctv, vnode_t *vnode)
{
        mutex_enter(&ct->ct_lock);
        ctv->ctv_vnode = vnode;
        list_insert_head(&ct->ct_vnodes, ctv);
        mutex_exit(&ct->ct_lock);
}

/*
 * contract_vnode_clear
 *
 * Removes this vnode as the contract directory vnode for this
 * contract.  Called from a contract directory's inactive entry point,
 * this may return 0 indicating that the vnode gained another reference
 * because of a simultaneous call to contract_vnode_get.
 */
int
contract_vnode_clear(contract_t *ct, contract_vnode_t *ctv)
{
        vnode_t *vp = ctv->ctv_vnode;
        int result;

        mutex_enter(&ct->ct_lock);
        mutex_enter(&vp->v_lock);
        if (vp->v_count == 1) {
                list_remove(&ct->ct_vnodes, ctv);
                result = 1;
        } else {
                VN_RELE_LOCKED(vp);
                result = 0;
        }
        mutex_exit(&vp->v_lock);
        mutex_exit(&ct->ct_lock);

        return (result);
}

/*
 * contract_exit
 *
 * Abandons all contracts held by process p, and drains process p's
 * bundle queues.  Called on process exit.
 */
void
contract_exit(proc_t *p)
{
        contract_t *ct;
        void *cookie = NULL;
        int i;

        ASSERT(p == curproc);

        /*
         * Abandon held contracts.  contract_abandon knows enough not
         * to remove the contract from the list a second time.  We are
         * exiting, so no locks are needed here.  But because
         * contract_abandon will take p_lock, we need to make sure we
         * aren't holding it.
         */
        ASSERT(MUTEX_NOT_HELD(&p->p_lock));
        while ((ct = avl_destroy_nodes(&p->p_ct_held, &cookie)) != NULL)
                VERIFY(contract_abandon(ct, p, 0) == 0);

        /*
         * Drain pbundles.  Because a process bundle queue could have
         * been passed to another process, they may not be freed right
         * away.
         */
        if (p->p_ct_equeue) {
                for (i = 0; i < CTT_MAXTYPE; i++)
                        if (p->p_ct_equeue[i])
                                cte_queue_drain(p->p_ct_equeue[i], 0);
                kmem_free(p->p_ct_equeue, CTT_MAXTYPE * sizeof (ct_equeue_t *));
        }
}

static int
get_time_left(struct ct_time *t)
{
        clock_t ticks_elapsed;
        int secs_elapsed;

        if (t->ctm_total == -1)
                return (-1);

        ticks_elapsed = ddi_get_lbolt() - t->ctm_start;
        secs_elapsed = t->ctm_total - (drv_hztousec(ticks_elapsed)/MICROSEC);
        return (secs_elapsed > 0 ? secs_elapsed : 0);
}

/*
 * contract_status_common
 *
 * Populates a ct_status structure.  Used by contract types in their
 * status entry points and ctfs when only common information is
 * requested.
 */
void
contract_status_common(contract_t *ct, zone_t *zone, void *status,
    model_t model)
{
        STRUCT_HANDLE(ct_status, lstatus);

        STRUCT_SET_HANDLE(lstatus, model, status);
        ASSERT(MUTEX_HELD(&ct->ct_lock));
        if (zone->zone_uniqid == GLOBAL_ZONEUNIQID ||
            zone->zone_uniqid == ct->ct_czuniqid) {
                zone_t *czone;
                zoneid_t zoneid = -1;

                /*
                 * Contracts don't have holds on the zones they were
                 * created by.  If the contract's zone no longer
                 * exists, we say its zoneid is -1.
                 */
                if (zone->zone_uniqid == ct->ct_czuniqid ||
                    ct->ct_czuniqid == GLOBAL_ZONEUNIQID) {
                        zoneid = ct->ct_zoneid;
                } else if ((czone = zone_find_by_id(ct->ct_zoneid)) != NULL) {
                        if (czone->zone_uniqid == ct->ct_mzuniqid)
                                zoneid = ct->ct_zoneid;
                        zone_rele(czone);
                }

                STRUCT_FSET(lstatus, ctst_zoneid, zoneid);
                STRUCT_FSET(lstatus, ctst_holder,
                    (ct->ct_state == CTS_OWNED) ? ct->ct_owner->p_pid :
                    (ct->ct_state == CTS_INHERITED) ? ct->ct_regent->ct_id : 0);
                STRUCT_FSET(lstatus, ctst_state, ct->ct_state);
        } else {
                /*
                 * We are looking at a contract which was created by a
                 * process outside of our zone.  We provide fake zone,
                 * holder, and state information.
                 */

                STRUCT_FSET(lstatus, ctst_zoneid, zone->zone_id);
                /*
                 * Since "zone" can't disappear until the calling ctfs
                 * is unmounted, zone_zsched must be valid.
                 */
                STRUCT_FSET(lstatus, ctst_holder, (ct->ct_state < CTS_ORPHAN) ?
                    zone->zone_zsched->p_pid : 0);
                STRUCT_FSET(lstatus, ctst_state, (ct->ct_state < CTS_ORPHAN) ?
                    CTS_OWNED : ct->ct_state);
        }
        STRUCT_FSET(lstatus, ctst_nevents, ct->ct_evcnt);
        STRUCT_FSET(lstatus, ctst_ntime, get_time_left(&ct->ct_ntime));
        STRUCT_FSET(lstatus, ctst_qtime, get_time_left(&ct->ct_qtime));
        STRUCT_FSET(lstatus, ctst_nevid,
            ct->ct_nevent ? ct->ct_nevent->cte_id : 0);
        STRUCT_FSET(lstatus, ctst_critical, ct->ct_ev_crit);
        STRUCT_FSET(lstatus, ctst_informative, ct->ct_ev_info);
        STRUCT_FSET(lstatus, ctst_cookie, ct->ct_cookie);
        STRUCT_FSET(lstatus, ctst_type, ct->ct_type->ct_type_index);
        STRUCT_FSET(lstatus, ctst_id, ct->ct_id);
}

/*
 * contract_checkcred
 *
 * Determines if the specified contract is owned by a process with the
 * same effective uid as the specified credential.  The caller must
 * ensure that the uid spaces are the same.  Returns 1 on success.
 */
static int
contract_checkcred(contract_t *ct, const cred_t *cr)
{
        proc_t *p;
        int fail = 1;

        mutex_enter(&ct->ct_lock);
        if ((p = ct->ct_owner) != NULL) {
                mutex_enter(&p->p_crlock);
                fail = crgetuid(cr) != crgetuid(p->p_cred);
                mutex_exit(&p->p_crlock);
        }
        mutex_exit(&ct->ct_lock);

        return (!fail);
}

/*
 * contract_owned
 *
 * Determines if the specified credential can view an event generated
 * by the specified contract.  If locked is set, the contract's ct_lock
 * is held and the caller will need to do additional work to determine
 * if they truly can see the event.  Returns 1 on success.
 */
int
contract_owned(contract_t *ct, const cred_t *cr, int locked)
{
        int owner, cmatch, zmatch;
        uint64_t zuniqid, mzuniqid;
        uid_t euid;

        ASSERT(locked || MUTEX_NOT_HELD(&ct->ct_lock));

        zuniqid = curproc->p_zone->zone_uniqid;
        mzuniqid = contract_getzuniqid(ct);
        euid = crgetuid(cr);

        /*
         * owner: we own the contract
         * cmatch: we are in the creator's (and holder's) zone and our
         *   uid matches the creator's or holder's
         * zmatch: we are in the effective zone of a contract created
         *   in the global zone, and our uid matches that of the
         *   virtualized holder's (zsched/kcred)
         */
        owner = (ct->ct_owner == curproc);
        cmatch = (zuniqid == ct->ct_czuniqid) &&
            ((ct->ct_cuid == euid) || (!locked && contract_checkcred(ct, cr)));
        zmatch = (ct->ct_czuniqid != mzuniqid) && (zuniqid == mzuniqid) &&
            (crgetuid(kcred) == euid);

        return (owner || cmatch || zmatch);
}


/*
 * contract_type_init
 *
 * Called by contract types to register themselves with the contracts
 * framework.
 */
ct_type_t *
contract_type_init(ct_typeid_t type, const char *name, contops_t *ops,
    ct_f_default_t *dfault)
{
        ct_type_t *result;

        ASSERT(type < CTT_MAXTYPE);

        result = kmem_alloc(sizeof (ct_type_t), KM_SLEEP);

        mutex_init(&result->ct_type_lock, NULL, MUTEX_DEFAULT, NULL);
        avl_create(&result->ct_type_avl, contract_compar, sizeof (contract_t),
            offsetof(contract_t, ct_cttavl));
        cte_queue_create(&result->ct_type_events, CTEL_BUNDLE, 20, 0);
        result->ct_type_name = name;
        result->ct_type_ops = ops;
        result->ct_type_default = dfault;
        result->ct_type_evid = 0;
        gethrestime(&result->ct_type_timestruc);
        result->ct_type_index = type;

        ct_types[type] = result;

        return (result);
}

/*
 * contract_type_count
 *
 * Obtains the number of contracts of a particular type.
 */
int
contract_type_count(ct_type_t *type)
{
        ulong_t count;

        mutex_enter(&type->ct_type_lock);
        count = avl_numnodes(&type->ct_type_avl);
        mutex_exit(&type->ct_type_lock);

        return (count);
}

/*
 * contract_type_max
 *
 * Obtains the maximum contract id of of a particular type.
 */
ctid_t
contract_type_max(ct_type_t *type)
{
        contract_t *ct;
        ctid_t res;

        mutex_enter(&type->ct_type_lock);
        ct = avl_last(&type->ct_type_avl);
        res = ct ? ct->ct_id : -1;
        mutex_exit(&type->ct_type_lock);

        return (res);
}

/*
 * contract_max
 *
 * Obtains the maximum contract id.
 */
ctid_t
contract_max(void)
{
        contract_t *ct;
        ctid_t res;

        mutex_enter(&contract_lock);
        ct = avl_last(&contract_avl);
        res = ct ? ct->ct_id : -1;
        mutex_exit(&contract_lock);

        return (res);
}

/*
 * contract_lookup_common
 *
 * Common code for contract_lookup and contract_type_lookup.  Takes a
 * pointer to an AVL tree to search in.  Should be called with the
 * appropriate tree-protecting lock held (unfortunately unassertable).
 */
static ctid_t
contract_lookup_common(avl_tree_t *tree, uint64_t zuniqid, ctid_t current)
{
        contract_t template, *ct;
        avl_index_t where;
        ctid_t res;

        template.ct_id = current;
        ct = avl_find(tree, &template, &where);
        if (ct == NULL)
                ct = avl_nearest(tree, where, AVL_AFTER);
        if (zuniqid != GLOBAL_ZONEUNIQID)
                while (ct && (contract_getzuniqid(ct) != zuniqid))
                        ct = AVL_NEXT(tree, ct);
        res = ct ? ct->ct_id : -1;

        return (res);
}

/*
 * contract_type_lookup
 *
 * Returns the next type contract after the specified id, visible from
 * the specified zone.
 */
ctid_t
contract_type_lookup(ct_type_t *type, uint64_t zuniqid, ctid_t current)
{
        ctid_t res;

        mutex_enter(&type->ct_type_lock);
        res = contract_lookup_common(&type->ct_type_avl, zuniqid, current);
        mutex_exit(&type->ct_type_lock);

        return (res);
}

/*
 * contract_lookup
 *
 * Returns the next contract after the specified id, visible from the
 * specified zone.
 */
ctid_t
contract_lookup(uint64_t zuniqid, ctid_t current)
{
        ctid_t res;

        mutex_enter(&contract_lock);
        res = contract_lookup_common(&contract_avl, zuniqid, current);
        mutex_exit(&contract_lock);

        return (res);
}

/*
 * contract_plookup
 *
 * Returns the next contract held by process p after the specified id,
 * visible from the specified zone.  Made complicated by the fact that
 * contracts visible in a zone but held by processes outside of the
 * zone need to appear as being held by zsched to zone members.
 */
ctid_t
contract_plookup(proc_t *p, ctid_t current, uint64_t zuniqid)
{
        contract_t template, *ct;
        avl_index_t where;
        ctid_t res;

        template.ct_id = current;
        if (zuniqid != GLOBAL_ZONEUNIQID &&
            (p->p_flag & (SSYS|SZONETOP)) == (SSYS|SZONETOP)) {
                /* This is inelegant. */
                mutex_enter(&contract_lock);
                ct = avl_find(&contract_avl, &template, &where);
                if (ct == NULL)
                        ct = avl_nearest(&contract_avl, where, AVL_AFTER);
                while (ct && !(ct->ct_state < CTS_ORPHAN &&
                    contract_getzuniqid(ct) == zuniqid &&
                    ct->ct_czuniqid == GLOBAL_ZONEUNIQID))
                        ct = AVL_NEXT(&contract_avl, ct);
                res = ct ? ct->ct_id : -1;
                mutex_exit(&contract_lock);
        } else {
                mutex_enter(&p->p_lock);
                ct = avl_find(&p->p_ct_held, &template, &where);
                if (ct == NULL)
                        ct = avl_nearest(&p->p_ct_held, where, AVL_AFTER);
                res = ct ? ct->ct_id : -1;
                mutex_exit(&p->p_lock);
        }

        return (res);
}

/*
 * contract_ptr_common
 *
 * Common code for contract_ptr and contract_type_ptr.  Takes a pointer
 * to an AVL tree to search in.  Should be called with the appropriate
 * tree-protecting lock held (unfortunately unassertable).
 */
static contract_t *
contract_ptr_common(avl_tree_t *tree, ctid_t id, uint64_t zuniqid)
{
        contract_t template, *ct;

        template.ct_id = id;
        ct = avl_find(tree, &template, NULL);
        if (ct == NULL || (zuniqid != GLOBAL_ZONEUNIQID &&
            contract_getzuniqid(ct) != zuniqid)) {
                return (NULL);
        }

        /*
         * Check to see if a thread is in the window in contract_rele
         * between dropping the reference count and removing the
         * contract from the type AVL.
         */
        mutex_enter(&ct->ct_reflock);
        if (ct->ct_ref) {
                ct->ct_ref++;
                mutex_exit(&ct->ct_reflock);
        } else {
                mutex_exit(&ct->ct_reflock);
                ct = NULL;
        }

        return (ct);
}

/*
 * contract_type_ptr
 *
 * Returns a pointer to the contract with the specified id.  The
 * contract is held, so the caller needs to release the reference when
 * it is through with the contract.
 */
contract_t *
contract_type_ptr(ct_type_t *type, ctid_t id, uint64_t zuniqid)
{
        contract_t *ct;

        mutex_enter(&type->ct_type_lock);
        ct = contract_ptr_common(&type->ct_type_avl, id, zuniqid);
        mutex_exit(&type->ct_type_lock);

        return (ct);
}

/*
 * contract_ptr
 *
 * Returns a pointer to the contract with the specified id.  The
 * contract is held, so the caller needs to release the reference when
 * it is through with the contract.
 */
contract_t *
contract_ptr(ctid_t id, uint64_t zuniqid)
{
        contract_t *ct;

        mutex_enter(&contract_lock);
        ct = contract_ptr_common(&contract_avl, id, zuniqid);
        mutex_exit(&contract_lock);

        return (ct);
}

/*
 * contract_type_time
 *
 * Obtains the last time a contract of a particular type was created.
 */
void
contract_type_time(ct_type_t *type, timestruc_t *time)
{
        mutex_enter(&type->ct_type_lock);
        *time = type->ct_type_timestruc;
        mutex_exit(&type->ct_type_lock);
}

/*
 * contract_type_bundle
 *
 * Obtains a type's bundle queue.
 */
ct_equeue_t *
contract_type_bundle(ct_type_t *type)
{
        return (&type->ct_type_events);
}

/*
 * contract_type_pbundle
 *
 * Obtain's a process's bundle queue.  If one doesn't exist, one is
 * created.  Often used simply to ensure that a bundle queue is
 * allocated.
 */
ct_equeue_t *
contract_type_pbundle(ct_type_t *type, proc_t *pp)
{
        /*
         * If there isn't an array of bundle queues, allocate one.
         */
        if (pp->p_ct_equeue == NULL) {
                size_t size = CTT_MAXTYPE * sizeof (ct_equeue_t *);
                ct_equeue_t **qa = kmem_zalloc(size, KM_SLEEP);

                mutex_enter(&pp->p_lock);
                if (pp->p_ct_equeue)
                        kmem_free(qa, size);
                else
                        pp->p_ct_equeue = qa;
                mutex_exit(&pp->p_lock);
        }

        /*
         * If there isn't a bundle queue of the required type, allocate
         * one.
         */
        if (pp->p_ct_equeue[type->ct_type_index] == NULL) {
                ct_equeue_t *q = kmem_zalloc(sizeof (ct_equeue_t), KM_SLEEP);
                cte_queue_create(q, CTEL_PBUNDLE, 20, 1);

                mutex_enter(&pp->p_lock);
                if (pp->p_ct_equeue[type->ct_type_index])
                        cte_queue_drain(q, 0);
                else
                        pp->p_ct_equeue[type->ct_type_index] = q;
                mutex_exit(&pp->p_lock);
        }

        return (pp->p_ct_equeue[type->ct_type_index]);
}

/*
 * ctparam_copyin
 *
 * copyin a ct_param_t for CT_TSET or CT_TGET commands.
 * If ctparam_copyout() is not called after ctparam_copyin(), then
 * the caller must kmem_free() the buffer pointed by kparam->ctpm_kbuf.
 *
 * The copyin/out of ct_param_t is not done in ctmpl_set() and ctmpl_get()
 * because prctioctl() calls ctmpl_set() and ctmpl_get() while holding a
 * process lock.
 */
int
ctparam_copyin(const void *uaddr, ct_kparam_t *kparam, int flag, int cmd)
{
        uint32_t size;
        void *ubuf;
        ct_param_t *param = &kparam->param;
        STRUCT_DECL(ct_param, uarg);

        STRUCT_INIT(uarg, flag);
        if (copyin(uaddr, STRUCT_BUF(uarg), STRUCT_SIZE(uarg)))
                return (EFAULT);
        size = STRUCT_FGET(uarg, ctpm_size);
        ubuf = STRUCT_FGETP(uarg, ctpm_value);

        if (size > CT_PARAM_MAX_SIZE || size == 0)
                return (EINVAL);

        kparam->ctpm_kbuf = kmem_alloc(size, KM_SLEEP);
        if (cmd == CT_TSET) {
                if (copyin(ubuf, kparam->ctpm_kbuf, size)) {
                        kmem_free(kparam->ctpm_kbuf, size);
                        return (EFAULT);
                }
        }
        param->ctpm_id = STRUCT_FGET(uarg, ctpm_id);
        param->ctpm_size = size;
        param->ctpm_value = ubuf;
        kparam->ret_size = 0;

        return (0);
}

/*
 * ctparam_copyout
 *
 * copyout a ct_kparam_t and frees the buffer pointed by the member
 * ctpm_kbuf of ct_kparam_t
 */
int
ctparam_copyout(ct_kparam_t *kparam, void *uaddr, int flag)
{
        int r = 0;
        ct_param_t *param = &kparam->param;
        STRUCT_DECL(ct_param, uarg);

        STRUCT_INIT(uarg, flag);

        STRUCT_FSET(uarg, ctpm_id, param->ctpm_id);
        STRUCT_FSET(uarg, ctpm_size, kparam->ret_size);
        STRUCT_FSETP(uarg, ctpm_value, param->ctpm_value);
        if (copyout(STRUCT_BUF(uarg), uaddr, STRUCT_SIZE(uarg))) {
                r = EFAULT;
                goto error;
        }
        if (copyout(kparam->ctpm_kbuf, param->ctpm_value,
            MIN(kparam->ret_size, param->ctpm_size))) {
                r = EFAULT;
        }

error:
        kmem_free(kparam->ctpm_kbuf, param->ctpm_size);

        return (r);
}

/*
 * ctmpl_free
 *
 * Frees a template.
 */
void
ctmpl_free(ct_template_t *template)
{
        mutex_destroy(&template->ctmpl_lock);
        template->ctmpl_ops->ctop_free(template);
}

/*
 * ctmpl_dup
 *
 * Creates a copy of a template.
 */
ct_template_t *
ctmpl_dup(ct_template_t *template)
{
        ct_template_t *new;

        if (template == NULL)
                return (NULL);

        new = template->ctmpl_ops->ctop_dup(template);
        /*
         * ctmpl_lock was taken by ctop_dup's call to ctmpl_copy and
         * should have remain held until now.
         */
        mutex_exit(&template->ctmpl_lock);

        return (new);
}

/*
 * ctmpl_set
 *
 * Sets the requested terms of a template.
 */
int
ctmpl_set(ct_template_t *template, ct_kparam_t *kparam, const cred_t *cr)
{
        int result = 0;
        ct_param_t *param = &kparam->param;
        uint64_t param_value;

        param_value = 0;
        if (param->ctpm_id == CTP_COOKIE ||
            param->ctpm_id == CTP_EV_INFO ||
            param->ctpm_id == CTP_EV_CRITICAL) {
                if (param->ctpm_size < sizeof (uint64_t)) {
                        return (EINVAL);
                } else {
                        param_value = *(uint64_t *)kparam->ctpm_kbuf;
                }
        }

        mutex_enter(&template->ctmpl_lock);
        switch (param->ctpm_id) {
        case CTP_COOKIE:
                template->ctmpl_cookie = param_value;
                break;
        case CTP_EV_INFO:
                if (param_value & ~(uint64_t)template->ctmpl_ops->allevents)
                        result = EINVAL;
                else
                        template->ctmpl_ev_info = param_value;
                break;
        case CTP_EV_CRITICAL:
                if (param_value & ~(uint64_t)template->ctmpl_ops->allevents) {
                        result = EINVAL;
                        break;
                } else if ((~template->ctmpl_ev_crit & param_value) == 0) {
                        /*
                         * Assume that a pure reduction of the critical
                         * set is allowed by the contract type.
                         */
                        template->ctmpl_ev_crit = param_value;
                        break;
                }
                /*
                 * There may be restrictions on what we can make
                 * critical, so we defer to the judgement of the
                 * contract type.
                 */
                /* FALLTHROUGH */
        default:
                result = template->ctmpl_ops->ctop_set(template, kparam, cr);
        }
        mutex_exit(&template->ctmpl_lock);

        return (result);
}

/*
 * ctmpl_get
 *
 * Obtains the requested terms from a template.
 *
 * If the term requested is a variable-sized term and the buffer
 * provided is too small for the data, we truncate the data and return
 * the buffer size necessary to fit the term in kparam->ret_size. If the
 * term requested is fix-sized (uint64_t) and the buffer provided is too
 * small, we return EINVAL.  This should never happen if you're using
 * libcontract(3LIB), only if you call ioctl with a hand constructed
 * ct_param_t argument.
 *
 * Currently, only contract specific parameters have variable-sized
 * parameters.
 */
int
ctmpl_get(ct_template_t *template, ct_kparam_t *kparam)
{
        int result = 0;
        ct_param_t *param = &kparam->param;
        uint64_t *param_value;

        param_value = NULL;
        if (param->ctpm_id == CTP_COOKIE ||
            param->ctpm_id == CTP_EV_INFO ||
            param->ctpm_id == CTP_EV_CRITICAL) {
                if (param->ctpm_size < sizeof (uint64_t)) {
                        return (EINVAL);
                } else {
                        param_value = kparam->ctpm_kbuf;
                        kparam->ret_size = sizeof (uint64_t);
                }
        }

        mutex_enter(&template->ctmpl_lock);
        switch (param->ctpm_id) {
        case CTP_COOKIE:
                if (param_value != NULL)
                        *param_value = template->ctmpl_cookie;
                break;
        case CTP_EV_INFO:
                if (param_value != NULL)
                        *param_value = template->ctmpl_ev_info;
                break;
        case CTP_EV_CRITICAL:
                if (param_value != NULL)
                        *param_value = template->ctmpl_ev_crit;
                break;
        default:
                result = template->ctmpl_ops->ctop_get(template, kparam);
        }
        mutex_exit(&template->ctmpl_lock);

        return (result);
}

/*
 * ctmpl_makecurrent
 *
 * Used by ctmpl_activate and ctmpl_clear to set the current thread's
 * active template.  Frees the old active template, if there was one.
 */
static void
ctmpl_makecurrent(ct_template_t *template, ct_template_t *new)
{
        klwp_t *curlwp = ttolwp(curthread);
        proc_t *p = curproc;
        ct_template_t *old;

        mutex_enter(&p->p_lock);
        old = curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index];
        curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index] = new;
        mutex_exit(&p->p_lock);

        if (old)
                ctmpl_free(old);
}

/*
 * ctmpl_activate
 *
 * Copy the specified template as the current thread's activate
 * template of that type.
 */
void
ctmpl_activate(ct_template_t *template)
{
        ctmpl_makecurrent(template, ctmpl_dup(template));
}

/*
 * ctmpl_clear
 *
 * Clears the current thread's activate template of the same type as
 * the specified template.
 */
void
ctmpl_clear(ct_template_t *template)
{
        ctmpl_makecurrent(template, NULL);
}

/*
 * ctmpl_create
 *
 * Creates a new contract using the specified template.
 */
int
ctmpl_create(ct_template_t *template, ctid_t *ctidp)
{
        return (template->ctmpl_ops->ctop_create(template, ctidp));
}

/*
 * ctmpl_init
 *
 * Initializes the common portion of a new contract template.
 */
void
ctmpl_init(ct_template_t *new, ctmplops_t *ops, ct_type_t *type, void *data)
{
        mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
        new->ctmpl_ops = ops;
        new->ctmpl_type = type;
        new->ctmpl_data = data;
        new->ctmpl_ev_info = new->ctmpl_ev_crit = 0;
        new->ctmpl_cookie = 0;
}

/*
 * ctmpl_copy
 *
 * Copies the common portions of a contract template.  Intended for use
 * by a contract type's ctop_dup template op.  Returns with the old
 * template's lock held, which will should remain held until the
 * template op returns (it is dropped by ctmpl_dup).
 */
void
ctmpl_copy(ct_template_t *new, ct_template_t *old)
{
        mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_enter(&old->ctmpl_lock);
        new->ctmpl_ops = old->ctmpl_ops;
        new->ctmpl_type = old->ctmpl_type;
        new->ctmpl_ev_crit = old->ctmpl_ev_crit;
        new->ctmpl_ev_info = old->ctmpl_ev_info;
        new->ctmpl_cookie = old->ctmpl_cookie;
}

/*
 * ctmpl_create_inval
 *
 * Returns EINVAL.  Provided for the convenience of those contract
 * types which don't support ct_tmpl_create(3contract) and would
 * otherwise need to create their own stub for the ctop_create template
 * op.
 */
/*ARGSUSED*/
int
ctmpl_create_inval(ct_template_t *template, ctid_t *ctidp)
{
        return (EINVAL);
}


/*
 * cte_queue_create
 *
 * Initializes a queue of a particular type.  If dynamic is set, the
 * queue is to be freed when its last listener is removed after being
 * drained.
 */
static void
cte_queue_create(ct_equeue_t *q, ct_listnum_t list, int maxinf, int dynamic)
{
        mutex_init(&q->ctq_lock, NULL, MUTEX_DEFAULT, NULL);
        q->ctq_listno = list;
        list_create(&q->ctq_events, sizeof (ct_kevent_t),
            offsetof(ct_kevent_t, cte_nodes[list].ctm_node));
        list_create(&q->ctq_listeners, sizeof (ct_listener_t),
            offsetof(ct_listener_t, ctl_allnode));
        list_create(&q->ctq_tail, sizeof (ct_listener_t),
            offsetof(ct_listener_t, ctl_tailnode));
        gethrestime(&q->ctq_atime);
        q->ctq_nlisteners = 0;
        q->ctq_nreliable = 0;
        q->ctq_ninf = 0;
        q->ctq_max = maxinf;

        /*
         * Bundle queues and contract queues are embedded in other
         * structures and are implicitly referenced counted by virtue
         * of their vnodes' indirect hold on their contracts.  Process
         * bundle queues are dynamically allocated and may persist
         * after the death of the process, so they must be explicitly
         * reference counted.
         */
        q->ctq_flags = dynamic ? CTQ_REFFED : 0;
}

/*
 * cte_queue_destroy
 *
 * Destroys the specified queue.  The queue is freed if referenced
 * counted.
 */
static void
cte_queue_destroy(ct_equeue_t *q)
{
        ASSERT(q->ctq_flags & CTQ_DEAD);
        ASSERT(q->ctq_nlisteners == 0);
        ASSERT(q->ctq_nreliable == 0);
        list_destroy(&q->ctq_events);
        list_destroy(&q->ctq_listeners);
        list_destroy(&q->ctq_tail);
        mutex_destroy(&q->ctq_lock);
        if (q->ctq_flags & CTQ_REFFED)
                kmem_free(q, sizeof (ct_equeue_t));
}

/*
 * cte_hold
 *
 * Takes a hold on the specified event.
 */
static void
cte_hold(ct_kevent_t *e)
{
        mutex_enter(&e->cte_lock);
        ASSERT(e->cte_refs > 0);
        e->cte_refs++;
        mutex_exit(&e->cte_lock);
}

/*
 * cte_rele
 *
 * Releases a hold on the specified event.  If the caller had the last
 * reference, frees the event and releases its hold on the contract
 * that generated it.
 */
static void
cte_rele(ct_kevent_t *e)
{
        mutex_enter(&e->cte_lock);
        ASSERT(e->cte_refs > 0);
        if (--e->cte_refs) {
                mutex_exit(&e->cte_lock);
                return;
        }

        contract_rele(e->cte_contract);

        mutex_destroy(&e->cte_lock);
        nvlist_free(e->cte_data);
        nvlist_free(e->cte_gdata);
        kmem_free(e, sizeof (ct_kevent_t));
}

/*
 * cte_qrele
 *
 * Remove this listener's hold on the specified event, removing and
 * releasing the queue's hold on the event if appropriate.
 */
static void
cte_qrele(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
{
        ct_member_t *member = &e->cte_nodes[q->ctq_listno];

        ASSERT(MUTEX_HELD(&q->ctq_lock));

        if (l->ctl_flags & CTLF_RELIABLE)
                member->ctm_nreliable--;
        if ((--member->ctm_refs == 0) && member->ctm_trimmed) {
                member->ctm_trimmed = 0;
                list_remove(&q->ctq_events, e);
                cte_rele(e);
        }
}

/*
 * cte_qmove
 *
 * Move this listener to the specified event in the queue.
 */
static ct_kevent_t *
cte_qmove(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
{
        ct_kevent_t *olde;

        ASSERT(MUTEX_HELD(&q->ctq_lock));
        ASSERT(l->ctl_equeue == q);

        if ((olde = l->ctl_position) == NULL)
                list_remove(&q->ctq_tail, l);

        while (e != NULL && e->cte_nodes[q->ctq_listno].ctm_trimmed)
                e = list_next(&q->ctq_events, e);

        if (e != NULL) {
                e->cte_nodes[q->ctq_listno].ctm_refs++;
                if (l->ctl_flags & CTLF_RELIABLE)
                        e->cte_nodes[q->ctq_listno].ctm_nreliable++;
        } else {
                list_insert_tail(&q->ctq_tail, l);
        }

        l->ctl_position = e;
        if (olde)
                cte_qrele(q, l, olde);

        return (e);
}

/*
 * cte_checkcred
 *
 * Determines if the specified event's contract is owned by a process
 * with the same effective uid as the specified credential.  Called
 * after a failed call to contract_owned with locked set.  Because it
 * drops the queue lock, its caller (cte_qreadable) needs to make sure
 * we're still in the same place after we return.  Returns 1 on
 * success.
 */
static int
cte_checkcred(ct_equeue_t *q, ct_kevent_t *e, const cred_t *cr)
{
        int result;
        contract_t *ct = e->cte_contract;

        cte_hold(e);
        mutex_exit(&q->ctq_lock);
        result = curproc->p_zone->zone_uniqid == ct->ct_czuniqid &&
            contract_checkcred(ct, cr);
        mutex_enter(&q->ctq_lock);
        cte_rele(e);

        return (result);
}

/*
 * cte_qreadable
 *
 * Ensures that the listener is pointing to a valid event that the
 * caller has the credentials to read.  Returns 0 if we can read the
 * event we're pointing to.
 */
static int
cte_qreadable(ct_equeue_t *q, ct_listener_t *l, const cred_t *cr,
    uint64_t zuniqid, int crit)
{
        ct_kevent_t *e, *next;
        contract_t *ct;

        ASSERT(MUTEX_HELD(&q->ctq_lock));
        ASSERT(l->ctl_equeue == q);

        if (l->ctl_flags & CTLF_COPYOUT)
                return (1);

        next = l->ctl_position;
        while (e = cte_qmove(q, l, next)) {
                ct = e->cte_contract;
                /*
                 * Check obvious things first.  If we are looking for a
                 * critical message, is this one?  If we aren't in the
                 * global zone, is this message meant for us?
                 */
                if ((crit && (e->cte_flags & (CTE_INFO | CTE_ACK))) ||
                    (cr != NULL && zuniqid != GLOBAL_ZONEUNIQID &&
                    zuniqid != contract_getzuniqid(ct))) {

                        next = list_next(&q->ctq_events, e);

                /*
                 * Next, see if our effective uid equals that of owner
                 * or author of the contract.  Since we are holding the
                 * queue lock, contract_owned can't always check if we
                 * have the same effective uid as the contract's
                 * owner.  If it comes to that, it fails and we take
                 * the slow(er) path.
                 */
                } else if (cr != NULL && !contract_owned(ct, cr, B_TRUE)) {

                        /*
                         * At this point we either don't have any claim
                         * to this contract or we match the effective
                         * uid of the owner but couldn't tell.  We
                         * first test for a NULL holder so that events
                         * from orphans and inherited contracts avoid
                         * the penalty phase.
                         */
                        if (e->cte_contract->ct_owner == NULL &&
                            !secpolicy_contract_observer_choice(cr))
                                next = list_next(&q->ctq_events, e);

                        /*
                         * cte_checkcred will juggle locks to see if we
                         * have the same uid as the event's contract's
                         * current owner.  If it succeeds, we have to
                         * make sure we are in the same point in the
                         * queue.
                         */
                        else if (cte_checkcred(q, e, cr) &&
                            l->ctl_position == e)
                                break;

                        /*
                         * cte_checkcred failed; see if we're in the
                         * same place.
                         */
                        else if (l->ctl_position == e)
                                if (secpolicy_contract_observer_choice(cr))
                                        break;
                                else
                                        next = list_next(&q->ctq_events, e);

                        /*
                         * cte_checkcred failed, and our position was
                         * changed.  Start from there.
                         */
                        else
                                next = l->ctl_position;
                } else {
                        break;
                }
        }

        /*
         * We check for CTLF_COPYOUT again in case we dropped the queue
         * lock in cte_checkcred.
         */
        return ((l->ctl_flags & CTLF_COPYOUT) || (l->ctl_position == NULL));
}

/*
 * cte_qwakeup
 *
 * Wakes up any waiting listeners and points them at the specified event.
 */
static void
cte_qwakeup(ct_equeue_t *q, ct_kevent_t *e)
{
        ct_listener_t *l;

        ASSERT(MUTEX_HELD(&q->ctq_lock));

        while (l = list_head(&q->ctq_tail)) {
                list_remove(&q->ctq_tail, l);
                e->cte_nodes[q->ctq_listno].ctm_refs++;
                if (l->ctl_flags & CTLF_RELIABLE)
                        e->cte_nodes[q->ctq_listno].ctm_nreliable++;
                l->ctl_position = e;
                cv_signal(&l->ctl_cv);
                pollwakeup(&l->ctl_pollhead, POLLIN);
        }
}

/*
 * cte_copy
 *
 * Copies events from the specified contract event queue to the
 * end of the specified process bundle queue.  Only called from
 * contract_adopt.
 *
 * We copy to the end of the target queue instead of mixing the events
 * in their proper order because otherwise the act of adopting a
 * contract would require a process to reset all process bundle
 * listeners it needed to see the new events.  This would, in turn,
 * require the process to keep track of which preexisting events had
 * already been processed.
 */
static void
cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
{
        ct_kevent_t *e, *first = NULL;

        VERIFY(q->ctq_listno == CTEL_CONTRACT);
        VERIFY(newq->ctq_listno == CTEL_PBUNDLE);

        mutex_enter(&q->ctq_lock);
        mutex_enter(&newq->ctq_lock);

        /*
         * For now, only copy critical events.
         */
        for (e = list_head(&q->ctq_events); e != NULL;
            e = list_next(&q->ctq_events, e)) {
                if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
                        if (first == NULL)
                                first = e;
                        /*
                         * It is possible for adoption to race with an owner's
                         * cte_publish_all(); we must only enqueue events that
                         * have not already been enqueued.
                         */
                        if (!list_link_active((list_node_t *)
                            ((uintptr_t)e + newq->ctq_events.list_offset))) {
                                list_insert_tail(&newq->ctq_events, e);
                                cte_hold(e);
                        }
                }
        }

        mutex_exit(&q->ctq_lock);

        if (first)
                cte_qwakeup(newq, first);

        mutex_exit(&newq->ctq_lock);
}

/*
 * cte_trim
 *
 * Trims unneeded events from an event queue.  Algorithm works as
 * follows:
 *
 *   Removes all informative and acknowledged critical events until the
 *   first referenced event is found.
 *
 *   If a contract is specified, removes all events (regardless of
 *   acknowledgement) generated by that contract until the first event
 *   referenced by a reliable listener is found.  Reference events are
 *   removed by marking them "trimmed".  Such events will be removed
 *   when the last reference is dropped and will be skipped by future
 *   listeners.
 *
 * This is pretty basic.  Ideally this should remove from the middle of
 * the list (i.e. beyond the first referenced event), and even
 * referenced events.
 */
static void
cte_trim(ct_equeue_t *q, contract_t *ct)
{
        ct_kevent_t *e, *next;
        int flags, stopper;
        int start = 1;

        VERIFY(MUTEX_HELD(&q->ctq_lock));

        for (e = list_head(&q->ctq_events); e != NULL; e = next) {
                next = list_next(&q->ctq_events, e);
                flags = e->cte_flags;
                stopper = (q->ctq_listno != CTEL_PBUNDLE) &&
                    (e->cte_nodes[q->ctq_listno].ctm_nreliable > 0);
                if (e->cte_nodes[q->ctq_listno].ctm_refs == 0) {
                        if ((start && (flags & (CTE_INFO | CTE_ACK))) ||
                            (e->cte_contract == ct)) {
                                /*
                                 * Toss informative and ACKed critical messages.
                                 */
                                list_remove(&q->ctq_events, e);
                                cte_rele(e);
                        }
                } else if ((e->cte_contract == ct) && !stopper) {
                        ASSERT(q->ctq_nlisteners != 0);
                        e->cte_nodes[q->ctq_listno].ctm_trimmed = 1;
                } else if (ct && !stopper) {
                        start = 0;
                } else {
                        /*
                         * Don't free messages past the first reader.
                         */
                        break;
                }
        }
}

/*
 * cte_queue_drain
 *
 * Drain all events from the specified queue, and mark it dead.  If
 * "ack" is set, acknowledge any critical events we find along the
 * way.
 */
static void
cte_queue_drain(ct_equeue_t *q, int ack)
{
        ct_kevent_t *e, *next;
        ct_listener_t *l;

        mutex_enter(&q->ctq_lock);

        for (e = list_head(&q->ctq_events); e != NULL; e = next) {
                next = list_next(&q->ctq_events, e);
                if (ack && ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0)) {
                        /*
                         * Make sure critical messages are eventually
                         * removed from the bundle queues.
                         */
                        mutex_enter(&e->cte_lock);
                        e->cte_flags |= CTE_ACK;
                        mutex_exit(&e->cte_lock);
                        ASSERT(MUTEX_HELD(&e->cte_contract->ct_lock));
                        e->cte_contract->ct_evcnt--;
                }
                list_remove(&q->ctq_events, e);
                e->cte_nodes[q->ctq_listno].ctm_refs = 0;
                e->cte_nodes[q->ctq_listno].ctm_nreliable = 0;
                e->cte_nodes[q->ctq_listno].ctm_trimmed = 0;
                cte_rele(e);
        }

        /*
         * This is necessary only because of CTEL_PBUNDLE listeners;
         * the events they point to can move from one pbundle to
         * another.  Fortunately, this only happens if the contract is
         * inherited, which (in turn) only happens if the process
         * exits, which means it's an all-or-nothing deal.  If this
         * wasn't the case, we would instead need to keep track of
         * listeners on a per-event basis, not just a per-queue basis.
         * This would have the side benefit of letting us clean up
         * trimmed events sooner (i.e. immediately), but would
         * unfortunately make events even bigger than they already
         * are.
         */
        for (l = list_head(&q->ctq_listeners); l;
            l = list_next(&q->ctq_listeners, l)) {
                l->ctl_flags |= CTLF_DEAD;
                if (l->ctl_position) {
                        l->ctl_position = NULL;
                        list_insert_tail(&q->ctq_tail, l);
                }
                cv_broadcast(&l->ctl_cv);
        }

        /*
         * Disallow events.
         */
        q->ctq_flags |= CTQ_DEAD;

        /*
         * If we represent the last reference to a reference counted
         * process bundle queue, free it.
         */
        if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_nlisteners == 0))
                cte_queue_destroy(q);
        else
                mutex_exit(&q->ctq_lock);
}

/*
 * cte_publish
 *
 * Publishes an event to a specific queue.  Only called by
 * cte_publish_all.
 */
static void
cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist)
{
        ASSERT(MUTEX_HELD(&q->ctq_lock));

        q->ctq_atime = *tsp;

        /*
         * If this event may already exist on this queue, check to see if it
         * is already there and return if so.
         */
        if (mayexist && list_link_active((list_node_t *)((uintptr_t)e +
            q->ctq_events.list_offset))) {
                mutex_exit(&q->ctq_lock);
                cte_rele(e);
                return;
        }

        /*
         * Don't publish if the event is informative and there aren't
         * any listeners, or if the queue has been shut down.
         */
        if (((q->ctq_nlisteners == 0) && (e->cte_flags & (CTE_INFO|CTE_ACK))) ||
            (q->ctq_flags & CTQ_DEAD)) {
                mutex_exit(&q->ctq_lock);
                cte_rele(e);
                return;
        }

        /*
         * Enqueue event
         */
        VERIFY(!list_link_active((list_node_t *)
            ((uintptr_t)e + q->ctq_events.list_offset)));
        list_insert_tail(&q->ctq_events, e);

        /*
         * Check for waiting listeners
         */
        cte_qwakeup(q, e);

        /*
         * Trim unnecessary events from the queue.
         */
        cte_trim(q, NULL);
        mutex_exit(&q->ctq_lock);
}

/*
 * cte_publish_all
 *
 * Publish an event to all necessary event queues.  The event, e, must
 * be zallocated by the caller, and the event's flags and type must be
 * set.  The rest of the event's fields are initialized here.
 */
uint64_t
cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
{
        ct_equeue_t *q;
        timespec_t ts;
        uint64_t evid;
        ct_kevent_t *negev;
        int negend;

        e->cte_contract = ct;
        e->cte_data = data;
        e->cte_gdata = gdata;
        e->cte_refs = 3;
        evid = e->cte_id = atomic_inc_64_nv(&ct->ct_type->ct_type_evid);
        contract_hold(ct);

        /*
         * For a negotiation event we set the ct->ct_nevent field of the
         * contract for the duration of the negotiation
         */
        negend = 0;
        if (e->cte_flags & CTE_NEG) {
                cte_hold(e);
                ct->ct_nevent = e;
        } else if (e->cte_type == CT_EV_NEGEND) {
                negend = 1;
        }

        gethrestime(&ts);

        /*
         * ct_evtlock simply (and only) ensures that two events sent
         * from the same contract are delivered to all queues in the
         * same order.
         */
        mutex_enter(&ct->ct_evtlock);

        /*
         * CTEL_CONTRACT - First deliver to the contract queue, acking
         * the event if the contract has been orphaned.
         */
        mutex_enter(&ct->ct_lock);
        mutex_enter(&ct->ct_events.ctq_lock);
        if ((e->cte_flags & CTE_INFO) == 0) {
                if (ct->ct_state >= CTS_ORPHAN)
                        e->cte_flags |= CTE_ACK;
                else
                        ct->ct_evcnt++;
        }
        mutex_exit(&ct->ct_lock);
        cte_publish(&ct->ct_events, e, &ts, B_FALSE);

        /*
         * CTEL_BUNDLE - Next deliver to the contract type's bundle
         * queue.
         */
        mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
        cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE);

        /*
         * CTEL_PBUNDLE - Finally, if the contract has an owner,
         * deliver to the owner's process bundle queue.
         */
        mutex_enter(&ct->ct_lock);
        if (ct->ct_owner) {
                /*
                 * proc_exit doesn't free event queues until it has
                 * abandoned all contracts.
                 */
                ASSERT(ct->ct_owner->p_ct_equeue);
                ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
                q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
                mutex_enter(&q->ctq_lock);
                mutex_exit(&ct->ct_lock);

                /*
                 * It is possible for this code to race with adoption; we
                 * publish the event indicating that the event may already
                 * be enqueued because adoption beat us to it (in which case
                 * cte_pubish() does nothing).
                 */
                cte_publish(q, e, &ts, B_TRUE);
        } else {
                mutex_exit(&ct->ct_lock);
                cte_rele(e);
        }

        if (negend) {
                mutex_enter(&ct->ct_lock);
                negev = ct->ct_nevent;
                ct->ct_nevent = NULL;
                cte_rele(negev);
                mutex_exit(&ct->ct_lock);
        }

        mutex_exit(&ct->ct_evtlock);

        return (evid);
}

/*
 * cte_add_listener
 *
 * Add a new listener to an event queue.
 */
void
cte_add_listener(ct_equeue_t *q, ct_listener_t *l)
{
        cv_init(&l->ctl_cv, NULL, CV_DEFAULT, NULL);
        l->ctl_equeue = q;
        l->ctl_position = NULL;
        l->ctl_flags = 0;

        mutex_enter(&q->ctq_lock);
        list_insert_head(&q->ctq_tail, l);
        list_insert_head(&q->ctq_listeners, l);
        q->ctq_nlisteners++;
        mutex_exit(&q->ctq_lock);
}

/*
 * cte_remove_listener
 *
 * Remove a listener from an event queue.  No other queue activities
 * (e.g. cte_get event) may be in progress at this endpoint when this
 * is called.
 */
void
cte_remove_listener(ct_listener_t *l)
{
        ct_equeue_t *q = l->ctl_equeue;
        ct_kevent_t *e;

        mutex_enter(&q->ctq_lock);

        ASSERT((l->ctl_flags & (CTLF_COPYOUT|CTLF_RESET)) == 0);

        if ((e = l->ctl_position) != NULL)
                cte_qrele(q, l, e);
        else
                list_remove(&q->ctq_tail, l);
        l->ctl_position = NULL;

        q->ctq_nlisteners--;
        list_remove(&q->ctq_listeners, l);

        if (l->ctl_flags & CTLF_RELIABLE)
                q->ctq_nreliable--;

        /*
         * If we are a the last listener of a dead reference counted
         * queue (i.e. a process bundle) we free it.  Otherwise we just
         * trim any events which may have been kept around for our
         * benefit.
         */
        if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_flags & CTQ_DEAD) &&
            (q->ctq_nlisteners == 0)) {
                cte_queue_destroy(q);
        } else {
                cte_trim(q, NULL);
                mutex_exit(&q->ctq_lock);
        }
}

/*
 * cte_reset_listener
 *
 * Moves a listener's queue pointer to the beginning of the queue.
 */
void
cte_reset_listener(ct_listener_t *l)
{
        ct_equeue_t *q = l->ctl_equeue;

        mutex_enter(&q->ctq_lock);

        /*
         * We allow an asynchronous reset because it doesn't make a
         * whole lot of sense to make reset block or fail.  We already
         * have most of the mechanism needed thanks to queue trimming,
         * so implementing it isn't a big deal.
         */
        if (l->ctl_flags & CTLF_COPYOUT)
                l->ctl_flags |= CTLF_RESET;

        (void) cte_qmove(q, l, list_head(&q->ctq_events));

        /*
         * Inform blocked readers.
         */
        cv_broadcast(&l->ctl_cv);
        pollwakeup(&l->ctl_pollhead, POLLIN);
        mutex_exit(&q->ctq_lock);
}

/*
 * cte_next_event
 *
 * Moves the event pointer for the specified listener to the next event
 * on the queue.  To avoid races, this movement only occurs if the
 * specified event id matches that of the current event.  This is used
 * primarily to skip events that have been read but whose extended data
 * haven't been copied out.
 */
int
cte_next_event(ct_listener_t *l, uint64_t id)
{
        ct_equeue_t *q = l->ctl_equeue;
        ct_kevent_t *old;

        mutex_enter(&q->ctq_lock);

        if (l->ctl_flags & CTLF_COPYOUT)
                l->ctl_flags |= CTLF_RESET;

        if (((old = l->ctl_position) != NULL) && (old->cte_id == id))
                (void) cte_qmove(q, l, list_next(&q->ctq_events, old));

        mutex_exit(&q->ctq_lock);

        return (0);
}

/*
 * cte_get_event
 *
 * Reads an event from an event endpoint.  If "nonblock" is clear, we
 * block until a suitable event is ready.  If "crit" is set, we only
 * read critical events.  Note that while "cr" is the caller's cred,
 * "zuniqid" is the unique id of the zone the calling contract
 * filesystem was mounted in.
 */
int
cte_get_event(ct_listener_t *l, int nonblock, void *uaddr, const cred_t *cr,
    uint64_t zuniqid, int crit)
{
        ct_equeue_t *q = l->ctl_equeue;
        ct_kevent_t *temp;
        int result = 0;
        int partial = 0;
        size_t size, gsize, len;
        model_t mdl = get_udatamodel();
        STRUCT_DECL(ct_event, ev);
        STRUCT_INIT(ev, mdl);

        /*
         * cte_qreadable checks for CTLF_COPYOUT as well as ensures
         * that there exists, and we are pointing to, an appropriate
         * event.  It may temporarily drop ctq_lock, but that doesn't
         * really matter to us.
         */
        mutex_enter(&q->ctq_lock);
        while (cte_qreadable(q, l, cr, zuniqid, crit)) {
                if (nonblock) {
                        result = EAGAIN;
                        goto error;
                }
                if (q->ctq_flags & CTQ_DEAD) {
                        result = EIDRM;
                        goto error;
                }
                result = cv_wait_sig(&l->ctl_cv, &q->ctq_lock);
                if (result == 0) {
                        result = EINTR;
                        goto error;
                }
        }
        temp = l->ctl_position;
        cte_hold(temp);
        l->ctl_flags |= CTLF_COPYOUT;
        mutex_exit(&q->ctq_lock);

        /*
         * We now have an event.  Copy in the user event structure to
         * see how much space we have to work with.
         */
        result = copyin(uaddr, STRUCT_BUF(ev), STRUCT_SIZE(ev));
        if (result)
                goto copyerr;

        /*
         * Determine what data we have and what the user should be
         * allowed to see.
         */
        size = gsize = 0;
        if (temp->cte_data) {
                VERIFY(nvlist_size(temp->cte_data, &size,
                    NV_ENCODE_NATIVE) == 0);
                ASSERT(size != 0);
        }
        if (zuniqid == GLOBAL_ZONEUNIQID && temp->cte_gdata) {
                VERIFY(nvlist_size(temp->cte_gdata, &gsize,
                    NV_ENCODE_NATIVE) == 0);
                ASSERT(gsize != 0);
        }

        /*
         * If we have enough space, copy out the extended event data.
         */
        len = size + gsize;
        if (len) {
                if (STRUCT_FGET(ev, ctev_nbytes) >= len) {
                        char *buf = kmem_alloc(len, KM_SLEEP);

                        if (size)
                                VERIFY(nvlist_pack(temp->cte_data, &buf, &size,
                                    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
                        if (gsize) {
                                char *tmp = buf + size;

                                VERIFY(nvlist_pack(temp->cte_gdata, &tmp,
                                    &gsize, NV_ENCODE_NATIVE, KM_SLEEP) == 0);
                        }

                        /* This shouldn't have changed */
                        ASSERT(size + gsize == len);
                        result = copyout(buf, STRUCT_FGETP(ev, ctev_buffer),
                            len);
                        kmem_free(buf, len);
                        if (result)
                                goto copyerr;
                } else {
                        partial = 1;
                }
        }

        /*
         * Copy out the common event data.
         */
        STRUCT_FSET(ev, ctev_id, temp->cte_contract->ct_id);
        STRUCT_FSET(ev, ctev_evid, temp->cte_id);
        STRUCT_FSET(ev, ctev_cttype,
            temp->cte_contract->ct_type->ct_type_index);
        STRUCT_FSET(ev, ctev_flags, temp->cte_flags &
            (CTE_ACK|CTE_INFO|CTE_NEG));
        STRUCT_FSET(ev, ctev_type, temp->cte_type);
        STRUCT_FSET(ev, ctev_nbytes, len);
        STRUCT_FSET(ev, ctev_goffset, size);
        result = copyout(STRUCT_BUF(ev), uaddr, STRUCT_SIZE(ev));

copyerr:
        /*
         * Only move our location in the queue if all copyouts were
         * successful, the caller provided enough space for the entire
         * event, and our endpoint wasn't reset or otherwise moved by
         * another thread.
         */
        mutex_enter(&q->ctq_lock);
        if (result)
                result = EFAULT;
        else if (!partial && ((l->ctl_flags & CTLF_RESET) == 0) &&
            (l->ctl_position == temp))
                (void) cte_qmove(q, l, list_next(&q->ctq_events, temp));
        l->ctl_flags &= ~(CTLF_COPYOUT|CTLF_RESET);
        /*
         * Signal any readers blocked on our CTLF_COPYOUT.
         */
        cv_signal(&l->ctl_cv);
        cte_rele(temp);

error:
        mutex_exit(&q->ctq_lock);
        return (result);
}

/*
 * cte_set_reliable
 *
 * Requests that events be reliably delivered to an event endpoint.
 * Unread informative and acknowledged critical events will not be
 * removed from the queue until this listener reads or skips them.
 * Because a listener could maliciously request reliable delivery and
 * then do nothing, this requires that PRIV_CONTRACT_EVENT be in the
 * caller's effective set.
 */
int
cte_set_reliable(ct_listener_t *l, const cred_t *cr)
{
        ct_equeue_t *q = l->ctl_equeue;
        int error;

        if ((error = secpolicy_contract_event(cr)) != 0)
                return (error);

        mutex_enter(&q->ctq_lock);
        if ((l->ctl_flags & CTLF_RELIABLE) == 0) {
                l->ctl_flags |= CTLF_RELIABLE;
                q->ctq_nreliable++;
                if (l->ctl_position != NULL)
                        l->ctl_position->cte_nodes[q->ctq_listno].
                            ctm_nreliable++;
        }
        mutex_exit(&q->ctq_lock);

        return (0);
}