root/usr/src/uts/common/io/signalfd.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2017 Joyent, Inc.
 * Copyright 2023 Oxide Computer Company
 */

/*
 * Support for the signalfd facility, a Linux-borne facility for
 * file descriptor-based synchronous signal consumption.
 *
 * As described on the signalfd(3C) man page, the general idea behind these
 * file descriptors is that they can be used to synchronously consume signals
 * via the read(2) syscall.  While that capability already exists with the
 * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
 * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
 * event ports) to notify interested parties when consumable signals arrive.
 *
 * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
 * will be allocated for them along with an associated signalfd_state_t struct.
 * It is there where the mask of desired signals resides.
 *
 * Reading from the signalfd is straightforward and mimics the kernel behavior
 * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
 * thread's t_sig, member.  During a read operation, those which match the mask
 * are consumed so they are no longer pending.
 *
 * The poll side is more complex.  Every time a signal is delivered, all of the
 * signalfds on the process need to be examined in order to pollwake threads
 * waiting for signal arrival.
 *
 * When a thread polling on a signalfd requires a pollhead, several steps must
 * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
 * created for the calling process if it does not yet exist.  It is there where
 * a list of signalfd_poller_t structures reside which associate pollheads to
 * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find any
 * signalfd_poller_t which is both associated with the polling process and
 * corresponds to the signalfd resource being polled.  If none matching those
 * conditions is found, then a new one with the appropriate associations is
 * created.
 *
 * The complications imposed by fork(2) are why the pollhead is stored in the
 * associated signalfd_poller_t instead of directly in the signalfd_state_t.
 * More than one process can hold a reference to the signalfd at a time but
 * arriving signals should wake only process-local pollers.  Additionally,
 * signalfd_close is called only when the last referencing fd is closed, hiding
 * occurrences of preceeding threads which released their references.  This
 * necessitates a pollhead for each signalfd/process pair when being polled.
 * Doing so ensures that those pollheads will live long enough for the greater
 * poll machinery can act upon them without risk of use-after-free.  When a
 * signalfd is closed, existing signalfd_poller_t instances are dissociated from
 * their respective processes, causing pollwake() calls for any blocked pollers.
 *
 * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
 * is called via the pointer in sigfd_proc_state_t.  It will walk over the
 * signalfd_poller_t entries present in the list, searching for any possessing a
 * signal mask which matches the incoming signal.  (Changes to the signal mask
 * held in signalfd_state_t is propagated to the signalfd_poller_t instance to
 * avoid the need for additional locks during the callback.) The approach of
 * keeping the poller list in p_sigfd was chosen because a process is likely to
 * use few signalfds relative to its total file descriptors.  It reduces the
 * work required for each received signal.
 *
 * When matching signalfd_poller_t entries are encountered in the poller list
 * during signalfd_pollwake_cb, they are dispatched into signalfd_wakeq to
 * perform the pollwake.  This is due to a lock ordering conflict between
 * signalfd_poll and signalfd_pollwake_cb.  The former acquires
 * pollcache_t`pc_lock before proc_t`p_lock.  The latter (via sigtoproc)
 * reverses the order.  Defering the pollwake into a taskq means it can be
 * performed without proc_t`p_lock held, avoiding the deadlock.
 *
 * Poller entries in sigfd_proc_state_t`sigfd_list are cleaned up under two
 * different circumstances.  When a signalfd instance is being closed, it will
 * dissociate all of its remaining signalfd_poller_t instances from their
 * polling processes.  When a process (which polled on signalfd instance(s)
 * which have not yet been closed) exits, the exit helper (signalfd_exit_helper)
 * is called, and it dissociates all signalfd_poller_t instances tied to the
 * existing process.
 *
 * The structures associated with signalfd state are designed to operate
 * correctly across fork, but there is one caveat that applies.  Using
 * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
 * descriptors (such as /dev/poll or event ports) will result in missed poll
 * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
 * being dependent on the process they are polled from.  Because it has a
 * thread-local cache, poll(2) is unaffected by this limitation.
 *
 * Lock ordering:
 *
 * Calling signalfd_poll:
 * 1. pollcache_t`pc_lock
 * 2. signalfd_state_t`sfd_lock
 * 3. proc_t`p_lock
 *
 * Signal delivery, waking a pollhead:
 * 1. proc_t`p_lock
 * 2. signalfd_poller_t`sp_lock
 *
 * Process exit, cleaning up signalfd pollers:
 * 1. proc_t`p_lock
 * 2. signalfd_poller_t`sp_lock
 *
 * Waking a pollhead, from taskq:
 * 1. signalfd_poller_t`sp_lock
 * ... Disjoint from signalfd_poller_t`sp_lock hold ...
 * 1. pollcache_t`pc_lock
 *
 * Closing signalfd, dissociating pollers:
 * 1. signalfd_state_t`sfd_lock
 * 2. pidlock
 * 3. proc_t`p_lock
 *
 */

#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/signalfd.h>
#include <sys/conf.h>
#include <sys/sysmacros.h>
#include <sys/filio.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/schedctl.h>
#include <sys/id_space.h>
#include <sys/sdt.h>
#include <sys/disp.h>
#include <sys/taskq_impl.h>
#include <sys/condvar.h>
#include <sys/stdbool.h>

/* Per-instance signalfd device state: */
typedef struct signalfd_state {
        kmutex_t        sfd_lock;       /* protects fields below */
        list_t          sfd_pollers;
        k_sigset_t      sfd_mask;       /* signal mask for this instance */
        minor_t         sfd_minor;      /* dev minor, fixed at creation */
} signalfd_state_t;

typedef struct signalfd_poller {
        /*
         * List node referenced by containing signalfd_state_t
         * Protected by signalfd_state`sfd_lock
         */
        list_node_t     sp_state_node;

        /*
         * List node referenced by containing sigfd_proc_state_t
         * Protected by proc_t`plock
         */
        list_node_t     sp_proc_node;

        pollhead_t      sp_pollhead;

        /*
         * The signalfd_state_t to which this poller is associated.
         * It remains fixed after its initialization at creation time.
         */
        signalfd_state_t        *sp_state;

        /*
         * The proc_t to which this poller is associated.
         * It is initialized under the protection of proc_t`p_lock when this
         * poller is created.  It is NULLed out, again under the protection of
         * proc_t`p_lock, when the poller is dissociated from the process.
         */
        proc_t          *sp_proc;

        kmutex_t        sp_lock;        /* protects fields below */
        kcondvar_t      sp_cv;          /* CV for cleaning up */
        short           sp_pollev;      /* Event(s) pending delivery */
        bool            sp_pending;     /* pollwakeup() via taskq in progress */
        taskq_ent_t     sp_taskent;     /* pollwakeup() dispatch taskq */
        k_sigset_t      sp_mask;        /* signal match mask */
} signalfd_poller_t;

static dev_info_t       *signalfd_devi;         /* device info */
static id_space_t       *signalfd_minors;       /* minor number arena */
static void             *signalfd_softstate;    /* softstate pointer */
static taskq_t          *signalfd_wakeq;        /* pollwake event taskq */

static void
signalfd_proc_clean(proc_t *p)
{
        sigfd_proc_state_t *pstate = p->p_sigfd;

        ASSERT(MUTEX_HELD(&p->p_lock));
        ASSERT(pstate != NULL);
        VERIFY(list_is_empty(&pstate->sigfd_list));

        p->p_sigfd = NULL;
        list_destroy(&pstate->sigfd_list);
        kmem_free(pstate, sizeof (*pstate));
}

static void
signalfd_wake_task(void *arg)
{
        signalfd_poller_t *sp = arg;

        mutex_enter(&sp->sp_lock);
        VERIFY(sp->sp_pollev != 0);
        VERIFY(sp->sp_pending);
        do {
                const short pollev = sp->sp_pollev;
                const bool is_err = (pollev & POLLERR) != 0;
                sp->sp_pollev = 0;
                mutex_exit(&sp->sp_lock);

                /*
                 * Actions against the pollhead and associated pollcache(s) are
                 * taken without signalfd_poller_t`sp_lock held, since the chain
                 * of dependencies through pollcache_t`pc_lock and
                 * signalfd_state_t`sfd_lock form a potential for deadlock.
                 */
                pollwakeup(&sp->sp_pollhead, pollev);
                if (is_err) {
                        pollhead_clean(&sp->sp_pollhead);
                }

                mutex_enter(&sp->sp_lock);
                /*
                 * Once pollhead/pollcache actions are complete, check for newly
                 * queued events which could have appeared in the mean time.  We
                 * can bail immediately if POLLER was being delivered, since the
                 * underlying resource is undergoing clean-up.
                 */
                if (is_err) {
                        break;
                }
        } while (sp->sp_pollev != 0);

        /*
         * Indicate that wake task processing is complete.
         *
         * Wake any thread waiting for event delivery to complete if this poller
         * is being torn down.
         */
        sp->sp_pending = false;
        cv_signal(&sp->sp_cv);
        mutex_exit(&sp->sp_lock);
}

static void
signalfd_poller_wake(signalfd_poller_t *sp, short ev)
{
        ASSERT(MUTEX_HELD(&sp->sp_lock));

        sp->sp_pollev |= ev;
        if (!sp->sp_pending) {
                sp->sp_pending = true;
                taskq_dispatch_ent(signalfd_wakeq, signalfd_wake_task, sp, 0,
                    &sp->sp_taskent);
        }
}

/*
 * Notification callback associated to processes which are being polled for
 * signalfd events.  Called by sigtoproc().
 */
static void
signalfd_pollwake_cb(void *arg0, int sig)
{
        proc_t *p = (proc_t *)arg0;
        sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;

        ASSERT(MUTEX_HELD(&p->p_lock));
        ASSERT(pstate != NULL);

        list_t *pollers = &pstate->sigfd_list;
        for (signalfd_poller_t *sp = list_head(pollers); sp != NULL;
            sp = list_next(pollers, sp)) {
                mutex_enter(&sp->sp_lock);
                if (sigismember(&sp->sp_mask, sig)) {
                        signalfd_poller_wake(sp, POLLRDNORM | POLLIN);
                }
                mutex_exit(&sp->sp_lock);
        }
}

/*
 * Get the sigfd_proc_state_t for a given process, allocating one if necessary.
 *
 * Must be called with p_lock held, which may be dropped and reacquired during
 * the allocation.
 */
static sigfd_proc_state_t *
signalfd_proc_pstate(proc_t *p)
{
        ASSERT(MUTEX_HELD(&p->p_lock));

        sigfd_proc_state_t *pstate = p->p_sigfd;
        if (pstate == NULL) {
                mutex_exit(&p->p_lock);
                pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
                list_create(&pstate->sigfd_list,
                    sizeof (signalfd_poller_t),
                    offsetof(signalfd_poller_t, sp_proc_node));
                pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;

                /* Check again, after blocking for the alloc. */
                mutex_enter(&p->p_lock);
                if (p->p_sigfd == NULL) {
                        p->p_sigfd = pstate;
                } else {
                        /* Someone beat us to it */
                        list_destroy(&pstate->sigfd_list);
                        kmem_free(pstate, sizeof (*pstate));
                        pstate = p->p_sigfd;
                }
        }

        return (pstate);
}

static signalfd_poller_t *
signalfd_poller_associate(signalfd_state_t *state, proc_t *p)
{
        sigfd_proc_state_t *pstate;
        list_t *pollers;
        signalfd_poller_t *sp;

        ASSERT(MUTEX_HELD(&state->sfd_lock));

        mutex_enter(&p->p_lock);

        pstate = signalfd_proc_pstate(p);
        pollers = &pstate->sigfd_list;

        /*
         * Check if there is already a signalfd_poller_t allocated for this
         * signalfd_state_t/proc_t pair.
         */
        for (sp = list_head(pollers); sp != NULL; sp = list_next(pollers, sp)) {
                if (sp->sp_state == state) {
                        mutex_exit(&p->p_lock);
                        return (sp);
                }
        }

        /*
         * No existing poller found, so allocate one. Since sfd_lock remains
         * held, there is no risk of some other operation racing with us to
         * create such a poller.
         */
        mutex_exit(&p->p_lock);

        sp = kmem_zalloc(sizeof (*sp), KM_SLEEP);
        mutex_init(&sp->sp_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&sp->sp_cv, NULL, CV_DEFAULT, NULL);
        sigorset(&sp->sp_mask, &state->sfd_mask);
        sp->sp_state = state;
        sp->sp_proc = p;

        mutex_enter(&p->p_lock);
        /*
         * Fetch the pstate again, since it could have been freed or reallocated
         * in the time p_lock was dropped.
         */
        pstate = signalfd_proc_pstate(p);

        list_insert_tail(&pstate->sigfd_list, sp);
        list_insert_tail(&state->sfd_pollers, sp);
        mutex_exit(&p->p_lock);

        return (sp);
}

static void
signalfd_pollers_dissociate(signalfd_state_t *state)
{
        ASSERT(MUTEX_HELD(&state->sfd_lock));

        mutex_enter(&pidlock);

        signalfd_poller_t *sp;
        list_t *pollers = &state->sfd_pollers;
        for (sp = list_head(pollers); sp != NULL; sp = list_next(pollers, sp)) {
                proc_t *p = sp->sp_proc;

                if (p == NULL) {
                        continue;
                }

                /*
                 * Even if the process in question is racing us to clean-up in
                 * proc_exit(), it will be unable to exit (and free itself)
                 * since we hold pidlock.  This prevents us from otherwise
                 * attempting to lock a p_lock which was freed.
                 */
                mutex_enter(&p->p_lock);
                if (sp->sp_proc == NULL) {
                        mutex_exit(&p->p_lock);
                        continue;
                }
                VERIFY3P(sp->sp_proc, ==, p);
                VERIFY3P(sp->sp_state, ==, state);
                VERIFY3P(p->p_sigfd, !=, NULL);

                sigfd_proc_state_t *pstate = p->p_sigfd;
                list_remove(&pstate->sigfd_list, sp);
                sp->sp_proc = NULL;

                /* Wake any lingering pollers referencing the pollhead */
                mutex_enter(&sp->sp_lock);
                signalfd_poller_wake(sp, POLLERR);
                mutex_exit(&sp->sp_lock);

                if (list_is_empty(&pstate->sigfd_list)) {
                        /*
                         * If this poller was the last associated against the
                         * process, then clean up its state as well.
                         */
                        signalfd_proc_clean(p);
                }
                mutex_exit(&p->p_lock);
        }
        mutex_exit(&pidlock);
}

static void
signalfd_pollers_free(signalfd_state_t *state)
{
        ASSERT(MUTEX_HELD(&state->sfd_lock));

        signalfd_poller_t *sp;
        while ((sp = list_remove_head(&state->sfd_pollers)) != NULL) {
                ASSERT3P(sp->sp_proc, ==, NULL);

                mutex_enter(&sp->sp_lock);
                while (sp->sp_pending) {
                        cv_wait(&sp->sp_cv, &sp->sp_lock);
                }
                /*
                 * With the poller dissociated from its polling process, and any
                 * lingering events delivered, the pollhead should be empty.
                 */
                ASSERT3P(sp->sp_pollhead.ph_list, ==, NULL);

                cv_destroy(&sp->sp_cv);
                mutex_destroy(&sp->sp_lock);
                kmem_free(sp, sizeof (*sp));
        }
}

/*
 * Callback for cleaning up signalfd state from a process during proc_exit().
 */
static void
signalfd_exit_helper(void)
{
        proc_t *p = curproc;

        mutex_enter(&p->p_lock);

        sigfd_proc_state_t *pstate = p->p_sigfd;
        if (pstate == NULL) {
                mutex_exit(&p->p_lock);
                return;
        }

        signalfd_poller_t *sp;
        while ((sp = list_remove_head(&pstate->sigfd_list)) != NULL) {
                /*
                 * Having been removed from the sigfd_list, make it clear that
                 * this signalfd_poller_t is disssociated from the process.
                 */
                sp->sp_proc = NULL;

                /* Wake any lingering pollers referencing the pollhead */
                mutex_enter(&sp->sp_lock);
                signalfd_poller_wake(sp, POLLERR);
                mutex_exit(&sp->sp_lock);
        }
        signalfd_proc_clean(p);
        mutex_exit(&p->p_lock);
}

_NOTE(ARGSUSED(1))
static int
signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cr)
{
        if (getminor(*devp) != SIGNALFDMNRN_SIGNALFD) {
                return (ENXIO);
        }

        const minor_t minor = (minor_t)id_allocff_nosleep(signalfd_minors);
        if (minor == -1) {
                return (ENOMEM);
        }

        if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
                id_free(signalfd_minors, minor);
                return (ENODEV);
        }

        signalfd_state_t *state = ddi_get_soft_state(signalfd_softstate, minor);
        mutex_init(&state->sfd_lock, NULL, MUTEX_DEFAULT, NULL);
        list_create(&state->sfd_pollers, sizeof (signalfd_poller_t),
            offsetof(signalfd_poller_t, sp_state_node));
        state->sfd_minor = minor;

        const major_t major = getemajor(*devp);
        *devp = makedevice(major, minor);

        return (0);
}

/*
 * Consume one signal from our set in a manner similar to sigtimedwait().
 * The block parameter is used to control whether we wait for a signal or
 * return immediately if no signal is pending. We use the thread's t_sigwait
 * member in the same way that it is used by sigtimedwait.
 *
 * Return 0 if we successfully consumed a signal or an errno if not.
 */
static int
signalfd_consume_signal(k_sigset_t set, uio_t *uio, bool should_block)
{
        kthread_t *t = curthread;
        klwp_t *lwp = ttolwp(t);
        proc_t *p = ttoproc(t);
        int ret = 0;

        /*
         * Identify signals of interest so they can be processed, even if other
         * parts of the machinery would be poised to ignore them.
         */
        t->t_sigwait = set;

        mutex_enter(&p->p_lock);

        /* Set thread signal mask to unmask those in the specified set. */
        schedctl_finish_sigblock(t);
        const k_sigset_t oldmask = t->t_hold;
        sigdiffset(&t->t_hold, &t->t_sigwait);

        if (should_block) {
                do {
                        ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock,
                            NULL, 0);
                } while (ret > 0);
        } else {
                mutex_exit(&p->p_lock);
                if (issig(FORREAL) == 0) {
                        ret = -1;
                }
                mutex_enter(&p->p_lock);
        }

        /*
         * Restore thread's signal mask to its previous value.
         * Set t_sig_check so post_syscall sees new t_hold mask.
         */
        t->t_hold = oldmask;
        t->t_sig_check = 1;

        if (ret == -1) {
                /* no signals pending */
                mutex_exit(&p->p_lock);
                sigemptyset(&t->t_sigwait);
                return (EAGAIN);
        }

        /* Do not bother with signal if it is not in request set. */
        if (lwp->lwp_cursig == 0 ||
            !sigismember(&t->t_sigwait, lwp->lwp_cursig)) {
                /*
                 * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
                 * This happens if some other thread in this process called
                 * forkall() or exit().
                 */
                mutex_exit(&p->p_lock);
                sigemptyset(&t->t_sigwait);
                return (EINTR);
        }

        /* Convert signal info into external, datamodel independent, struct. */
        signalfd_siginfo_t ssi;
        bzero(&ssi, sizeof (ssi));
        if (lwp->lwp_curinfo != NULL) {
                k_siginfo_t *infop = &lwp->lwp_curinfo->sq_info;

                ssi.ssi_signo   = infop->si_signo;
                ssi.ssi_errno   = infop->si_errno;
                ssi.ssi_code    = infop->si_code;
                ssi.ssi_pid     = infop->si_pid;
                ssi.ssi_uid     = infop->si_uid;
                ssi.ssi_fd      = infop->si_fd;
                ssi.ssi_band    = infop->si_band;
                ssi.ssi_trapno  = infop->si_trapno;
                ssi.ssi_status  = infop->si_status;
                ssi.ssi_utime   = infop->si_utime;
                ssi.ssi_stime   = infop->si_stime;
                ssi.ssi_addr    = (uint64_t)(intptr_t)infop->si_addr;

                DTRACE_PROC2(signal__clear, int, 0, ksiginfo_t *, infop);
        } else {
                /* Convert to the format expected by the probe. */
                k_siginfo_t info = {
                        .si_signo = lwp->lwp_cursig,
                        .si_code = SI_NOINFO,
                };

                ssi.ssi_signo = info.si_signo;
                ssi.ssi_code = info.si_code;

                DTRACE_PROC2(signal__clear, int, 0, ksiginfo_t *, &info);
        }

        lwp->lwp_ru.nsignals++;
        lwp->lwp_cursig = 0;
        lwp->lwp_extsig = 0;
        if (lwp->lwp_curinfo != NULL) {
                siginfofree(lwp->lwp_curinfo);
                lwp->lwp_curinfo = NULL;
        }
        mutex_exit(&p->p_lock);

        ret = uiomove(&ssi, sizeof (ssi), UIO_READ, uio);
        sigemptyset(&t->t_sigwait);
        return (ret);
}

/*
 * This is similar to sigtimedwait. Based on the fd mode, we may wait until a
 * signal within our specified set is posted. We consume as many available
 * signals within our set as we can.
 */
_NOTE(ARGSUSED(2))
static int
signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
{
        signalfd_state_t *state;
        k_sigset_t set;
        bool should_block = true, got_one = false;
        int res;

        state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
        if (state == NULL) {
                return (ENXIO);
        }

        if (uio->uio_resid < sizeof (signalfd_siginfo_t)) {
                return (EINVAL);
        }

        if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
                should_block = false;
        }

        mutex_enter(&state->sfd_lock);
        set = state->sfd_mask;
        mutex_exit(&state->sfd_lock);

        if (sigisempty(&set))
                return (set_errno(EINVAL));

        do  {
                res = signalfd_consume_signal(set, uio, should_block);

                if (res == 0) {
                        /*
                         * After consuming one signal, do not block while
                         * trying to consume more.
                         */
                        got_one = true;
                        should_block = false;

                        /*
                         * Refresh the matching signal set in case it was
                         * updated during the wait.
                         */
                        mutex_enter(&state->sfd_lock);
                        set = state->sfd_mask;
                        mutex_exit(&state->sfd_lock);
                        if (sigisempty(&set))
                                break;
                }
        } while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));

        if (got_one)
                res = 0;

        return (res);
}

/*
 * If ksigset_t's were a single word, we would do:
 *      return (((p->p_sig | t->t_sig) & set) & fillset);
 */
static int
signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set)
{
        return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) &
            set.__sigbits[0]) |
            ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) &
            set.__sigbits[1]) |
            (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) &
            set.__sigbits[2]) & FILLSET2));
}

static int
signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
    struct pollhead **phpp)
{
        signalfd_state_t *state;
        short revents = 0;
        kthread_t *t = curthread;
        proc_t *p = ttoproc(t);

        state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
        if (state == NULL) {
                return (ENXIO);
        }

        mutex_enter(&state->sfd_lock);
        if (signalfd_sig_pending(p, t, state->sfd_mask) != 0) {
                revents |= POLLRDNORM | POLLIN;
        }

        *reventsp = revents & events;
        if ((*reventsp == 0 && !anyyet) || (events & POLLET) != 0) {
                signalfd_poller_t *sp;

                sp = signalfd_poller_associate(state, p);
                *phpp = &sp->sp_pollhead;
        }
        mutex_exit(&state->sfd_lock);

        return (0);
}

static void
signalfd_set_mask(signalfd_state_t *state, const sigset_t *umask)
{
        k_sigset_t kmask;

        sigutok(umask, &kmask);

        mutex_enter(&state->sfd_lock);
        state->sfd_mask = kmask;
        list_t *pollers = &state->sfd_pollers;
        for (signalfd_poller_t *sp = list_head(pollers); sp != NULL;
            sp = list_next(pollers, sp)) {
                mutex_enter(&sp->sp_lock);
                sp->sp_mask = kmask;
                mutex_exit(&sp->sp_lock);
        }
        mutex_exit(&state->sfd_lock);
}

_NOTE(ARGSUSED(4))
static int
signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
{
        signalfd_state_t *state;
        sigset_t mask;

        state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
        if (state == NULL) {
                return (ENXIO);
        }

        switch (cmd) {
        case SIGNALFDIOC_MASK:
                if (ddi_copyin((caddr_t)arg, &mask, sizeof (mask), md) != 0) {
                        return (EFAULT);
                }
                signalfd_set_mask(state, &mask);
                return (0);

        default:
                break;
        }

        return (ENOTTY);
}

_NOTE(ARGSUSED(1))
static int
signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
{
        signalfd_state_t *state;
        const minor_t minor = getminor(dev);

        state = ddi_get_soft_state(signalfd_softstate, minor);
        if (state == NULL) {
                return (ENXIO);
        }

        /*
         * With this signalfd instance being closed, sfd_lock is a formality, as
         * nothing else should be reaching for it to add pollers at this point.
         */
        mutex_enter(&state->sfd_lock);

        /* Dissociate any pollers from their respective processes */
        signalfd_pollers_dissociate(state);

        /* ... and free all those (now-dissociated) pollers */
        signalfd_pollers_free(state);
        ASSERT(list_is_empty(&state->sfd_pollers));

        mutex_destroy(&state->sfd_lock);
        ddi_soft_state_free(signalfd_softstate, minor);
        id_free(signalfd_minors, minor);

        return (0);
}

static int
signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
        if (cmd != DDI_ATTACH || signalfd_devi != NULL) {
                return (DDI_FAILURE);
        }

        signalfd_minors = id_space_create("signalfd_minors", 1, L_MAXMIN32 + 1);
        if (signalfd_minors == NULL) {
                cmn_err(CE_WARN, "signalfd couldn't create id space");
                return (DDI_FAILURE);
        }

        if (ddi_soft_state_init(&signalfd_softstate,
            sizeof (signalfd_state_t), 0) != 0) {
                cmn_err(CE_WARN, "signalfd failed to create soft state");
                id_space_destroy(signalfd_minors);
                return (DDI_FAILURE);
        }

        if (ddi_create_minor_node(devi, "signalfd", S_IFCHR,
            SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
                cmn_err(CE_NOTE, "signalfd couldn't create minor node");
                ddi_soft_state_fini(&signalfd_softstate);
                id_space_destroy(signalfd_minors);
                return (DDI_FAILURE);
        }


        sigfd_exit_helper = signalfd_exit_helper;

        signalfd_wakeq = taskq_create("signalfd_wake", 1, minclsyspri,
            0, INT_MAX, TASKQ_PREPOPULATE);

        ddi_report_dev(devi);
        signalfd_devi = devi;

        return (DDI_SUCCESS);
}

_NOTE(ARGSUSED(0))
static int
signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
        if (cmd != DDI_DETACH) {
                return (DDI_FAILURE);
        }

        /*
         * With all of the instances gone, it is safe to both destroy the waker
         * taskq (which must be empty) and tear down the exit helper (which must
         * be unreachable with no proc_t`p_sigfd associations).
         */
        taskq_destroy(signalfd_wakeq);
        sigfd_exit_helper = NULL;

        id_space_destroy(signalfd_minors);
        ddi_soft_state_fini(&signalfd_softstate);
        ddi_remove_minor_node(signalfd_devi, NULL);
        signalfd_devi = NULL;

        return (DDI_SUCCESS);
}

_NOTE(ARGSUSED(0))
static int
signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
        int error;

        switch (infocmd) {
        case DDI_INFO_DEVT2DEVINFO:
                *result = (void *)signalfd_devi;
                error = DDI_SUCCESS;
                break;
        case DDI_INFO_DEVT2INSTANCE:
                *result = (void *)0;
                error = DDI_SUCCESS;
                break;
        default:
                error = DDI_FAILURE;
        }
        return (error);
}

static struct cb_ops signalfd_cb_ops = {
        signalfd_open,          /* open */
        signalfd_close,         /* close */
        nulldev,                /* strategy */
        nulldev,                /* print */
        nodev,                  /* dump */
        signalfd_read,          /* read */
        nodev,                  /* write */
        signalfd_ioctl,         /* ioctl */
        nodev,                  /* devmap */
        nodev,                  /* mmap */
        nodev,                  /* segmap */
        signalfd_poll,          /* poll */
        ddi_prop_op,            /* cb_prop_op */
        0,                      /* streamtab  */
        D_NEW | D_MP            /* Driver compatibility flag */
};

static struct dev_ops signalfd_ops = {
        DEVO_REV,               /* devo_rev */
        0,                      /* refcnt */
        signalfd_info,          /* get_dev_info */
        nulldev,                /* identify */
        nulldev,                /* probe */
        signalfd_attach,        /* attach */
        signalfd_detach,        /* detach */
        nodev,                  /* reset */
        &signalfd_cb_ops,       /* driver operations */
        NULL,                   /* bus operations */
        nodev,                  /* dev power */
        ddi_quiesce_not_needed, /* quiesce */
};

static struct modldrv modldrv = {
        &mod_driverops,         /* module type (this is a pseudo driver) */
        "signalfd support",     /* name of module */
        &signalfd_ops,          /* driver ops */
};

static struct modlinkage modlinkage = {
        MODREV_1,
        (void *)&modldrv,
        NULL
};

int
_init(void)
{
        return (mod_install(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}

int
_fini(void)
{
        return (mod_remove(&modlinkage));
}