root/sys/kern/sys_timerfd.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
 * Copyright (c) 2023 Jake Freeland <jfree@FreeBSD.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/selinfo.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/timerfd.h>
#include <sys/timespec.h>
#include <sys/uio.h>
#include <sys/user.h>

#include <security/audit/audit.h>

static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");

static struct mtx timerfd_list_lock;
static LIST_HEAD(, timerfd) timerfd_list;
MTX_SYSINIT(timerfd, &timerfd_list_lock, "timerfd_list_lock", MTX_DEF);

static struct unrhdr64 tfdino_unr;

#define TFD_NOJUMP      0       /* Realtime clock has not jumped. */
#define TFD_READ        1       /* Jumped, tfd has been read since. */
#define TFD_ZREAD       2       /* Jumped backwards, CANCEL_ON_SET=false. */
#define TFD_CANCELED    4       /* Jumped, CANCEL_ON_SET=true. */
#define TFD_JUMPED      (TFD_ZREAD | TFD_CANCELED)

/*
 * One structure allocated per timerfd descriptor.
 *
 * Locking semantics:
 * (t)  locked by tfd_lock mtx
 * (l)  locked by timerfd_list_lock sx
 * (c)  const until freeing
 */
struct timerfd {
        /* User specified. */
        struct itimerspec tfd_time;     /* (t) tfd timer */
        clockid_t       tfd_clockid;    /* (c) timing base */
        int             tfd_flags;      /* (c) creation flags */
        int             tfd_timflags;   /* (t) timer flags */

        /* Used internally. */
        timerfd_t       tfd_count;      /* (t) expiration count since read */
        bool            tfd_expired;    /* (t) true upon initial expiration */
        struct mtx      tfd_lock;       /* tfd mtx lock */
        struct callout  tfd_callout;    /* (t) expiration notification */
        struct selinfo  tfd_sel;        /* (t) I/O alerts */
        struct timespec tfd_boottim;    /* (t) cached boottime */
        int             tfd_jumped;     /* (t) timer jump status */
        LIST_ENTRY(timerfd) entry;      /* (l) entry in list */

        /* For stat(2). */
        ino_t           tfd_ino;        /* (c) inode number */
        struct timespec tfd_atim;       /* (t) time of last read */
        struct timespec tfd_mtim;       /* (t) time of last settime */
        struct timespec tfd_birthtim;   /* (c) creation time */
};

static void
timerfd_init(void *data)
{
        new_unrhdr64(&tfdino_unr, 1);
}

SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);

static inline void
timerfd_getboottime(struct timespec *ts)
{
        struct timeval tv;

        getboottime(&tv);
        TIMEVAL_TO_TIMESPEC(&tv, ts);
}

static void
timerfd_wakeup(struct timerfd *tfd)
{
        wakeup(&tfd->tfd_count);
        selwakeup(&tfd->tfd_sel);
        KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
}

/*
 * Call when a discontinuous jump has occured in CLOCK_REALTIME and
 * update timerfd's cached boottime. A jump can be triggered using
 * functions like clock_settime(2) or settimeofday(2).
 *
 * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
 * and the realtime clock jumps.
 * Timer is marked TFD_ZREAD if TFD_TIMER_CANCEL_ON_SET is not set,
 * but the realtime clock jumps backwards.
 */
void
timerfd_jumped(void)
{
        struct timerfd *tfd;
        struct timespec boottime, diff;

        if (LIST_EMPTY(&timerfd_list))
                return;

        timerfd_getboottime(&boottime);
        mtx_lock(&timerfd_list_lock);
        LIST_FOREACH(tfd, &timerfd_list, entry) {
                mtx_lock(&tfd->tfd_lock);
                if (tfd->tfd_clockid != CLOCK_REALTIME ||
                    (tfd->tfd_timflags & TFD_TIMER_ABSTIME) == 0 ||
                    timespeccmp(&boottime, &tfd->tfd_boottim, ==)) {
                        mtx_unlock(&tfd->tfd_lock);
                        continue;
                }

                if (callout_active(&tfd->tfd_callout)) {
                        if ((tfd->tfd_timflags & TFD_TIMER_CANCEL_ON_SET) != 0)
                                tfd->tfd_jumped = TFD_CANCELED;
                        else if (timespeccmp(&boottime, &tfd->tfd_boottim, <))
                                tfd->tfd_jumped = TFD_ZREAD;

                        /*
                         * Do not reschedule callout when
                         * inside interval time loop.
                         */
                        if (!tfd->tfd_expired) {
                                timespecsub(&boottime,
                                    &tfd->tfd_boottim, &diff);
                                timespecsub(&tfd->tfd_time.it_value,
                                    &diff, &tfd->tfd_time.it_value);
                                if (callout_stop(&tfd->tfd_callout) == 1) {
                                        callout_schedule_sbt(&tfd->tfd_callout,
                                            tstosbt_sat(tfd->tfd_time.it_value),
                                            0, C_ABSOLUTE);
                                }
                        }
                }

                tfd->tfd_boottim = boottime;
                if ((tfd->tfd_jumped & TFD_JUMPED) != 0)
                        timerfd_wakeup(tfd);
                mtx_unlock(&tfd->tfd_lock);
        }
        mtx_unlock(&timerfd_list_lock);
}

static int
timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
        struct timerfd *tfd = fp->f_data;
        timerfd_t count;
        int error = 0;

        if (uio->uio_resid < sizeof(timerfd_t))
                return (EINVAL);

        mtx_lock(&tfd->tfd_lock);
retry:
        getnanotime(&tfd->tfd_atim);
        if ((tfd->tfd_jumped & TFD_JUMPED) != 0) {
                if (tfd->tfd_jumped == TFD_CANCELED)
                        error = ECANCELED;
                tfd->tfd_jumped = TFD_READ;
                tfd->tfd_count = 0;
                mtx_unlock(&tfd->tfd_lock);
                return (error);
        } else {
                tfd->tfd_jumped = TFD_NOJUMP;
        }
        if (tfd->tfd_count == 0) {
                if ((fp->f_flag & FNONBLOCK) != 0) {
                        mtx_unlock(&tfd->tfd_lock);
                        return (EAGAIN);
                }
                error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock,
                    PCATCH, "tfdrd", 0);
                if (error == 0) {
                        goto retry;
                } else {
                        mtx_unlock(&tfd->tfd_lock);
                        return (error);
                }
        }

        count = tfd->tfd_count;
        tfd->tfd_count = 0;
        mtx_unlock(&tfd->tfd_lock);
        error = uiomove(&count, sizeof(timerfd_t), uio);

        return (error);
}

static int
timerfd_ioctl(struct file *fp, u_long cmd, void *data,
    struct ucred *active_cred, struct thread *td)
{
        switch (cmd) {
        case FIOASYNC:
                if (*(int *)data != 0)
                        atomic_set_int(&fp->f_flag, FASYNC);
                else
                        atomic_clear_int(&fp->f_flag, FASYNC);
                return (0);
        case FIONBIO:
                if (*(int *)data != 0)
                        atomic_set_int(&fp->f_flag, FNONBLOCK);
                else
                        atomic_clear_int(&fp->f_flag, FNONBLOCK);
                return (0);
        }
        return (ENOTTY);
}

static int
timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
    struct thread *td)
{
        struct timerfd *tfd = fp->f_data;
        int revents = 0;

        mtx_lock(&tfd->tfd_lock);
        if ((events & (POLLIN | POLLRDNORM)) != 0 &&
            tfd->tfd_count > 0 && tfd->tfd_jumped != TFD_READ)
                revents |= events & (POLLIN | POLLRDNORM);
        if (revents == 0)
                selrecord(td, &tfd->tfd_sel);
        mtx_unlock(&tfd->tfd_lock);

        return (revents);
}

static void
filt_timerfddetach(struct knote *kn)
{
        struct timerfd *tfd = kn->kn_hook;

        mtx_lock(&tfd->tfd_lock);
        knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
        mtx_unlock(&tfd->tfd_lock);
}

static int
filt_timerfdread(struct knote *kn, long hint)
{
        struct timerfd *tfd = kn->kn_hook;

        mtx_assert(&tfd->tfd_lock, MA_OWNED);
        kn->kn_data = (int64_t)tfd->tfd_count;
        return (tfd->tfd_count > 0 && tfd->tfd_jumped != TFD_READ);
}

static const struct filterops timerfd_rfiltops = {
        .f_isfd = 1,
        .f_detach = filt_timerfddetach,
        .f_event = filt_timerfdread,
};

static int
timerfd_kqfilter(struct file *fp, struct knote *kn)
{
        struct timerfd *tfd = fp->f_data;

        if (kn->kn_filter != EVFILT_READ)
                return (EINVAL);

        kn->kn_fop = &timerfd_rfiltops;
        kn->kn_hook = tfd;
        knlist_add(&tfd->tfd_sel.si_note, kn, 0);

        return (0);
}

static int
timerfd_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
{
        struct timerfd *tfd = fp->f_data;

        bzero(sb, sizeof(*sb));
        sb->st_nlink = fp->f_count - 1;
        sb->st_uid = fp->f_cred->cr_uid;
        sb->st_gid = fp->f_cred->cr_gid;
        sb->st_blksize = PAGE_SIZE;
        mtx_lock(&tfd->tfd_lock);
        sb->st_atim = tfd->tfd_atim;
        sb->st_mtim = tfd->tfd_mtim;
        mtx_unlock(&tfd->tfd_lock);
        sb->st_ctim = sb->st_mtim;
        sb->st_ino = tfd->tfd_ino;
        sb->st_birthtim = tfd->tfd_birthtim;

        return (0);
}

static int
timerfd_close(struct file *fp, struct thread *td)
{
        struct timerfd *tfd = fp->f_data;

        mtx_lock(&timerfd_list_lock);
        LIST_REMOVE(tfd, entry);
        mtx_unlock(&timerfd_list_lock);

        callout_drain(&tfd->tfd_callout);
        seldrain(&tfd->tfd_sel);
        knlist_destroy(&tfd->tfd_sel.si_note);
        mtx_destroy(&tfd->tfd_lock);
        free(tfd, M_TIMERFD);
        fp->f_ops = &badfileops;

        return (0);
}

static int
timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif,
    struct filedesc *fdp)
{
        struct timerfd *tfd = fp->f_data;

        kif->kf_type = KF_TYPE_TIMERFD;
        kif->kf_un.kf_timerfd.kf_timerfd_clockid = tfd->tfd_clockid;
        kif->kf_un.kf_timerfd.kf_timerfd_flags = tfd->tfd_flags;
        kif->kf_un.kf_timerfd.kf_timerfd_addr = (uintptr_t)tfd;

        return (0);
}

static const struct fileops timerfdops = {
        .fo_read = timerfd_read,
        .fo_write = invfo_rdwr,
        .fo_truncate = invfo_truncate,
        .fo_ioctl = timerfd_ioctl,
        .fo_poll = timerfd_poll,
        .fo_kqfilter = timerfd_kqfilter,
        .fo_stat = timerfd_stat,
        .fo_close = timerfd_close,
        .fo_chmod = invfo_chmod,
        .fo_chown = invfo_chown,
        .fo_sendfile = invfo_sendfile,
        .fo_fill_kinfo = timerfd_fill_kinfo,
        .fo_cmp = file_kcmp_generic,
        .fo_flags = DFLAG_PASSABLE,
};

static void
timerfd_curval(struct timerfd *tfd, struct itimerspec *old_value)
{
        struct timespec curr_value;

        mtx_assert(&tfd->tfd_lock, MA_OWNED);
        *old_value = tfd->tfd_time;
        if (timespecisset(&tfd->tfd_time.it_value)) {
                nanouptime(&curr_value);
                timespecsub(&tfd->tfd_time.it_value, &curr_value,
                    &old_value->it_value);
        }
}

static void
timerfd_expire(void *arg)
{
        struct timerfd *tfd = (struct timerfd *)arg;
        sbintime_t exp, interval, now, next, diff;

        ++tfd->tfd_count;
        tfd->tfd_expired = true;
        if (timespecisset(&tfd->tfd_time.it_interval)) {
                exp = tstosbt_sat(tfd->tfd_time.it_value);
                interval = tstosbt_sat(tfd->tfd_time.it_interval);
                now = sbinuptime();
                next = now > SBT_MAX - interval ? SBT_MAX : now + interval;

                /* Count missed events. */
                if (now > exp) {
                        diff = now - exp;
                        tfd->tfd_count += diff / interval;
                        next -= diff % interval;
                }

                callout_schedule_sbt(&tfd->tfd_callout, next, 0, C_ABSOLUTE);
                tfd->tfd_time.it_value = sbttots(next);
        } else {
                /* Single shot timer. */
                callout_deactivate(&tfd->tfd_callout);
                timespecclear(&tfd->tfd_time.it_value);
        }

        timerfd_wakeup(tfd);
}

int
kern_timerfd_create(struct thread *td, int clockid, int flags)
{
        struct file *fp;
        struct timerfd *tfd;
        int error, fd, fflags;

        AUDIT_ARG_VALUE(clockid);
        AUDIT_ARG_FFLAGS(flags);

        switch (clockid) {
        case CLOCK_REALTIME:
                /* FALLTHROUGH */
        case CLOCK_MONOTONIC:
                /* FALLTHROUGH */
        case CLOCK_UPTIME:
                /*
                 * CLOCK_BOOTTIME should be added once different from
                 * CLOCK_UPTIME
                 */
                break;
        default:
                return (EINVAL);
        }
        if ((flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) != 0)
                return (EINVAL);

        fflags = FREAD;
        if ((flags & TFD_CLOEXEC) != 0)
                fflags |= O_CLOEXEC;
        if ((flags & TFD_NONBLOCK) != 0)
                fflags |= FNONBLOCK;

        error = falloc(td, &fp, &fd, fflags);
        if (error != 0)
                return (error);

        tfd = malloc(sizeof(*tfd), M_TIMERFD, M_WAITOK | M_ZERO);
        tfd->tfd_clockid = (clockid_t)clockid;
        tfd->tfd_flags = flags;
        tfd->tfd_ino = alloc_unr64(&tfdino_unr);
        mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
        callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
        knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
        timerfd_getboottime(&tfd->tfd_boottim);
        getnanotime(&tfd->tfd_birthtim);
        mtx_lock(&timerfd_list_lock);
        LIST_INSERT_HEAD(&timerfd_list, tfd, entry);
        mtx_unlock(&timerfd_list_lock);

        finit(fp, fflags, DTYPE_TIMERFD, tfd, &timerfdops);

        fdrop(fp, td);

        td->td_retval[0] = fd;
        return (0);
}

int
kern_timerfd_gettime(struct thread *td, int fd, struct itimerspec *curr_value)
{
        struct file *fp;
        struct timerfd *tfd;
        int error;

        error = fget(td, fd, &cap_write_rights, &fp);
        if (error != 0)
                return (error);
        if (fp->f_type != DTYPE_TIMERFD) {
                fdrop(fp, td);
                return (EINVAL);
        }
        tfd = fp->f_data;

        mtx_lock(&tfd->tfd_lock);
        timerfd_curval(tfd, curr_value);
        mtx_unlock(&tfd->tfd_lock);

        fdrop(fp, td);
        return (0);
}

int
kern_timerfd_settime(struct thread *td, int fd, int flags,
    const struct itimerspec *new_value, struct itimerspec *old_value)
{
        struct file *fp;
        struct timerfd *tfd;
        struct timespec ts;
        int error = 0;

        if ((flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) != 0)
                return (EINVAL);
        if (!timespecvalid_interval(&new_value->it_value) ||
            !timespecvalid_interval(&new_value->it_interval))
                return (EINVAL);

        error = fget(td, fd, &cap_write_rights, &fp);
        if (error != 0)
                return (error);
        if (fp->f_type != DTYPE_TIMERFD) {
                fdrop(fp, td);
                return (EINVAL);
        }
        tfd = fp->f_data;

        mtx_lock(&tfd->tfd_lock);
        getnanotime(&tfd->tfd_mtim);
        tfd->tfd_timflags = flags;

        /* Store old itimerspec, if applicable. */
        if (old_value != NULL)
                timerfd_curval(tfd, old_value);

        /* Set new expiration. */
        tfd->tfd_time = *new_value;
        if (timespecisset(&tfd->tfd_time.it_value)) {
                if ((flags & TFD_TIMER_ABSTIME) == 0) {
                        nanouptime(&ts);
                        timespecadd(&tfd->tfd_time.it_value, &ts,
                            &tfd->tfd_time.it_value);
                } else if (tfd->tfd_clockid == CLOCK_REALTIME) {
                        /* ECANCELED if unread jump is pending. */
                        if (tfd->tfd_jumped == TFD_CANCELED)
                                error = ECANCELED;
                        /* Convert from CLOCK_REALTIME to CLOCK_BOOTTIME. */
                        timespecsub(&tfd->tfd_time.it_value, &tfd->tfd_boottim,
                            &tfd->tfd_time.it_value);
                }
                callout_reset_sbt(&tfd->tfd_callout,
                    tstosbt_sat(tfd->tfd_time.it_value),
                    0, timerfd_expire, tfd, C_ABSOLUTE);
        } else {
                callout_stop(&tfd->tfd_callout);
        }
        tfd->tfd_count = 0;
        tfd->tfd_expired = false;
        tfd->tfd_jumped = TFD_NOJUMP;
        mtx_unlock(&tfd->tfd_lock);

        fdrop(fp, td);
        return (error);
}

int
sys_timerfd_create(struct thread *td, struct timerfd_create_args *uap)
{
        return (kern_timerfd_create(td, uap->clockid, uap->flags));
}

int
sys_timerfd_gettime(struct thread *td, struct timerfd_gettime_args *uap)
{
        struct itimerspec curr_value;
        int error;

        error = kern_timerfd_gettime(td, uap->fd, &curr_value);
        if (error == 0)
                error = copyout(&curr_value, uap->curr_value,
                    sizeof(curr_value));

        return (error);
}

int
sys_timerfd_settime(struct thread *td, struct timerfd_settime_args *uap)
{
        struct itimerspec new_value, old_value;
        int error;

        error = copyin(uap->new_value, &new_value, sizeof(new_value));
        if (error != 0)
                return (error);
        if (uap->old_value == NULL) {
                error = kern_timerfd_settime(td, uap->fd, uap->flags,
                    &new_value, NULL);
        } else {
                error = kern_timerfd_settime(td, uap->fd, uap->flags,
                    &new_value, &old_value);
                if (error == 0)
                        error = copyout(&old_value, uap->old_value,
                            sizeof(old_value));
        }
        return (error);
}