root/net/sched/sch_etf.c
// SPDX-License-Identifier: GPL-2.0

/* net/sched/sch_etf.c  Earliest TxTime First queueing discipline.
 *
 * Authors:     Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
 *              Vinicius Costa Gomes <vinicius.gomes@intel.com>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/rbtree.h>
#include <linux/skbuff.h>
#include <linux/posix-timers.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/sock.h>

#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
#define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK)

struct etf_sched_data {
        bool offload;
        bool deadline_mode;
        bool skip_sock_check;
        int clockid;
        int queue;
        s32 delta; /* in ns */
        ktime_t last; /* The txtime of the last skb sent to the netdevice. */
        struct rb_root_cached head;
        struct qdisc_watchdog watchdog;
        ktime_t (*get_time)(void);
};

static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
        [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
};

static inline int validate_input_params(struct tc_etf_qopt *qopt,
                                        struct netlink_ext_ack *extack)
{
        /* Check if params comply to the following rules:
         *      * Clockid and delta must be valid.
         *
         *      * Dynamic clockids are not supported.
         *
         *      * Delta must be a positive integer.
         *
         * Also note that for the HW offload case, we must
         * expect that system clocks have been synchronized to PHC.
         */
        if (qopt->clockid < 0) {
                NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
                return -ENOTSUPP;
        }

        if (qopt->clockid != CLOCK_TAI) {
                NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
                return -EINVAL;
        }

        if (qopt->delta < 0) {
                NL_SET_ERR_MSG(extack, "Delta must be positive");
                return -EINVAL;
        }

        return 0;
}

static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        ktime_t txtime = nskb->tstamp;
        struct sock *sk = nskb->sk;
        ktime_t now;

        if (q->skip_sock_check)
                goto skip;

        if (!sk || !sk_fullsock(sk))
                return false;

        if (!sock_flag(sk, SOCK_TXTIME))
                return false;

        /* We don't perform crosstimestamping.
         * Drop if packet's clockid differs from qdisc's.
         */
        if (sk->sk_clockid != q->clockid)
                return false;

        if (sk->sk_txtime_deadline_mode != q->deadline_mode)
                return false;

skip:
        now = q->get_time();
        if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
                return false;

        return true;
}

static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct rb_node *p;

        p = rb_first_cached(&q->head);
        if (!p)
                return NULL;

        return rb_to_skb(p);
}

static void reset_watchdog(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct sk_buff *skb = etf_peek_timesortedlist(sch);
        ktime_t next;

        if (!skb) {
                qdisc_watchdog_cancel(&q->watchdog);
                return;
        }

        next = ktime_sub_ns(skb->tstamp, q->delta);
        qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
}

static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
{
        struct sock_exterr_skb *serr;
        struct sk_buff *clone;
        ktime_t txtime = skb->tstamp;
        struct sock *sk = skb->sk;

        if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors))
                return;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone)
                return;

        serr = SKB_EXT_ERR(clone);
        serr->ee.ee_errno = err;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
        serr->ee.ee_type = 0;
        serr->ee.ee_code = code;
        serr->ee.ee_pad = 0;
        serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
        serr->ee.ee_info = txtime; /* low part of tstamp */

        if (sock_queue_err_skb(sk, clone))
                kfree_skb(clone);
}

static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
                                      struct sk_buff **to_free)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL;
        ktime_t txtime = nskb->tstamp;
        bool leftmost = true;

        if (!is_packet_valid(sch, nskb)) {
                report_sock_error(nskb, EINVAL,
                                  SO_EE_CODE_TXTIME_INVALID_PARAM);
                return qdisc_drop(nskb, sch, to_free);
        }

        while (*p) {
                struct sk_buff *skb;

                parent = *p;
                skb = rb_to_skb(parent);
                if (ktime_compare(txtime, skb->tstamp) >= 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else {
                        p = &parent->rb_left;
                }
        }
        rb_link_node(&nskb->rbnode, parent, p);
        rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost);

        qdisc_qstats_backlog_inc(sch, nskb);
        sch->q.qlen++;

        /* Now we may need to re-arm the qdisc watchdog for the next packet. */
        reset_watchdog(sch);

        return NET_XMIT_SUCCESS;
}

static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb,
                                ktime_t now)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct sk_buff *to_free = NULL;
        struct sk_buff *tmp = NULL;

        skb_rbtree_walk_from_safe(skb, tmp) {
                if (ktime_after(skb->tstamp, now))
                        break;

                rb_erase_cached(&skb->rbnode, &q->head);

                /* The rbnode field in the skb re-uses these fields, now that
                 * we are done with the rbnode, reset them.
                 */
                skb->next = NULL;
                skb->prev = NULL;
                skb->dev = qdisc_dev(sch);

                report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);

                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_drop(skb, sch, &to_free);
                qdisc_qstats_overlimit(sch);
                sch->q.qlen--;
        }

        kfree_skb_list(to_free);
}

static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb)
{
        struct etf_sched_data *q = qdisc_priv(sch);

        rb_erase_cached(&skb->rbnode, &q->head);

        /* The rbnode field in the skb re-uses these fields, now that
         * we are done with the rbnode, reset them.
         */
        skb->next = NULL;
        skb->prev = NULL;
        skb->dev = qdisc_dev(sch);

        qdisc_qstats_backlog_dec(sch, skb);

        qdisc_bstats_update(sch, skb);

        q->last = skb->tstamp;

        sch->q.qlen--;
}

static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct sk_buff *skb;
        ktime_t now, next;

        skb = etf_peek_timesortedlist(sch);
        if (!skb)
                return NULL;

        now = q->get_time();

        /* Drop if packet has expired while in queue. */
        if (ktime_before(skb->tstamp, now)) {
                timesortedlist_drop(sch, skb, now);
                skb = NULL;
                goto out;
        }

        /* When in deadline mode, dequeue as soon as possible and change the
         * txtime from deadline to (now + delta).
         */
        if (q->deadline_mode) {
                timesortedlist_remove(sch, skb);
                skb->tstamp = now;
                goto out;
        }

        next = ktime_sub_ns(skb->tstamp, q->delta);

        /* Dequeue only if now is within the [txtime - delta, txtime] range. */
        if (ktime_after(now, next))
                timesortedlist_remove(sch, skb);
        else
                skb = NULL;

out:
        /* Now we may need to re-arm the qdisc watchdog for the next packet. */
        reset_watchdog(sch);

        return skb;
}

static void etf_disable_offload(struct net_device *dev,
                                struct etf_sched_data *q)
{
        struct tc_etf_qopt_offload etf = { };
        const struct net_device_ops *ops;
        int err;

        if (!q->offload)
                return;

        ops = dev->netdev_ops;
        if (!ops->ndo_setup_tc)
                return;

        etf.queue = q->queue;
        etf.enable = 0;

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
        if (err < 0)
                pr_warn("Couldn't disable ETF offload for queue %d\n",
                        etf.queue);
}

static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
                              struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct tc_etf_qopt_offload etf = { };
        int err;

        if (!ops->ndo_setup_tc) {
                NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
                return -EOPNOTSUPP;
        }

        etf.queue = q->queue;
        etf.enable = 1;

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
                return err;
        }

        return 0;
}

static int etf_init(struct Qdisc *sch, struct nlattr *opt,
                    struct netlink_ext_ack *extack)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct nlattr *tb[TCA_ETF_MAX + 1];
        struct tc_etf_qopt *qopt;
        int err;

        if (!opt) {
                NL_SET_ERR_MSG(extack,
                               "Missing ETF qdisc options which are mandatory");
                return -EINVAL;
        }

        err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy,
                                          extack);
        if (err < 0)
                return err;

        if (!tb[TCA_ETF_PARMS]) {
                NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
                return -EINVAL;
        }

        qopt = nla_data(tb[TCA_ETF_PARMS]);

        pr_debug("delta %d clockid %d offload %s deadline %s\n",
                 qopt->delta, qopt->clockid,
                 OFFLOAD_IS_ON(qopt) ? "on" : "off",
                 DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");

        err = validate_input_params(qopt, extack);
        if (err < 0)
                return err;

        q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);

        if (OFFLOAD_IS_ON(qopt)) {
                err = etf_enable_offload(dev, q, extack);
                if (err < 0)
                        return err;
        }

        /* Everything went OK, save the parameters used. */
        q->delta = qopt->delta;
        q->clockid = qopt->clockid;
        q->offload = OFFLOAD_IS_ON(qopt);
        q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
        q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt);

        switch (q->clockid) {
        case CLOCK_REALTIME:
                q->get_time = ktime_get_real;
                break;
        case CLOCK_MONOTONIC:
                q->get_time = ktime_get;
                break;
        case CLOCK_BOOTTIME:
                q->get_time = ktime_get_boottime;
                break;
        case CLOCK_TAI:
                q->get_time = ktime_get_clocktai;
                break;
        default:
                NL_SET_ERR_MSG(extack, "Clockid is not supported");
                return -ENOTSUPP;
        }

        qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);

        return 0;
}

static void timesortedlist_clear(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct rb_node *p = rb_first_cached(&q->head);

        while (p) {
                struct sk_buff *skb = rb_to_skb(p);

                p = rb_next(p);

                rb_erase_cached(&skb->rbnode, &q->head);
                rtnl_kfree_skbs(skb, skb);
                sch->q.qlen--;
        }
}

static void etf_reset(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);

        /* Only cancel watchdog if it's been initialized. */
        if (q->watchdog.qdisc == sch)
                qdisc_watchdog_cancel(&q->watchdog);

        /* No matter which mode we are on, it's safe to clear both lists. */
        timesortedlist_clear(sch);
        __qdisc_reset_queue(&sch->q);

        q->last = 0;
}

static void etf_destroy(struct Qdisc *sch)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);

        /* Only cancel watchdog if it's been initialized. */
        if (q->watchdog.qdisc == sch)
                qdisc_watchdog_cancel(&q->watchdog);

        etf_disable_offload(dev, q);
}

static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
        struct etf_sched_data *q = qdisc_priv(sch);
        struct tc_etf_qopt opt = { };
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
        if (!nest)
                goto nla_put_failure;

        opt.delta = READ_ONCE(q->delta);
        opt.clockid = READ_ONCE(q->clockid);
        if (READ_ONCE(q->offload))
                opt.flags |= TC_ETF_OFFLOAD_ON;

        if (READ_ONCE(q->deadline_mode))
                opt.flags |= TC_ETF_DEADLINE_MODE_ON;

        if (READ_ONCE(q->skip_sock_check))
                opt.flags |= TC_ETF_SKIP_SOCK_CHECK;

        if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
                goto nla_put_failure;

        return nla_nest_end(skb, nest);

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -1;
}

static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
        .id             =       "etf",
        .priv_size      =       sizeof(struct etf_sched_data),
        .enqueue        =       etf_enqueue_timesortedlist,
        .dequeue        =       etf_dequeue_timesortedlist,
        .peek           =       etf_peek_timesortedlist,
        .init           =       etf_init,
        .reset          =       etf_reset,
        .destroy        =       etf_destroy,
        .dump           =       etf_dump,
        .owner          =       THIS_MODULE,
};
MODULE_ALIAS_NET_SCH("etf");

static int __init etf_module_init(void)
{
        return register_qdisc(&etf_qdisc_ops);
}

static void __exit etf_module_exit(void)
{
        unregister_qdisc(&etf_qdisc_ops);
}
module_init(etf_module_init)
module_exit(etf_module_exit)
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc");