root/sys/dev/cxgbe/t4_sched.c
/*-
 * Copyright (c) 2017 Chelsio Communications, Inc.
 * All rights reserved.
 * Written by: Navdeep Parhar <np@FreeBSD.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ratelimit.h"

#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/taskqueue.h>
#include <sys/sysctl.h>

#include "common/common.h"
#include "common/t4_regs.h"
#include "common/t4_regs_values.h"
#include "common/t4_msg.h"

static int
in_range(int val, int lo, int hi)
{

        return (val < 0 || (val <= hi && val >= lo));
}

static int
set_sched_class_config(struct adapter *sc, int minmax)
{
        int rc;

        if (minmax < 0)
                return (EINVAL);

        rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
        if (rc)
                return (rc);
        if (hw_off_limits(sc))
                rc = ENXIO;
        else
                rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
        end_synchronized_op(sc, 0);

        return (rc);
}

static int
set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
    int sleep_ok)
{
        int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
        struct port_info *pi;
        struct tx_cl_rl_params *tc, old;
        bool check_pktsize = false;

        if (p->level == SCHED_CLASS_LEVEL_CL_RL)
                fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
        else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
                fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
        else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
                fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
        else
                return (EINVAL);

        if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
                if (p->mode == SCHED_CLASS_MODE_CLASS)
                        fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
                else if (p->mode == SCHED_CLASS_MODE_FLOW) {
                        check_pktsize = true;
                        fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
                } else
                        return (EINVAL);
        } else
                fw_mode = 0;

        /* Valid channel must always be provided. */
        if (p->channel < 0)
                return (EINVAL);
        if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
                return (ERANGE);

        pi = sc->port[sc->chan_map[p->channel]];
        if (pi == NULL)
                return (ENXIO);
        MPASS(pi->tx_chan == p->channel);
        top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */

        if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
            p->level == SCHED_CLASS_LEVEL_CH_RL) {
                /*
                 * Valid rate (mode, unit and values) must be provided.
                 */

                if (p->minrate < 0)
                        p->minrate = 0;
                if (p->maxrate < 0)
                        return (EINVAL);

                if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
                        fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
                        /* ratemode could be relative (%) or absolute. */
                        if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
                                fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
                                /* maxrate is % of port bandwidth. */
                                if (!in_range(p->minrate, 0, 100) ||
                                    !in_range(p->maxrate, 0, 100)) {
                                        return (ERANGE);
                                }
                        } else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
                                fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
                                /* maxrate is absolute value in kbps. */
                                if (!in_range(p->minrate, 0, top_speed) ||
                                    !in_range(p->maxrate, 0, top_speed)) {
                                        return (ERANGE);
                                }
                        } else
                                return (EINVAL);
                } else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
                        /* maxrate is the absolute value in pps. */
                        check_pktsize = true;
                        fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
                } else
                        return (EINVAL);
        } else {
                MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);

                /*
                 * Valid weight must be provided.
                 */
                if (p->weight < 0)
                       return (EINVAL);
                if (!in_range(p->weight, 1, 99))
                        return (ERANGE);

                fw_rateunit = 0;
                fw_ratemode = 0;
        }

        if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
            p->level == SCHED_CLASS_LEVEL_CL_WRR) {
                /*
                 * Valid scheduling class must be provided.
                 */
                if (p->cl < 0)
                        return (EINVAL);
                if (!in_range(p->cl, 0, sc->params.nsched_cls - 1))
                        return (ERANGE);
        }

        if (check_pktsize) {
                if (p->pktsize < 0)
                        return (EINVAL);
                if (!in_range(p->pktsize, 64, if_getmtu(pi->vi[0].ifp)))
                        return (ERANGE);
        }

        if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
                tc = &pi->sched_params->cl_rl[p->cl];
                mtx_lock(&sc->tc_lock);
                if (tc->refcount > 0 || tc->state == CS_HW_UPDATE_IN_PROGRESS)
                        rc = EBUSY;
                else {
                        old = *tc;

                        tc->flags |= CF_USER;
                        tc->state = CS_HW_UPDATE_IN_PROGRESS;
                        tc->ratemode = fw_ratemode;
                        tc->rateunit = fw_rateunit;
                        tc->mode = fw_mode;
                        tc->maxrate = p->maxrate;
                        tc->pktsize = p->pktsize;
                        rc = 0;
                }
                mtx_unlock(&sc->tc_lock);
                if (rc != 0)
                        return (rc);
        }

        rc = begin_synchronized_op(sc, NULL,
            sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
        if (rc != 0) {
                if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
                        mtx_lock(&sc->tc_lock);
                        MPASS(tc->refcount == 0);
                        MPASS(tc->flags & CF_USER);
                        MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);
                        *tc = old;
                        mtx_unlock(&sc->tc_lock);
                }
                return (rc);
        }
        if (!hw_off_limits(sc)) {
                rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level,
                    fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl,
                    p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok);
        }
        end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);

        if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
                mtx_lock(&sc->tc_lock);
                MPASS(tc->refcount == 0);
                MPASS(tc->flags & CF_USER);
                MPASS(tc->state == CS_HW_UPDATE_IN_PROGRESS);

                if (rc == 0)
                        tc->state = CS_HW_CONFIGURED;
                else {
                        /* parameters failed so we don't park at params_set */
                        tc->state = CS_UNINITIALIZED;
                        tc->flags &= ~CF_USER;
                        CH_ERR(pi, "failed to configure traffic class %d: %d.  "
                            "params: mode %d, rateunit %d, ratemode %d, "
                            "channel %d, minrate %d, maxrate %d, pktsize %d, "
                            "burstsize %d\n", p->cl, rc, fw_mode, fw_rateunit,
                            fw_ratemode, p->channel, p->minrate, p->maxrate,
                            p->pktsize, 0);
                }
                mtx_unlock(&sc->tc_lock);
        }

        return (rc);
}

static void
update_tx_sched(void *context, int pending)
{
        int i, j, rc;
        struct port_info *pi;
        struct tx_cl_rl_params *tc;
        struct adapter *sc = context;
        const int n = sc->params.nsched_cls;

        mtx_lock(&sc->tc_lock);
        for_each_port(sc, i) {
                pi = sc->port[i];
                tc = &pi->sched_params->cl_rl[0];
                for (j = 0; j < n; j++, tc++) {
                        MPASS(mtx_owned(&sc->tc_lock));
                        if (tc->state != CS_HW_UPDATE_REQUESTED)
                                continue;
                        mtx_unlock(&sc->tc_lock);

                        if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
                            "t4utxs") != 0) {
                                mtx_lock(&sc->tc_lock);
                                continue;
                        }
                        rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
                            FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
                            tc->ratemode, pi->hw_port, j, 0, tc->maxrate, 0,
                            tc->pktsize, tc->burstsize, 1);
                        end_synchronized_op(sc, 0);

                        mtx_lock(&sc->tc_lock);
                        MPASS(tc->state == CS_HW_UPDATE_REQUESTED);
                        if (rc == 0) {
                                tc->state = CS_HW_CONFIGURED;
                                continue;
                        }
                        /* parameters failed so we try to avoid params_set */
                        if (tc->refcount > 0)
                                tc->state = CS_PARAMS_SET;
                        else
                                tc->state = CS_UNINITIALIZED;
                        CH_ERR(pi, "failed to configure traffic class %d: %d.  "
                            "params: mode %d, rateunit %d, ratemode %d, "
                            "channel %d, minrate %d, maxrate %d, pktsize %d, "
                            "burstsize %d\n", j, rc, tc->mode, tc->rateunit,
                            tc->ratemode, pi->hw_port, 0, tc->maxrate,
                            tc->pktsize, tc->burstsize);
                }
        }
        mtx_unlock(&sc->tc_lock);
}

int
t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
{

        if (p->type != SCHED_CLASS_TYPE_PACKET)
                return (EINVAL);

        if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
                return (set_sched_class_config(sc, p->u.config.minmax));

        if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
                return (set_sched_class_params(sc, &p->u.params, 1));

        return (EINVAL);
}

static int
bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
{
        struct tx_cl_rl_params *tc0, *tc;
        int rc, old_idx;
        uint32_t fw_mnem, fw_class;

        if (!(txq->eq.flags & EQ_HW_ALLOCATED))
                return (ENXIO);

        mtx_lock(&sc->tc_lock);
        if (txq->tc_idx == -2) {
                rc = EBUSY;     /* Another bind/unbind in progress already. */
                goto done;
        }
        if (idx == txq->tc_idx) {
                rc = 0;         /* No change, nothing to do. */
                goto done;
        }

        tc0 = &sc->port[txq->eq.port_id]->sched_params->cl_rl[0];
        if (idx != -1) {
                /*
                 * Bind to a different class at index idx.
                 */
                tc = &tc0[idx];
                if (tc->state != CS_HW_CONFIGURED) {
                        rc = ENXIO;
                        goto done;
                } else {
                        /*
                         * Ok to proceed.  Place a reference on the new class
                         * while still holding on to the reference on the
                         * previous class, if any.
                         */
                        tc->refcount++;
                }
        }
        /* Mark as busy before letting go of the lock. */
        old_idx = txq->tc_idx;
        txq->tc_idx = -2;
        mtx_unlock(&sc->tc_lock);

        rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
        if (rc == 0) {
                fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
                    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
                    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
                fw_class = idx < 0 ? 0xffffffff : idx;
                rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem,
                    &fw_class);
                end_synchronized_op(sc, 0);
        }

        mtx_lock(&sc->tc_lock);
        MPASS(txq->tc_idx == -2);
        if (rc == 0) {
                /*
                 * Unbind, bind, or bind to a different class succeeded.  Remove
                 * the reference on the old traffic class, if any.
                 */
                if (old_idx != -1) {
                        tc = &tc0[old_idx];
                        MPASS(tc->refcount > 0);
                        tc->refcount--;
                }
                txq->tc_idx = idx;
        } else {
                /*
                 * Unbind, bind, or bind to a different class failed.  Remove
                 * the anticipatory reference on the new traffic class, if any.
                 */
                if (idx != -1) {
                        tc = &tc0[idx];
                        MPASS(tc->refcount > 0);
                        tc->refcount--;
                }
                txq->tc_idx = old_idx;
        }
done:
        MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls);
        mtx_unlock(&sc->tc_lock);
        return (rc);
}

int
t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
{
        struct port_info *pi = NULL;
        struct vi_info *vi;
        struct sge_txq *txq;
        int i, rc;

        if (p->port >= sc->params.nports)
                return (EINVAL);

        /*
         * XXX: cxgbetool allows the user to specify the physical port only.  So
         * we always operate on the main VI.
         */
        pi = sc->port[p->port];
        vi = &pi->vi[0];

        /* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
        if (!(vi->flags & VI_INIT_DONE))
                return (EAGAIN);
        MPASS(vi->ntxq > 0);

        if (!in_range(p->queue, 0, vi->ntxq - 1) ||
            !in_range(p->cl, 0, sc->params.nsched_cls - 1))
                return (EINVAL);

        if (p->queue < 0) {
                /*
                 * Change the scheduling on all the TX queues for the
                 * interface.
                 */
                for_each_txq(vi, i, txq) {
                        rc = bind_txq_to_traffic_class(sc, txq, p->cl);
                        if (rc != 0)
                                break;
                }
        } else {
                /*
                 * If op.queue is non-negative, then we're only changing the
                 * scheduling on a single specified TX queue.
                 */
                txq = &sc->sge.txq[vi->first_txq + p->queue];
                rc = bind_txq_to_traffic_class(sc, txq, p->cl);
        }

        return (rc);
}

int
t4_init_tx_sched(struct adapter *sc)
{
        int i;
        const int n = sc->params.nsched_cls;
        struct port_info *pi;

        mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
        TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
        for_each_port(sc, i) {
                pi = sc->port[i];
                pi->sched_params = malloc(sizeof(*pi->sched_params) +
                    n * sizeof(struct tx_cl_rl_params), M_CXGBE, M_ZERO | M_WAITOK);
        }

        return (0);
}

int
t4_free_tx_sched(struct adapter *sc)
{
        int i;

        taskqueue_drain(taskqueue_thread, &sc->tc_task);

        for_each_port(sc, i) {
                if (sc->port[i] != NULL)
                        free(sc->port[i]->sched_params, M_CXGBE);
        }

        if (mtx_initialized(&sc->tc_lock))
                mtx_destroy(&sc->tc_lock);

        return (0);
}

void
t4_update_tx_sched(struct adapter *sc)
{

        taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
}

int
t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
    int *tc_idx)
{
        int rc = 0, fa, fa2, i, pktsize, burstsize;
        bool update;
        struct tx_cl_rl_params *tc;
        struct port_info *pi;

        MPASS(port_id >= 0 && port_id < sc->params.nports);

        pi = sc->port[port_id];
        if (pi->sched_params->pktsize > 0)
                pktsize = pi->sched_params->pktsize;
        else
                pktsize = if_getmtu(pi->vi[0].ifp);
        if (pi->sched_params->burstsize > 0)
                burstsize = pi->sched_params->burstsize;
        else
                burstsize = pktsize * 4;
        tc = &pi->sched_params->cl_rl[0];

        update = false;
        fa = fa2 = -1;
        mtx_lock(&sc->tc_lock);
        for (i = 0; i < sc->params.nsched_cls; i++, tc++) {
                if (tc->state >= CS_PARAMS_SET &&
                    tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
                    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
                    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
                    tc->maxrate == maxrate && tc->pktsize == pktsize &&
                    tc->burstsize == burstsize) {
                        tc->refcount++;
                        *tc_idx = i;
                        if (tc->state == CS_PARAMS_SET) {
                                tc->state = CS_HW_UPDATE_REQUESTED;
                                update = true;
                        }
                        goto done;
                }

                if (fa < 0 && tc->state == CS_UNINITIALIZED) {
                        MPASS(tc->refcount == 0);
                        fa = i;         /* first available, never used. */
                }
                if (fa2 < 0 && tc->refcount == 0 && !(tc->flags & CF_USER)) {
                        fa2 = i;        /* first available, used previously.  */
                }
        }
        /* Not found */
        MPASS(i == sc->params.nsched_cls);
        if (fa == -1)
                fa = fa2;
        if (fa == -1) {
                *tc_idx = -1;
                rc = ENOSPC;
        } else {
                MPASS(fa >= 0 && fa < sc->params.nsched_cls);
                tc = &pi->sched_params->cl_rl[fa];
                MPASS(!(tc->flags & CF_USER));
                MPASS(tc->refcount == 0);

                tc->refcount = 1;
                tc->state = CS_HW_UPDATE_REQUESTED;
                tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
                tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
                tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
                tc->maxrate = maxrate;
                tc->pktsize = pktsize;
                tc->burstsize = burstsize;
                *tc_idx = fa;
                update = true;
        }
done:
        mtx_unlock(&sc->tc_lock);
        if (update)
                t4_update_tx_sched(sc);
        return (rc);
}

void
t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
{
        struct tx_cl_rl_params *tc;

        MPASS(port_id >= 0 && port_id < sc->params.nports);
        MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);

        mtx_lock(&sc->tc_lock);
        tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
        MPASS(tc->refcount > 0);
        tc->refcount--;
        mtx_unlock(&sc->tc_lock);
}

int
sysctl_tc(SYSCTL_HANDLER_ARGS)
{
        struct vi_info *vi = arg1;
        struct adapter *sc = vi->adapter;
        struct sge_txq *txq;
        int qidx = arg2, rc, tc_idx;

        MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq);

        txq = &sc->sge.txq[qidx];
        tc_idx = txq->tc_idx;
        rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
        if (rc != 0 || req->newptr == NULL)
                return (rc);

        if (sc->flags & IS_VF)
                return (EPERM);
        if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1))
                return (EINVAL);

        return (bind_txq_to_traffic_class(sc, txq, tc_idx));
}

int
sysctl_tc_params(SYSCTL_HANDLER_ARGS)
{
        struct adapter *sc = arg1;
        struct tx_cl_rl_params tc;
        struct sbuf *sb;
        int i, rc, port_id, mbps, gbps;

        rc = sysctl_wire_old_buffer(req, 0);
        if (rc != 0)
                return (rc);

        sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
        if (sb == NULL)
                return (ENOMEM);

        port_id = arg2 >> 16;
        MPASS(port_id < sc->params.nports);
        MPASS(sc->port[port_id] != NULL);
        i = arg2 & 0xffff;
        MPASS(i < sc->params.nsched_cls);

        mtx_lock(&sc->tc_lock);
        tc = sc->port[port_id]->sched_params->cl_rl[i];
        mtx_unlock(&sc->tc_lock);

        if (tc.state < CS_PARAMS_SET) {
                sbuf_printf(sb, "uninitialized");
                goto done;
        }

        switch (tc.rateunit) {
        case SCHED_CLASS_RATEUNIT_BITS:
                switch (tc.ratemode) {
                case SCHED_CLASS_RATEMODE_REL:
                        /* XXX: top speed or actual link speed? */
                        gbps = port_top_speed(sc->port[port_id]);
                        sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
                        break;
                case SCHED_CLASS_RATEMODE_ABS:
                        mbps = tc.maxrate / 1000;
                        gbps = tc.maxrate / 1000000;
                        if (tc.maxrate == gbps * 1000000)
                                sbuf_printf(sb, "%uGbps", gbps);
                        else if (tc.maxrate == mbps * 1000)
                                sbuf_printf(sb, "%uMbps", mbps);
                        else
                                sbuf_printf(sb, "%uKbps", tc.maxrate);
                        break;
                default:
                        rc = ENXIO;
                        goto done;
                }
                break;
        case SCHED_CLASS_RATEUNIT_PKTS:
                sbuf_printf(sb, "%upps", tc.maxrate);
                break;
        default:
                rc = ENXIO;
                goto done;
        }

        switch (tc.mode) {
        case SCHED_CLASS_MODE_CLASS:
                /* Note that pktsize and burstsize are not used in this mode. */
                sbuf_printf(sb, " aggregate");
                break;
        case SCHED_CLASS_MODE_FLOW:
                sbuf_printf(sb, " per-flow");
                if (tc.pktsize > 0)
                        sbuf_printf(sb, " pkt-size %u", tc.pktsize);
                if (tc.burstsize > 0)
                        sbuf_printf(sb, " burst-size %u", tc.burstsize);
                break;
        default:
                rc = ENXIO;
                goto done;
        }

done:
        if (rc == 0)
                rc = sbuf_finish(sb);
        sbuf_delete(sb);

        return (rc);
}

#ifdef RATELIMIT
void
t4_init_etid_table(struct adapter *sc)
{
        int i;
        struct tid_info *t;

        if (!is_ethoffload(sc))
                return;

        t = &sc->tids;
        MPASS(t->netids > 0);

        mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
        t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
                        M_ZERO | M_WAITOK);
        t->efree = t->etid_tab;
        t->etids_in_use = 0;
        for (i = 1; i < t->netids; i++)
                t->etid_tab[i - 1].next = &t->etid_tab[i];
        t->etid_tab[t->netids - 1].next = NULL;
}

void
t4_free_etid_table(struct adapter *sc)
{
        struct tid_info *t;

        if (!is_ethoffload(sc))
                return;

        t = &sc->tids;
        MPASS(t->netids > 0);

        free(t->etid_tab, M_CXGBE);
        t->etid_tab = NULL;

        if (mtx_initialized(&t->etid_lock))
                mtx_destroy(&t->etid_lock);
}

/* etid services */
static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
static void free_etid(struct adapter *, int);

static int
alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
{
        struct tid_info *t = &sc->tids;
        int etid = -1;

        mtx_lock(&t->etid_lock);
        if (t->efree) {
                union etid_entry *p = t->efree;

                etid = p - t->etid_tab + t->etid_base;
                t->efree = p->next;
                p->cst = cst;
                t->etids_in_use++;
        }
        mtx_unlock(&t->etid_lock);
        return (etid);
}

struct cxgbe_rate_tag *
lookup_etid(struct adapter *sc, int etid)
{
        struct tid_info *t = &sc->tids;

        return (t->etid_tab[etid - t->etid_base].cst);
}

static void
free_etid(struct adapter *sc, int etid)
{
        struct tid_info *t = &sc->tids;
        union etid_entry *p = &t->etid_tab[etid - t->etid_base];

        mtx_lock(&t->etid_lock);
        p->next = t->efree;
        t->efree = p;
        t->etids_in_use--;
        mtx_unlock(&t->etid_lock);
}

static int cxgbe_rate_tag_modify(struct m_snd_tag *,
    union if_snd_tag_modify_params *);
static int cxgbe_rate_tag_query(struct m_snd_tag *,
    union if_snd_tag_query_params *);
static void cxgbe_rate_tag_free(struct m_snd_tag *);

static const struct if_snd_tag_sw cxgbe_rate_tag_sw = {
        .snd_tag_modify = cxgbe_rate_tag_modify,
        .snd_tag_query = cxgbe_rate_tag_query,
        .snd_tag_free = cxgbe_rate_tag_free,
        .type = IF_SND_TAG_TYPE_RATE_LIMIT
};

int
cxgbe_rate_tag_alloc(if_t ifp, union if_snd_tag_alloc_params *params,
    struct m_snd_tag **pt)
{
        int rc, schedcl;
        struct vi_info *vi = if_getsoftc(ifp);
        struct port_info *pi = vi->pi;
        struct adapter *sc = pi->adapter;
        struct cxgbe_rate_tag *cst;

        MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);

        rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
            (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
        if (rc != 0)
                return (rc);
        MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);

        cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
        if (cst == NULL) {
failed:
                t4_release_cl_rl(sc, pi->port_id, schedcl);
                return (ENOMEM);
        }

        cst->etid = alloc_etid(sc, cst);
        if (cst->etid < 0) {
                free(cst, M_CXGBE);
                goto failed;
        }

        mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
        mbufq_init(&cst->pending_tx, INT_MAX);
        mbufq_init(&cst->pending_fwack, INT_MAX);
        m_snd_tag_init(&cst->com, ifp, &cxgbe_rate_tag_sw);
        cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
        cst->adapter = sc;
        cst->port_id = pi->port_id;
        cst->schedcl = schedcl;
        cst->max_rate = params->rate_limit.max_rate;
        cst->tx_credits = sc->params.eo_wr_cred;
        cst->tx_total = cst->tx_credits;
        cst->plen = 0;
        cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
            V_TXPKT_INTF(pi->hw_port) | V_TXPKT_PF(sc->pf) |
            V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));

        /*
         * Queues will be selected later when the connection flowid is available.
         */

        *pt = &cst->com;
        return (0);
}

/*
 * Change in parameters, no change in ifp.
 */
static int
cxgbe_rate_tag_modify(struct m_snd_tag *mst,
    union if_snd_tag_modify_params *params)
{
        int rc, schedcl;
        struct cxgbe_rate_tag *cst = mst_to_crt(mst);
        struct adapter *sc = cst->adapter;

        /* XXX: is schedcl -1 ok here? */
        MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls);

        mtx_lock(&cst->lock);
        MPASS(cst->flags & EO_SND_TAG_REF);
        rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
            (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
        if (rc != 0)
                return (rc);
        MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
        t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
        cst->schedcl = schedcl;
        cst->max_rate = params->rate_limit.max_rate;
        mtx_unlock(&cst->lock);

        return (0);
}

static int
cxgbe_rate_tag_query(struct m_snd_tag *mst,
    union if_snd_tag_query_params *params)
{
        struct cxgbe_rate_tag *cst = mst_to_crt(mst);

        params->rate_limit.max_rate = cst->max_rate;

#define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
        params->rate_limit.queue_level =
                (cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;

        return (0);
}

/*
 * Unlocks cst and frees it.
 */
void
cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
{
        struct adapter *sc = cst->adapter;

        mtx_assert(&cst->lock, MA_OWNED);
        MPASS((cst->flags & EO_SND_TAG_REF) == 0);
        MPASS(cst->tx_credits == cst->tx_total);
        MPASS(cst->plen == 0);
        MPASS(mbufq_first(&cst->pending_tx) == NULL);
        MPASS(mbufq_first(&cst->pending_fwack) == NULL);

        if (cst->etid >= 0)
                free_etid(sc, cst->etid);
        if (cst->schedcl != -1)
                t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
        mtx_unlock(&cst->lock);
        mtx_destroy(&cst->lock);
        free(cst, M_CXGBE);
}

static void
cxgbe_rate_tag_free(struct m_snd_tag *mst)
{
        struct cxgbe_rate_tag *cst = mst_to_crt(mst);

        mtx_lock(&cst->lock);

        /* The kernel is done with the snd_tag.  Remove its reference. */
        MPASS(cst->flags & EO_SND_TAG_REF);
        cst->flags &= ~EO_SND_TAG_REF;

        if (cst->ncompl == 0) {
                /*
                 * No fw4_ack in flight.  Free the tag right away if there are
                 * no outstanding credits.  Request the firmware to return all
                 * credits for the etid otherwise.
                 */
                if (cst->tx_credits == cst->tx_total) {
                        cxgbe_rate_tag_free_locked(cst);
                        return; /* cst is gone. */
                }
                send_etid_flush_wr(cst);
        }
        mtx_unlock(&cst->lock);
}

void
cxgbe_ratelimit_query(if_t ifp, struct if_ratelimit_query_results *q)
{
        struct vi_info *vi = if_getsoftc(ifp);
        struct adapter *sc = vi->adapter;

        q->rate_table = NULL;
        q->flags = RT_IS_SELECTABLE;
        /*
         * Absolute max limits from the firmware configuration.  Practical
         * limits depend on the burstsize, pktsize (if_getmtu(ifp) ultimately) and
         * the card's cclk.
         */
        q->max_flows = sc->tids.netids;
        q->number_of_rates = sc->params.nsched_cls;
        q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */

#if 1
        if (chip_id(sc) < CHELSIO_T6) {
                /* Based on testing by rrs@ with a T580 at burstsize = 4. */
                MPASS(q->min_segment_burst == 4);
                q->max_flows = min(4000, q->max_flows);
        } else {
                /* XXX: TBD, carried forward from T5 for now. */
                q->max_flows = min(4000, q->max_flows);
        }

        /*
         * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
         * even knows whether hw pacing will be used or not.  This prevents
         * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
         * the private ioctls from using any of traffic classes.
         *
         * Underreport the number of rates to tcp_ratelimit so that it doesn't
         * hog all of them.  This can be removed if/when tcp_ratelimit switches
         * to making its allocations on first-use rather than link-up.  There is
         * nothing wrong with one particular consumer reserving all the classes
         * but it should do so only if it'll actually use hw rate limiting.
         */
        q->number_of_rates /= 4;
#endif
}
#endif