#include <linux/errno.h>
#include <linux/hrtimer.h>
#include <linux/if_vlan.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/gso.h>
#include <net/inet_ecn.h>
#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
#define MAX_PROB U32_MAX
#define ALPHA_BETA_SHIFT 8
#define ALPHA_BETA_MAX ((1U << 31) - 1)
#define ALPHA_BETA_GRANULARITY 6
#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
#define MAX_WC 100
struct dualpi2_sched_data {
struct Qdisc *l_queue;
struct Qdisc *sch;
struct tcf_proto __rcu *tcf_filters;
struct tcf_block *tcf_block;
u64 pi2_target;
u32 pi2_tupdate;
u32 pi2_prob;
u32 pi2_alpha;
u32 pi2_beta;
struct hrtimer pi2_timer;
u32 step_thresh;
bool step_in_packets;
s32 c_protection_credit;
s32 c_protection_init;
u8 c_protection_wc;
u8 c_protection_wl;
u32 memory_limit;
u8 coupling_factor;
u8 ecn_mask;
u32 min_qlen_step;
bool drop_early;
bool drop_overload;
bool split_gso;
u64 c_head_ts;
u64 l_head_ts;
u64 last_qdelay;
u32 packets_in_c;
u32 packets_in_l;
u32 maxq;
u32 ecn_mark;
u32 step_marks;
u32 memory_used;
u32 max_memory_used;
u32 deferred_drops_cnt;
u32 deferred_drops_len;
};
struct dualpi2_skb_cb {
u64 ts;
u8 apply_step:1,
classified:2,
ect:2;
};
enum dualpi2_classification_results {
DUALPI2_C_CLASSIC = 0,
DUALPI2_C_L4S = 1,
DUALPI2_C_LLLL = 2,
__DUALPI2_C_MAX
};
static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
}
static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
{
return reference - dualpi2_skb_cb(skb)->ts;
}
static u64 head_enqueue_time(struct Qdisc *q)
{
struct sk_buff *skb = qdisc_peek_head(q);
return skb ? dualpi2_skb_cb(skb)->ts : 0;
}
static u32 dualpi2_scale_alpha_beta(u32 param)
{
u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
do_div(tmp, NSEC_PER_SEC);
return tmp;
}
static u32 dualpi2_unscale_alpha_beta(u32 param)
{
u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
do_div(tmp, MAX_PROB);
return tmp;
}
static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
{
return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
}
static bool skb_is_l4s(struct sk_buff *skb)
{
return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
}
static bool skb_in_l_queue(struct sk_buff *skb)
{
return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
}
static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
{
return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
}
static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
{
if (INET_ECN_set_ce(skb)) {
q->ecn_mark++;
return true;
}
return false;
}
static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
{
q->c_protection_credit = q->c_protection_init;
}
static void dualpi2_calculate_c_protection(struct Qdisc *sch,
struct dualpi2_sched_data *q, u32 wc)
{
q->c_protection_wc = wc;
q->c_protection_wl = MAX_WC - wc;
q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
((int)q->c_protection_wc - (int)q->c_protection_wl);
dualpi2_reset_c_protection(q);
}
static bool dualpi2_roll(u32 prob)
{
return get_random_u32() <= prob;
}
static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
struct sk_buff *skb, u32 prob,
bool overload)
{
if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
return true;
dualpi2_mark(q, skb);
}
return false;
}
static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
struct sk_buff *skb,
u64 local_l_prob, u32 prob,
bool overload)
{
if (overload) {
if (!q->drop_overload ||
!(dualpi2_roll(prob) && dualpi2_roll(prob)))
goto mark;
return true;
}
if (dualpi2_roll(local_l_prob)) {
if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
return true;
mark:
dualpi2_mark(q, skb);
}
return false;
}
static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
struct sk_buff *skb)
{
u64 local_l_prob;
bool overload;
u32 prob;
if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
return false;
prob = READ_ONCE(q->pi2_prob);
local_l_prob = (u64)prob * q->coupling_factor;
overload = local_l_prob > MAX_PROB;
switch (dualpi2_skb_cb(skb)->classified) {
case DUALPI2_C_CLASSIC:
return dualpi2_classic_marking(q, skb, prob, overload);
case DUALPI2_C_L4S:
return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
overload);
default:
return false;
}
}
static void dualpi2_read_ect(struct sk_buff *skb)
{
struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
int wlen = skb_network_offset(skb);
switch (skb_protocol(skb, true)) {
case htons(ETH_P_IP):
wlen += sizeof(struct iphdr);
if (!pskb_may_pull(skb, wlen) ||
skb_try_make_writable(skb, wlen))
goto not_ecn;
cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
break;
case htons(ETH_P_IPV6):
wlen += sizeof(struct ipv6hdr);
if (!pskb_may_pull(skb, wlen) ||
skb_try_make_writable(skb, wlen))
goto not_ecn;
cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
break;
default:
goto not_ecn;
}
return;
not_ecn:
cb->ect = INET_ECN_NOT_ECT;
}
static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
struct sk_buff *skb)
{
struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
struct tcf_result res;
struct tcf_proto *fl;
int result;
dualpi2_read_ect(skb);
if (cb->ect & q->ecn_mask) {
cb->classified = DUALPI2_C_L4S;
return NET_XMIT_SUCCESS;
}
if (TC_H_MAJ(skb->priority) == q->sch->handle &&
TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
cb->classified = TC_H_MIN(skb->priority);
return NET_XMIT_SUCCESS;
}
fl = rcu_dereference_bh(q->tcf_filters);
if (!fl) {
cb->classified = DUALPI2_C_CLASSIC;
return NET_XMIT_SUCCESS;
}
result = tcf_classify(skb, NULL, fl, &res, false);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
#endif
cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
}
return NET_XMIT_SUCCESS;
}
static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
struct dualpi2_skb_cb *cb;
if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
qdisc_qstats_overlimit(sch);
if (skb_in_l_queue(skb))
qdisc_qstats_overlimit(q->l_queue);
return qdisc_drop_reason(skb, sch, to_free,
SKB_DROP_REASON_QDISC_OVERLIMIT);
}
if (q->drop_early && must_drop(sch, q, skb)) {
qdisc_drop_reason(skb, sch, to_free,
SKB_DROP_REASON_QDISC_CONGESTED);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
cb = dualpi2_skb_cb(skb);
cb->ts = ktime_get_ns();
q->memory_used += skb->truesize;
if (q->memory_used > q->max_memory_used)
q->max_memory_used = q->memory_used;
if (qdisc_qlen(sch) > q->maxq)
q->maxq = qdisc_qlen(sch);
if (skb_in_l_queue(skb)) {
dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
++sch->q.qlen;
qdisc_qstats_backlog_inc(sch, skb);
++q->packets_in_l;
if (!q->l_head_ts)
q->l_head_ts = cb->ts;
return qdisc_enqueue_tail(skb, q->l_queue);
}
++q->packets_in_c;
if (!q->c_head_ts)
q->c_head_ts = cb->ts;
return qdisc_enqueue_tail(skb, sch);
}
static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
int err;
err = dualpi2_skb_classify(q, skb);
if (err != NET_XMIT_SUCCESS) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
__qdisc_drop(skb, to_free);
return err;
}
if (q->split_gso && skb_is_gso(skb)) {
netdev_features_t features;
struct sk_buff *nskb, *next;
int cnt, byte_len, orig_len;
int err;
features = netif_skb_features(skb);
nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(nskb))
return qdisc_drop(skb, sch, to_free);
cnt = 1;
byte_len = 0;
orig_len = qdisc_pkt_len(skb);
skb_list_walk_safe(nskb, nskb, next) {
skb_mark_not_on_list(nskb);
qdisc_skb_cb(nskb)->pkt_len = nskb->len;
qdisc_skb_cb(nskb)->pkt_segs = 1;
dualpi2_skb_cb(nskb)->classified =
dualpi2_skb_cb(skb)->classified;
dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
err = dualpi2_enqueue_skb(nskb, sch, to_free);
if (err == NET_XMIT_SUCCESS) {
++cnt;
byte_len += nskb->len;
}
}
if (cnt > 1) {
--cnt;
byte_len -= orig_len;
}
qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
consume_skb(skb);
return err;
}
return dualpi2_enqueue_skb(skb, sch, to_free);
}
static struct sk_buff *dequeue_packet(struct Qdisc *sch,
struct dualpi2_sched_data *q,
int *credit_change,
u64 now)
{
struct sk_buff *skb = NULL;
int c_len;
*credit_change = 0;
c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
skb = __qdisc_dequeue_head(&q->l_queue->q);
WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
if (c_len)
*credit_change = q->c_protection_wc;
qdisc_qstats_backlog_dec(q->l_queue, skb);
--sch->q.qlen;
q->memory_used -= skb->truesize;
} else if (c_len) {
skb = __qdisc_dequeue_head(&sch->q);
WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
if (qdisc_qlen(q->l_queue))
*credit_change = ~((s32)q->c_protection_wl) + 1;
q->memory_used -= skb->truesize;
} else {
dualpi2_reset_c_protection(q);
return NULL;
}
*credit_change *= qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
return skb;
}
static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
u64 now)
{
u64 qdelay = 0;
if (q->step_in_packets)
qdelay = qdisc_qlen(q->l_queue);
else
qdelay = dualpi2_sojourn_time(skb, now);
if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
if (!dualpi2_skb_cb(skb)->ect) {
return 1;
}
if (dualpi2_mark(q, skb))
++q->step_marks;
}
qdisc_bstats_update(q->l_queue, skb);
return 0;
}
static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
struct Qdisc *sch, enum skb_drop_reason reason)
{
++q->deferred_drops_cnt;
q->deferred_drops_len += qdisc_pkt_len(skb);
kfree_skb_reason(skb, reason);
qdisc_qstats_drop(sch);
}
static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
int credit_change;
u64 now;
now = ktime_get_ns();
while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
if (!q->drop_early && must_drop(sch, q, skb)) {
drop_and_retry(q, skb, sch,
SKB_DROP_REASON_QDISC_CONGESTED);
continue;
}
if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
qdisc_qstats_drop(q->l_queue);
drop_and_retry(q, skb, sch,
SKB_DROP_REASON_DUALPI2_STEP_DROP);
continue;
}
q->c_protection_credit += credit_change;
qdisc_bstats_update(sch, skb);
break;
}
if (q->deferred_drops_cnt) {
qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
q->deferred_drops_len);
q->deferred_drops_cnt = 0;
q->deferred_drops_len = 0;
}
return skb;
}
static s64 __scale_delta(u64 diff)
{
do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
return diff;
}
static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
u64 *qdelay_l)
{
u64 now, qc, ql;
now = ktime_get_ns();
qc = READ_ONCE(q->c_head_ts);
ql = READ_ONCE(q->l_head_ts);
*qdelay_c = qc ? now - qc : 0;
*qdelay_l = ql ? now - ql : 0;
}
static u32 calculate_probability(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
u32 new_prob;
u64 qdelay_c;
u64 qdelay_l;
u64 qdelay;
s64 delta;
get_queue_delays(q, &qdelay_c, &qdelay_l);
qdelay = max(qdelay_l, qdelay_c);
delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
q->last_qdelay = qdelay;
if (delta > 0) {
new_prob = __scale_delta(delta) + q->pi2_prob;
if (new_prob < q->pi2_prob)
new_prob = MAX_PROB;
} else {
new_prob = q->pi2_prob - __scale_delta(~delta + 1);
if (new_prob > q->pi2_prob)
new_prob = 0;
}
if (!q->drop_overload)
return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
return new_prob;
}
static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
{
u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
if (upper_32_bits(memlim))
return U32_MAX;
else
return lower_32_bits(memlim);
}
static u32 convert_us_to_nsec(u32 us)
{
u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
if (upper_32_bits(ns))
return U32_MAX;
return lower_32_bits(ns);
}
static u32 convert_ns_to_usec(u64 ns)
{
do_div(ns, NSEC_PER_USEC);
if (upper_32_bits(ns))
return U32_MAX;
return lower_32_bits(ns);
}
static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
{
struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
struct Qdisc *sch = q->sch;
spinlock_t *root_lock;
rcu_read_lock();
root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
spin_unlock(root_lock);
rcu_read_unlock();
return HRTIMER_RESTART;
}
static struct netlink_range_validation dualpi2_alpha_beta_range = {
.min = 1,
.max = ALPHA_BETA_MAX,
};
static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
[TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_TARGET] = { .type = NLA_U32 },
[TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_ALPHA] =
NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
[TCA_DUALPI2_BETA] =
NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
[TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 },
[TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 },
[TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 },
[TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1),
[TCA_DUALPI2_DROP_OVERLOAD] =
NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
[TCA_DUALPI2_DROP_EARLY] =
NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
[TCA_DUALPI2_C_PROTECTION] =
NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
[TCA_DUALPI2_ECN_MASK] =
NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
TCA_DUALPI2_ECN_MASK_MAX),
[TCA_DUALPI2_SPLIT_GSO] =
NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
};
static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_DUALPI2_MAX + 1];
struct dualpi2_sched_data *q;
int old_backlog;
int old_qlen;
int err;
if (!opt || !nla_len(opt)) {
NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
return -EINVAL;
}
err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
extack);
if (err < 0)
return err;
if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
return -EINVAL;
}
q = qdisc_priv(sch);
sch_tree_lock(sch);
if (tb[TCA_DUALPI2_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
WRITE_ONCE(sch->limit, limit);
WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
}
if (tb[TCA_DUALPI2_MEMORY_LIMIT])
WRITE_ONCE(q->memory_limit,
nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
if (tb[TCA_DUALPI2_TARGET]) {
u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
}
if (tb[TCA_DUALPI2_TUPDATE]) {
u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
}
if (tb[TCA_DUALPI2_ALPHA]) {
u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
}
if (tb[TCA_DUALPI2_BETA]) {
u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
}
if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
WRITE_ONCE(q->step_in_packets, true);
WRITE_ONCE(q->step_thresh, step_th);
} else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
WRITE_ONCE(q->step_in_packets, false);
WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
}
if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
WRITE_ONCE(q->min_qlen_step,
nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
if (tb[TCA_DUALPI2_COUPLING]) {
u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
WRITE_ONCE(q->coupling_factor, coupling);
}
if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
WRITE_ONCE(q->drop_overload, (bool)drop_overload);
}
if (tb[TCA_DUALPI2_DROP_EARLY]) {
u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
WRITE_ONCE(q->drop_early, (bool)drop_early);
}
if (tb[TCA_DUALPI2_C_PROTECTION]) {
u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
dualpi2_calculate_c_protection(sch, q, wc);
}
if (tb[TCA_DUALPI2_ECN_MASK]) {
u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
WRITE_ONCE(q->ecn_mask, ecn_mask);
}
if (tb[TCA_DUALPI2_SPLIT_GSO]) {
u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
WRITE_ONCE(q->split_gso, (bool)split_gso);
}
old_qlen = qdisc_qlen(sch);
old_backlog = sch->qstats.backlog;
while (qdisc_qlen(sch) > sch->limit ||
q->memory_used > q->memory_limit) {
struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
q->memory_used -= skb->truesize;
qdisc_qstats_backlog_dec(sch, skb);
rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
old_backlog - sch->qstats.backlog);
sch_tree_unlock(sch);
return 0;
}
static void dualpi2_reset_default(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
q->sch->limit = 10000;
q->memory_limit = get_memory_limit(sch, q->sch->limit);
q->pi2_target = 15 * NSEC_PER_MSEC;
q->pi2_tupdate = 16 * NSEC_PER_MSEC;
q->pi2_alpha = dualpi2_scale_alpha_beta(41);
q->pi2_beta = dualpi2_scale_alpha_beta(819);
q->step_thresh = 1 * NSEC_PER_MSEC;
q->step_in_packets = false;
dualpi2_calculate_c_protection(q->sch, q, 10);
q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT;
q->min_qlen_step = 0;
q->coupling_factor = 2;
q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP;
q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE;
q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO;
}
static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
int err;
q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1), extack);
if (!q->l_queue)
return -ENOMEM;
err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
if (err)
return err;
q->sch = sch;
dualpi2_reset_default(sch);
hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED_SOFT);
if (opt && nla_len(opt)) {
err = dualpi2_change(sch, opt, extack);
if (err)
return err;
}
hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
HRTIMER_MODE_ABS_PINNED_SOFT);
return 0;
}
static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
bool step_in_pkts;
u32 step_th;
step_in_pkts = READ_ONCE(q->step_in_packets);
step_th = READ_ONCE(q->step_thresh);
opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!opts)
goto nla_put_failure;
if (step_in_pkts &&
(nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
READ_ONCE(q->memory_limit)) ||
nla_put_u32(skb, TCA_DUALPI2_TARGET,
convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
nla_put_u32(skb, TCA_DUALPI2_ALPHA,
dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
nla_put_u32(skb, TCA_DUALPI2_BETA,
dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
READ_ONCE(q->min_qlen_step)) ||
nla_put_u8(skb, TCA_DUALPI2_COUPLING,
READ_ONCE(q->coupling_factor)) ||
nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
READ_ONCE(q->drop_overload)) ||
nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
READ_ONCE(q->drop_early)) ||
nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
READ_ONCE(q->c_protection_wc)) ||
nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
goto nla_put_failure;
if (!step_in_pkts &&
(nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
READ_ONCE(q->memory_limit)) ||
nla_put_u32(skb, TCA_DUALPI2_TARGET,
convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
nla_put_u32(skb, TCA_DUALPI2_ALPHA,
dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
nla_put_u32(skb, TCA_DUALPI2_BETA,
dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
convert_ns_to_usec(step_th)) ||
nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
READ_ONCE(q->min_qlen_step)) ||
nla_put_u8(skb, TCA_DUALPI2_COUPLING,
READ_ONCE(q->coupling_factor)) ||
nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
READ_ONCE(q->drop_overload)) ||
nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
READ_ONCE(q->drop_early)) ||
nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
READ_ONCE(q->c_protection_wc)) ||
nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -1;
}
static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
struct tc_dualpi2_xstats st = {
.prob = READ_ONCE(q->pi2_prob),
.packets_in_c = q->packets_in_c,
.packets_in_l = q->packets_in_l,
.maxq = q->maxq,
.ecn_mark = q->ecn_mark,
.credit = q->c_protection_credit,
.step_marks = q->step_marks,
.memory_used = q->memory_used,
.max_memory_used = q->max_memory_used,
.memory_limit = q->memory_limit,
};
u64 qc, ql;
get_queue_delays(q, &qc, &ql);
st.delay_l = convert_ns_to_usec(ql);
st.delay_c = convert_ns_to_usec(qc);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static void dualpi2_reset(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
qdisc_reset_queue(sch);
qdisc_reset_queue(q->l_queue);
q->c_head_ts = 0;
q->l_head_ts = 0;
q->pi2_prob = 0;
q->packets_in_c = 0;
q->packets_in_l = 0;
q->maxq = 0;
q->ecn_mark = 0;
q->step_marks = 0;
q->memory_used = 0;
q->max_memory_used = 0;
dualpi2_reset_c_protection(q);
}
static void dualpi2_destroy(struct Qdisc *sch)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
q->pi2_tupdate = 0;
hrtimer_cancel(&q->pi2_timer);
if (q->l_queue)
qdisc_put(q->l_queue);
tcf_block_put(q->tcf_block);
}
static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
{
return 0;
}
static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return 0;
}
static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
{
}
static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
struct netlink_ext_ack *extack)
{
struct dualpi2_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return q->tcf_block;
}
static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < 2; i++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, i + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops dualpi2_class_ops = {
.leaf = dualpi2_leaf,
.find = dualpi2_find,
.tcf_block = dualpi2_tcf_block,
.bind_tcf = dualpi2_bind,
.unbind_tcf = dualpi2_unbind,
.walk = dualpi2_walk,
};
static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
.id = "dualpi2",
.cl_ops = &dualpi2_class_ops,
.priv_size = sizeof(struct dualpi2_sched_data),
.enqueue = dualpi2_qdisc_enqueue,
.dequeue = dualpi2_qdisc_dequeue,
.peek = qdisc_peek_dequeued,
.init = dualpi2_init,
.destroy = dualpi2_destroy,
.reset = dualpi2_reset,
.change = dualpi2_change,
.dump = dualpi2_dump,
.dump_stats = dualpi2_dump_stats,
.owner = THIS_MODULE,
};
static int __init dualpi2_module_init(void)
{
return register_qdisc(&dualpi2_qdisc_ops);
}
static void __exit dualpi2_module_exit(void)
{
unregister_qdisc(&dualpi2_qdisc_ops);
}
module_init(dualpi2_module_init);
module_exit(dualpi2_module_exit);
MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_VERSION("1.0");