root/sys/dev/cxgbe/tom/t4_listen.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2012 Chelsio Communications, Inc.
 * All rights reserved.
 * Written by: Navdeep Parhar <np@FreeBSD.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
#include "opt_inet.h"
#include "opt_inet6.h"

#ifdef TCP_OFFLOAD
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/refcount.h>
#include <sys/domain.h>
#include <sys/fnv_hash.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/route.h>
#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
#include <netinet/tcp_timer.h>
#define TCPSTATES
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>

#include "common/common.h"
#include "common/t4_msg.h"
#include "common/t4_regs.h"
#include "t4_clip.h"
#include "tom/t4_tom_l2t.h"
#include "tom/t4_tom.h"

/* stid services */
static int alloc_stid(struct adapter *, bool, void *);
static struct listen_ctx *lookup_stid(struct adapter *, int);
static void free_stid(struct adapter *, int , bool);

/* lctx services */
static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
    struct vi_info *);
static int free_lctx(struct adapter *, struct listen_ctx *);
static void hold_lctx(struct listen_ctx *);
static void listen_hash_add(struct adapter *, struct listen_ctx *);
static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);

static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);

static int create_server6(struct adapter *, struct listen_ctx *);
static int create_server(struct adapter *, struct listen_ctx *);

int
alloc_stid_tab(struct adapter *sc)
{
        struct tid_info *t = &sc->tids;

        MPASS(t->nstids > 0);
        MPASS(t->stid_tab == NULL);

        t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
            M_ZERO | M_NOWAIT);
        if (t->stid_tab == NULL)
                return (ENOMEM);
        t->stid_bitmap = bit_alloc(t->nstids, M_CXGBE, M_NOWAIT);
        if (t->stid_bitmap == NULL) {
                free(t->stid_tab, M_CXGBE);
                t->stid_tab = NULL;
                return (ENOMEM);
        }
        mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
        t->stids_in_use = 0;

        return (0);
}

void
free_stid_tab(struct adapter *sc)
{
        struct tid_info *t = &sc->tids;

        KASSERT(t->stids_in_use == 0,
            ("%s: %d tids still in use.", __func__, t->stids_in_use));

        if (mtx_initialized(&t->stid_lock))
                mtx_destroy(&t->stid_lock);
        free(t->stid_tab, M_CXGBE);
        t->stid_tab = NULL;
        free(t->stid_bitmap, M_CXGBE);
        t->stid_bitmap = NULL;
}

void
stop_stid_tab(struct adapter *sc)
{
        struct tid_info *t = &sc->tids;
        struct tom_data *td = sc->tom_softc;
        struct listen_ctx *lctx;
        struct synq_entry *synqe;
        int i, ntids;

        mtx_lock(&t->stid_lock);
        t->stid_tab_stopped = true;
        mtx_unlock(&t->stid_lock);

        mtx_lock(&td->lctx_hash_lock);
        for (i = 0; i <= td->listen_mask; i++) {
                LIST_FOREACH(lctx, &td->listen_hash[i], link)
                        lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW);
        }
        mtx_unlock(&td->lctx_hash_lock);

        mtx_lock(&td->toep_list_lock);
        TAILQ_FOREACH(synqe, &td->synqe_list, link) {
                MPASS(sc->incarnation == synqe->incarnation);
                MPASS(synqe->tid >= 0);
                MPASS(synqe == lookup_tid(sc, synqe->tid));
                /* Remove tid from the lookup table immediately. */
                CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
                    __func__, synqe->tid, synqe->incarnation);
                ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1;
                remove_tid(sc, synqe->tid, ntids);
#if 0
                /* synqe->tid is stale now but left alone for debug. */
                synqe->tid = -1;
#endif
        }
        MPASS(TAILQ_EMPTY(&td->stranded_synqe));
        TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link);
        MPASS(TAILQ_EMPTY(&td->synqe_list));
        mtx_unlock(&td->toep_list_lock);
}

void
restart_stid_tab(struct adapter *sc)
{
        struct tid_info *t = &sc->tids;
        struct tom_data *td = sc->tom_softc;
        struct listen_ctx *lctx;
        int i;

        mtx_lock(&td->lctx_hash_lock);
        for (i = 0; i <= td->listen_mask; i++) {
                LIST_FOREACH(lctx, &td->listen_hash[i], link) {
                        MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0);
                        lctx->flags |= LCTX_RPL_PENDING;
                        if (lctx->inp->inp_vflag & INP_IPV6)
                                create_server6(sc, lctx);
                        else
                                create_server(sc, lctx);
                }
        }
        mtx_unlock(&td->lctx_hash_lock);

        mtx_lock(&t->stid_lock);
        t->stid_tab_stopped = false;
        mtx_unlock(&t->stid_lock);

}

static int
alloc_stid(struct adapter *sc, bool isipv6, void *ctx)
{
        struct tid_info *t = &sc->tids;
        const u_int n = isipv6 ? 2 : 1;
        int stid, pair_stid;
        u_int i;
        ssize_t val;

        mtx_lock(&t->stid_lock);
        MPASS(t->stids_in_use <= t->nstids);
        if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) {
                mtx_unlock(&t->stid_lock);
                return (-1);
        }

        stid = -1;
        if (isipv6) {
                /*
                 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4
                 * cells) in the TCAM.  We know that the start of the stid
                 * region is properly aligned already (the chip requires each
                 * region to be 128-cell aligned).
                 */
                for (i = 0; i + 1 < t->nstids; i = roundup2(val + 1, 2)) {
                        bit_ffc_area_at(t->stid_bitmap, i, t->nstids, 2, &val);
                        if (val == -1)
                                break;
                        if ((val & 1) == 0) {
                                stid = val;
                                break;
                        }
                }
        } else {
                /*
                 * An IPv4 server needs one stid without any alignment
                 * requirements.  But we try extra hard to find an available
                 * stid adjacent to a used stid so that free "stid-pairs" are
                 * left intact for IPv6.
                 */
                bit_ffc_at(t->stid_bitmap, 0, t->nstids, &val);
                while (val != -1) {
                        if (stid == -1) {
                                /*
                                 * First usable stid.  Look no further if it's
                                 * an ideal fit.
                                 */
                                stid = val;
                                if (val & 1 || bit_test(t->stid_bitmap, val + 1))
                                        break;
                        } else {
                                /*
                                 * We have an unused stid already but are now
                                 * looking for in-use stids because we'd prefer
                                 * to grab an unused stid adjacent to one that's
                                 * in use.
                                 *
                                 * Odd stids pair with the previous stid and
                                 * even ones pair with the next stid.
                                 */
                                pair_stid = val & 1 ? val - 1 : val + 1;
                                if (bit_test(t->stid_bitmap, pair_stid) == 0) {
                                        stid = pair_stid;
                                        break;
                                }
                        }
                        val = roundup2(val + 1, 2);
                        if (val >= t->nstids)
                                break;
                        bit_ffs_at(t->stid_bitmap, val, t->nstids, &val);
                }
        }

        if (stid >= 0) {
                MPASS(stid + n - 1 < t->nstids);
                MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 0));
                bit_nset(t->stid_bitmap, stid, stid + n - 1);
                t->stids_in_use += n;
                t->stid_tab[stid] = ctx;
#ifdef INVARIANTS
                if (n == 2) {
                        MPASS((stid & 1) == 0);
                        t->stid_tab[stid + 1] = NULL;
                }
#endif
                stid += t->stid_base;
        }
        mtx_unlock(&t->stid_lock);
        return (stid);
}

static struct listen_ctx *
lookup_stid(struct adapter *sc, int stid)
{
        struct tid_info *t = &sc->tids;

        return (t->stid_tab[stid - t->stid_base]);
}

static void
free_stid(struct adapter *sc, int stid, bool isipv6)
{
        struct tid_info *t = &sc->tids;
        const u_int n = isipv6 ? 2 : 1;

        mtx_lock(&t->stid_lock);
        MPASS(stid >= t->stid_base);
        stid -= t->stid_base;
        MPASS(stid + n - 1 < t->nstids);
        MPASS(t->stids_in_use <= t->nstids);
        MPASS(t->stids_in_use >= n);
        MPASS(t->stid_tab[stid] != NULL);
#ifdef INVARIANTS
        if (n == 2) {
                MPASS((stid & 1) == 0);
                MPASS(t->stid_tab[stid + 1] == NULL);
        }
#endif
        MPASS(bit_ntest(t->stid_bitmap, stid, stid + n - 1, 1));
        bit_nclear(t->stid_bitmap, stid, stid + n - 1);
        t->stid_tab[stid] = NULL;
        t->stids_in_use -= n;
        mtx_unlock(&t->stid_lock);
}

static struct listen_ctx *
alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
{
        struct listen_ctx *lctx;

        INP_WLOCK_ASSERT(inp);

        lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
        if (lctx == NULL)
                return (NULL);

        lctx->isipv6 = inp->inp_vflag & INP_IPV6;
        lctx->stid = alloc_stid(sc, lctx->isipv6, lctx);
        if (lctx->stid < 0) {
                free(lctx, M_CXGBE);
                return (NULL);
        }

        if (lctx->isipv6 &&
            !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
                lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
                if (lctx->ce == NULL) {
                        free(lctx, M_CXGBE);
                        return (NULL);
                }
        }

        lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
        lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
        refcount_init(&lctx->refcount, 1);

        lctx->inp = inp;
        lctx->vnet = inp->inp_socket->so_vnet;
        in_pcbref(inp);

        return (lctx);
}

/* Don't call this directly, use release_lctx instead */
static int
free_lctx(struct adapter *sc, struct listen_ctx *lctx)
{
        struct inpcb *inp = lctx->inp;

        INP_WLOCK_ASSERT(inp);
        KASSERT(lctx->refcount == 0,
            ("%s: refcount %d", __func__, lctx->refcount));
        KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));

        CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
            __func__, lctx->stid, lctx, lctx->inp);

        if (lctx->ce)
                t4_release_clip_entry(sc, lctx->ce);
        free_stid(sc, lctx->stid, lctx->isipv6);
        free(lctx, M_CXGBE);

        return (in_pcbrele_wlocked(inp));
}

static void
hold_lctx(struct listen_ctx *lctx)
{

        refcount_acquire(&lctx->refcount);
}

static inline uint32_t
listen_hashfn(void *key, u_long mask)
{

        return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
}

/*
 * Add a listen_ctx entry to the listen hash table.
 */
static void
listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
{
        struct tom_data *td = sc->tom_softc;
        int bucket = listen_hashfn(lctx->inp, td->listen_mask);

        mtx_lock(&td->lctx_hash_lock);
        LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
        td->lctx_count++;
        mtx_unlock(&td->lctx_hash_lock);
}

/*
 * Look for the listening socket's context entry in the hash and return it.
 */
static struct listen_ctx *
listen_hash_find(struct adapter *sc, struct inpcb *inp)
{
        struct tom_data *td = sc->tom_softc;
        int bucket = listen_hashfn(inp, td->listen_mask);
        struct listen_ctx *lctx;

        mtx_lock(&td->lctx_hash_lock);
        LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
                if (lctx->inp == inp)
                        break;
        }
        mtx_unlock(&td->lctx_hash_lock);

        return (lctx);
}

/*
 * Removes the listen_ctx structure for inp from the hash and returns it.
 */
static struct listen_ctx *
listen_hash_del(struct adapter *sc, struct inpcb *inp)
{
        struct tom_data *td = sc->tom_softc;
        int bucket = listen_hashfn(inp, td->listen_mask);
        struct listen_ctx *lctx, *l;

        mtx_lock(&td->lctx_hash_lock);
        LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
                if (lctx->inp == inp) {
                        LIST_REMOVE(lctx, link);
                        td->lctx_count--;
                        break;
                }
        }
        mtx_unlock(&td->lctx_hash_lock);

        return (lctx);
}

/*
 * Releases a hold on the lctx.  Must be called with the listening socket's inp
 * locked.  The inp may be freed by this function and it returns NULL to
 * indicate this.
 */
static struct inpcb *
release_lctx(struct adapter *sc, struct listen_ctx *lctx)
{
        struct inpcb *inp = lctx->inp;
        int inp_freed = 0;

        INP_WLOCK_ASSERT(inp);
        if (refcount_release(&lctx->refcount))
                inp_freed = free_lctx(sc, lctx);

        return (inp_freed ? NULL : inp);
}

static void
send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
{
        struct mbuf *m = synqe->syn;
        if_t ifp = m->m_pkthdr.rcvif;
        struct vi_info *vi = if_getsoftc(ifp);
        struct port_info *pi = vi->pi;
        struct wrqe *wr;
        struct fw_flowc_wr *flowc;
        struct sge_ofld_txq *ofld_txq;
        struct sge_ofld_rxq *ofld_rxq;
        const int nparams = 6;
        const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
        const u_int pfvf = sc->pf << S_FW_VIID_PFN;

        INP_WLOCK_ASSERT(synqe->lctx->inp);
        MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);

        ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
        ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];

        wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
        if (wr == NULL) {
                /* XXX */
                panic("%s: allocation failure.", __func__);
        }
        flowc = wrtod(wr);
        memset(flowc, 0, wr->wr_len);
        flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
            V_FW_FLOWC_WR_NPARAMS(nparams));
        flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
            V_FW_WR_FLOWID(synqe->tid));
        flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
        flowc->mnemval[0].val = htobe32(pfvf);
        /* Firmware expects hw port and will translate to channel itself. */
        flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
        flowc->mnemval[1].val = htobe32(pi->hw_port);
        flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
        flowc->mnemval[2].val = htobe32(pi->hw_port);
        flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
        flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
        flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
        flowc->mnemval[4].val = htobe32(512);
        flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
        flowc->mnemval[5].val = htobe32(512);

        synqe->flags |= TPF_FLOWC_WR_SENT;
        t4_wrq_tx(sc, wr);
}

static void
send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
    int rst_status)
{
        struct adapter *sc = tod->tod_softc;
        struct wrqe *wr;
        struct cpl_abort_req *req;

        INP_WLOCK_ASSERT(synqe->lctx->inp);

        CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
            __func__, synqe, synqe->flags, synqe->tid,
            synqe->flags & TPF_ABORT_SHUTDOWN ?
            " (abort already in progress)" : "");
        if (synqe->flags & TPF_ABORT_SHUTDOWN)
                return; /* abort already in progress */
        synqe->flags |= TPF_ABORT_SHUTDOWN;

        if (!(synqe->flags & TPF_FLOWC_WR_SENT))
                send_flowc_wr_synqe(sc, synqe);

        wr = alloc_wrqe(sizeof(*req),
            &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
        if (wr == NULL) {
                /* XXX */
                panic("%s: allocation failure.", __func__);
        }
        req = wrtod(wr);
        INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
        req->rsvd0 = 0; /* don't have a snd_nxt */
        req->rsvd1 = 1; /* no data sent yet */
        req->cmd = rst_status;

        t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
}

static int
create_server(struct adapter *sc, struct listen_ctx *lctx)
{
        struct wrqe *wr;
        struct cpl_pass_open_req *req;
        struct inpcb *inp = lctx->inp;

        wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
        if (wr == NULL) {
                log(LOG_ERR, "%s: allocation failure", __func__);
                return (ENOMEM);
        }
        req = wrtod(wr);

        INIT_TP_WR(req, 0);
        OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
        req->local_port = inp->inp_lport;
        req->peer_port = 0;
        req->local_ip = inp->inp_laddr.s_addr;
        req->peer_ip = 0;
        req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
        req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
            F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));

        t4_wrq_tx(sc, wr);
        return (0);
}

static int
create_server6(struct adapter *sc, struct listen_ctx *lctx)
{
        struct wrqe *wr;
        struct cpl_pass_open_req6 *req;
        struct inpcb *inp = lctx->inp;

        wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
        if (wr == NULL) {
                log(LOG_ERR, "%s: allocation failure", __func__);
                return (ENOMEM);
        }
        req = wrtod(wr);

        INIT_TP_WR(req, 0);
        OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
        req->local_port = inp->inp_lport;
        req->peer_port = 0;
        req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
        req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
        req->peer_ip_hi = 0;
        req->peer_ip_lo = 0;
        req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
        req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
            F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));

        t4_wrq_tx(sc, wr);
        return (0);
}

static int
destroy_server(struct adapter *sc, struct listen_ctx *lctx)
{
        struct wrqe *wr;
        struct cpl_close_listsvr_req *req;

        wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
        if (wr == NULL) {
                /* XXX */
                panic("%s: allocation failure.", __func__);
        }
        req = wrtod(wr);

        INIT_TP_WR(req, 0);
        OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
            lctx->stid));
        req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
        req->rsvd = htobe16(0);

        t4_wrq_tx(sc, wr);
        return (0);
}

/*
 * Start a listening server by sending a passive open request to HW.
 *
 * Can't take adapter lock here and access to sc->flags,
 * sc->offload_map, if_capenable are all race prone.
 */
int
t4_listen_start(struct toedev *tod, struct tcpcb *tp)
{
        struct adapter *sc = tod->tod_softc;
        struct vi_info *vi;
        struct port_info *pi;
        struct inpcb *inp = tptoinpcb(tp);
        struct listen_ctx *lctx;
        int i, rc, v;
        struct offload_settings settings;

        INP_WLOCK_ASSERT(inp);

        rw_rlock(&sc->policy_lock);
        settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
            EVL_MAKETAG(0xfff, 0, 0), inp);
        rw_runlock(&sc->policy_lock);
        if (!settings.offload)
                return (0);

        /* Don't start a hardware listener for any loopback address. */
        if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
                return (0);
        if (!(inp->inp_vflag & INP_IPV6) &&
            IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
                return (0);
        if (sc->flags & KERN_TLS_ON)
                return (0);
#if 0
        ADAPTER_LOCK(sc);
        if (IS_BUSY(sc)) {
                log(LOG_ERR, "%s: listen request ignored, %s is busy",
                    __func__, device_get_nameunit(sc->dev));
                goto done;
        }

        KASSERT(uld_active(sc, ULD_TOM),
            ("%s: TOM not initialized", __func__));
#endif

        /*
         * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
         * such VI's queues to send the passive open and receive the reply to
         * it.
         *
         * XXX: need a way to mark a port in use by offload.  if_cxgbe should
         * then reject any attempt to bring down such a port (and maybe reject
         * attempts to disable IFCAP_TOE on that port too?).
         */
        for_each_port(sc, i) {
                pi = sc->port[i];
                for_each_vi(pi, v, vi) {
                        if (vi->flags & VI_INIT_DONE &&
                            if_getcapenable(vi->ifp) & IFCAP_TOE)
                                goto found;
                }
        }
        goto done;      /* no port that's UP with IFCAP_TOE enabled */
found:

        if (listen_hash_find(sc, inp) != NULL)
                goto done;      /* already setup */

        lctx = alloc_lctx(sc, inp, vi);
        if (lctx == NULL) {
                log(LOG_ERR,
                    "%s: listen request ignored, %s couldn't allocate lctx\n",
                    __func__, device_get_nameunit(sc->dev));
                goto done;
        }
        listen_hash_add(sc, lctx);

        CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
            __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
            inp->inp_vflag);

        if (inp->inp_vflag & INP_IPV6)
                rc = create_server6(sc, lctx);
        else
                rc = create_server(sc, lctx);
        if (rc != 0) {
                log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
                    __func__, device_get_nameunit(sc->dev), rc);
                (void) listen_hash_del(sc, inp);
                inp = release_lctx(sc, lctx);
                /* can't be freed, host stack has a reference */
                KASSERT(inp != NULL, ("%s: inp freed", __func__));
                goto done;
        }
        lctx->flags |= LCTX_RPL_PENDING;
done:
#if 0
        ADAPTER_UNLOCK(sc);
#endif
        return (0);
}

int
t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
{
        struct listen_ctx *lctx;
        struct adapter *sc = tod->tod_softc;
        struct inpcb *inp = tptoinpcb(tp);

        INP_WLOCK_ASSERT(inp);

        lctx = listen_hash_del(sc, inp);
        if (lctx == NULL)
                return (ENOENT);        /* no hardware listener for this inp */

        CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
            lctx, lctx->flags);

        /*
         * If the reply to the PASS_OPEN is still pending we'll wait for it to
         * arrive and clean up when it does.
         */
        if (lctx->flags & LCTX_RPL_PENDING) {
                return (EINPROGRESS);
        }

        if (lctx->flags & LCTX_SETUP_IN_HW)
                destroy_server(sc, lctx);
        else
                inp = release_lctx(sc, lctx);
        return (0);
}

static inline struct synq_entry *
alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags)
{
        struct synq_entry *synqe;

        INP_RLOCK_ASSERT(lctx->inp);
        MPASS(flags == M_WAITOK || flags == M_NOWAIT);

        synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
        if (__predict_true(synqe != NULL)) {
                synqe->flags = TPF_SYNQE;
                synqe->incarnation = sc->incarnation;
                refcount_init(&synqe->refcnt, 1);
                synqe->lctx = lctx;
                hold_lctx(lctx);        /* Every synqe has a ref on its lctx. */
                synqe->syn = NULL;
        }

        return (synqe);
}

static inline void
hold_synqe(struct synq_entry *synqe)
{

        refcount_acquire(&synqe->refcnt);
}

static inline struct inpcb *
release_synqe(struct adapter *sc, struct synq_entry *synqe)
{
        struct inpcb *inp;

        MPASS(synqe->flags & TPF_SYNQE);
        MPASS(synqe->lctx != NULL);

        inp = synqe->lctx->inp;
        MPASS(inp != NULL);
        INP_WLOCK_ASSERT(inp);

        if (refcount_release(&synqe->refcnt)) {
                inp = release_lctx(sc, synqe->lctx);
                m_freem(synqe->syn);
                free(synqe, M_CXGBE);
        }

        return (inp);
}

void
t4_syncache_added(struct toedev *tod __unused, void *arg)
{
        struct synq_entry *synqe = arg;

        hold_synqe(synqe);
}

void
t4_syncache_removed(struct toedev *tod, void *arg)
{
        struct adapter *sc = tod->tod_softc;
        struct synq_entry *synqe = arg;
        struct inpcb *inp = synqe->lctx->inp;

        /*
         * XXX: this is a LOR but harmless when running from the softclock.
         */
        INP_WLOCK(inp);
        inp = release_synqe(sc, synqe);
        if (inp != NULL)
                INP_WUNLOCK(inp);
}

int
t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
{
        struct synq_entry *synqe = arg;

        if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
                struct tcpopt to;
                struct ip *ip = mtod(m, struct ip *);
                struct tcphdr *th;

                if (ip->ip_v == IPVERSION)
                        th = (void *)(ip + 1);
                else
                        th = (void *)((struct ip6_hdr *)ip + 1);
                bzero(&to, sizeof(to));
                tcp_dooptions(&to, (void *)(th + 1),
                    (th->th_off << 2) - sizeof(*th), TO_SYN);

                /* save these for later */
                synqe->iss = be32toh(th->th_seq);
                synqe->irs = be32toh(th->th_ack) - 1;
                synqe->ts = to.to_tsval;
        }

        m_freem(m);     /* don't need this any more */
        return (0);
}

static int
do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
        int stid = GET_TID(cpl);
        unsigned int status = cpl->status;
        struct listen_ctx *lctx = lookup_stid(sc, stid);
        struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif

        KASSERT(opcode == CPL_PASS_OPEN_RPL,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
        KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));

        INP_WLOCK(inp);

        CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
            __func__, stid, status, lctx->flags);

        lctx->flags &= ~LCTX_RPL_PENDING;
        if (status == CPL_ERR_NONE)
                lctx->flags |= LCTX_SETUP_IN_HW;
        else
                log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);

#ifdef INVARIANTS
        /*
         * If the inp has been dropped (listening socket closed) then
         * listen_stop must have run and taken the inp out of the hash.
         */
        if (inp->inp_flags & INP_DROPPED) {
                KASSERT(listen_hash_del(sc, inp) == NULL,
                    ("%s: inp %p still in listen hash", __func__, inp));
        }
#endif

        if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
                if (release_lctx(sc, lctx) != NULL)
                        INP_WUNLOCK(inp);
                return (status);
        }

        /*
         * Listening socket stopped listening earlier and now the chip tells us
         * it has started the hardware listener.  Stop it; the lctx will be
         * released in do_close_server_rpl.
         */
        if (inp->inp_flags & INP_DROPPED) {
                destroy_server(sc, lctx);
                INP_WUNLOCK(inp);
                return (status);
        }

        /*
         * Failed to start hardware listener.  Take inp out of the hash and
         * release our reference on it.  An error message has been logged
         * already.
         */
        if (status != CPL_ERR_NONE) {
                listen_hash_del(sc, inp);
                if (release_lctx(sc, lctx) != NULL)
                        INP_WUNLOCK(inp);
                return (status);
        }

        /* hardware listener open for business */

        INP_WUNLOCK(inp);
        return (status);
}

static int
do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
        int stid = GET_TID(cpl);
        unsigned int status = cpl->status;
        struct listen_ctx *lctx = lookup_stid(sc, stid);
        struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif

        KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
        KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));

        CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);

        if (status != CPL_ERR_NONE) {
                log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
                    __func__, status, stid);
                return (status);
        }

        INP_WLOCK(inp);
        inp = release_lctx(sc, lctx);
        if (inp != NULL)
                INP_WUNLOCK(inp);

        return (status);
}

static void
done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
{
        struct tom_data *td = sc->tom_softc;
        struct listen_ctx *lctx = synqe->lctx;
        struct inpcb *inp = lctx->inp;
        struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
        int ntids;

        INP_WLOCK_ASSERT(inp);

        if (synqe->tid != -1) {
                ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
                remove_tid(sc, synqe->tid, ntids);
                mtx_lock(&td->toep_list_lock);
                TAILQ_REMOVE(&td->synqe_list, synqe, link);
                mtx_unlock(&td->toep_list_lock);
                release_tid(sc, synqe->tid, lctx->ctrlq);
        }
        t4_l2t_release(e);
        inp = release_synqe(sc, synqe);
        if (inp)
                INP_WUNLOCK(inp);
}

void
synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe)
{
        INP_WLOCK(synqe->lctx->inp);
        done_with_synqe(sc, synqe);
}

int
do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
        unsigned int tid = GET_TID(cpl);
        struct synq_entry *synqe = lookup_tid(sc, tid);
        struct listen_ctx *lctx = synqe->lctx;
        struct inpcb *inp = lctx->inp;
        struct sge_ofld_txq *ofld_txq;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif

        KASSERT(opcode == CPL_ABORT_REQ_RSS,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
        KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));

        CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
            __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);

        if (negative_advice(cpl->status))
                return (0);     /* Ignore negative advice */

        INP_WLOCK(inp);

        ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];

        if (!(synqe->flags & TPF_FLOWC_WR_SENT))
                send_flowc_wr_synqe(sc, synqe);

        /*
         * If we'd initiated an abort earlier the reply to it is responsible for
         * cleaning up resources.  Otherwise we tear everything down right here
         * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
         */
        if (synqe->flags & TPF_ABORT_SHUTDOWN) {
                INP_WUNLOCK(inp);
                goto done;
        }

        done_with_synqe(sc, synqe);
        /* inp lock released by done_with_synqe */
done:
        send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
        return (0);
}

int
do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
        unsigned int tid = GET_TID(cpl);
        struct synq_entry *synqe = lookup_tid(sc, tid);
        struct listen_ctx *lctx = synqe->lctx;
        struct inpcb *inp = lctx->inp;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif

        KASSERT(opcode == CPL_ABORT_RPL_RSS,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
        KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));

        CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
            __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);

        INP_WLOCK(inp);
        KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
            ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
            __func__, synqe, synqe->flags));

        done_with_synqe(sc, synqe);
        /* inp lock released by done_with_synqe */

        return (0);
}

void
t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
{
        struct adapter *sc = tod->tod_softc;
        struct tom_data *td = sc->tom_softc;
        struct synq_entry *synqe = arg;
        struct inpcb *inp = sotoinpcb(so);
        struct toepcb *toep = synqe->toep;

        NET_EPOCH_ASSERT();     /* prevents bad race with accept() */
        INP_WLOCK_ASSERT(inp);
        KASSERT(synqe->flags & TPF_SYNQE,
            ("%s: %p not a synq_entry?", __func__, arg));
        MPASS(toep->tid == synqe->tid);

        offload_socket(so, toep);
        make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
        toep->flags |= TPF_CPL_PENDING;
        update_tid(sc, synqe->tid, toep);
        synqe->flags |= TPF_SYNQE_EXPANDED;
        mtx_lock(&td->toep_list_lock);
        /* Remove synqe from its list and add the TOE PCB to the active list. */
        TAILQ_REMOVE(&td->synqe_list, synqe, link);
        TAILQ_INSERT_TAIL(&td->toep_list, toep, link);
        toep->flags |= TPF_IN_TOEP_LIST;
        mtx_unlock(&td->toep_list_lock);
        inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
            M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
        inp->inp_flowid = synqe->rss_hash;
}

static void
t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
{
        bzero(to, sizeof(*to));

        if (t4opt->mss) {
                to->to_flags |= TOF_MSS;
                to->to_mss = be16toh(t4opt->mss);
        }

        if (t4opt->wsf > 0 && t4opt->wsf < 15) {
                to->to_flags |= TOF_SCALE;
                to->to_wscale = t4opt->wsf;
        }

        if (t4opt->tstamp)
                to->to_flags |= TOF_TS;

        if (t4opt->sack)
                to->to_flags |= TOF_SACKPERM;
}

static bool
encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
{
        u_int hlen = be32toh(cpl->hdr_len);

        if (chip_id(sc) >= CHELSIO_T6)
                return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
        else
                return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
}

static void
pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
    struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
{
        const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
        const struct ether_header *eh;
        unsigned int hlen = be32toh(cpl->hdr_len);
        uintptr_t l3hdr;
        const struct tcphdr *tcp;

        eh = (const void *)(cpl + 1);
        if (chip_id(sc) >= CHELSIO_T6) {
                l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
                tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
        } else {
                l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
                tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
        }

        /* extract TOS (DiffServ + ECN) byte for AccECN */
        if (iptos) {
                if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
                        const struct ip *ip = (const void *)l3hdr;
                        *iptos = ip->ip_tos;
                }
#ifdef INET6
                else
                if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
                        const struct ip6_hdr *ip6 = (const void *)l3hdr;
                        *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
                }
#endif /* INET */
        }

        if (inc) {
                bzero(inc, sizeof(*inc));
                inc->inc_fport = tcp->th_sport;
                inc->inc_lport = tcp->th_dport;
                if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
                        const struct ip *ip = (const void *)l3hdr;

                        inc->inc_faddr = ip->ip_src;
                        inc->inc_laddr = ip->ip_dst;
                } else {
                        const struct ip6_hdr *ip6 = (const void *)l3hdr;

                        inc->inc_flags |= INC_ISIPV6;
                        inc->inc6_faddr = ip6->ip6_src;
                        inc->inc6_laddr = ip6->ip6_dst;
                }
        }

        if (th) {
                bcopy(tcp, th, sizeof(*th));
                tcp_fields_to_host(th);         /* just like tcp_input */
        }
}

static struct l2t_entry *
get_l2te_for_nexthop(struct port_info *pi, if_t ifp,
    struct in_conninfo *inc)
{
        struct l2t_entry *e;
        struct sockaddr_in6 sin6;
        struct sockaddr *dst = (void *)&sin6;
        struct nhop_object *nh;

        if (inc->inc_flags & INC_ISIPV6) {
                bzero(dst, sizeof(struct sockaddr_in6));
                dst->sa_len = sizeof(struct sockaddr_in6);
                dst->sa_family = AF_INET6;

                if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
                        /* no need for route lookup */
                        e = t4_l2t_get(pi, ifp, dst);
                        return (e);
                }

                nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
                if (nh == NULL)
                        return (NULL);
                if (nh->nh_ifp != ifp)
                        return (NULL);
                if (nh->nh_flags & NHF_GATEWAY)
                        ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
                else
                        ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
        } else {
                dst->sa_len = sizeof(struct sockaddr_in);
                dst->sa_family = AF_INET;

                nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
                if (nh == NULL)
                        return (NULL);
                if (nh->nh_ifp != ifp)
                        return (NULL);
                if (nh->nh_flags & NHF_GATEWAY)
                        if (nh->gw_sa.sa_family == AF_INET)
                                ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
                        else
                                *((struct sockaddr_in6 *)dst) = nh->gw6_sa;
                else
                        ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
        }

        e = t4_l2t_get(pi, ifp, dst);
        return (e);
}

static int
send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
    uint32_t opt2, int tid)
{
        struct wrqe *wr;
        struct cpl_pass_accept_rpl *rpl;
        struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];

        wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
            sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
        if (wr == NULL)
                return (ENOMEM);
        rpl = wrtod(wr);

        if (is_t4(sc))
                INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
        else {
                struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;

                INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
                rpl5->iss = htobe32(synqe->iss);
        }
        rpl->opt0 = opt0;
        rpl->opt2 = opt2;

        return (t4_l2t_send(sc, wr, e));
}

#define REJECT_PASS_ACCEPT_REQ(tunnel)  do { \
        if (!tunnel) { \
                m_freem(m); \
                m = NULL; \
        } \
        reject_reason = __LINE__; \
        goto reject; \
} while (0)

/*
 * The context associated with a tid entry via insert_tid could be a synq_entry
 * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
 */
CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));

/*
 * Incoming SYN on a listening socket.
 *
 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
 * etc.
 */
static int
do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        struct tom_data *td = sc->tom_softc;
        struct toedev *tod;
        const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
        unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
        unsigned int tid = GET_TID(cpl);
        struct listen_ctx *lctx = lookup_stid(sc, stid);
        struct inpcb *inp;
        struct socket *so;
        struct in_conninfo inc;
        struct tcphdr th;
        struct tcpopt to;
        struct port_info *pi;
        struct vi_info *vi;
        if_t hw_ifp, ifp;
        struct l2t_entry *e = NULL;
        struct synq_entry *synqe = NULL;
        int reject_reason, v, ntids;
        uint16_t vid, l2info;
        struct epoch_tracker et;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
        struct offload_settings settings;
        uint8_t iptos;

        KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));

        CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
            lctx);

        /*
         * Figure out the port the SYN arrived on.  We'll look for an exact VI
         * match in a bit but in case we don't find any we'll use the main VI as
         * the incoming ifnet.
         */
        l2info = be16toh(cpl->l2info);
        pi = sc->port[G_SYN_INTF(l2info)];
        hw_ifp = pi->vi[0].ifp;
        m->m_pkthdr.rcvif = hw_ifp;

        CURVNET_SET(lctx->vnet);        /* before any potential REJECT */

        /*
         * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
         * also hit the listener.  We don't want to offload those.
         */
        if (encapsulated_syn(sc, cpl)) {
                REJECT_PASS_ACCEPT_REQ(true);
        }

        /*
         * Use the MAC index to lookup the associated VI.  If this SYN didn't
         * match a perfect MAC filter, punt.
         */
        if (!(l2info & F_SYN_XACT_MATCH)) {
                REJECT_PASS_ACCEPT_REQ(true);
        }
        for_each_vi(pi, v, vi) {
                if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
                        goto found;
        }
        REJECT_PASS_ACCEPT_REQ(true);
found:
        hw_ifp = vi->ifp;       /* the cxgbe ifnet */
        m->m_pkthdr.rcvif = hw_ifp;
        tod = TOEDEV(hw_ifp);

        /*
         * Don't offload if the peer requested a TCP option that's not known to
         * the silicon.  Send the SYN to the kernel instead.
         */
        if (__predict_false(cpl->tcpopt.unknown))
                REJECT_PASS_ACCEPT_REQ(true);

        /*
         * Figure out if there is a pseudo interface (vlan, lagg, etc.)
         * involved.  Don't offload if the SYN had a VLAN tag and the vid
         * doesn't match anything on this interface.
         *
         * XXX: lagg support, lagg + vlan support.
         */
        vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
        if (vid != 0xfff && vid != 0) {
                ifp = VLAN_DEVAT(hw_ifp, vid);
                if (ifp == NULL)
                        REJECT_PASS_ACCEPT_REQ(true);
        } else
                ifp = hw_ifp;

        /*
         * Don't offload if the ifnet that the SYN came in on is not in the same
         * vnet as the listening socket.
         */
        if (lctx->vnet != if_getvnet(ifp))
                REJECT_PASS_ACCEPT_REQ(true);

        pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
        if (inc.inc_flags & INC_ISIPV6) {

                /* Don't offload if the ifcap isn't enabled */
                if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0)
                        REJECT_PASS_ACCEPT_REQ(true);

                /*
                 * SYN must be directed to an IP6 address on this ifnet.  This
                 * is more restrictive than in6_localip.
                 */
                NET_EPOCH_ENTER(et);
                if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
                        NET_EPOCH_EXIT(et);
                        REJECT_PASS_ACCEPT_REQ(true);
                }

                ntids = 2;
        } else {

                /* Don't offload if the ifcap isn't enabled */
                if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0)
                        REJECT_PASS_ACCEPT_REQ(true);

                /*
                 * SYN must be directed to an IP address on this ifnet.  This
                 * is more restrictive than in_localip.
                 */
                NET_EPOCH_ENTER(et);
                if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
                        NET_EPOCH_EXIT(et);
                        REJECT_PASS_ACCEPT_REQ(true);
                }

                ntids = 1;
        }

        e = get_l2te_for_nexthop(pi, ifp, &inc);
        if (e == NULL) {
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(true);
        }

        /* Don't offload if the 4-tuple is already in use */
        if (toe_4tuple_check(&inc, &th, ifp) != 0) {
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(false);
        }

        inp = lctx->inp;                /* listening socket, not owned by TOE */
        INP_RLOCK(inp);

        /* Don't offload if the listening socket has closed */
        if (__predict_false(inp->inp_flags & INP_DROPPED)) {
                INP_RUNLOCK(inp);
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(false);
        }
        so = inp->inp_socket;
        rw_rlock(&sc->policy_lock);
        settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
            EVL_MAKETAG(0xfff, 0, 0), inp);
        rw_runlock(&sc->policy_lock);
        if (!settings.offload) {
                INP_RUNLOCK(inp);
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(true);   /* Rejected by COP. */
        }

        synqe = alloc_synqe(sc, lctx, M_NOWAIT);
        if (synqe == NULL) {
                INP_RUNLOCK(inp);
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(true);
        }
        MPASS(rss->hash_type == RSS_HASH_TCP);
        synqe->rss_hash = be32toh(rss->hash_val);
        atomic_store_int(&synqe->ok_to_respond, 0);

        init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
            &synqe->params);
        if (sc->params.tid_qid_sel_mask != 0)
                update_tid_qid_sel(vi, &synqe->params, tid);

        /*
         * If all goes well t4_syncache_respond will get called during
         * syncache_add.  Note that syncache_add releases the pcb lock.
         */
        t4opt_to_tcpopt(&cpl->tcpopt, &to);
        toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);

        if (atomic_load_int(&synqe->ok_to_respond) > 0) {
                uint64_t opt0;
                uint32_t opt2;

                opt0 = calc_options0(vi, &synqe->params);
                opt2 = calc_options2(vi, &synqe->params);

                insert_tid(sc, tid, synqe, ntids);
                synqe->tid = tid;
                synqe->syn = m;
                m = NULL;
                mtx_lock(&td->toep_list_lock);
                TAILQ_INSERT_TAIL(&td->synqe_list, synqe, link);
                mtx_unlock(&td->toep_list_lock);

                if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
                        remove_tid(sc, tid, ntids);
                        m = synqe->syn;
                        synqe->syn = NULL;
                        mtx_lock(&td->toep_list_lock);
                        TAILQ_REMOVE(&td->synqe_list, synqe, link);
                        mtx_unlock(&td->toep_list_lock);
                        NET_EPOCH_EXIT(et);
                        REJECT_PASS_ACCEPT_REQ(true);
                }
                CTR6(KTR_CXGBE,
                    "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
                    __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
        } else {
                NET_EPOCH_EXIT(et);
                REJECT_PASS_ACCEPT_REQ(false);
        }

        NET_EPOCH_EXIT(et);
        CURVNET_RESTORE();
        return (0);
reject:
        CURVNET_RESTORE();
        CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
            reject_reason);

        if (e)
                t4_l2t_release(e);
        release_tid(sc, tid, lctx->ctrlq);
        if (synqe) {
                inp = synqe->lctx->inp;
                INP_WLOCK(inp);
                inp = release_synqe(sc, synqe);
                if (inp)
                        INP_WUNLOCK(inp);
        }

        if (m) {
                /*
                 * The connection request hit a TOE listener but is being passed
                 * on to the kernel sw stack instead of getting offloaded.
                 */
                m_adj(m, sizeof(*cpl));
                m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
                    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
                m->m_pkthdr.csum_data = 0xffff;
                if_input(hw_ifp, m);
        }

        return (reject_reason);
}

static void
synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
    const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
    struct tcphdr *th, struct tcpopt *to)
{
        uint16_t tcp_opt = be16toh(cpl->tcp_opt);
        uint8_t iptos;

        /* start off with the original SYN */
        pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);

        /* modify parts to make it look like the ACK to our SYN|ACK */
        tcp_set_flags(th, TH_ACK);
        th->th_ack = synqe->iss + 1;
        th->th_seq = be32toh(cpl->rcv_isn);
        bzero(to, sizeof(*to));
        if (G_TCPOPT_TSTAMP(tcp_opt)) {
                to->to_flags |= TOF_TS;
                to->to_tsecr = synqe->ts;
        }
}

static int
do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
    struct mbuf *m)
{
        struct adapter *sc = iq->adapter;
        struct vi_info *vi;
        if_t ifp;
        const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
#if defined(KTR) || defined(INVARIANTS)
        unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
#endif
        unsigned int tid = GET_TID(cpl);
        struct synq_entry *synqe = lookup_tid(sc, tid);
        struct listen_ctx *lctx = synqe->lctx;
        struct inpcb *inp = lctx->inp, *new_inp;
        struct socket *so;
        struct tcphdr th;
        struct tcpopt to;
        struct in_conninfo inc;
        struct toepcb *toep;
        struct epoch_tracker et;
        int rstreason;
#ifdef INVARIANTS
        unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif

        KASSERT(opcode == CPL_PASS_ESTABLISH,
            ("%s: unexpected opcode 0x%x", __func__, opcode));
        KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
        KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
        KASSERT(synqe->flags & TPF_SYNQE,
            ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));

        CURVNET_SET(lctx->vnet);
        NET_EPOCH_ENTER(et);    /* for syncache_expand */
        INP_WLOCK(inp);

        CTR6(KTR_CXGBE,
            "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
            __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);

        ifp = synqe->syn->m_pkthdr.rcvif;
        vi = if_getsoftc(ifp);
        KASSERT(vi->adapter == sc,
            ("%s: vi %p, sc %p mismatch", __func__, vi, sc));

        if (__predict_false(inp->inp_flags & INP_DROPPED)) {
reset:
                send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
                INP_WUNLOCK(inp);
                NET_EPOCH_EXIT(et);
                CURVNET_RESTORE();
                return (0);
        }

        KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
            ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
            synqe->params.rxq_idx,
            (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));

        toep = alloc_toepcb(vi, M_NOWAIT);
        if (toep == NULL)
                goto reset;
        toep->tid = tid;
        toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
        toep->vnet = lctx->vnet;
        bcopy(&synqe->params, &toep->params, sizeof(toep->params));
        init_toepcb(vi, toep);

        MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
        MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
        synqe->tcp_opt = cpl->tcp_opt;
        synqe->toep = toep;

        /* Come up with something that syncache_expand should be ok with. */
        synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
        if (inc.inc_flags & INC_ISIPV6) {
                if (lctx->ce == NULL) {
                        toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
                        if (toep->ce == NULL) {
                                free_toepcb(toep);
                                goto reset;     /* RST without a CLIP entry? */
                        }
                } else {
                        t4_hold_clip_entry(sc, lctx->ce);
                        toep->ce = lctx->ce;
                }
        }
        so = inp->inp_socket;
        KASSERT(so != NULL, ("%s: socket is NULL", __func__));

        rstreason = toe_syncache_expand(&inc, &to, &th, &so);
        if (rstreason < 0) {
                free_toepcb(toep);
                send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
                INP_WUNLOCK(inp);
                NET_EPOCH_EXIT(et);
                CURVNET_RESTORE();
                return (0);
        } else if (rstreason == 0 || so == NULL) {
                free_toepcb(toep);
                goto reset;
        }

        /* New connection inpcb is already locked by syncache_expand(). */
        new_inp = sotoinpcb(so);
        INP_WLOCK_ASSERT(new_inp);
        MPASS(so->so_vnet == lctx->vnet);

        /*
         * This is for expansion from syncookies.
         *
         * XXX: we've held the tcbinfo lock throughout so there's no risk of
         * anyone accept'ing a connection before we've installed our hooks, but
         * this somewhat defeats the purpose of having a tod_offload_socket :-(
         */
        if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
                tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
                t4_offload_socket(TOEDEV(ifp), synqe, so);
        }

        INP_WUNLOCK(new_inp);

        /* Done with the synqe */
        inp = release_synqe(sc, synqe);
        if (inp != NULL)
                INP_WUNLOCK(inp);
        NET_EPOCH_EXIT(et);
        CURVNET_RESTORE();

        return (0);
}

void
t4_init_listen_cpl_handlers(void)
{

        t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
        t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
        t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
        t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
}

void
t4_uninit_listen_cpl_handlers(void)
{

        t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
        t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
        t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
        t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
}
#endif