root/sys/dev/nvmf/host/nvmf.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
 * Written by: John Baldwin <jhb@FreeBSD.org>
 */

#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/dnv.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/nv.h>
#include <sys/reboot.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>

static struct cdevsw nvmf_cdevsw;
static struct taskqueue *nvmf_tq;

bool nvmf_fail_disconnect = false;
SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
    &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");

MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");

static void     nvmf_controller_loss_task(void *arg, int pending);
static void     nvmf_disconnect_task(void *arg, int pending);
static void     nvmf_request_reconnect(struct nvmf_softc *sc);
static void     nvmf_request_reconnect_task(void *arg, int pending);
static void     nvmf_shutdown_pre_sync(void *arg, int howto);
static void     nvmf_shutdown_post_sync(void *arg, int howto);

void
nvmf_complete(void *arg, const struct nvme_completion *cqe)
{
        struct nvmf_completion_status *status = arg;
        struct mtx *mtx;

        status->cqe = *cqe;
        mtx = mtx_pool_find(mtxpool_sleep, status);
        mtx_lock(mtx);
        status->done = true;
        mtx_unlock(mtx);
        wakeup(status);
}

void
nvmf_io_complete(void *arg, size_t xfered, int error)
{
        struct nvmf_completion_status *status = arg;
        struct mtx *mtx;

        status->io_error = error;
        mtx = mtx_pool_find(mtxpool_sleep, status);
        mtx_lock(mtx);
        status->io_done = true;
        mtx_unlock(mtx);
        wakeup(status);
}

void
nvmf_wait_for_reply(struct nvmf_completion_status *status)
{
        struct mtx *mtx;

        mtx = mtx_pool_find(mtxpool_sleep, status);
        mtx_lock(mtx);
        while (!status->done || !status->io_done)
                mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
        mtx_unlock(mtx);
}

static int
nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
    uint64_t *value)
{
        const struct nvmf_fabric_prop_get_rsp *rsp;
        struct nvmf_completion_status status;

        nvmf_status_init(&status);
        if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
            M_WAITOK))
                return (ECONNABORTED);
        nvmf_wait_for_reply(&status);

        if (status.cqe.status != 0) {
                device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
                    le16toh(status.cqe.status));
                return (EIO);
        }

        rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
        if (size == 8)
                *value = le64toh(rsp->value.u64);
        else
                *value = le32toh(rsp->value.u32.low);
        return (0);
}

static int
nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
    uint64_t value)
{
        struct nvmf_completion_status status;

        nvmf_status_init(&status);
        if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
            M_WAITOK))
                return (ECONNABORTED);
        nvmf_wait_for_reply(&status);

        if (status.cqe.status != 0) {
                device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
                    le16toh(status.cqe.status));
                return (EIO);
        }
        return (0);
}

static void
nvmf_shutdown_controller(struct nvmf_softc *sc)
{
        uint64_t cc;
        int error;

        error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
        if (error != 0) {
                device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
                return;
        }

        cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);

        error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
        if (error != 0)
                device_printf(sc->dev,
                    "Failed to set CC to trigger shutdown\n");
}

static void
nvmf_check_keep_alive(void *arg)
{
        struct nvmf_softc *sc = arg;
        int traffic;

        traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
        if (traffic == 0) {
                device_printf(sc->dev,
                    "disconnecting due to KeepAlive timeout\n");
                nvmf_disconnect(sc);
                return;
        }

        callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
}

static void
nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
{
        struct nvmf_softc *sc = arg;

        atomic_store_int(&sc->ka_active_rx_traffic, 1);
        if (cqe->status != 0) {
                device_printf(sc->dev,
                    "KeepAlive response reported status %#x\n",
                    le16toh(cqe->status));
        }
}

static void
nvmf_send_keep_alive(void *arg)
{
        struct nvmf_softc *sc = arg;
        int traffic;

        /*
         * Don't bother sending a KeepAlive command if TKAS is active
         * and another command has been sent during the interval.
         */
        traffic = atomic_load_int(&sc->ka_active_tx_traffic);
        if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
            sc, M_NOWAIT))
                device_printf(sc->dev,
                    "Failed to allocate KeepAlive command\n");

        /* Clear ka_active_tx_traffic after sending the keep alive command. */
        atomic_store_int(&sc->ka_active_tx_traffic, 0);

        callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
}

int
nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
{
        const struct nvme_discovery_log_entry *dle;
        const struct nvme_controller_data *cdata;
        const nvlist_t *const *io;
        const nvlist_t *admin, *rparams;
        nvlist_t *nvl;
        size_t i, num_io_queues;
        uint32_t qsize;
        int error;

        error = nvmf_unpack_ioc_nvlist(nv, &nvl);
        if (error != 0)
                return (error);

        if (!nvlist_exists_number(nvl, "trtype") ||
            !nvlist_exists_nvlist(nvl, "admin") ||
            !nvlist_exists_nvlist_array(nvl, "io") ||
            !nvlist_exists_binary(nvl, "cdata") ||
            !nvlist_exists_nvlist(nvl, "rparams"))
                goto invalid;

        rparams = nvlist_get_nvlist(nvl, "rparams");
        if (!nvlist_exists_binary(rparams, "dle") ||
            !nvlist_exists_string(rparams, "hostnqn") ||
            !nvlist_exists_number(rparams, "num_io_queues") ||
            !nvlist_exists_number(rparams, "io_qsize"))
                goto invalid;

        admin = nvlist_get_nvlist(nvl, "admin");
        if (!nvmf_validate_qpair_nvlist(admin, false))
                goto invalid;
        if (!nvlist_get_bool(admin, "admin"))
                goto invalid;

        io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
        if (num_io_queues < 1 ||
            num_io_queues != nvlist_get_number(rparams, "num_io_queues"))
                goto invalid;
        for (i = 0; i < num_io_queues; i++) {
                if (!nvmf_validate_qpair_nvlist(io[i], false))
                        goto invalid;
        }

        /* Require all I/O queues to be the same size. */
        qsize = nvlist_get_number(rparams, "io_qsize");
        for (i = 0; i < num_io_queues; i++) {
                if (nvlist_get_number(io[i], "qsize") != qsize)
                        goto invalid;
        }

        cdata = nvlist_get_binary(nvl, "cdata", &i);
        if (i != sizeof(*cdata))
                goto invalid;
        dle = nvlist_get_binary(rparams, "dle", &i);
        if (i != sizeof(*dle))
                goto invalid;

        if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0)
                goto invalid;

        *nvlp = nvl;
        return (0);
invalid:
        nvlist_destroy(nvl);
        return (EINVAL);
}

static int
nvmf_probe(device_t dev)
{
        const nvlist_t *nvl = device_get_ivars(dev);
        const struct nvme_controller_data *cdata;

        if (nvl == NULL)
                return (ENXIO);

        cdata = nvlist_get_binary(nvl, "cdata", NULL);
        device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
        return (BUS_PROBE_DEFAULT);
}

static int
nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
{
        const nvlist_t *const *io;
        const nvlist_t *admin;
        uint64_t kato;
        size_t num_io_queues;
        enum nvmf_trtype trtype;
        char name[16];

        trtype = nvlist_get_number(nvl, "trtype");
        admin = nvlist_get_nvlist(nvl, "admin");
        io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
        kato = dnvlist_get_number(nvl, "kato", 0);
        sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0);
        sc->controller_loss_timeout = dnvlist_get_number(nvl,
            "controller_loss_timeout", 0);

        /* Setup the admin queue. */
        sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
        if (sc->admin == NULL) {
                device_printf(sc->dev, "Failed to setup admin queue\n");
                return (ENXIO);
        }

        /* Setup I/O queues. */
        sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
            M_WAITOK | M_ZERO);
        sc->num_io_queues = num_io_queues;
        for (u_int i = 0; i < sc->num_io_queues; i++) {
                snprintf(name, sizeof(name), "I/O queue %u", i);
                sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
                if (sc->io[i] == NULL) {
                        device_printf(sc->dev, "Failed to setup I/O queue %u\n",
                            i);
                        return (ENXIO);
                }
        }

        /* Start KeepAlive timers. */
        if (kato != 0) {
                sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
                    sc->cdata->ctratt) != 0;
                sc->ka_rx_sbt = mstosbt(kato);
                sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
                callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
                    nvmf_check_keep_alive, sc, C_HARDCLOCK);
                callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
                    nvmf_send_keep_alive, sc, C_HARDCLOCK);
        }

        memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
            sizeof(*sc->cdata));

        /* Save reconnect parameters. */
        nvlist_destroy(sc->rparams);
        sc->rparams = nvlist_take_nvlist(nvl, "rparams");

        return (0);
}

typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
    const struct nvme_namespace_data *, void *);

static bool
nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
    struct nvme_namespace_data *data, uint32_t *nsidp,
    nvmf_scan_active_ns_cb *cb, void *cb_arg)
{
        struct nvmf_completion_status status;
        uint32_t nsid;

        nvmf_status_init(&status);
        nvmf_status_wait_io(&status);
        if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
            nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
                device_printf(sc->dev,
                    "failed to send IDENTIFY active namespaces command\n");
                return (false);
        }
        nvmf_wait_for_reply(&status);

        if (status.cqe.status != 0) {
                device_printf(sc->dev,
                    "IDENTIFY active namespaces failed, status %#x\n",
                    le16toh(status.cqe.status));
                return (false);
        }

        if (status.io_error != 0) {
                device_printf(sc->dev,
                    "IDENTIFY active namespaces failed with I/O error %d\n",
                    status.io_error);
                return (false);
        }

        for (u_int i = 0; i < nitems(nslist->ns); i++) {
                nsid = nslist->ns[i];
                if (nsid == 0) {
                        *nsidp = 0;
                        return (true);
                }

                nvmf_status_init(&status);
                nvmf_status_wait_io(&status);
                if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
                    &status, nvmf_io_complete, &status, M_WAITOK)) {
                        device_printf(sc->dev,
                            "failed to send IDENTIFY namespace %u command\n",
                            nsid);
                        return (false);
                }
                nvmf_wait_for_reply(&status);

                if (status.cqe.status != 0) {
                        device_printf(sc->dev,
                            "IDENTIFY namespace %u failed, status %#x\n", nsid,
                            le16toh(status.cqe.status));
                        return (false);
                }

                if (status.io_error != 0) {
                        device_printf(sc->dev,
                            "IDENTIFY namespace %u failed with I/O error %d\n",
                            nsid, status.io_error);
                        return (false);
                }

                nvme_namespace_data_swapbytes(data);
                if (!cb(sc, nsid, data, cb_arg))
                        return (false);
        }

        MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);

        if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
                *nsidp = 0;
        else
                *nsidp = nsid;
        return (true);
}

static bool
nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
    void *cb_arg)
{
        struct nvme_namespace_data *data;
        struct nvme_ns_list *nslist;
        uint32_t nsid;
        bool retval;

        nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
        data = malloc(sizeof(*data), M_NVMF, M_WAITOK);

        nsid = 0;
        retval = true;
        for (;;) {
                if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
                    cb_arg)) {
                        retval = false;
                        break;
                }
                if (nsid == 0)
                        break;
        }

        free(data, M_NVMF);
        free(nslist, M_NVMF);
        return (retval);
}

static bool
nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
    const struct nvme_namespace_data *data, void *arg __unused)
{
        if (sc->ns[nsid - 1] != NULL) {
                device_printf(sc->dev,
                    "duplicate namespace %u in active namespace list\n",
                    nsid);
                return (false);
        }

        /*
         * As in nvme_ns_construct, a size of zero indicates an
         * invalid namespace.
         */
        if (data->nsze == 0) {
                device_printf(sc->dev,
                    "ignoring active namespace %u with zero size\n", nsid);
                return (true);
        }

        sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);

        nvmf_sim_rescan_ns(sc, nsid);
        return (true);
}

static bool
nvmf_add_namespaces(struct nvmf_softc *sc)
{
        sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
            M_WAITOK | M_ZERO);
        return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
}

static int
nvmf_attach(device_t dev)
{
        struct make_dev_args mda;
        struct nvmf_softc *sc = device_get_softc(dev);
        nvlist_t *nvl = device_get_ivars(dev);
        const nvlist_t * const *io;
        struct sysctl_oid *oid;
        uint64_t mpsmin, val;
        u_int i;
        int error;

        if (nvl == NULL)
                return (ENXIO);

        sc->dev = dev;
        sc->trtype = nvlist_get_number(nvl, "trtype");
        callout_init(&sc->ka_rx_timer, 1);
        callout_init(&sc->ka_tx_timer, 1);
        sx_init(&sc->connection_lock, "nvmf connection");
        TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
        TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0,
            nvmf_controller_loss_task, sc);
        TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0,
            nvmf_request_reconnect_task, sc);

        oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
            SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
            CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
        sc->ioq_oid_list = SYSCTL_CHILDREN(oid);

        sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);

        nvmf_init_aer(sc);

        error = nvmf_establish_connection(sc, nvl);
        if (error != 0)
                goto out;

        error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
        if (error != 0) {
                device_printf(sc->dev, "Failed to fetch CAP\n");
                error = ENXIO;
                goto out;
        }

        error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
        if (error != 0) {
                device_printf(sc->dev, "Failed to fetch VS\n");
                error = ENXIO;
                goto out;
        }
        sc->vs = val;

        /* Honor MDTS if it is set. */
        mpsmin = (uint64_t)1 << (NVME_MPS_SHIFT +
            NVME_CAP_HI_MPSMIN(sc->cap >> 32));
        sc->max_xfer_size = maxphys;
        if (sc->cdata->mdts != 0) {
                sc->max_xfer_size = ulmin(sc->max_xfer_size,
                    mpsmin << sc->cdata->mdts);
        }

        /* Honor any transfer size restriction imposed by the transport. */
        val = nvmf_max_xfer_size_qp(sc->io[0]);
        if (val >= mpsmin)
                sc->max_xfer_size = ulmin(sc->max_xfer_size,
                    rounddown2(val, mpsmin));

        io = nvlist_get_nvlist_array(nvl, "io", NULL);
        sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
            sc->num_io_queues;

        error = nvmf_init_sim(sc);
        if (error != 0)
                goto out;

        error = nvmf_start_aer(sc);
        if (error != 0) {
                nvmf_destroy_sim(sc);
                goto out;
        }

        if (!nvmf_add_namespaces(sc)) {
                nvmf_destroy_sim(sc);
                goto out;
        }

        make_dev_args_init(&mda);
        mda.mda_devsw = &nvmf_cdevsw;
        mda.mda_uid = UID_ROOT;
        mda.mda_gid = GID_WHEEL;
        mda.mda_mode = 0600;
        mda.mda_si_drv1 = sc;
        error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
        if (error != 0) {
                nvmf_destroy_sim(sc);
                goto out;
        }

        sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
            nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
        sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
            nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);

        return (0);
out:
        if (sc->ns != NULL) {
                for (i = 0; i < sc->cdata->nn; i++) {
                        if (sc->ns[i] != NULL)
                                nvmf_destroy_ns(sc->ns[i]);
                }
                free(sc->ns, M_NVMF);
        }

        callout_drain(&sc->ka_tx_timer);
        callout_drain(&sc->ka_rx_timer);

        if (sc->admin != NULL)
                nvmf_shutdown_controller(sc);

        for (i = 0; i < sc->num_io_queues; i++) {
                if (sc->io[i] != NULL)
                        nvmf_destroy_qp(sc->io[i]);
        }
        free(sc->io, M_NVMF);
        if (sc->admin != NULL)
                nvmf_destroy_qp(sc->admin);

        nvmf_destroy_aer(sc);

        taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
        taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task);
        taskqueue_drain(nvmf_tq, &sc->disconnect_task);
        sx_destroy(&sc->connection_lock);
        nvlist_destroy(sc->rparams);
        free(sc->cdata, M_NVMF);
        return (error);
}

void
nvmf_disconnect(struct nvmf_softc *sc)
{
        taskqueue_enqueue(nvmf_tq, &sc->disconnect_task);
}

static void
nvmf_disconnect_task(void *arg, int pending __unused)
{
        struct nvmf_softc *sc = arg;
        u_int i;

        sx_xlock(&sc->connection_lock);
        if (sc->admin == NULL) {
                /*
                 * Ignore transport errors if there is no active
                 * association.
                 */
                sx_xunlock(&sc->connection_lock);
                return;
        }

        if (sc->detaching) {
                if (sc->admin != NULL) {
                        /*
                         * This unsticks the detach process if a
                         * transport error occurs during detach.
                         */
                        nvmf_shutdown_qp(sc->admin);
                }
                sx_xunlock(&sc->connection_lock);
                return;
        }

        if (sc->cdev == NULL) {
                /*
                 * Transport error occurred during attach (nvmf_add_namespaces).
                 * Shutdown the admin queue.
                 */
                nvmf_shutdown_qp(sc->admin);
                sx_xunlock(&sc->connection_lock);
                return;
        }

        nanotime(&sc->last_disconnect);
        callout_drain(&sc->ka_tx_timer);
        callout_drain(&sc->ka_rx_timer);
        sc->ka_traffic = false;

        /* Quiesce namespace consumers. */
        nvmf_disconnect_sim(sc);
        for (i = 0; i < sc->cdata->nn; i++) {
                if (sc->ns[i] != NULL)
                        nvmf_disconnect_ns(sc->ns[i]);
        }

        /* Shutdown the existing qpairs. */
        for (i = 0; i < sc->num_io_queues; i++) {
                nvmf_destroy_qp(sc->io[i]);
        }
        free(sc->io, M_NVMF);
        sc->io = NULL;
        sc->num_io_queues = 0;
        nvmf_destroy_qp(sc->admin);
        sc->admin = NULL;

        if (sc->reconnect_delay != 0)
                nvmf_request_reconnect(sc);
        if (sc->controller_loss_timeout != 0)
                taskqueue_enqueue_timeout(nvmf_tq,
                    &sc->controller_loss_task, sc->controller_loss_timeout *
                    hz);

        sx_xunlock(&sc->connection_lock);
}

static void
nvmf_controller_loss_task(void *arg, int pending)
{
        struct nvmf_softc *sc = arg;
        device_t dev;
        int error;

        bus_topo_lock();
        sx_xlock(&sc->connection_lock);
        if (sc->admin != NULL || sc->detaching) {
                /* Reconnected or already detaching. */
                sx_xunlock(&sc->connection_lock);
                bus_topo_unlock();
                return;
        }

        sc->controller_timedout = true;
        sx_xunlock(&sc->connection_lock);

        /*
         * XXX: Doing this from here is a bit ugly.  We don't have an
         * extra reference on `dev` but bus_topo_lock should block any
         * concurrent device_delete_child invocations.
         */
        dev = sc->dev;
        error = device_delete_child(root_bus, dev);
        if (error != 0)
                device_printf(dev,
                    "failed to detach after controller loss: %d\n", error);
        bus_topo_unlock();
}

static void
nvmf_request_reconnect(struct nvmf_softc *sc)
{
        char buf[64];

        sx_assert(&sc->connection_lock, SX_LOCKED);

        snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev));
        devctl_notify("nvme", "controller", "RECONNECT", buf);
        taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task,
            sc->reconnect_delay * hz);
}

static void
nvmf_request_reconnect_task(void *arg, int pending)
{
        struct nvmf_softc *sc = arg;

        sx_xlock(&sc->connection_lock);
        if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
                /* Reconnected or already detaching. */
                sx_xunlock(&sc->connection_lock);
                return;
        }

        nvmf_request_reconnect(sc);
        sx_xunlock(&sc->connection_lock);
}

static int
nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
        const struct nvme_controller_data *cdata;
        nvlist_t *nvl;
        u_int i;
        int error;

        error = nvmf_copyin_handoff(nv, &nvl);
        if (error != 0)
                return (error);

        /* XXX: Should we permit changing the transport type? */
        if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
                device_printf(sc->dev,
                    "transport type mismatch on reconnect\n");
                return (EINVAL);
        }

        sx_xlock(&sc->connection_lock);
        if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
                error = EBUSY;
                goto out;
        }

        /*
         * Ensure this is for the same controller.  Note that the
         * controller ID can vary across associations if the remote
         * system is using the dynamic controller model.  This merely
         * ensures the new association is connected to the same NVMe
         * subsystem.
         */
        cdata = nvlist_get_binary(nvl, "cdata", NULL);
        if (memcmp(sc->cdata->subnqn, cdata->subnqn,
            sizeof(cdata->subnqn)) != 0) {
                device_printf(sc->dev,
                    "controller subsystem NQN mismatch on reconnect\n");
                error = EINVAL;
                goto out;
        }

        /*
         * XXX: Require same number and size of I/O queues so that
         * max_pending_io is still correct?
         */

        error = nvmf_establish_connection(sc, nvl);
        if (error != 0)
                goto out;

        error = nvmf_start_aer(sc);
        if (error != 0)
                goto out;

        device_printf(sc->dev,
            "established new association with %u I/O queues\n",
            sc->num_io_queues);

        /* Restart namespace consumers. */
        for (i = 0; i < sc->cdata->nn; i++) {
                if (sc->ns[i] != NULL)
                        nvmf_reconnect_ns(sc->ns[i]);
        }
        nvmf_reconnect_sim(sc);

        nvmf_rescan_all_ns(sc);

        taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL);
        taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL);
out:
        sx_xunlock(&sc->connection_lock);
        nvlist_destroy(nvl);
        return (error);
}

static void
nvmf_shutdown_pre_sync(void *arg, int howto)
{
        struct nvmf_softc *sc = arg;

        if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
                return;

        /*
         * If this association is disconnected, abort any pending
         * requests with an error to permit filesystems to unmount
         * without hanging.
         */
        sx_xlock(&sc->connection_lock);
        if (sc->admin != NULL || sc->detaching) {
                sx_xunlock(&sc->connection_lock);
                return;
        }

        for (u_int i = 0; i < sc->cdata->nn; i++) {
                if (sc->ns[i] != NULL)
                        nvmf_shutdown_ns(sc->ns[i]);
        }
        nvmf_shutdown_sim(sc);
        sx_xunlock(&sc->connection_lock);
}

static void
nvmf_shutdown_post_sync(void *arg, int howto)
{
        struct nvmf_softc *sc = arg;

        if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
                return;

        /*
         * If this association is connected, disconnect gracefully.
         */
        sx_xlock(&sc->connection_lock);
        if (sc->admin == NULL || sc->detaching) {
                sx_xunlock(&sc->connection_lock);
                return;
        }

        callout_drain(&sc->ka_tx_timer);
        callout_drain(&sc->ka_rx_timer);

        nvmf_shutdown_controller(sc);

        /*
         * Quiesce consumers so that any commands submitted after this
         * fail with an error.  Notably, nda(4) calls nda_flush() from
         * a post_sync handler that might be ordered after this one.
         */
        for (u_int i = 0; i < sc->cdata->nn; i++) {
                if (sc->ns[i] != NULL)
                        nvmf_shutdown_ns(sc->ns[i]);
        }
        nvmf_shutdown_sim(sc);

        for (u_int i = 0; i < sc->num_io_queues; i++) {
                nvmf_destroy_qp(sc->io[i]);
        }
        nvmf_destroy_qp(sc->admin);
        sc->admin = NULL;
        sx_xunlock(&sc->connection_lock);
}

static int
nvmf_detach(device_t dev)
{
        struct nvmf_softc *sc = device_get_softc(dev);
        u_int i;

        destroy_dev(sc->cdev);

        sx_xlock(&sc->connection_lock);
        sc->detaching = true;
        sx_xunlock(&sc->connection_lock);

        EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
        EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);

        nvmf_destroy_sim(sc);
        for (i = 0; i < sc->cdata->nn; i++) {
                if (sc->ns[i] != NULL)
                        nvmf_destroy_ns(sc->ns[i]);
        }
        free(sc->ns, M_NVMF);

        callout_drain(&sc->ka_tx_timer);
        callout_drain(&sc->ka_rx_timer);

        if (sc->admin != NULL)
                nvmf_shutdown_controller(sc);

        for (i = 0; i < sc->num_io_queues; i++) {
                nvmf_destroy_qp(sc->io[i]);
        }
        free(sc->io, M_NVMF);

        taskqueue_drain(nvmf_tq, &sc->disconnect_task);
        if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task,
            NULL) != 0)
                taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);

        /*
         * Don't cancel/drain the controller loss task if that task
         * has fired and is triggering the detach.
         */
        if (!sc->controller_timedout) {
                if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task,
                    NULL) != 0)
                        taskqueue_drain_timeout(nvmf_tq,
                            &sc->controller_loss_task);
        }

        if (sc->admin != NULL)
                nvmf_destroy_qp(sc->admin);

        nvmf_destroy_aer(sc);

        sx_destroy(&sc->connection_lock);
        nvlist_destroy(sc->rparams);
        free(sc->cdata, M_NVMF);
        return (0);
}

static void
nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
    const struct nvme_namespace_data *data)
{
        struct nvmf_namespace *ns;

        /* XXX: Needs locking around sc->ns[]. */
        ns = sc->ns[nsid - 1];
        if (data->nsze == 0) {
                /* XXX: Needs locking */
                if (ns != NULL) {
                        nvmf_destroy_ns(ns);
                        sc->ns[nsid - 1] = NULL;
                }
        } else {
                /* XXX: Needs locking */
                if (ns == NULL) {
                        sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
                } else {
                        if (!nvmf_update_ns(ns, data)) {
                                nvmf_destroy_ns(ns);
                                sc->ns[nsid - 1] = NULL;
                        }
                }
        }

        nvmf_sim_rescan_ns(sc, nsid);
}

void
nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
{
        struct nvmf_completion_status status;
        struct nvme_namespace_data *data;

        data = malloc(sizeof(*data), M_NVMF, M_WAITOK);

        nvmf_status_init(&status);
        nvmf_status_wait_io(&status);
        if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
            &status, nvmf_io_complete, &status, M_WAITOK)) {
                device_printf(sc->dev,
                    "failed to send IDENTIFY namespace %u command\n", nsid);
                free(data, M_NVMF);
                return;
        }
        nvmf_wait_for_reply(&status);

        if (status.cqe.status != 0) {
                device_printf(sc->dev,
                    "IDENTIFY namespace %u failed, status %#x\n", nsid,
                    le16toh(status.cqe.status));
                free(data, M_NVMF);
                return;
        }

        if (status.io_error != 0) {
                device_printf(sc->dev,
                    "IDENTIFY namespace %u failed with I/O error %d\n",
                    nsid, status.io_error);
                free(data, M_NVMF);
                return;
        }

        nvme_namespace_data_swapbytes(data);

        nvmf_rescan_ns_1(sc, nsid, data);

        free(data, M_NVMF);
}

static void
nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
    uint32_t next_valid_nsid)
{
        struct nvmf_namespace *ns;

        for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
        {
                /* XXX: Needs locking around sc->ns[]. */
                ns = sc->ns[nsid - 1];
                if (ns != NULL) {
                        nvmf_destroy_ns(ns);
                        sc->ns[nsid - 1] = NULL;

                        nvmf_sim_rescan_ns(sc, nsid);
                }
        }
}

static bool
nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
    const struct nvme_namespace_data *data, void *arg)
{
        uint32_t *last_nsid = arg;

        /* Check for any gaps prior to this namespace. */
        nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
        *last_nsid = nsid;

        nvmf_rescan_ns_1(sc, nsid, data);
        return (true);
}

void
nvmf_rescan_all_ns(struct nvmf_softc *sc)
{
        uint32_t last_nsid;

        last_nsid = 0;
        if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
                return;

        /*
         * Check for any namespace devices after the last active
         * namespace.
         */
        nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
}

int
nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
    bool admin)
{
        struct nvmf_completion_status status;
        struct nvme_command cmd;
        struct memdesc mem;
        struct nvmf_host_qpair *qp;
        struct nvmf_request *req;
        void *buf;
        int error;

        if (pt->len > sc->max_xfer_size)
                return (EINVAL);

        buf = NULL;
        if (pt->len != 0) {
                /*
                 * XXX: Depending on the size we may want to pin the
                 * user pages and use a memdesc with vm_page_t's
                 * instead.
                 */
                buf = malloc(pt->len, M_NVMF, M_WAITOK);
                if (pt->is_read == 0) {
                        error = copyin(pt->buf, buf, pt->len);
                        if (error != 0) {
                                free(buf, M_NVMF);
                                return (error);
                        }
                } else {
                        /* Ensure no kernel data is leaked to userland. */
                        memset(buf, 0, pt->len);
                }
        }

        memset(&cmd, 0, sizeof(cmd));
        cmd.opc = pt->cmd.opc;
        cmd.fuse = pt->cmd.fuse;
        cmd.nsid = pt->cmd.nsid;
        cmd.cdw10 = pt->cmd.cdw10;
        cmd.cdw11 = pt->cmd.cdw11;
        cmd.cdw12 = pt->cmd.cdw12;
        cmd.cdw13 = pt->cmd.cdw13;
        cmd.cdw14 = pt->cmd.cdw14;
        cmd.cdw15 = pt->cmd.cdw15;

        sx_slock(&sc->connection_lock);
        if (sc->admin == NULL || sc->detaching) {
                device_printf(sc->dev,
                    "failed to send passthrough command\n");
                error = ECONNABORTED;
                sx_sunlock(&sc->connection_lock);
                goto error;
        }
        if (admin)
                qp = sc->admin;
        else
                qp = nvmf_select_io_queue(sc);
        nvmf_status_init(&status);
        req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
        sx_sunlock(&sc->connection_lock);
        if (req == NULL) {
                device_printf(sc->dev, "failed to send passthrough command\n");
                error = ECONNABORTED;
                goto error;
        }

        if (pt->len != 0) {
                mem = memdesc_vaddr(buf, pt->len);
                nvmf_capsule_append_data(req->nc, &mem, pt->len,
                    pt->is_read == 0, nvmf_io_complete, &status);
                nvmf_status_wait_io(&status);
        }

        nvmf_submit_request(req);
        nvmf_wait_for_reply(&status);

        memset(&pt->cpl, 0, sizeof(pt->cpl));
        pt->cpl.cdw0 = status.cqe.cdw0;
        pt->cpl.status = status.cqe.status;

        error = status.io_error;
        if (error == 0 && pt->len != 0 && pt->is_read != 0)
                error = copyout(buf, pt->buf, pt->len);
error:
        free(buf, M_NVMF);
        return (error);
}

static int
nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
        int error;

        sx_slock(&sc->connection_lock);
        error = nvmf_pack_ioc_nvlist(sc->rparams, nv);
        sx_sunlock(&sc->connection_lock);

        return (error);
}

static int
nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
        nvlist_t *nvl, *nvl_ts;
        int error;

        nvl = nvlist_create(0);
        nvl_ts = nvlist_create(0);

        sx_slock(&sc->connection_lock);
        nvlist_add_bool(nvl, "connected", sc->admin != NULL);
        nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec);
        nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec);
        sx_sunlock(&sc->connection_lock);
        nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts);

        error = nvmf_pack_ioc_nvlist(nvl, nv);
        nvlist_destroy(nvl);
        return (error);
}

static int
nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
    struct thread *td)
{
        struct nvmf_softc *sc = cdev->si_drv1;
        struct nvme_get_nsid *gnsid;
        struct nvme_pt_command *pt;
        struct nvmf_ioc_nv *nv;

        switch (cmd) {
        case NVME_PASSTHROUGH_CMD:
                pt = (struct nvme_pt_command *)arg;
                return (nvmf_passthrough_cmd(sc, pt, true));
        case NVME_GET_NSID:
                gnsid = (struct nvme_get_nsid *)arg;
                strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
                    sizeof(gnsid->cdev));
                gnsid->nsid = 0;
                return (0);
        case NVME_GET_MAX_XFER_SIZE:
                *(uint64_t *)arg = sc->max_xfer_size;
                return (0);
        case NVME_GET_CONTROLLER_DATA:
                memcpy(arg, sc->cdata, sizeof(*sc->cdata));
                return (0);
        case DIOCGIDENT:
                nvme_cdata_get_disk_ident(sc->cdata, (uint8_t *)arg);
                return (0);
        case NVMF_RECONNECT_PARAMS:
                nv = (struct nvmf_ioc_nv *)arg;
                return (nvmf_reconnect_params(sc, nv));
        case NVMF_RECONNECT_HOST:
                nv = (struct nvmf_ioc_nv *)arg;
                return (nvmf_reconnect_host(sc, nv));
        case NVMF_CONNECTION_STATUS:
                nv = (struct nvmf_ioc_nv *)arg;
                return (nvmf_connection_status(sc, nv));
        default:
                return (ENOTTY);
        }
}

static struct cdevsw nvmf_cdevsw = {
        .d_version = D_VERSION,
        .d_ioctl = nvmf_ioctl
};

static int
nvmf_modevent(module_t mod, int what, void *arg)
{
        int error;

        switch (what) {
        case MOD_LOAD:
                error = nvmf_ctl_load();
                if (error != 0)
                        return (error);

                nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO,
                    taskqueue_thread_enqueue, &nvmf_tq);
                taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq");
                return (0);
        case MOD_QUIESCE:
                return (0);
        case MOD_UNLOAD:
                nvmf_ctl_unload();
                destroy_dev_drain(&nvmf_cdevsw);
                if (nvmf_tq != NULL)
                        taskqueue_free(nvmf_tq);
                return (0);
        default:
                return (EOPNOTSUPP);
        }
}

static device_method_t nvmf_methods[] = {
        /* Device interface */
        DEVMETHOD(device_probe,     nvmf_probe),
        DEVMETHOD(device_attach,    nvmf_attach),
        DEVMETHOD(device_detach,    nvmf_detach),
        DEVMETHOD_END
};

driver_t nvme_nvmf_driver = {
        "nvme",
        nvmf_methods,
        sizeof(struct nvmf_softc),
};

DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);