#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/dnv.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/nv.h>
#include <sys/reboot.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>
static struct cdevsw nvmf_cdevsw;
static struct taskqueue *nvmf_tq;
bool nvmf_fail_disconnect = false;
SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
&nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
static void nvmf_controller_loss_task(void *arg, int pending);
static void nvmf_disconnect_task(void *arg, int pending);
static void nvmf_request_reconnect(struct nvmf_softc *sc);
static void nvmf_request_reconnect_task(void *arg, int pending);
static void nvmf_shutdown_pre_sync(void *arg, int howto);
static void nvmf_shutdown_post_sync(void *arg, int howto);
void
nvmf_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->cqe = *cqe;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_io_complete(void *arg, size_t xfered, int error)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->io_error = error;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->io_done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_wait_for_reply(struct nvmf_completion_status *status)
{
struct mtx *mtx;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
while (!status->done || !status->io_done)
mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
mtx_unlock(mtx);
}
static int
nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t *value)
{
const struct nvmf_fabric_prop_get_rsp *rsp;
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
if (size == 8)
*value = le64toh(rsp->value.u64);
else
*value = le32toh(rsp->value.u32.low);
return (0);
}
static int
nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t value)
{
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
return (0);
}
static void
nvmf_shutdown_controller(struct nvmf_softc *sc)
{
uint64_t cc;
int error;
error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
return;
}
cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
if (error != 0)
device_printf(sc->dev,
"Failed to set CC to trigger shutdown\n");
}
static void
nvmf_check_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
if (traffic == 0) {
device_printf(sc->dev,
"disconnecting due to KeepAlive timeout\n");
nvmf_disconnect(sc);
return;
}
callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
}
static void
nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_softc *sc = arg;
atomic_store_int(&sc->ka_active_rx_traffic, 1);
if (cqe->status != 0) {
device_printf(sc->dev,
"KeepAlive response reported status %#x\n",
le16toh(cqe->status));
}
}
static void
nvmf_send_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
traffic = atomic_load_int(&sc->ka_active_tx_traffic);
if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
sc, M_NOWAIT))
device_printf(sc->dev,
"Failed to allocate KeepAlive command\n");
atomic_store_int(&sc->ka_active_tx_traffic, 0);
callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
}
int
nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
{
const struct nvme_discovery_log_entry *dle;
const struct nvme_controller_data *cdata;
const nvlist_t *const *io;
const nvlist_t *admin, *rparams;
nvlist_t *nvl;
size_t i, num_io_queues;
uint32_t qsize;
int error;
error = nvmf_unpack_ioc_nvlist(nv, &nvl);
if (error != 0)
return (error);
if (!nvlist_exists_number(nvl, "trtype") ||
!nvlist_exists_nvlist(nvl, "admin") ||
!nvlist_exists_nvlist_array(nvl, "io") ||
!nvlist_exists_binary(nvl, "cdata") ||
!nvlist_exists_nvlist(nvl, "rparams"))
goto invalid;
rparams = nvlist_get_nvlist(nvl, "rparams");
if (!nvlist_exists_binary(rparams, "dle") ||
!nvlist_exists_string(rparams, "hostnqn") ||
!nvlist_exists_number(rparams, "num_io_queues") ||
!nvlist_exists_number(rparams, "io_qsize"))
goto invalid;
admin = nvlist_get_nvlist(nvl, "admin");
if (!nvmf_validate_qpair_nvlist(admin, false))
goto invalid;
if (!nvlist_get_bool(admin, "admin"))
goto invalid;
io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
if (num_io_queues < 1 ||
num_io_queues != nvlist_get_number(rparams, "num_io_queues"))
goto invalid;
for (i = 0; i < num_io_queues; i++) {
if (!nvmf_validate_qpair_nvlist(io[i], false))
goto invalid;
}
qsize = nvlist_get_number(rparams, "io_qsize");
for (i = 0; i < num_io_queues; i++) {
if (nvlist_get_number(io[i], "qsize") != qsize)
goto invalid;
}
cdata = nvlist_get_binary(nvl, "cdata", &i);
if (i != sizeof(*cdata))
goto invalid;
dle = nvlist_get_binary(rparams, "dle", &i);
if (i != sizeof(*dle))
goto invalid;
if (memcmp(dle->subnqn, cdata->subnqn, sizeof(cdata->subnqn)) != 0)
goto invalid;
*nvlp = nvl;
return (0);
invalid:
nvlist_destroy(nvl);
return (EINVAL);
}
static int
nvmf_probe(device_t dev)
{
const nvlist_t *nvl = device_get_ivars(dev);
const struct nvme_controller_data *cdata;
if (nvl == NULL)
return (ENXIO);
cdata = nvlist_get_binary(nvl, "cdata", NULL);
device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
return (BUS_PROBE_DEFAULT);
}
static int
nvmf_establish_connection(struct nvmf_softc *sc, nvlist_t *nvl)
{
const nvlist_t *const *io;
const nvlist_t *admin;
uint64_t kato;
size_t num_io_queues;
enum nvmf_trtype trtype;
char name[16];
trtype = nvlist_get_number(nvl, "trtype");
admin = nvlist_get_nvlist(nvl, "admin");
io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
kato = dnvlist_get_number(nvl, "kato", 0);
sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0);
sc->controller_loss_timeout = dnvlist_get_number(nvl,
"controller_loss_timeout", 0);
sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
if (sc->admin == NULL) {
device_printf(sc->dev, "Failed to setup admin queue\n");
return (ENXIO);
}
sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
M_WAITOK | M_ZERO);
sc->num_io_queues = num_io_queues;
for (u_int i = 0; i < sc->num_io_queues; i++) {
snprintf(name, sizeof(name), "I/O queue %u", i);
sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
if (sc->io[i] == NULL) {
device_printf(sc->dev, "Failed to setup I/O queue %u\n",
i);
return (ENXIO);
}
}
if (kato != 0) {
sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
sc->cdata->ctratt) != 0;
sc->ka_rx_sbt = mstosbt(kato);
sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
nvmf_check_keep_alive, sc, C_HARDCLOCK);
callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
nvmf_send_keep_alive, sc, C_HARDCLOCK);
}
memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
sizeof(*sc->cdata));
nvlist_destroy(sc->rparams);
sc->rparams = nvlist_take_nvlist(nvl, "rparams");
return (0);
}
typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
const struct nvme_namespace_data *, void *);
static bool
nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
struct nvme_namespace_data *data, uint32_t *nsidp,
nvmf_scan_active_ns_cb *cb, void *cb_arg)
{
struct nvmf_completion_status status;
uint32_t nsid;
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY active namespaces command\n");
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed, status %#x\n",
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed with I/O error %d\n",
status.io_error);
return (false);
}
for (u_int i = 0; i < nitems(nslist->ns); i++) {
nsid = nslist->ns[i];
if (nsid == 0) {
*nsidp = 0;
return (true);
}
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n",
nsid);
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
return (false);
}
nvme_namespace_data_swapbytes(data);
if (!cb(sc, nsid, data, cb_arg))
return (false);
}
MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
*nsidp = 0;
else
*nsidp = nsid;
return (true);
}
static bool
nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
void *cb_arg)
{
struct nvme_namespace_data *data;
struct nvme_ns_list *nslist;
uint32_t nsid;
bool retval;
nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nsid = 0;
retval = true;
for (;;) {
if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
cb_arg)) {
retval = false;
break;
}
if (nsid == 0)
break;
}
free(data, M_NVMF);
free(nslist, M_NVMF);
return (retval);
}
static bool
nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data, void *arg __unused)
{
if (sc->ns[nsid - 1] != NULL) {
device_printf(sc->dev,
"duplicate namespace %u in active namespace list\n",
nsid);
return (false);
}
if (data->nsze == 0) {
device_printf(sc->dev,
"ignoring active namespace %u with zero size\n", nsid);
return (true);
}
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
nvmf_sim_rescan_ns(sc, nsid);
return (true);
}
static bool
nvmf_add_namespaces(struct nvmf_softc *sc)
{
sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
M_WAITOK | M_ZERO);
return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
}
static int
nvmf_attach(device_t dev)
{
struct make_dev_args mda;
struct nvmf_softc *sc = device_get_softc(dev);
nvlist_t *nvl = device_get_ivars(dev);
const nvlist_t * const *io;
struct sysctl_oid *oid;
uint64_t mpsmin, val;
u_int i;
int error;
if (nvl == NULL)
return (ENXIO);
sc->dev = dev;
sc->trtype = nvlist_get_number(nvl, "trtype");
callout_init(&sc->ka_rx_timer, 1);
callout_init(&sc->ka_tx_timer, 1);
sx_init(&sc->connection_lock, "nvmf connection");
TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0,
nvmf_controller_loss_task, sc);
TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0,
nvmf_request_reconnect_task, sc);
oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
nvmf_init_aer(sc);
error = nvmf_establish_connection(sc, nvl);
if (error != 0)
goto out;
error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CAP\n");
error = ENXIO;
goto out;
}
error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch VS\n");
error = ENXIO;
goto out;
}
sc->vs = val;
mpsmin = (uint64_t)1 << (NVME_MPS_SHIFT +
NVME_CAP_HI_MPSMIN(sc->cap >> 32));
sc->max_xfer_size = maxphys;
if (sc->cdata->mdts != 0) {
sc->max_xfer_size = ulmin(sc->max_xfer_size,
mpsmin << sc->cdata->mdts);
}
val = nvmf_max_xfer_size_qp(sc->io[0]);
if (val >= mpsmin)
sc->max_xfer_size = ulmin(sc->max_xfer_size,
rounddown2(val, mpsmin));
io = nvlist_get_nvlist_array(nvl, "io", NULL);
sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
sc->num_io_queues;
error = nvmf_init_sim(sc);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
if (!nvmf_add_namespaces(sc)) {
nvmf_destroy_sim(sc);
goto out;
}
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = sc;
error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_LAST);
return (0);
out:
if (sc->ns != NULL) {
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
if (sc->io[i] != NULL)
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task);
taskqueue_drain(nvmf_tq, &sc->disconnect_task);
sx_destroy(&sc->connection_lock);
nvlist_destroy(sc->rparams);
free(sc->cdata, M_NVMF);
return (error);
}
void
nvmf_disconnect(struct nvmf_softc *sc)
{
taskqueue_enqueue(nvmf_tq, &sc->disconnect_task);
}
static void
nvmf_disconnect_task(void *arg, int pending __unused)
{
struct nvmf_softc *sc = arg;
u_int i;
sx_xlock(&sc->connection_lock);
if (sc->admin == NULL) {
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->detaching) {
if (sc->admin != NULL) {
nvmf_shutdown_qp(sc->admin);
}
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->cdev == NULL) {
nvmf_shutdown_qp(sc->admin);
sx_xunlock(&sc->connection_lock);
return;
}
nanotime(&sc->last_disconnect);
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
sc->ka_traffic = false;
nvmf_disconnect_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_disconnect_ns(sc->ns[i]);
}
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
sc->io = NULL;
sc->num_io_queues = 0;
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
if (sc->reconnect_delay != 0)
nvmf_request_reconnect(sc);
if (sc->controller_loss_timeout != 0)
taskqueue_enqueue_timeout(nvmf_tq,
&sc->controller_loss_task, sc->controller_loss_timeout *
hz);
sx_xunlock(&sc->connection_lock);
}
static void
nvmf_controller_loss_task(void *arg, int pending)
{
struct nvmf_softc *sc = arg;
device_t dev;
int error;
bus_topo_lock();
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching) {
sx_xunlock(&sc->connection_lock);
bus_topo_unlock();
return;
}
sc->controller_timedout = true;
sx_xunlock(&sc->connection_lock);
dev = sc->dev;
error = device_delete_child(root_bus, dev);
if (error != 0)
device_printf(dev,
"failed to detach after controller loss: %d\n", error);
bus_topo_unlock();
}
static void
nvmf_request_reconnect(struct nvmf_softc *sc)
{
char buf[64];
sx_assert(&sc->connection_lock, SX_LOCKED);
snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev));
devctl_notify("nvme", "controller", "RECONNECT", buf);
taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task,
sc->reconnect_delay * hz);
}
static void
nvmf_request_reconnect_task(void *arg, int pending)
{
struct nvmf_softc *sc = arg;
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
sx_xunlock(&sc->connection_lock);
return;
}
nvmf_request_reconnect(sc);
sx_xunlock(&sc->connection_lock);
}
static int
nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
const struct nvme_controller_data *cdata;
nvlist_t *nvl;
u_int i;
int error;
error = nvmf_copyin_handoff(nv, &nvl);
if (error != 0)
return (error);
if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
device_printf(sc->dev,
"transport type mismatch on reconnect\n");
return (EINVAL);
}
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching || sc->controller_timedout) {
error = EBUSY;
goto out;
}
cdata = nvlist_get_binary(nvl, "cdata", NULL);
if (memcmp(sc->cdata->subnqn, cdata->subnqn,
sizeof(cdata->subnqn)) != 0) {
device_printf(sc->dev,
"controller subsystem NQN mismatch on reconnect\n");
error = EINVAL;
goto out;
}
error = nvmf_establish_connection(sc, nvl);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0)
goto out;
device_printf(sc->dev,
"established new association with %u I/O queues\n",
sc->num_io_queues);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_reconnect_ns(sc->ns[i]);
}
nvmf_reconnect_sim(sc);
nvmf_rescan_all_ns(sc);
taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL);
taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL);
out:
sx_xunlock(&sc->connection_lock);
nvlist_destroy(nvl);
return (error);
}
static void
nvmf_shutdown_pre_sync(void *arg, int howto)
{
struct nvmf_softc *sc = arg;
if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
return;
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching) {
sx_xunlock(&sc->connection_lock);
return;
}
for (u_int i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_shutdown_ns(sc->ns[i]);
}
nvmf_shutdown_sim(sc);
sx_xunlock(&sc->connection_lock);
}
static void
nvmf_shutdown_post_sync(void *arg, int howto)
{
struct nvmf_softc *sc = arg;
if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
return;
sx_xlock(&sc->connection_lock);
if (sc->admin == NULL || sc->detaching) {
sx_xunlock(&sc->connection_lock);
return;
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
nvmf_shutdown_controller(sc);
for (u_int i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_shutdown_ns(sc->ns[i]);
}
nvmf_shutdown_sim(sc);
for (u_int i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
sx_xunlock(&sc->connection_lock);
}
static int
nvmf_detach(device_t dev)
{
struct nvmf_softc *sc = device_get_softc(dev);
u_int i;
destroy_dev(sc->cdev);
sx_xlock(&sc->connection_lock);
sc->detaching = true;
sx_xunlock(&sc->connection_lock);
EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
nvmf_destroy_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
taskqueue_drain(nvmf_tq, &sc->disconnect_task);
if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task,
NULL) != 0)
taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task);
if (!sc->controller_timedout) {
if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task,
NULL) != 0)
taskqueue_drain_timeout(nvmf_tq,
&sc->controller_loss_task);
}
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
sx_destroy(&sc->connection_lock);
nvlist_destroy(sc->rparams);
free(sc->cdata, M_NVMF);
return (0);
}
static void
nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data)
{
struct nvmf_namespace *ns;
ns = sc->ns[nsid - 1];
if (data->nsze == 0) {
if (ns != NULL) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
} else {
if (ns == NULL) {
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
} else {
if (!nvmf_update_ns(ns, data)) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
}
}
nvmf_sim_rescan_ns(sc, nsid);
}
void
nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
{
struct nvmf_completion_status status;
struct nvme_namespace_data *data;
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n", nsid);
free(data, M_NVMF);
return;
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
free(data, M_NVMF);
return;
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
free(data, M_NVMF);
return;
}
nvme_namespace_data_swapbytes(data);
nvmf_rescan_ns_1(sc, nsid, data);
free(data, M_NVMF);
}
static void
nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
uint32_t next_valid_nsid)
{
struct nvmf_namespace *ns;
for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
{
ns = sc->ns[nsid - 1];
if (ns != NULL) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
nvmf_sim_rescan_ns(sc, nsid);
}
}
}
static bool
nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data, void *arg)
{
uint32_t *last_nsid = arg;
nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
*last_nsid = nsid;
nvmf_rescan_ns_1(sc, nsid, data);
return (true);
}
void
nvmf_rescan_all_ns(struct nvmf_softc *sc)
{
uint32_t last_nsid;
last_nsid = 0;
if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
return;
nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
}
int
nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
bool admin)
{
struct nvmf_completion_status status;
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_host_qpair *qp;
struct nvmf_request *req;
void *buf;
int error;
if (pt->len > sc->max_xfer_size)
return (EINVAL);
buf = NULL;
if (pt->len != 0) {
buf = malloc(pt->len, M_NVMF, M_WAITOK);
if (pt->is_read == 0) {
error = copyin(pt->buf, buf, pt->len);
if (error != 0) {
free(buf, M_NVMF);
return (error);
}
} else {
memset(buf, 0, pt->len);
}
}
memset(&cmd, 0, sizeof(cmd));
cmd.opc = pt->cmd.opc;
cmd.fuse = pt->cmd.fuse;
cmd.nsid = pt->cmd.nsid;
cmd.cdw10 = pt->cmd.cdw10;
cmd.cdw11 = pt->cmd.cdw11;
cmd.cdw12 = pt->cmd.cdw12;
cmd.cdw13 = pt->cmd.cdw13;
cmd.cdw14 = pt->cmd.cdw14;
cmd.cdw15 = pt->cmd.cdw15;
sx_slock(&sc->connection_lock);
if (sc->admin == NULL || sc->detaching) {
device_printf(sc->dev,
"failed to send passthrough command\n");
error = ECONNABORTED;
sx_sunlock(&sc->connection_lock);
goto error;
}
if (admin)
qp = sc->admin;
else
qp = nvmf_select_io_queue(sc);
nvmf_status_init(&status);
req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
sx_sunlock(&sc->connection_lock);
if (req == NULL) {
device_printf(sc->dev, "failed to send passthrough command\n");
error = ECONNABORTED;
goto error;
}
if (pt->len != 0) {
mem = memdesc_vaddr(buf, pt->len);
nvmf_capsule_append_data(req->nc, &mem, pt->len,
pt->is_read == 0, nvmf_io_complete, &status);
nvmf_status_wait_io(&status);
}
nvmf_submit_request(req);
nvmf_wait_for_reply(&status);
memset(&pt->cpl, 0, sizeof(pt->cpl));
pt->cpl.cdw0 = status.cqe.cdw0;
pt->cpl.status = status.cqe.status;
error = status.io_error;
if (error == 0 && pt->len != 0 && pt->is_read != 0)
error = copyout(buf, pt->buf, pt->len);
error:
free(buf, M_NVMF);
return (error);
}
static int
nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
int error;
sx_slock(&sc->connection_lock);
error = nvmf_pack_ioc_nvlist(sc->rparams, nv);
sx_sunlock(&sc->connection_lock);
return (error);
}
static int
nvmf_connection_status(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
nvlist_t *nvl, *nvl_ts;
int error;
nvl = nvlist_create(0);
nvl_ts = nvlist_create(0);
sx_slock(&sc->connection_lock);
nvlist_add_bool(nvl, "connected", sc->admin != NULL);
nvlist_add_number(nvl_ts, "tv_sec", sc->last_disconnect.tv_sec);
nvlist_add_number(nvl_ts, "tv_nsec", sc->last_disconnect.tv_nsec);
sx_sunlock(&sc->connection_lock);
nvlist_move_nvlist(nvl, "last_disconnect", nvl_ts);
error = nvmf_pack_ioc_nvlist(nvl, nv);
nvlist_destroy(nvl);
return (error);
}
static int
nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_softc *sc = cdev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
struct nvmf_ioc_nv *nv;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
return (nvmf_passthrough_cmd(sc, pt, true));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
sizeof(gnsid->cdev));
gnsid->nsid = 0;
return (0);
case NVME_GET_MAX_XFER_SIZE:
*(uint64_t *)arg = sc->max_xfer_size;
return (0);
case NVME_GET_CONTROLLER_DATA:
memcpy(arg, sc->cdata, sizeof(*sc->cdata));
return (0);
case DIOCGIDENT:
nvme_cdata_get_disk_ident(sc->cdata, (uint8_t *)arg);
return (0);
case NVMF_RECONNECT_PARAMS:
nv = (struct nvmf_ioc_nv *)arg;
return (nvmf_reconnect_params(sc, nv));
case NVMF_RECONNECT_HOST:
nv = (struct nvmf_ioc_nv *)arg;
return (nvmf_reconnect_host(sc, nv));
case NVMF_CONNECTION_STATUS:
nv = (struct nvmf_ioc_nv *)arg;
return (nvmf_connection_status(sc, nv));
default:
return (ENOTTY);
}
}
static struct cdevsw nvmf_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = nvmf_ioctl
};
static int
nvmf_modevent(module_t mod, int what, void *arg)
{
int error;
switch (what) {
case MOD_LOAD:
error = nvmf_ctl_load();
if (error != 0)
return (error);
nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO,
taskqueue_thread_enqueue, &nvmf_tq);
taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq");
return (0);
case MOD_QUIESCE:
return (0);
case MOD_UNLOAD:
nvmf_ctl_unload();
destroy_dev_drain(&nvmf_cdevsw);
if (nvmf_tq != NULL)
taskqueue_free(nvmf_tq);
return (0);
default:
return (EOPNOTSUPP);
}
}
static device_method_t nvmf_methods[] = {
DEVMETHOD(device_probe, nvmf_probe),
DEVMETHOD(device_attach, nvmf_attach),
DEVMETHOD(device_detach, nvmf_detach),
DEVMETHOD_END
};
driver_t nvme_nvmf_driver = {
"nvme",
nvmf_methods,
sizeof(struct nvmf_softc),
};
DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);