#include <sys/param.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <sys/stdarg.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/host/nvmf_var.h>
struct nvmf_namespace {
struct nvmf_softc *sc;
uint64_t size;
uint32_t id;
u_int flags;
uint32_t lba_size;
bool disconnected;
bool shutdown;
TAILQ_HEAD(, bio) pending_bios;
struct mtx lock;
volatile u_int active_bios;
struct cdev *cdev;
};
static void nvmf_ns_strategy(struct bio *bio);
static void
ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
{
char buf[128];
struct sbuf sb;
va_list ap;
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
ns->id);
va_start(ap, fmt);
sbuf_vprintf(&sb, fmt, ap);
va_end(ap);
sbuf_finish(&sb);
sbuf_delete(&sb);
}
static __inline u_int *
bio_refs(struct bio *bio)
{
return ((u_int *)&bio->bio_driver1);
}
static void
nvmf_ns_biodone(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
if (!refcount_release(bio_refs(bio)))
return;
ns = bio->bio_dev->si_drv1;
if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
bio->bio_error = 0;
bio->bio_driver2 = 0;
mtx_lock(&ns->lock);
if (ns->disconnected) {
if (nvmf_fail_disconnect || ns->shutdown) {
mtx_unlock(&ns->lock);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
} else {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
bio_queue);
mtx_unlock(&ns->lock);
}
} else {
mtx_unlock(&ns->lock);
nvmf_ns_strategy(bio);
}
} else {
error = (intptr_t)bio->bio_driver2;
if (error != 0)
bio->bio_error = error;
if (bio->bio_error != 0)
bio->bio_flags |= BIO_ERROR;
biodone(bio);
}
if (refcount_release(&ns->active_bios))
wakeup(ns);
}
static void
nvmf_ns_io_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
KASSERT(xfered <= bio->bio_bcount,
("%s: xfered > bio_bcount", __func__));
bio->bio_driver2 = (void *)(intptr_t)error;
bio->bio_resid = bio->bio_bcount - xfered;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
if (error != 0)
bio->bio_resid = bio->bio_bcount;
else
bio->bio_resid = 0;
free(bio->bio_driver2, M_NVMF);
bio->bio_driver2 = (void *)(intptr_t)error;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
{
struct bio *bio = arg;
if (nvmf_cqe_aborted(cqe))
bio->bio_error = ECONNABORTED;
else if (cqe->status != 0)
bio->bio_error = EIO;
nvmf_ns_biodone(bio);
}
static int
nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
{
struct nvme_command cmd;
struct nvmf_request *req;
struct nvme_dsm_range *dsm_range;
struct memdesc mem;
uint64_t lba, lba_count;
int error;
dsm_range = NULL;
memset(&cmd, 0, sizeof(cmd));
switch (bio->bio_cmd) {
case BIO_READ:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_WRITE:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_FLUSH:
nvme_ns_flush_cmd(&cmd, ns->id);
break;
case BIO_DELETE:
dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
M_ZERO);
if (dsm_range == NULL)
return (ENOMEM);
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
dsm_range->starting_lba = htole64(lba);
dsm_range->length = htole32(lba_count);
cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
cmd.nsid = htole32(ns->id);
cmd.cdw10 = htole32(0);
cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
break;
default:
return (EOPNOTSUPP);
}
mtx_lock(&ns->lock);
if (ns->disconnected) {
if (nvmf_fail_disconnect || ns->shutdown) {
error = ECONNABORTED;
} else {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
error = 0;
}
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (error);
}
req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
nvmf_ns_bio_complete, bio, M_NOWAIT);
if (req == NULL) {
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (ENOMEM);
}
switch (bio->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_bio(bio);
nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
break;
case BIO_DELETE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
true, nvmf_ns_delete_complete, bio);
bio->bio_driver2 = dsm_range;
break;
default:
refcount_init(bio_refs(bio), 1);
KASSERT(bio->bio_resid == 0,
("%s: input bio_resid != 0", __func__));
break;
}
refcount_acquire(&ns->active_bios);
nvmf_submit_request(req);
mtx_unlock(&ns->lock);
return (0);
}
static int
nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_namespace *ns = dev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
pt->cmd.nsid = htole32(ns->id);
return (nvmf_passthrough_cmd(ns->sc, pt, false));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
sizeof(gnsid->cdev));
gnsid->nsid = ns->id;
return (0);
case DIOCGIDENT:
nvme_cdata_get_disk_ident(ns->sc->cdata, (uint8_t *)arg);
return (0);
case DIOCGMEDIASIZE:
*(off_t *)arg = ns->size;
return (0);
case DIOCGSECTORSIZE:
*(u_int *)arg = ns->lba_size;
return (0);
default:
return (ENOTTY);
}
}
static int
nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
int error;
error = 0;
if ((oflags & FWRITE) != 0)
error = securelevel_gt(td->td_ucred, 0);
return (error);
}
void
nvmf_ns_strategy(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
ns = bio->bio_dev->si_drv1;
error = nvmf_ns_submit_bio(ns, bio);
if (error != 0) {
bio->bio_error = error;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
}
static struct cdevsw nvmf_ns_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_DISK,
.d_open = nvmf_ns_open,
.d_read = physread,
.d_write = physwrite,
.d_strategy = nvmf_ns_strategy,
.d_ioctl = nvmf_ns_ioctl
};
struct nvmf_namespace *
nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
const struct nvme_namespace_data *data)
{
struct make_dev_args mda;
struct nvmf_namespace *ns;
int error;
uint8_t lbads, lbaf;
ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
ns->sc = sc;
ns->id = id;
TAILQ_INIT(&ns->pending_bios);
mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
refcount_init(&ns->active_bios, 1);
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
goto fail;
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
goto fail;
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_ns_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = ns;
error = make_dev_s(&mda, &ns->cdev, "%sn%u",
device_get_nameunit(sc->dev), id);
if (error != 0)
goto fail;
ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
device_get_nameunit(sc->dev), id);
ns->cdev->si_flags |= SI_UNMAPPED;
return (ns);
fail:
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
return (NULL);
}
void
nvmf_disconnect_ns(struct nvmf_namespace *ns)
{
mtx_lock(&ns->lock);
ns->disconnected = true;
mtx_unlock(&ns->lock);
}
void
nvmf_reconnect_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
mtx_lock(&ns->lock);
ns->disconnected = false;
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
nvmf_ns_strategy(bio);
}
}
void
nvmf_shutdown_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
mtx_lock(&ns->lock);
ns->shutdown = true;
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
}
void
nvmf_destroy_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
if (ns->cdev->si_drv2 != NULL)
destroy_dev(ns->cdev->si_drv2);
destroy_dev(ns->cdev);
mtx_lock(&ns->lock);
if (!refcount_release(&ns->active_bios)) {
while (ns->active_bios != 0)
mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
}
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
}
bool
nvmf_update_ns(struct nvmf_namespace *ns,
const struct nvme_namespace_data *data)
{
uint8_t lbads, lbaf;
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
return (false);
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
return (false);
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
return (true);
}