#include <sys/stddef.h>
#include <sys/nvme.h>
#include "nvme_reg.h"
#include "nvme_var.h"
static boolean_t
nvme_rwlock_wr_or_pend(nvme_lock_t *lock)
{
return (lock->nl_writer != NULL ||
list_is_empty(&lock->nl_pend_writers) == 0);
}
static boolean_t
nvme_rwlock_block_ns_rdlock(nvme_t *nvme, nvme_namespace_t *ns)
{
return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
nvme_rwlock_wr_or_pend(&ns->ns_lock));
}
static boolean_t
nvme_rwlock_block_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
{
return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
nvme_rwlock_wr_or_pend(&ns->ns_lock) ||
list_is_empty(&ns->ns_lock.nl_readers) == 0);
}
static boolean_t
nvme_rwlock_block_ctrl_rdlock(nvme_t *nvme)
{
return (nvme_rwlock_wr_or_pend(&nvme->n_lock));
}
static boolean_t
nvme_rwlock_block_ctrl_wrlock(nvme_t *nvme)
{
if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
list_is_empty(&nvme->n_lock.nl_readers) == 0) {
return (B_TRUE);
}
for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
if (ns->ns_lock.nl_writer != NULL ||
list_is_empty(&ns->ns_lock.nl_readers) == 0) {
return (B_TRUE);
}
}
return (B_FALSE);
}
static boolean_t
nvme_rwlock_handoff_ctrl_wrlock(nvme_t *nvme)
{
ASSERT3P(nvme->n_lock.nl_writer, ==, NULL);
if (list_is_empty(&nvme->n_lock.nl_readers) == 0) {
return (B_FALSE);
}
for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
if (ns->ns_lock.nl_writer != NULL ||
list_is_empty(&ns->ns_lock.nl_readers) == 0) {
return (B_FALSE);
}
}
return (B_TRUE);
}
static boolean_t
nvme_rwlock_handoff_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
{
if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
list_is_empty(&nvme->n_lock.nl_readers) == 0) {
return (B_FALSE);
}
if (ns->ns_lock.nl_writer != NULL ||
list_is_empty(&ns->ns_lock.nl_readers) == 0) {
return (B_FALSE);
}
return (B_TRUE);
}
static void
nvme_rwlock_rdlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
{
ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
ASSERT3P(lock->nl_writer, ==, NULL);
ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
ASSERT3U(list_link_active(&info->nli_node), ==, 0);
ASSERT3P(info->nli_minor, !=, NULL);
ASSERT3P(info->nli_nvme, !=, NULL);
ASSERT3U(info->nli_curlevel, ==, NVME_LOCK_L_READ);
info->nli_state = NVME_LOCK_STATE_ACQUIRED;
info->nli_last_change = gethrtime();
info->nli_acq_kthread = (uintptr_t)curthread;
info->nli_acq_pid = (uint32_t)curproc->p_pid;
list_insert_tail(&lock->nl_readers, info);
lock->nl_nread_locks++;
}
static void
nvme_rwlock_wrlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
{
ASSERT3P(lock->nl_writer, ==, NULL);
ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
ASSERT3U(list_link_active(&info->nli_node), ==, 0);
ASSERT3P(info->nli_minor, !=, NULL);
ASSERT3P(info->nli_nvme, !=, NULL);
info->nli_state = NVME_LOCK_STATE_ACQUIRED;
info->nli_curlevel = NVME_LOCK_L_WRITE;
info->nli_last_change = gethrtime();
info->nli_acq_kthread = (uintptr_t)curthread;
info->nli_acq_pid = (uint32_t)curproc->p_pid;
lock->nl_writer = info;
lock->nl_nwrite_locks++;
}
#ifdef DEBUG
static boolean_t
nvme_rwlock_is_reader(nvme_lock_t *lock, const nvme_minor_lock_info_t *info)
{
for (nvme_minor_lock_info_t *i = list_head(&lock->nl_readers);
i != NULL; i = list_next(&lock->nl_readers, i)) {
if (i == info) {
return (B_TRUE);
}
}
return (B_FALSE);
}
#endif
static void
nvme_rwlock_signal_one(nvme_minor_lock_info_t *info, nvme_ioctl_errno_t err)
{
ASSERT3P(info->nli_ioc, !=, NULL);
ASSERT3P(info->nli_minor, !=, NULL);
ASSERT3P(info->nli_state, !=, NVME_LOCK_STATE_BLOCKED);
if (err == NVME_IOCTL_E_OK) {
nvme_ioctl_success(info->nli_ioc);
} else {
(void) nvme_ioctl_error(info->nli_ioc, err, 0, 0);
}
cv_signal(&info->nli_minor->nm_cv);
}
static void
nvme_rwlock_wakeup_readers(nvme_lock_t *lock)
{
nvme_minor_lock_info_t *info;
if (list_is_empty(&lock->nl_pend_readers) != 0) {
return;
}
ASSERT3U(list_is_empty(&lock->nl_readers), !=, 0);
ASSERT3P(lock->nl_writer, ==, NULL);
ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
while ((info = list_remove_head(&lock->nl_pend_readers)) != NULL) {
info->nli_state = NVME_LOCK_STATE_UNLOCKED;
nvme_rwlock_rdlock(info, lock);
nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
}
}
static void
nvme_rwlock_wakeup(nvme_t *nvme)
{
nvme_lock_t *ctrl_lock = &nvme->n_lock;
VERIFY3P(ctrl_lock->nl_writer, ==, NULL);
if (list_is_empty(&ctrl_lock->nl_pend_writers) == 0) {
nvme_minor_lock_info_t *info;
if (!nvme_rwlock_handoff_ctrl_wrlock(nvme))
return;
info = list_remove_head(&ctrl_lock->nl_pend_writers);
info->nli_state = NVME_LOCK_STATE_UNLOCKED;
nvme_rwlock_wrlock(info, ctrl_lock);
nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
return;
}
nvme_rwlock_wakeup_readers(ctrl_lock);
for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
nvme_lock_t *ns_lock = &ns->ns_lock;
if (list_is_empty(&ns_lock->nl_pend_writers) == 0) {
nvme_minor_lock_info_t *info;
if (!nvme_rwlock_handoff_ns_wrlock(nvme, ns))
continue;
info = list_remove_head(&ns_lock->nl_pend_writers);
info->nli_state = NVME_LOCK_STATE_UNLOCKED;
nvme_rwlock_wrlock(info, ns_lock);
nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
} else {
nvme_rwlock_wakeup_readers(ns_lock);
}
}
}
static void
nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t *info)
{
info->nli_lock = NULL;
info->nli_state = NVME_LOCK_STATE_UNLOCKED;
info->nli_curlevel = 0;
info->nli_ns = NULL;
}
void
nvme_rwunlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
{
nvme_t *const nvme = info->nli_nvme;
boolean_t is_read;
VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
VERIFY3P(info->nli_lock, ==, lock);
VERIFY(info->nli_curlevel == NVME_LOCK_L_READ ||
info->nli_curlevel == NVME_LOCK_L_WRITE);
is_read = info->nli_curlevel == NVME_LOCK_L_READ;
info->nli_last_change = gethrtime();
if (is_read) {
VERIFY3U(list_link_active(&info->nli_node), !=, 0);
ASSERT3U(nvme_rwlock_is_reader(lock, info), ==, B_TRUE);
list_remove(&lock->nl_readers, info);
} else {
VERIFY3U(list_link_active(&info->nli_node), ==, 0);
VERIFY3P(lock->nl_writer, ==, info);
lock->nl_writer = NULL;
}
nvme_rwunlock_cleanup_minor(info);
nvme_rwlock_wakeup(nvme);
}
static void
nvme_rwlock_signal(nvme_minor_lock_info_t *info, nvme_lock_t *lock,
boolean_t is_read)
{
ASSERT3P(info->nli_ioc, !=, NULL);
info->nli_last_change = gethrtime();
lock->nl_nsignals++;
if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
ASSERT3P(info->nli_lock, ==, NULL);
(void) nvme_ioctl_error(info->nli_ioc,
NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
lock->nl_nsig_unlock++;
return;
}
ASSERT3P(info->nli_lock, ==, lock);
if (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
ASSERT3S(list_link_active(&info->nli_node), !=, 0);
if (is_read) {
list_remove(&lock->nl_pend_readers, info);
} else {
list_remove(&lock->nl_pend_writers, info);
}
nvme_rwunlock_cleanup_minor(info);
(void) nvme_ioctl_error(info->nli_ioc,
NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
lock->nl_nsig_blocks++;
return;
}
lock->nl_nsig_acq++;
nvme_rwunlock(info, lock);
}
void
nvme_rwlock(nvme_minor_t *minor, nvme_ioctl_lock_t *req)
{
nvme_t *const nvme = minor->nm_ctrl;
const boolean_t is_nonblock = (req->nil_flags &
NVME_LOCK_F_DONT_BLOCK) != 0;
const boolean_t is_read = req->nil_level == NVME_LOCK_L_READ;
const boolean_t is_ctrl = req->nil_ent == NVME_LOCK_E_CTRL;
nvme_minor_lock_info_t *info;
nvme_lock_t *lock;
boolean_t waiters;
hrtime_t sleep_time;
VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
if (is_ctrl) {
info = &minor->nm_ctrl_lock;
lock = &nvme->n_lock;
if (is_read) {
waiters = nvme_rwlock_block_ctrl_rdlock(nvme);
} else {
waiters = nvme_rwlock_block_ctrl_wrlock(nvme);
}
} else {
nvme_namespace_t *ns;
const uint32_t nsid = req->nil_common.nioc_nsid;
info = &minor->nm_ns_lock;
VERIFY3U(req->nil_ent, ==, NVME_LOCK_E_NS);
ns = nvme_nsid2ns(nvme, nsid);
minor->nm_ns_lock.nli_ns = ns;
lock = &ns->ns_lock;
if (is_read) {
waiters = nvme_rwlock_block_ns_rdlock(nvme, ns);
} else {
waiters = nvme_rwlock_block_ns_wrlock(nvme, ns);
}
}
info->nli_curlevel = is_read ? NVME_LOCK_L_READ : NVME_LOCK_L_WRITE;
info->nli_lock = lock;
if (!waiters) {
if (is_read) {
nvme_rwlock_rdlock(info, lock);
} else {
nvme_rwlock_wrlock(info, lock);
}
(void) nvme_ioctl_success(&req->nil_common);
return;
}
if (is_nonblock) {
nvme_rwunlock_cleanup_minor(info);
lock->nl_nnonblock++;
(void) nvme_ioctl_error(&req->nil_common,
NVME_IOCTL_E_LOCK_WOULD_BLOCK, 0, 0);
return;
}
ASSERT3P(info->nli_ioc, ==, NULL);
info->nli_ioc = &req->nil_common;
if (is_read) {
list_insert_tail(&lock->nl_pend_readers, info);
lock->nl_npend_reads++;
} else {
list_insert_tail(&lock->nl_pend_writers, info);
lock->nl_npend_writes++;
}
ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
info->nli_state = NVME_LOCK_STATE_BLOCKED;
sleep_time = gethrtime();
info->nli_last_change = sleep_time;
while (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
if (cv_wait_sig(&minor->nm_cv, &nvme->n_minor_mutex) == 0) {
nvme_rwlock_signal(info, lock, is_read);
break;
}
}
info->nli_ioc = NULL;
#ifdef DEBUG
ASSERT3S(info->nli_last_change, !=, sleep_time);
if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
ASSERT3S(list_link_active(&info->nli_node), ==, 0);
ASSERT3P(info->nli_ns, ==, NULL);
ASSERT3U(req->nil_common.nioc_drv_err, !=, NVME_IOCTL_E_OK);
} else {
ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_ACQUIRED);
ASSERT3U(req->nil_common.nioc_drv_err, ==, NVME_IOCTL_E_OK);
if (is_read) {
ASSERT3S(list_link_active(&info->nli_node), !=, 0);
} else {
ASSERT3P(lock->nl_writer, ==, info);
}
}
ASSERT3P(info->nli_minor, ==, minor);
ASSERT3P(info->nli_nvme, ==, minor->nm_ctrl);
#endif
}
static void
nvme_rwlock_ctrl_dead_cleanup_one(nvme_t *nvme, nvme_minor_lock_info_t *info)
{
ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_BLOCKED);
ASSERT3P(info->nli_ioc, !=, NULL);
info->nli_last_change = gethrtime();
nvme_rwunlock_cleanup_minor(info);
nvme_rwlock_signal_one(info, nvme->n_dead_status);
}
void
nvme_rwlock_ctrl_dead(void *arg)
{
nvme_t *nvme = arg;
nvme_lock_t *ctrl_lock = &nvme->n_lock;
nvme_minor_lock_info_t *info;
mutex_enter(&nvme->n_minor_mutex);
if ((nvme->n_progress & NVME_NS_INIT) == 0) {
mutex_exit(&nvme->n_minor_mutex);
return;
}
for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
nvme_lock_t *ns_lock = &ns->ns_lock;
while ((info = list_remove_head(&ns_lock->nl_pend_readers)) !=
NULL) {
nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
}
while ((info = list_remove_head(&ns_lock->nl_pend_writers)) !=
NULL) {
nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
}
}
while ((info = list_remove_head(&ctrl_lock->nl_pend_readers)) != NULL) {
nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
}
while ((info = list_remove_head(&ctrl_lock->nl_pend_writers)) != NULL) {
nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
}
mutex_exit(&nvme->n_minor_mutex);
}
void
nvme_lock_fini(nvme_lock_t *lock)
{
VERIFY3P(lock->nl_writer, ==, NULL);
list_destroy(&lock->nl_pend_writers);
list_destroy(&lock->nl_pend_readers);
list_destroy(&lock->nl_readers);
}
void
nvme_lock_init(nvme_lock_t *lock)
{
list_create(&lock->nl_readers, sizeof (nvme_minor_lock_info_t),
offsetof(nvme_minor_lock_info_t, nli_node));
list_create(&lock->nl_pend_readers, sizeof (nvme_minor_lock_info_t),
offsetof(nvme_minor_lock_info_t, nli_node));
list_create(&lock->nl_pend_writers, sizeof (nvme_minor_lock_info_t),
offsetof(nvme_minor_lock_info_t, nli_node));
}