#include <sys/zfs_context.h>
#include <sys/zfs_rlock.h>
static int
rangelock_compare(const void *arg1, const void *arg2)
{
const locked_range_t *rl1 = (const locked_range_t *)arg1;
const locked_range_t *rl2 = (const locked_range_t *)arg2;
return (TREE_CMP(rl1->lr_offset, rl2->lr_offset));
}
void
rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
{
mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&rl->rl_tree, rangelock_compare,
sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
rl->rl_cb = cb;
rl->rl_arg = arg;
}
void
rangelock_fini(rangelock_t *rl)
{
mutex_destroy(&rl->rl_lock);
avl_destroy(&rl->rl_tree);
}
static void
rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
{
avl_tree_t *tree = &rl->rl_tree;
locked_range_t *lr;
avl_index_t where;
uint64_t orig_off = new->lr_offset;
uint64_t orig_len = new->lr_length;
rangelock_type_t orig_type = new->lr_type;
for (;;) {
if (rl->rl_cb != NULL) {
rl->rl_cb(new, rl->rl_arg);
}
ASSERT3U(new->lr_type, ==, RL_WRITER);
if (avl_numnodes(tree) == 0) {
avl_add(tree, new);
return;
}
lr = avl_find(tree, new, &where);
if (lr != NULL)
goto wait;
lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
if (lr != NULL &&
lr->lr_offset < new->lr_offset + new->lr_length)
goto wait;
lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
if (lr != NULL &&
lr->lr_offset + lr->lr_length > new->lr_offset)
goto wait;
avl_insert(tree, new, where);
return;
wait:
if (!lr->lr_write_wanted) {
cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
lr->lr_write_wanted = B_TRUE;
}
cv_wait(&lr->lr_write_cv, &rl->rl_lock);
new->lr_offset = orig_off;
new->lr_length = orig_len;
new->lr_type = orig_type;
}
}
static locked_range_t *
rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
{
locked_range_t *proxy;
if (lr->lr_proxy)
return (lr);
ASSERT3U(lr->lr_count, ==, 1);
ASSERT(lr->lr_write_wanted == B_FALSE);
ASSERT(lr->lr_read_wanted == B_FALSE);
avl_remove(tree, lr);
lr->lr_count = 0;
proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
proxy->lr_offset = lr->lr_offset;
proxy->lr_length = lr->lr_length;
proxy->lr_count = 1;
proxy->lr_type = RL_READER;
proxy->lr_proxy = B_TRUE;
proxy->lr_write_wanted = B_FALSE;
proxy->lr_read_wanted = B_FALSE;
avl_add(tree, proxy);
return (proxy);
}
static locked_range_t *
rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
{
ASSERT3U(lr->lr_length, >, 1);
ASSERT3U(off, >, lr->lr_offset);
ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
ASSERT(lr->lr_write_wanted == B_FALSE);
ASSERT(lr->lr_read_wanted == B_FALSE);
locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
rear->lr_offset = off;
rear->lr_length = lr->lr_offset + lr->lr_length - off;
rear->lr_count = lr->lr_count;
rear->lr_type = RL_READER;
rear->lr_proxy = B_TRUE;
rear->lr_write_wanted = B_FALSE;
rear->lr_read_wanted = B_FALSE;
locked_range_t *front = rangelock_proxify(tree, lr);
front->lr_length = off - lr->lr_offset;
avl_insert_here(tree, rear, front, AVL_AFTER);
return (front);
}
static void
rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
{
ASSERT(len != 0);
locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
lr->lr_offset = off;
lr->lr_length = len;
lr->lr_count = 1;
lr->lr_type = RL_READER;
lr->lr_proxy = B_TRUE;
lr->lr_write_wanted = B_FALSE;
lr->lr_read_wanted = B_FALSE;
avl_add(tree, lr);
}
static void
rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
locked_range_t *prev, avl_index_t where)
{
locked_range_t *next;
uint64_t off = new->lr_offset;
uint64_t len = new->lr_length;
if (prev != NULL) {
if (prev->lr_offset + prev->lr_length <= off) {
prev = NULL;
} else if (prev->lr_offset != off) {
prev = rangelock_split(tree, prev, off);
prev = AVL_NEXT(tree, prev);
}
}
ASSERT((prev == NULL) || (prev->lr_offset == off));
if (prev != NULL)
next = prev;
else
next = avl_nearest(tree, where, AVL_AFTER);
if (next == NULL || off + len <= next->lr_offset) {
avl_insert(tree, new, where);
return;
}
if (off < next->lr_offset) {
rangelock_new_proxy(tree, off, next->lr_offset - off);
}
new->lr_count = 0;
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
if (off + len <= next->lr_offset)
break;
if (prev != NULL && prev->lr_offset + prev->lr_length <
next->lr_offset) {
ASSERT3U(next->lr_offset, >,
prev->lr_offset + prev->lr_length);
rangelock_new_proxy(tree,
prev->lr_offset + prev->lr_length,
next->lr_offset -
(prev->lr_offset + prev->lr_length));
}
if (off + len == next->lr_offset + next->lr_length) {
next = rangelock_proxify(tree, next);
next->lr_count++;
return;
}
if (off + len < next->lr_offset + next->lr_length) {
next = rangelock_split(tree, next, off + len);
next->lr_count++;
return;
}
ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
next = rangelock_proxify(tree, next);
next->lr_count++;
}
rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
(off + len) - (prev->lr_offset + prev->lr_length));
}
static void
rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
{
avl_tree_t *tree = &rl->rl_tree;
locked_range_t *prev, *next;
avl_index_t where;
uint64_t off = new->lr_offset;
uint64_t len = new->lr_length;
retry:
prev = avl_find(tree, new, &where);
if (prev == NULL)
prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
if (prev && (off < prev->lr_offset + prev->lr_length)) {
if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
if (!prev->lr_read_wanted) {
cv_init(&prev->lr_read_cv,
NULL, CV_DEFAULT, NULL);
prev->lr_read_wanted = B_TRUE;
}
cv_wait(&prev->lr_read_cv, &rl->rl_lock);
goto retry;
}
if (off + len < prev->lr_offset + prev->lr_length)
goto got_lock;
}
if (prev != NULL)
next = AVL_NEXT(tree, prev);
else
next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
for (; next != NULL; next = AVL_NEXT(tree, next)) {
if (off + len <= next->lr_offset)
goto got_lock;
if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
if (!next->lr_read_wanted) {
cv_init(&next->lr_read_cv,
NULL, CV_DEFAULT, NULL);
next->lr_read_wanted = B_TRUE;
}
cv_wait(&next->lr_read_cv, &rl->rl_lock);
goto retry;
}
if (off + len <= next->lr_offset + next->lr_length)
goto got_lock;
}
got_lock:
rangelock_add_reader(tree, new, prev, where);
}
locked_range_t *
rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
rangelock_type_t type)
{
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
new->lr_rangelock = rl;
new->lr_offset = off;
if (len + off < off)
len = UINT64_MAX - off;
new->lr_length = len;
new->lr_count = 1;
new->lr_type = type;
new->lr_proxy = B_FALSE;
new->lr_write_wanted = B_FALSE;
new->lr_read_wanted = B_FALSE;
mutex_enter(&rl->rl_lock);
if (type == RL_READER) {
if (avl_numnodes(&rl->rl_tree) == 0)
avl_add(&rl->rl_tree, new);
else
rangelock_enter_reader(rl, new);
} else
rangelock_enter_writer(rl, new);
mutex_exit(&rl->rl_lock);
return (new);
}
static void
rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove)
{
avl_tree_t *tree = &rl->rl_tree;
uint64_t len;
if (remove->lr_count == 1) {
avl_remove(tree, remove);
if (remove->lr_write_wanted) {
cv_broadcast(&remove->lr_write_cv);
cv_destroy(&remove->lr_write_cv);
}
if (remove->lr_read_wanted) {
cv_broadcast(&remove->lr_read_cv);
cv_destroy(&remove->lr_read_cv);
}
} else {
ASSERT0(remove->lr_count);
ASSERT0(remove->lr_write_wanted);
ASSERT0(remove->lr_read_wanted);
locked_range_t *lr = avl_find(tree, remove, NULL);
ASSERT3P(lr, !=, NULL);
ASSERT3U(lr->lr_count, !=, 0);
ASSERT3U(lr->lr_type, ==, RL_READER);
locked_range_t *next = NULL;
for (len = remove->lr_length; len != 0; lr = next) {
len -= lr->lr_length;
if (len != 0) {
next = AVL_NEXT(tree, lr);
ASSERT3P(next, !=, NULL);
ASSERT3U(lr->lr_offset + lr->lr_length, ==,
next->lr_offset);
ASSERT3U(next->lr_count, !=, 0);
ASSERT3U(next->lr_type, ==, RL_READER);
}
lr->lr_count--;
if (lr->lr_count == 0) {
avl_remove(tree, lr);
if (lr->lr_write_wanted) {
cv_broadcast(&lr->lr_write_cv);
cv_destroy(&lr->lr_write_cv);
}
if (lr->lr_read_wanted) {
cv_broadcast(&lr->lr_read_cv);
cv_destroy(&lr->lr_read_cv);
}
kmem_free(lr, sizeof (locked_range_t));
}
}
}
kmem_free(remove, sizeof (locked_range_t));
}
void
rangelock_exit(locked_range_t *lr)
{
rangelock_t *rl = lr->lr_rangelock;
ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
ASSERT(!lr->lr_proxy);
mutex_enter(&rl->rl_lock);
if (lr->lr_type == RL_WRITER) {
avl_remove(&rl->rl_tree, lr);
mutex_exit(&rl->rl_lock);
if (lr->lr_write_wanted) {
cv_broadcast(&lr->lr_write_cv);
cv_destroy(&lr->lr_write_cv);
}
if (lr->lr_read_wanted) {
cv_broadcast(&lr->lr_read_cv);
cv_destroy(&lr->lr_read_cv);
}
kmem_free(lr, sizeof (locked_range_t));
} else {
rangelock_exit_reader(rl, lr);
mutex_exit(&rl->rl_lock);
}
}
void
rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
{
rangelock_t *rl = lr->lr_rangelock;
ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
ASSERT3U(lr->lr_offset, ==, 0);
ASSERT3U(lr->lr_type, ==, RL_WRITER);
ASSERT(!lr->lr_proxy);
ASSERT3U(lr->lr_length, ==, UINT64_MAX);
ASSERT3U(lr->lr_count, ==, 1);
mutex_enter(&rl->rl_lock);
lr->lr_offset = off;
lr->lr_length = len;
mutex_exit(&rl->rl_lock);
if (lr->lr_write_wanted)
cv_broadcast(&lr->lr_write_cv);
if (lr->lr_read_wanted)
cv_broadcast(&lr->lr_read_cv);
}