#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/avl.h>
#include <sys/list.h>
#include <sys/machparam.h>
#include <sys/kmem.h>
#include <sys/stddef.h>
#include <sys/null.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/sunddi.h>
#include <sys/policy.h>
#include <vm/seg_kmem.h>
#include <vm/hat_i86.h>
#include <sys/kstat.h>
#include <sys/vmm_reservoir.h>
#include <sys/vmm_dev.h>
#include <sys/vmm_impl.h>
#define VMMR_TARGET_INACTIVE SIZE_MAX
static kmutex_t vmmr_lock;
static size_t vmmr_free_sz;
static size_t vmmr_free_transient_sz;
static size_t vmmr_adding_sz;
static size_t vmmr_alloc_sz;
static size_t vmmr_alloc_transient_sz;
static size_t vmmr_empty_sz;
static size_t vmmr_target_sz;
static uintptr_t vmmr_empty_last;
static size_t vmmr_total_limit;
static uintptr_t vmmr_va;
static uintptr_t vmmr_va_sz;
static kstat_t *vmmr_kstat;
typedef struct vmmr_treepair {
avl_tree_t by_addr;
avl_tree_t by_size;
} vmmr_treepair_t;
static vmmr_treepair_t vmmr_free_tp;
static vmmr_treepair_t vmmr_empty_tp;
static list_t vmmr_alloc_regions;
struct vmmr_span {
uintptr_t vs_addr;
size_t vs_size;
avl_node_t vs_by_addr;
avl_node_t vs_by_size;
uintptr_t vs_region_addr;
};
typedef struct vmmr_span vmmr_span_t;
struct vmmr_region {
size_t vr_size;
avl_tree_t vr_spans;
list_node_t vr_node;
bool vr_transient;
};
typedef struct vmmr_kstats {
kstat_named_t vmrks_bytes_free;
kstat_named_t vmrks_bytes_alloc;
kstat_named_t vmrks_bytes_transient;
kstat_named_t vmrks_bytes_limit;
} vmmr_kstats_t;
static int vmmr_add(size_t, bool);
static int vmmr_remove(size_t, bool);
static int
vmmr_cmp_addr(const void *a, const void *b)
{
const vmmr_span_t *sa = a;
const vmmr_span_t *sb = b;
if (sa->vs_addr == sb->vs_addr) {
return (0);
} else if (sa->vs_addr < sb->vs_addr) {
return (-1);
} else {
return (1);
}
}
static int
vmmr_cmp_size(const void *a, const void *b)
{
const vmmr_span_t *sa = a;
const vmmr_span_t *sb = b;
if (sa->vs_size == sb->vs_size) {
return (vmmr_cmp_addr(a, b));
} else if (sa->vs_size < sb->vs_size) {
return (-1);
} else {
return (1);
}
}
static int
vmmr_cmp_region_addr(const void *a, const void *b)
{
const vmmr_span_t *sa = a;
const vmmr_span_t *sb = b;
if (sa->vs_region_addr == sb->vs_region_addr) {
return (0);
} else if (sa->vs_region_addr < sb->vs_region_addr) {
return (-1);
} else {
return (1);
}
}
static void
vmmr_tp_init(vmmr_treepair_t *tree)
{
avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t),
offsetof(vmmr_span_t, vs_by_addr));
avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t),
offsetof(vmmr_span_t, vs_by_size));
}
static void
vmmr_tp_destroy(vmmr_treepair_t *tree)
{
void *vcp = NULL;
vmmr_span_t *span;
while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) {
}
while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) {
kmem_free(span, sizeof (*span));
}
avl_destroy(&tree->by_addr);
avl_destroy(&tree->by_size);
}
static void
vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree)
{
avl_tree_t *by_addr = &tree->by_addr;
avl_tree_t *by_size = &tree->by_size;
vmmr_span_t *node;
avl_index_t where;
node = avl_find(by_addr, to_add, &where);
ASSERT3P(node, ==, NULL);
node = avl_nearest(by_addr, where, AVL_BEFORE);
if (node != NULL &&
(node->vs_addr + node->vs_size) == to_add->vs_addr) {
avl_remove(by_addr, node);
avl_remove(by_size, node);
node->vs_size += to_add->vs_size;
kmem_free(to_add, sizeof (*to_add));
to_add = node;
}
node = avl_nearest(by_addr, where, AVL_AFTER);
if (node != NULL &&
(to_add->vs_addr + to_add->vs_size) == node->vs_addr) {
avl_remove(by_addr, node);
avl_remove(by_size, node);
node->vs_addr = to_add->vs_addr;
node->vs_size += to_add->vs_size;
avl_add(by_addr, node);
avl_add(by_size, node);
kmem_free(to_add, sizeof (*to_add));
return;
}
avl_add(by_addr, to_add);
avl_add(by_size, to_add);
}
static vmmr_span_t *
vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree)
{
avl_tree_t *by_addr = &tree->by_addr;
avl_tree_t *by_size = &tree->by_size;
vmmr_span_t *span;
avl_index_t where;
ASSERT3U(target_sz, !=, 0);
ASSERT(!avl_is_empty(by_addr));
ASSERT(!avl_is_empty(by_size));
vmmr_span_t search = { .vs_size = target_sz };
span = avl_find(by_size, &search, &where);
if (span == NULL) {
span = avl_nearest(by_size, where, AVL_AFTER);
if (span == NULL) {
span = avl_nearest(by_size, where, AVL_BEFORE);
ASSERT3P(span, !=, NULL);
}
}
if (span->vs_size <= target_sz) {
avl_remove(by_size, span);
avl_remove(by_addr, span);
return (span);
} else {
uintptr_t start = span->vs_addr + span->vs_size - target_sz;
avl_remove(by_size, span);
span->vs_size -= target_sz;
avl_add(by_size, span);
vmmr_span_t *split_span =
kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
split_span->vs_addr = start;
split_span->vs_size = target_sz;
return (split_span);
}
}
static int
vmmr_kstat_update(struct kstat *ksp, int rw)
{
vmmr_kstats_t *vkp = ksp->ks_data;
mutex_enter(&vmmr_lock);
vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz;
vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz;
vkp->vmrks_bytes_transient.value.ui64 =
vmmr_alloc_transient_sz + vmmr_free_transient_sz;
vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit;
mutex_exit(&vmmr_lock);
return (0);
}
int
vmmr_init()
{
mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL);
vmmr_total_limit =
(((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10;
vmmr_empty_last = 0;
vmmr_free_sz = 0;
vmmr_alloc_sz = 0;
vmmr_empty_sz = 0;
vmmr_adding_sz = 0;
vmmr_free_transient_sz = 0;
vmmr_alloc_transient_sz = 0;
vmmr_target_sz = VMMR_TARGET_INACTIVE;
kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir",
VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED,
sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID);
if (ksp == NULL) {
mutex_destroy(&vmmr_lock);
return (ENOMEM);
}
vmmr_kstats_t *vkp = ksp->ks_data;
kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free",
KSTAT_DATA_UINT64);
kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc",
KSTAT_DATA_UINT64);
kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc",
KSTAT_DATA_UINT64);
kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit",
KSTAT_DATA_UINT64);
ksp->ks_private = NULL;
ksp->ks_update = vmmr_kstat_update;
vmmr_kstat = ksp;
vmmr_tp_init(&vmmr_free_tp);
vmmr_tp_init(&vmmr_empty_tp);
list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t),
offsetof(vmmr_region_t, vr_node));
vmmr_va_sz = physmem * PAGESIZE;
vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP);
kstat_install(vmmr_kstat);
return (0);
}
void
vmmr_fini()
{
mutex_enter(&vmmr_lock);
VERIFY3U(vmmr_alloc_sz, ==, 0);
VERIFY3U(vmmr_free_sz, ==, 0);
VERIFY3U(vmmr_adding_sz, ==, 0);
VERIFY3U(vmmr_alloc_transient_sz, ==, 0);
VERIFY3U(vmmr_free_transient_sz, ==, 0);
VERIFY(avl_is_empty(&vmmr_free_tp.by_addr));
VERIFY(avl_is_empty(&vmmr_free_tp.by_size));
VERIFY(list_is_empty(&vmmr_alloc_regions));
kstat_delete(vmmr_kstat);
vmmr_kstat = NULL;
vmmr_tp_destroy(&vmmr_free_tp);
vmmr_tp_destroy(&vmmr_empty_tp);
list_destroy(&vmmr_alloc_regions);
vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz);
vmmr_va = 0;
vmmr_va_sz = 0;
vmmr_total_limit = 0;
vmmr_empty_last = 0;
mutex_exit(&vmmr_lock);
mutex_destroy(&vmmr_lock);
}
bool
vmmr_is_empty()
{
mutex_enter(&vmmr_lock);
bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 &&
vmmr_free_sz == 0 && vmmr_free_transient_sz == 0);
mutex_exit(&vmmr_lock);
return (res);
}
int
vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp)
{
VERIFY3U(sz & PAGEOFFSET, ==, 0);
if (!transient) {
mutex_enter(&vmmr_lock);
if (sz > vmmr_free_sz) {
mutex_exit(&vmmr_lock);
return (ENOSPC);
}
} else {
int err;
mutex_enter(&vmmr_lock);
err = vmmr_add(sz, true);
if (err != 0) {
mutex_exit(&vmmr_lock);
return (err);
}
VERIFY3U(vmmr_free_transient_sz, >=, sz);
}
vmmr_region_t *region;
region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP);
avl_create(®ion->vr_spans, vmmr_cmp_region_addr,
sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr));
region->vr_size = sz;
size_t remain = sz;
uintptr_t map_at = 0;
while (remain > 0) {
vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
VERIFY3P(span, !=, NULL);
ASSERT3U(span->vs_size, <=, remain);
span->vs_region_addr = map_at;
avl_add(®ion->vr_spans, span);
map_at += span->vs_size;
remain -= span->vs_size;
}
if (!transient) {
vmmr_free_sz -= sz;
vmmr_alloc_sz += sz;
} else {
vmmr_free_transient_sz -= sz;
vmmr_alloc_transient_sz += sz;
region->vr_transient = true;
}
list_insert_tail(&vmmr_alloc_regions, region);
mutex_exit(&vmmr_lock);
*resp = region;
return (0);
}
void *
vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off)
{
return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off)));
}
pfn_t
vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off)
{
VERIFY3U(off & PAGEOFFSET, ==, 0);
VERIFY3U(off, <, region->vr_size);
vmmr_span_t search = {
.vs_region_addr = off
};
avl_index_t where;
vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where);
if (span == NULL) {
span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE);
ASSERT3P(span, !=, NULL);
}
uintptr_t span_off = off - span->vs_region_addr + span->vs_addr;
page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off);
VERIFY(pp != NULL);
return (pp->p_pagenum);
}
void
vmmr_free(vmmr_region_t *region)
{
mutex_enter(&vmmr_lock);
if (!region->vr_transient) {
VERIFY3U(region->vr_size, <=, vmmr_alloc_sz);
} else {
VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz);
}
list_remove(&vmmr_alloc_regions, region);
mutex_exit(&vmmr_lock);
for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) {
bzero(vmmr_region_mem_at(region, off), PAGESIZE);
}
mutex_enter(&vmmr_lock);
void *cookie = NULL;
vmmr_span_t *span;
while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) {
span->vs_region_addr = 0;
vmmr_tp_insert_concat(span, &vmmr_free_tp);
}
avl_destroy(®ion->vr_spans);
if (!region->vr_transient) {
vmmr_free_sz += region->vr_size;
vmmr_alloc_sz -= region->vr_size;
} else {
vmmr_free_transient_sz += region->vr_size;
vmmr_alloc_transient_sz -= region->vr_size;
}
if (region->vr_transient) {
VERIFY0(vmmr_remove(region->vr_size, true));
}
kmem_free(region, sizeof (*region));
mutex_exit(&vmmr_lock);
}
static void
vmmr_destroy_pages(vmmr_span_t *span)
{
const uintptr_t end = span->vs_addr + span->vs_size;
struct vnode *vp = &kvps[KV_VVP];
for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
page_t *pp;
pp = page_find(vp, (u_offset_t)pos);
VERIFY(pp != NULL);
if (!page_tryupgrade(pp)) {
page_unlock(pp);
pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL);
}
pp->p_lckcnt = 0;
page_destroy(pp, 0);
}
}
static int
vmmr_alloc_pages(const vmmr_span_t *span)
{
struct seg kseg = {
.s_as = &kas
};
struct vnode *vp = &kvps[KV_VVP];
const uintptr_t end = span->vs_addr + span->vs_size;
for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) {
page_t *pp;
pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE,
PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos));
if (pp == NULL) {
if (pos != span->vs_addr) {
vmmr_span_t destroy_span = {
.vs_addr = span->vs_addr,
.vs_size = pos - span->vs_addr,
};
vmmr_destroy_pages(&destroy_span);
}
return (ENOMEM);
}
ASSERT(PAGE_EXCL(pp));
page_io_unlock(pp);
pp->p_lckcnt = 1;
page_downgrade(pp);
bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE);
}
return (0);
}
static int
vmmr_resv_wait()
{
if (delay_sig(hz >> 2) != 0) {
return (0);
}
return (1);
}
static void
vmmr_remove_raw(size_t sz)
{
VERIFY3U(sz & PAGEOFFSET, ==, 0);
VERIFY(MUTEX_HELD(&vmmr_lock));
size_t remain = sz;
while (remain > 0) {
vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp);
VERIFY3P(span, !=, NULL);
ASSERT3U(span->vs_size, <=, remain);
vmmr_destroy_pages(span);
remain -= span->vs_size;
vmmr_tp_insert_concat(span, &vmmr_empty_tp);
}
vmmr_empty_sz += sz;
}
static int
vmmr_add(size_t sz, bool transient)
{
VERIFY3U(sz & PAGEOFFSET, ==, 0);
VERIFY3U(sz, >, 0);
VERIFY(MUTEX_HELD(&vmmr_lock));
const size_t current_total =
vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz +
vmmr_alloc_transient_sz + vmmr_free_transient_sz;
if ((current_total + sz) < current_total) {
return (EOVERFLOW);
}
if ((current_total + sz) > vmmr_total_limit) {
return (ENOSPC);
}
vmmr_adding_sz += sz;
mutex_exit(&vmmr_lock);
if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) {
mutex_enter(&vmmr_lock);
vmmr_adding_sz -= sz;
return (EINTR);
}
mutex_enter(&vmmr_lock);
size_t added = 0;
size_t remain = sz;
while (added < sz) {
vmmr_span_t *span = NULL;
if (vmmr_empty_sz > 0) {
span = vmmr_tp_remove_split(remain, &vmmr_empty_tp);
vmmr_empty_sz -= span->vs_size;
} else {
span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP);
span->vs_addr = vmmr_empty_last;
span->vs_size = remain;
vmmr_empty_last += remain;
}
VERIFY3P(span, !=, NULL);
mutex_exit(&vmmr_lock);
int err = vmmr_alloc_pages(span);
mutex_enter(&vmmr_lock);
if (err != 0) {
vmmr_empty_sz += span->vs_size;
vmmr_tp_insert_concat(span, &vmmr_empty_tp);
if (added != 0) {
vmmr_remove_raw(added);
}
vmmr_adding_sz -= sz;
page_unresv(sz >> PAGESHIFT);
return (err);
}
added += span->vs_size;
remain -= span->vs_size;
vmmr_tp_insert_concat(span, &vmmr_free_tp);
}
if (!transient) {
vmmr_free_sz += added;
} else {
vmmr_free_transient_sz += added;
}
ASSERT3U(added, ==, sz);
vmmr_adding_sz -= added;
return (0);
}
static int
vmmr_remove(size_t sz, bool transient)
{
VERIFY3U(sz & PAGEOFFSET, ==, 0);
VERIFY(sz);
VERIFY(MUTEX_HELD(&vmmr_lock));
if ((!transient && sz > vmmr_free_sz) ||
(transient && sz > vmmr_free_transient_sz)) {
return (ENOSPC);
}
vmmr_remove_raw(sz);
if (!transient) {
vmmr_free_sz -= sz;
} else {
vmmr_free_transient_sz -= sz;
}
page_unresv(sz >> PAGESHIFT);
return (0);
}
static int
vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp)
{
VERIFY(resp != NULL);
mutex_enter(&vmmr_lock);
size_t current_sz = vmmr_alloc_sz + vmmr_free_sz;
*resp = current_sz;
if ((target_sz & PAGEOFFSET) != 0 ||
(chunk_sz & PAGEOFFSET) != 0) {
mutex_exit(&vmmr_lock);
return (EINVAL);
}
if (target_sz == VMMR_TARGET_INACTIVE) {
mutex_exit(&vmmr_lock);
return (EINVAL);
}
if (target_sz == current_sz) {
mutex_exit(&vmmr_lock);
return (0);
}
if (vmmr_target_sz != VMMR_TARGET_INACTIVE) {
mutex_exit(&vmmr_lock);
return (EALREADY);
}
vmmr_target_sz = target_sz;
int err = 0;
do {
if (issig(JUSTLOOKING) != 0) {
mutex_exit(&vmmr_lock);
const bool sig_bail = issig(FORREAL) != 0;
mutex_enter(&vmmr_lock);
if (sig_bail) {
err = EINTR;
break;
}
}
if (current_sz > target_sz) {
size_t req_sz = current_sz - target_sz;
if (chunk_sz != 0) {
req_sz = MIN(req_sz, chunk_sz);
}
err = vmmr_remove(req_sz, false);
} else {
ASSERT(current_sz < target_sz);
size_t req_sz = target_sz - current_sz;
if (chunk_sz != 0) {
req_sz = MIN(req_sz, chunk_sz);
}
err = vmmr_add(req_sz, false);
}
current_sz = vmmr_alloc_sz + vmmr_free_sz;
} while (err == 0 && current_sz != target_sz);
vmmr_target_sz = VMMR_TARGET_INACTIVE;
mutex_exit(&vmmr_lock);
*resp = current_sz;
return (err);
}
int
vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp)
{
switch (cmd) {
case VMM_RESV_QUERY: {
struct vmm_resv_query res;
void *datap = (void *)(uintptr_t)arg;
mutex_enter(&vmmr_lock);
res.vrq_free_sz = vmmr_free_sz;
res.vrq_alloc_sz = vmmr_alloc_sz;
res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz;
res.vrq_limit = vmmr_total_limit;
mutex_exit(&vmmr_lock);
if (ddi_copyout(&res, datap, sizeof (res), md) != 0) {
return (EFAULT);
}
break;
}
case VMM_RESV_SET_TARGET: {
if (secpolicy_sys_config(cr, B_FALSE) != 0) {
return (EPERM);
}
struct vmm_resv_target tgt;
void *datap = (void *)(uintptr_t)arg;
if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) {
return (EFAULT);
}
int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz,
&tgt.vrt_result_sz);
if (err == 0 || err == EINTR) {
if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) {
err = EFAULT;
}
}
return (err);
}
default:
return (ENOTTY);
}
return (0);
}