#include <sys/balloon_impl.h>
#include <sys/hypervisor.h>
#include <xen/sys/xenbus_impl.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/disp.h>
#include <sys/callb.h>
#include <xen/public/memory.h>
#include <vm/hat.h>
#include <sys/promif.h>
#include <vm/seg_kmem.h>
#include <sys/memnode.h>
#include <sys/param.h>
#include <vm/vm_dep.h>
#include <sys/mman.h>
#include <sys/memlist.h>
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/sdt.h>
static bln_stats_t bln_stats;
static kthread_t *bln_thread;
static kmutex_t bln_mutex;
static kcondvar_t bln_cv;
static struct xenbus_watch bln_watch;
static mfn_t new_high_mfn;
static page_t *bln_spare_list_front, *bln_spare_list_back;
int balloon_zero_memory = 1;
size_t balloon_minkmem = (8 * 1024 * 1024);
uint_t bln_contig_list_quota = 50;
extern void clear_and_lock_contig_pfnlist(void);
extern void unlock_contig_pfnlist(void);
static int
balloon_lock_contig_pfnlist(int count)
{
if (count > bln_contig_list_quota) {
clear_and_lock_contig_pfnlist();
return (1);
} else {
return (0);
}
}
static void
balloon_page_add(page_t *pp)
{
ASSERT(PAGE_EXCL(pp));
ASSERT(MUTEX_HELD(&bln_mutex));
pp->p_prev = NULL;
if (bln_spare_list_front == NULL) {
bln_spare_list_front = bln_spare_list_back = pp;
pp->p_next = NULL;
} else if (pp->p_pagenum >= mfn_count) {
ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
bln_spare_list_back->p_next = pp;
pp->p_next = NULL;
bln_spare_list_back = pp;
} else {
pp->p_next = bln_spare_list_front;
bln_spare_list_front = pp;
}
}
static page_t *
balloon_page_sub(void)
{
page_t *pp;
ASSERT(MUTEX_HELD(&bln_mutex));
if (bln_spare_list_front == NULL) {
return (NULL);
}
pp = bln_spare_list_front;
ASSERT(PAGE_EXCL(pp));
ASSERT(pp->p_pagenum <= mfn_count);
if (pp->p_pagenum == mfn_count) {
return (NULL);
}
bln_spare_list_front = pp->p_next;
if (bln_spare_list_front == NULL)
bln_spare_list_back = NULL;
pp->p_next = NULL;
return (pp);
}
typedef struct {
struct memseg memseg;
struct memlist memlist;
page_t pages[1];
} mem_structs_t;
#define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist))
static int
balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
{
pgcnt_t metapgs, totalpgs, num_pages;
paddr_t metasz;
pfn_t meta_start;
page_t *page_array;
caddr_t va;
int i, rv, locked;
mem_structs_t *mem;
struct memseg *segp;
totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
(PAGESIZE + sizeof (page_t)));
if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
MEM_STRUCT_SIZE))
metapgs++;
if (metapgs > count)
metapgs = count;
metasz = pfn_to_pa(metapgs);
num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
num_pages, pgcnt_t, metapgs);
mfn_count += count;
va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
mem = (mem_structs_t *)va;
page_array = mem->pages;
meta_start = bln_stats.bln_max_pages;
locked = balloon_lock_contig_pfnlist(metapgs);
for (i = 0; i < metapgs; i++) {
reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
}
if (locked)
unlock_contig_pfnlist();
hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
PROT_READ | PROT_WRITE,
HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
bzero(va, metasz);
for (i = 0; i < metapgs; i++) {
page_array[i].p_pagenum = bln_stats.bln_max_pages++;
page_array[i].p_offset = (u_offset_t)-1;
page_iolock_init(&page_array[i]);
rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
ASSERT(rv == 1);
}
for (i = metapgs; i < num_pages; i++) {
page_array[i].p_pagenum = bln_stats.bln_max_pages++;
page_array[i].p_offset = (u_offset_t)-1;
page_iolock_init(&page_array[i]);
rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
ASSERT(rv == 1);
balloon_page_add(&page_array[i]);
}
mem->memseg.pages_base = meta_start;
mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
mem->memseg.pages = &page_array[0];
mem->memseg.epages = &page_array[num_pages - 1];
mem->memseg.next = NULL;
memsegs_lock(1);
for (segp = memsegs; segp->next != NULL; segp = segp->next)
;
segp->next = &mem->memseg;
memsegs_unlock(1);
mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
memlist_write_lock();
memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
&phys_install);
memlist_write_unlock();
build_pfn_hash();
return (metapgs);
}
#define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t))
static ulong_t mfn_frames[FRAME_ARRAY_SIZE];
static pfn_t pfn_frames[FRAME_ARRAY_SIZE];
static spgcnt_t
balloon_inc_reservation(ulong_t credit)
{
int i, cnt, locked;
int meta_pg_start, meta_pg_end;
long rv;
page_t *pp;
page_t *new_list_front, *new_list_back;
ASSERT(MUTEX_HELD(&bln_mutex));
rv = 0;
new_list_front = new_list_back = NULL;
meta_pg_start = meta_pg_end = 0;
bzero(mfn_frames, PAGESIZE);
if (credit > FRAME_ARRAY_SIZE)
credit = FRAME_ARRAY_SIZE;
xen_block_migrate();
rv = balloon_alloc_pages(credit, mfn_frames);
if (rv < 0) {
xen_allow_migrate();
return (0);
}
for (i = 0; i < rv; i++) {
if (mfn_frames[i] > new_high_mfn)
new_high_mfn = mfn_frames[i];
pp = balloon_page_sub();
if (pp == NULL) {
meta_pg_start = i;
cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
i += cnt;
meta_pg_end = i;
if (i < rv) {
pp = balloon_page_sub();
} else {
ASSERT(i == rv);
}
}
if (pp == NULL) {
break;
}
if (new_list_back == NULL) {
new_list_front = new_list_back = pp;
} else {
new_list_back->p_next = pp;
new_list_back = pp;
}
pp->p_next = NULL;
}
cnt = i;
locked = balloon_lock_contig_pfnlist(cnt);
for (i = 0, pp = new_list_front; i < meta_pg_start;
i++, pp = pp->p_next) {
reassign_pfn(pp->p_pagenum, mfn_frames[i]);
}
for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
reassign_pfn(pp->p_pagenum, mfn_frames[i]);
}
if (locked)
unlock_contig_pfnlist();
ASSERT(pp == NULL);
while (new_list_front != NULL) {
pp = new_list_front;
new_list_front = pp->p_next;
page_free(pp, 1);
}
ASSERT(cnt == i);
if (cnt < rv) {
#ifdef DEBUG
cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
#endif
(void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
rv = cnt;
}
xen_allow_migrate();
page_unresv(rv - (meta_pg_end - meta_pg_start));
return (rv);
}
static spgcnt_t
balloon_dec_reservation(ulong_t debit)
{
int i, locked;
long rv;
ulong_t request;
page_t *pp;
bzero(mfn_frames, sizeof (mfn_frames));
bzero(pfn_frames, sizeof (pfn_frames));
if (debit > FRAME_ARRAY_SIZE) {
debit = FRAME_ARRAY_SIZE;
}
request = debit;
if (kmem_avail() < balloon_minkmem) {
kmem_reap();
if (kmem_avail() < balloon_minkmem)
return (0);
}
if (page_resv(request, KM_NOSLEEP) == 0) {
return (0);
}
xen_block_migrate();
for (i = 0; i < debit; i++) {
pp = page_get_high_mfn(new_high_mfn);
new_high_mfn = 0;
if (pp == NULL) {
kmem_reap();
if (kmem_avail() < balloon_minkmem ||
(pp = page_get_high_mfn(0)) == NULL) {
debit = i;
break;
}
}
ASSERT(PAGE_EXCL(pp));
ASSERT(!hat_page_is_mapped(pp));
balloon_page_add(pp);
pfn_frames[i] = pp->p_pagenum;
mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
}
if (debit == 0) {
xen_allow_migrate();
page_unresv(request);
return (0);
}
if (balloon_zero_memory) {
for (i = 0; i < debit; i++) {
pfnzero(pfn_frames[i], 0, PAGESIZE);
}
}
locked = balloon_lock_contig_pfnlist(debit);
for (i = 0; i < debit; i++) {
reassign_pfn(pfn_frames[i], MFN_INVALID);
}
if (locked)
unlock_contig_pfnlist();
rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
if (rv < 0) {
cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
"failed - up to %lu pages lost (error = %ld)", debit, rv);
rv = 0;
} else if (rv != debit) {
panic("Unexpected return value (%ld) from decrease reservation "
"hypervisor call", rv);
}
xen_allow_migrate();
if (debit != request)
page_unresv(request - debit);
return (rv);
}
static void
balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
{
ulong_t new_target_kb;
pgcnt_t new_target_pages;
int rv;
static uchar_t warning_cnt = 0;
rv = xenbus_scanf(0, "memory", "target", "%lu", &new_target_kb);
if (rv != 0) {
return;
}
new_target_pages = kbtop(new_target_kb);
DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
mutex_enter(&bln_mutex);
if (new_target_pages > bln_stats.bln_max_pages) {
DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
"larger than original memory size (0x%lx pages). "
"Ballooning beyond original memory size is not "
"allowed.",
new_target_pages, bln_stats.bln_max_pages);
}
warning_cnt = 1;
bln_stats.bln_new_target = bln_stats.bln_max_pages;
} else {
bln_stats.bln_new_target = new_target_pages;
}
mutex_exit(&bln_mutex);
cv_signal(&bln_cv);
}
uint_t bln_wait_sec = 0;
uint_t bln_wait_shift = 1;
static void
balloon_worker_thread(void)
{
uint_t bln_wait;
callb_cpr_t cprinfo;
spgcnt_t rv;
bln_wait = bln_wait_sec;
CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
for (;;) {
rv = 0;
mutex_enter(&bln_mutex);
CALLB_CPR_SAFE_BEGIN(&cprinfo);
if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
(void) cv_reltimedwait(&bln_cv, &bln_mutex,
(bln_wait * hz), TR_CLOCK_TICK);
} else {
cv_wait(&bln_cv, &bln_mutex);
}
CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
if (bln_stats.bln_new_target <
bln_stats.bln_current_pages) {
rv = -balloon_dec_reservation(
bln_stats.bln_current_pages -
bln_stats.bln_new_target);
} else if (bln_stats.bln_new_target >
bln_stats.bln_current_pages) {
rv = balloon_inc_reservation(
bln_stats.bln_new_target -
bln_stats.bln_current_pages);
}
}
if (rv == 0) {
if (bln_wait == 0) {
bln_wait = 1;
} else {
bln_wait <<= bln_wait_shift;
}
} else {
bln_stats.bln_current_pages += rv;
bln_wait = bln_wait_sec;
}
if (bln_stats.bln_current_pages < bln_stats.bln_low)
bln_stats.bln_low = bln_stats.bln_current_pages;
else if (bln_stats.bln_current_pages > bln_stats.bln_high)
bln_stats.bln_high = bln_stats.bln_current_pages;
mutex_exit(&bln_mutex);
}
}
static void
balloon_config_watch(int state)
{
if (state != XENSTORE_UP)
return;
bln_watch.node = "memory/target";
bln_watch.callback = balloon_handler;
if (register_xenbus_watch(&bln_watch)) {
cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
"thread will be disabled");
return;
}
if (bln_thread == NULL)
bln_thread = thread_create(NULL, 0, balloon_worker_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
}
void
balloon_init(pgcnt_t nr_pages)
{
domid_t domid = DOMID_SELF;
bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
bln_stats.bln_max_pages = nr_pages;
cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
XENMEM_maximum_reservation, &domid);
(void) xs_register_xenbus_callback(balloon_config_watch);
}
void
balloon_drv_added(int64_t delta)
{
atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
}
void
balloon_drv_subtracted(int64_t delta)
{
atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
}
long
balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
{
xen_memory_reservation_t memres;
long rv;
bzero(&memres, sizeof (memres));
set_xen_guest_handle(memres.extent_start, mfns);
memres.domid = DOMID_SELF;
memres.nr_extents = page_cnt;
rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
if (rv > 0)
atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
return (rv);
}
long
balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
{
xen_memory_reservation_t memdec;
mfn_t mfn;
pfn_t pfn;
uint_t i;
long e;
#if DEBUG
if (kva != NULL) {
ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
if (pfns != NULL) {
ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
}
}
#endif
if ((kva != NULL) && balloon_zero_memory) {
bzero(kva, (page_cnt * PAGESIZE));
}
if ((kva != NULL) || (pfns != NULL)) {
ASSERT(page_cnt < bln_contig_list_quota);
for (i = 0; i < page_cnt; i++) {
if (pfns == NULL) {
pfn = hat_getpfnum(kas.a_hat,
(kva + (PAGESIZE * i)));
} else {
pfn = pfns[i];
}
if ((kva == NULL) && (balloon_zero_memory)) {
pfnzero(pfn, 0, PAGESIZE);
}
if (kva != NULL) {
hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
PAGESIZE, HAT_UNLOAD_UNMAP);
}
mfn = pfn_to_mfn(pfn);
reassign_pfn(pfn, MFN_INVALID);
if (mfns == NULL) {
bzero(&memdec, sizeof (memdec));
set_xen_guest_handle(memdec.extent_start, &mfn);
memdec.domid = DOMID_SELF;
memdec.nr_extents = 1;
e = HYPERVISOR_memory_op(
XENMEM_decrease_reservation, &memdec);
if (e != 1) {
cmn_err(CE_PANIC, "balloon: unable to "
"give a page back to the "
"hypervisor.\n");
}
}
}
}
if (mfns != NULL) {
bzero(&memdec, sizeof (memdec));
set_xen_guest_handle(memdec.extent_start, mfns);
memdec.domid = DOMID_SELF;
memdec.nr_extents = page_cnt;
e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
if (e != page_cnt) {
cmn_err(CE_PANIC, "balloon: unable to give pages back "
"to the hypervisor.\n");
}
}
atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
return (page_cnt);
}
long
balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
uint_t order, mfn_t *mfns)
{
xen_memory_reservation_t memres;
long fallback_cnt;
long cnt;
uint_t i, j, page_cnt, extlen;
long e;
int locked;
ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
extlen = 1 << order;
page_cnt = nextents * extlen;
for (i = 0; i < page_cnt; i++) {
cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
if (cnt != 1) {
cmn_err(CE_PANIC, "balloon: unable to give a page back "
"to the hypervisor.\n");
}
}
bzero(&memres, sizeof (memres));
set_xen_guest_handle(memres.extent_start, mfns);
memres.domid = DOMID_SELF;
memres.nr_extents = nextents;
memres.mem_flags = XENMEMF_address_bits(addr_bits);
memres.extent_order = order;
cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
locked = balloon_lock_contig_pfnlist(cnt * extlen);
for (i = 0; i < cnt; i++) {
for (j = 0; j < extlen; j++) {
reassign_pfn(pp[i * extlen + j]->p_pagenum,
mfns[i] + j);
}
}
if (locked)
unlock_contig_pfnlist();
if (cnt != nextents) {
if (cnt < 0) {
cnt = 0;
}
fallback_cnt = page_cnt - cnt * extlen;
bzero(&memres, sizeof (memres));
set_xen_guest_handle(memres.extent_start, mfns);
memres.domid = DOMID_SELF;
memres.nr_extents = fallback_cnt;
e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
if (e != fallback_cnt) {
cmn_err(CE_PANIC, "balloon: unable to recover from "
"failed increase_reservation.\n");
}
locked = balloon_lock_contig_pfnlist(fallback_cnt);
for (i = 0; i < fallback_cnt; i++) {
uint_t offset = page_cnt - fallback_cnt;
reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
}
if (locked)
unlock_contig_pfnlist();
}
atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
return (cnt);
}
size_t
balloon_values(int cmd)
{
switch (cmd) {
case BLN_IOCTL_CURRENT:
return (ptokb(bln_stats.bln_current_pages));
case BLN_IOCTL_TARGET:
return (ptokb(bln_stats.bln_new_target));
case BLN_IOCTL_LOW:
return (ptokb(bln_stats.bln_low));
case BLN_IOCTL_HIGH:
return (ptokb(bln_stats.bln_high));
case BLN_IOCTL_LIMIT:
return (ptokb(bln_stats.bln_hard_limit));
default:
panic("Unexpected cmd %d in balloon_values()\n", cmd);
}
}