#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mman.h>
#include <sys/errno.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/thread.h>
#include <sys/dumphdr.h>
#include <sys/bitmap.h>
#include <sys/lgrp.h>
#include <vm/seg_kmem.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kpm.h>
#include <vm/seg_map.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/rm.h>
#include <vm/vpm.h>
#ifdef SEGKPM_SUPPORT
int vpm_enable = 1;
#else
int vpm_enable = 0;
#endif
#ifdef SEGKPM_SUPPORT
int vpm_cache_enable = 1;
long vpm_cache_percent = 12;
long vpm_cache_size;
int vpm_nfreelist = 0;
int vpmd_freemsk = 0;
#define VPM_S_PAD 64
union vpm_cpu {
struct {
int vcpu_free_ndx;
ulong_t vcpu_hits;
ulong_t vcpu_misses;
} vcpu;
char vpm_pad[VPM_S_PAD];
};
static union vpm_cpu *vpmd_cpu;
#define vfree_ndx vcpu.vcpu_free_ndx
int vpm_cachemode = VPMCACHE_LRU;
#define PPMTX(pp) (&(pp)->p_ilock)
static struct vpmap *vpmd_vpmap;
static struct vpmfree *vpmd_free;
#define VPMAPMTX(vpm) (&vpm->vpm_mtx)
#define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
#define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
#define VPMP(id) (&vpmd_vpmap[id - 1])
#define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1)
#ifdef DEBUG
struct vpm_debug {
int vpmd_steals;
int vpmd_contend;
int vpmd_prevpagelocked;
int vpmd_getpagefailed;
int vpmd_zerostart;
int vpmd_emptyfreelist;
int vpmd_nofreevpms;
} vpm_debug;
#define VPM_DEBUG(x) ((vpm_debug.x)++)
int steals;
int steals_mtbf = 7;
int contend;
int contend_mtbf = 127;
#define VPM_MTBF(v, f) (((++(v)) & (f)) != (f))
#else
#define VPM_MTBF(v, f) (1)
#define VPM_DEBUG(x)
#endif
void
vpm_init()
{
long npages;
struct vpmap *vpm;
struct vpmfree *vpmflp;
int i, ndx;
extern void prefetch_smap_w(void *);
if (!kpm_enable) {
vpm_enable = 0;
}
if (!vpm_enable || !vpm_cache_enable) {
return;
}
vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
if (vpm_cache_size < VPMAP_MINCACHE) {
vpm_cache_size = VPMAP_MINCACHE;
}
if (vpm_cache_size > VPMAP_MAXCACHE) {
vpm_cache_size = VPMAP_MAXCACHE;
}
if (vpm_nfreelist == 0) {
vpm_nfreelist = max_ncpus;
} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
cmn_err(CE_WARN, "vpmap create : number of freelist "
"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
vpm_nfreelist = 2 * max_ncpus;
}
if (!ISP2(vpm_nfreelist)) {
vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
}
vpmd_freemsk = vpm_nfreelist - 1;
vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
ndx = 0;
for (i = 0; i < max_ncpus; i++) {
vpmd_cpu[i].vfree_ndx = ndx;
ndx = (ndx + 1) & vpmd_freemsk;
}
vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
KM_SLEEP);
for (i = 0; i < vpm_nfreelist; i++) {
vpmflp = &vpmd_free[i];
vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
}
npages = mmu_btop(vpm_cache_size);
vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
struct vpmfree *vpmflp;
union vpm_freeq *releq;
struct vpmap *vpmapf;
prefetch_smap_w((void *)vpm);
vpm->vpm_vp = NULL;
vpm->vpm_off = 0;
vpm->vpm_pp = NULL;
vpm->vpm_refcnt = 0;
mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
vpmflp = VPMAP2VMF(vpm);
releq = vpmflp->vpm_releq;
vpmapf = releq->vpmq_free;
if (vpmapf == NULL) {
releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
} else {
vpm->vpm_next = vpmapf;
vpm->vpm_prev = vpmapf->vpm_prev;
vpmapf->vpm_prev = vpm;
vpm->vpm_prev->vpm_next = vpm;
releq->vpmq_free = vpm->vpm_next;
}
vpm->vpm_ndxflg = VPMRELEQ;
}
}
#define VPMAP_RMFREELIST(vpm) \
{ \
if (vpm->vpm_next != NULL) { \
union vpm_freeq *freeq; \
struct vpmfree *vpmflp; \
vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
mutex_enter(&freeq->vpmq_mtx); \
if (freeq->vpmq_free != vpm) { \
vpm->vpm_prev->vpm_next = vpm->vpm_next; \
vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
} else if (vpm == vpm->vpm_next) { \
freeq->vpmq_free = NULL; \
} else { \
freeq->vpmq_free = vpm->vpm_next; \
vpm->vpm_prev->vpm_next = vpm->vpm_next; \
vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
} \
mutex_exit(&freeq->vpmq_mtx); \
vpm->vpm_next = vpm->vpm_prev = NULL; \
} \
}
static int
get_freelndx(int mode)
{
int ndx;
ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
switch (mode) {
case VPMCACHE_LRU:
default:
vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
break;
}
return (ndx);
}
static struct vpmap *
get_free_vpmap(page_t *newpage)
{
struct vpmfree *vpmflp;
kmutex_t *vmtx;
struct vpmap *vpm, *first;
union vpm_freeq *allocq, *releq;
page_t *pp = NULL;
int end_ndx, page_locked = 0;
int free_ndx;
free_ndx = get_freelndx(vpm_cachemode);
end_ndx = free_ndx;
vpmflp = &vpmd_free[free_ndx];
retry_queue:
allocq = vpmflp->vpm_allocq;
mutex_enter(&allocq->vpmq_mtx);
if ((vpm = allocq->vpmq_free) == NULL) {
skip_queue:
if (vpmflp->vpm_allocq != allocq) {
mutex_exit(&allocq->vpmq_mtx);
goto retry_queue;
}
releq = vpmflp->vpm_releq;
if (!mutex_tryenter(&releq->vpmq_mtx)) {
mutex_exit(&allocq->vpmq_mtx);
mutex_enter(&releq->vpmq_mtx);
mutex_exit(&releq->vpmq_mtx);
goto retry_queue;
}
if (releq->vpmq_free == NULL) {
VPM_DEBUG(vpmd_emptyfreelist);
free_ndx = (free_ndx + 1) & vpmd_freemsk;
if (free_ndx != end_ndx) {
mutex_exit(&releq->vpmq_mtx);
mutex_exit(&allocq->vpmq_mtx);
vpmflp = &vpmd_free[free_ndx];
goto retry_queue;
}
vpmflp->vpm_want++;
mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
cv_wait(&vpmflp->vpm_free_cv,
&vpmflp->vpm_freeq[0].vpmq_mtx);
vpmflp->vpm_want--;
mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
vpmflp = &vpmd_free[free_ndx];
VPM_DEBUG(vpmd_nofreevpms);
goto retry_queue;
} else {
vpmflp->vpm_allocq = releq;
vpmflp->vpm_releq = allocq;
mutex_exit(&allocq->vpmq_mtx);
mutex_exit(&releq->vpmq_mtx);
if (page_locked) {
delay(hz >> 2);
page_locked = 0;
}
goto retry_queue;
}
} else {
int gotnewvpm;
kmutex_t *pmtx;
uint_t vpmref;
first = vpm;
next_vpmap:
vmtx = VPMAPMTX(vpm);
if (!mutex_tryenter(vmtx)) {
if ((vpm = vpm->vpm_next) == first) {
goto skip_queue;
} else {
goto next_vpmap;
}
}
pmtx = PPMTX(newpage);
gotnewvpm = 0;
mutex_enter(pmtx);
if ((vpmref = newpage->p_vpmref) == 0) {
newpage->p_vpmref = VPMID(vpm);
gotnewvpm = 1;
} else {
VPM_DEBUG(vpmd_contend);
mutex_exit(vmtx);
}
mutex_exit(pmtx);
if (gotnewvpm) {
if (first == vpm) {
ASSERT(first == allocq->vpmq_free);
allocq->vpmq_free = vpm->vpm_next;
}
if (allocq->vpmq_free == vpm)
allocq->vpmq_free = NULL;
else {
vpm->vpm_prev->vpm_next = vpm->vpm_next;
vpm->vpm_next->vpm_prev = vpm->vpm_prev;
}
mutex_exit(&allocq->vpmq_mtx);
vpm->vpm_prev = vpm->vpm_next = NULL;
if ((pp = vpm->vpm_pp) != NULL &&
vpm->vpm_vp == pp->p_vnode &&
vpm->vpm_off == pp->p_offset) {
pmtx = PPMTX(pp);
if (page_trylock(pp, SE_SHARED)) {
mutex_enter(pmtx);
if (PP_ISFREE(pp) ||
vpm->vpm_vp != pp->p_vnode ||
vpm->vpm_off != pp->p_offset ||
pp->p_vpmref != VPMID(vpm)) {
mutex_exit(pmtx);
page_unlock(pp);
} else {
pp->p_vpmref = 0;
mutex_exit(pmtx);
(void) page_release(pp, 1);
}
} else {
mutex_enter(pmtx);
if (pp->p_vpmref == VPMID(vpm)) {
pp->p_vpmref = 0;
}
mutex_exit(pmtx);
VPM_DEBUG(vpmd_prevpagelocked);
}
}
vpm->vpm_pp = newpage;
vpm->vpm_vp = newpage->p_vnode;
vpm->vpm_off = newpage->p_offset;
} else {
int steal = !VPM_MTBF(steals, steals_mtbf);
mutex_exit(&allocq->vpmq_mtx);
vpm = VPMP(vpmref);
vmtx = VPMAPMTX(vpm);
mutex_enter(vmtx);
if ((steal && vpm->vpm_refcnt == 0) ||
vpm->vpm_pp != newpage) {
pmtx = PPMTX(newpage);
mutex_enter(pmtx);
if (newpage->p_vpmref == vpmref) {
newpage->p_vpmref = 0;
}
mutex_exit(pmtx);
mutex_exit(vmtx);
VPM_DEBUG(vpmd_steals);
goto retry_queue;
} else if (vpm->vpm_refcnt == 0) {
VPMAP_RMFREELIST(vpm);
}
}
return (vpm);
}
}
static void
free_vpmap(struct vpmap *vpm)
{
struct vpmfree *vpmflp;
struct vpmap *vpmfreelist;
union vpm_freeq *releq;
ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
if (vpm->vpm_refcnt != 0) {
panic("free_vpmap");
}
vpmflp = &vpmd_free[vpm->vpm_free_ndx];
releq = vpmflp->vpm_releq;
if (releq == &vpmflp->vpm_freeq[0]) {
vpm->vpm_ndxflg = 0;
} else {
vpm->vpm_ndxflg = 1;
}
mutex_enter(&releq->vpmq_mtx);
vpmfreelist = releq->vpmq_free;
if (vpmfreelist == 0) {
int want;
releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
want = vpmflp->vpm_want;
mutex_exit(&releq->vpmq_mtx);
if (want) {
mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
if (vpmflp->vpm_want)
cv_signal(&vpmflp->vpm_free_cv);
mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
}
} else {
vpm->vpm_next = vpmfreelist;
vpm->vpm_prev = vpmfreelist->vpm_prev;
vpmfreelist->vpm_prev = vpm;
vpm->vpm_prev->vpm_next = vpm;
mutex_exit(&releq->vpmq_mtx);
}
}
static struct vpmap *
get_vpmap(page_t *pp)
{
struct vpmap *vpm = NULL;
kmutex_t *vmtx;
kmutex_t *pmtx;
unsigned int refid;
ASSERT((pp != NULL) && PAGE_LOCKED(pp));
if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
vpm = VPMP(refid);
vmtx = VPMAPMTX(vpm);
mutex_enter(vmtx);
if (vpm->vpm_pp != pp) {
pmtx = PPMTX(pp);
mutex_enter(pmtx);
if (pp->p_vpmref == refid)
pp->p_vpmref = 0;
mutex_exit(pmtx);
mutex_exit(vmtx);
vpm = NULL;
} else if (vpm->vpm_refcnt == 0) {
VPMAP_RMFREELIST(vpm);
}
}
if (vpm == NULL) {
vpm = get_free_vpmap(pp);
vmtx = VPMAPMTX(vpm);
vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
} else {
vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
}
vpm->vpm_refcnt++;
mutex_exit(vmtx);
return (vpm);
}
static int
vpm_pagecreate(
struct vnode *vp,
u_offset_t baseoff,
size_t len,
vmap_t vml[],
int nseg,
int *newpage)
{
page_t *pp = NULL;
caddr_t base;
u_offset_t off = baseoff;
int i;
ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
for (i = 0; len > 0; len -= PAGESIZE, i++) {
struct vpmap *vpm;
if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
base = segkpm_create_va(off);
if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
segkmap, base)) == NULL) {
panic("segmap_pagecreate_vpm: "
"page_create failed");
}
if (newpage != NULL)
*newpage = 1;
page_io_unlock(pp);
}
if (vpm_cache_enable) {
vpm = get_vpmap(pp);
vml[i].vs_data = (void *)&vpm->vpm_pp;
} else {
vml[i].vs_data = (void *)pp;
pp->p_vpmref = 0;
}
vml[i].vs_addr = hat_kpm_mapin(pp, 0);
vml[i].vs_len = PAGESIZE;
off += PAGESIZE;
}
vml[i].vs_data = NULL;
vml[i].vs_addr = (caddr_t)NULL;
return (0);
}
int
vpm_map_pages(
struct vnode *vp,
u_offset_t off,
size_t len,
int fetchpage,
vmap_t *vml,
int nseg,
int *newpage,
enum seg_rw rw)
{
extern struct vnode *common_specvp();
u_offset_t baseoff;
uint_t prot;
caddr_t base;
page_t *pp, *pplist[MAXVMAPS];
struct vpmap *vpm;
int i, error = 0;
size_t tlen;
ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
baseoff = off & (offset_t)PAGEMASK;
vml[0].vs_data = NULL;
vml[0].vs_addr = (caddr_t)NULL;
tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
if (tlen > (VPMMAXPGS * PAGESIZE)) {
tlen = VPMMAXPGS * PAGESIZE;
}
if (tlen > ((nseg - 1) * PAGESIZE)) {
tlen = ((nseg - 1) * PAGESIZE);
}
len = tlen;
if (vp->v_type == VBLK)
vp = common_specvp(vp);
if (!fetchpage)
return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
pp = page_lookup(vp, baseoff, SE_SHARED);
if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
!= (P_MOD | P_REF))) {
int j;
if (pp != NULL) {
page_unlock(pp);
}
for (j = 0; j < i; j++) {
page_unlock(pplist[j]);
}
baseoff = off & (offset_t)PAGEMASK;
base = segkpm_create_va(baseoff);
error = VOP_GETPAGE(vp, baseoff, tlen, &prot, pplist,
tlen, segkmap, base, rw, CRED(), NULL);
if (error) {
VPM_DEBUG(vpmd_getpagefailed);
pplist[0] = NULL;
}
break;
} else {
pplist[i] = pp;
baseoff += PAGESIZE;
}
}
if (error) {
for (i = 0; pplist[i] != NULL; i++) {
page_unlock(pplist[i]);
pplist[i] = NULL;
}
vml[0].vs_addr = NULL;
vml[0].vs_data = NULL;
return (error);
}
for (i = 0; pplist[i] != NULL; i++) {
if (vpm_cache_enable) {
vpm = get_vpmap(pplist[i]);
vml[i].vs_data = (void *)&(vpm->vpm_pp);
} else {
vml[i].vs_data = (void *)pplist[i];
pplist[i]->p_vpmref = 0;
}
vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
vml[i].vs_len = PAGESIZE;
}
vml[i].vs_data = NULL;
vml[i].vs_addr = (caddr_t)NULL;
return (0);
}
void
vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
{
int i;
struct vpmap *vpm;
kmutex_t *mtx;
page_t *pp;
for (i = 0; vml[i].vs_data != NULL; i++) {
ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
if (vpm_cache_enable) {
pp = *(((page_t **)vml[i].vs_data));
} else {
pp = (page_t *)vml[i].vs_data;
}
if (rw == S_WRITE) {
hat_setrefmod(pp);
} else {
ASSERT(rw == S_READ);
hat_setref(pp);
}
if (vpm_cache_enable) {
vpm = (struct vpmap *)((char *)vml[i].vs_data
- offsetof(struct vpmap, vpm_pp));
hat_kpm_mapout(pp, 0, vml[i].vs_addr);
page_unlock(pp);
mtx = VPMAPMTX(vpm);
mutex_enter(mtx);
if (--vpm->vpm_refcnt == 0) {
free_vpmap(vpm);
}
mutex_exit(mtx);
} else {
hat_kpm_mapout(pp, 0, vml[i].vs_addr);
(void) page_release(pp, 1);
}
vml[i].vs_data = NULL;
vml[i].vs_addr = NULL;
}
}
int
vpm_data_copy(struct vnode *vp,
u_offset_t off,
size_t len,
struct uio *uio,
int fetchpage,
int *newpage,
int zerostart,
enum seg_rw rw)
{
int error;
struct vmap vml[MINVMAPS];
enum uio_rw uiorw;
int npages = 0;
uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
error = vpm_map_pages(vp, off, (uint_t)len,
fetchpage, vml, MINVMAPS, &npages, rw);
if (newpage != NULL)
*newpage = npages;
if (!error) {
int i, pn, slen = len;
int pon = off & PAGEOFFSET;
if (!fetchpage && zerostart) {
(void) kzero(vml[0].vs_addr, (uint_t)pon);
VPM_DEBUG(vpmd_zerostart);
}
for (i = 0; !error && slen > 0 &&
vml[i].vs_addr != NULL; i++) {
pn = (int)MIN(slen, (PAGESIZE - pon));
error = uiomove(vml[i].vs_addr + pon,
(long)pn, uiorw, uio);
slen -= pn;
pon = 0;
}
if (!fetchpage && npages &&
uio->uio_loffset < roundup(off + len, PAGESIZE)) {
int nzero;
pon = (uio->uio_loffset & PAGEOFFSET);
nzero = PAGESIZE - pon;
i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
}
vpm_unmap_pages(vml, rw);
}
return (error);
}
int
vpm_sync_pages(struct vnode *vp,
u_offset_t off,
size_t len,
uint_t flags)
{
extern struct vnode *common_specvp();
int bflags = 0;
int error = 0;
size_t psize = roundup(len, PAGESIZE);
if (vp->v_type == VBLK)
vp = common_specvp(vp);
if ((flags & ~SM_DONTNEED) != 0) {
if (flags & SM_ASYNC)
bflags |= B_ASYNC;
if (flags & SM_INVAL)
bflags |= B_INVAL;
if (flags & SM_DESTROY)
bflags |= (B_INVAL|B_TRUNC);
if (flags & SM_FREE)
bflags |= B_FREE;
if (flags & SM_DONTNEED)
bflags |= B_DONTNEED;
error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
}
return (error);
}
#else
void
vpm_init()
{
}
int
vpm_pagecreate(
struct vnode *vp,
u_offset_t baseoff,
size_t len,
vmap_t vml[],
int nseg,
int *newpage)
{
return (0);
}
int
vpm_map_pages(
struct vnode *vp,
u_offset_t off,
size_t len,
int fetchpage,
vmap_t vml[],
int nseg,
int *newpage,
enum seg_rw rw)
{
return (0);
}
int
vpm_data_copy(struct vnode *vp,
u_offset_t off,
size_t len,
struct uio *uio,
int fetchpage,
int *newpage,
int zerostart,
enum seg_rw rw)
{
return (0);
}
void
vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
{
}
int
vpm_sync_pages(struct vnode *vp,
u_offset_t off,
size_t len,
uint_t flags)
{
return (0);
}
#endif