#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vmem.h>
#include <sys/buf.h>
#include <sys/cpuvar.h>
#include <sys/lgrp.h>
#include <sys/disp.h>
#include <sys/vm.h>
#include <sys/mman.h>
#include <sys/vnode.h>
#include <sys/cred.h>
#include <sys/exec.h>
#include <sys/exechdr.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/swap.h>
#include <sys/dumphdr.h>
#include <sys/random.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_kp.h>
#include <vm/seg_vn.h>
#include <vm/page.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/vm_dep.h>
#include <sys/cpu.h>
#include <sys/vm_machparam.h>
#include <sys/memlist.h>
#include <sys/bootconf.h>
#include <vm/hat_i86.h>
#include <sys/x86_archext.h>
#include <sys/elf_386.h>
#include <sys/cmn_err.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/secflags.h>
#include <sys/vtrace.h>
#include <sys/ddidmareq.h>
#include <sys/promif.h>
#include <sys/memnode.h>
#include <sys/stack.h>
#include <util/qsort.h>
#include <sys/taskq.h>
#ifdef __xpv
#include <sys/hypervisor.h>
#include <sys/xen_mmu.h>
#include <sys/balloon_impl.h>
static kmutex_t io_pool_lock;
static kmutex_t contig_list_lock;
static page_t *io_pool_4g;
static page_t *io_pool_16m;
static long io_pool_cnt;
static long io_pool_cnt_max = 0;
#define DEFAULT_IO_POOL_MIN 128
static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
static long io_pool_cnt_lowater = 0;
static long io_pool_shrink_attempts;
static long io_pool_shrinks;
static long io_pool_grows;
static mfn_t start_mfn = 1;
static caddr_t io_pool_kva;
static int create_contig_pfnlist(uint_t);
#define DEFAULT_IO_POOL_PCT 2
static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
static void page_io_pool_sub(page_t **, page_t *, page_t *);
int ioalloc_dbg = 0;
#endif
uint_t vac_colors = 1;
int largepagesupport = 0;
extern uint_t page_create_new;
extern uint_t page_create_exists;
extern uint_t page_create_putbacks;
extern int use_sse_pagecopy, use_sse_pagezero;
typedef struct {
pfn_t mnr_pfnlo;
pfn_t mnr_pfnhi;
int mnr_mnode;
int mnr_memrange;
int mnr_next;
int mnr_exists;
pgcnt_t mnr_mt_clpgcnt;
pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES];
pgcnt_t mnr_mt_totcnt;
#ifdef DEBUG
struct mnr_mts {
pgcnt_t mnr_mts_pgcnt;
int mnr_mts_colors;
pgcnt_t *mnr_mtsc_pgcnt;
} *mnr_mts;
#endif
} mnoderange_t;
#define MEMRANGEHI(mtype) \
((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
#define MEMRANGELO(mtype) (memranges[mtype])
#define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt)
#define PFN_4GIG 0x100000
#define PFN_16MEG 0x1000
#define MRI_4G 0
#define MRI_2G 1
#define MRI_16M 2
#define MRI_0 3
static pfn_t arch_memranges[NUM_MEM_RANGES] = {
PFN_4GIG,
0x80000,
PFN_16MEG,
0x00000,
};
pfn_t *memranges = &arch_memranges[0];
int nranges = NUM_MEM_RANGES;
static mnoderange_t *mnoderanges;
static int mnoderangecnt;
static int mtype4g;
static int mtype16m;
static int mtypetop;
#define DESFREE4G (maxmem4g >> desfree4gshift)
#define RESTRICT4G_ALLOC \
(physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
static pgcnt_t maxmem4g;
static pgcnt_t freemem4g;
static int physmax4g;
static int desfree4gshift = 4;
#define FREEMEM16M MTYPE_FREEMEM(mtype16m)
#define DESFREE16M desfree16m
#define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \
(mtype16m != -1 && (freemem != 0) && ((flags & PG_PANIC) == 0) && \
((freemem >= (FREEMEM16M)) || \
(FREEMEM16M < (DESFREE16M + pgcnt))))
static pgcnt_t desfree16m = 0x380;
int restricted_kmemalloc = 0;
#ifdef VM_STATS
struct {
ulong_t pga_alloc;
ulong_t pga_notfullrange;
ulong_t pga_nulldmaattr;
ulong_t pga_allocok;
ulong_t pga_allocfailed;
ulong_t pgma_alloc;
ulong_t pgma_allocok;
ulong_t pgma_allocfailed;
ulong_t pgma_allocempty;
} pga_vmstats;
#endif
uint_t mmu_page_sizes;
uint_t mmu_exported_page_sizes;
uint_t mmu_legacy_page_sizes;
pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
size_t mcntl0_lpsize = MMU_PAGESIZE;
size_t max_uheap_lpsize = MMU_PAGESIZE;
size_t default_uheap_lpsize = MMU_PAGESIZE;
size_t max_ustack_lpsize = MMU_PAGESIZE;
size_t default_ustack_lpsize = MMU_PAGESIZE;
size_t max_privmap_lpsize = MMU_PAGESIZE;
size_t max_uidata_lpsize = MMU_PAGESIZE;
size_t max_utext_lpsize = MMU_PAGESIZE;
size_t max_shm_lpsize = MMU_PAGESIZE;
uint_t page_colors;
uint_t page_colors_mask;
uint_t page_coloring_shift;
int cpu_page_colors;
static uint_t l2_colors;
#define PAGE_COLORS_MIN 16
page_t ****page_freelists;
page_t ***page_cachelists;
hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
kmutex_t *fpc_mutex[NPC_MUTEX];
kmutex_t *cpc_mutex[NPC_MUTEX];
static kmutex_t mnoderange_lock;
static kmutex_t contig_lock;
#define CONTIG_LOCK() mutex_enter(&contig_lock);
#define CONTIG_UNLOCK() mutex_exit(&contig_lock);
#define PFN_16M (mmu_btop((uint64_t)0x1000000))
caddr_t
i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
{
caddr_t addr;
caddr_t addr1;
page_t *pp;
addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
pp = page_numtopp_nolock(pf);
if (pp == NULL) {
hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
prot | HAT_NOSYNC, HAT_LOAD_LOCK);
} else {
hat_memload(kas.a_hat, addr, pp,
prot | HAT_NOSYNC, HAT_LOAD_LOCK);
}
}
return (addr1);
}
page_t *
page_numtopp_alloc(pfn_t pfnum)
{
page_t *pp;
retry:
pp = page_numtopp_nolock(pfnum);
if (pp == NULL) {
return (NULL);
}
if (!page_trylock(pp, SE_EXCL)) {
return (NULL);
}
if (page_pptonum(pp) != pfnum) {
page_unlock(pp);
goto retry;
}
if (!PP_ISFREE(pp)) {
page_unlock(pp);
return (NULL);
}
if (pp->p_szc) {
page_demote_free_pages(pp);
page_unlock(pp);
goto retry;
}
if (pp->p_vnode) {
page_destroy_free(pp);
if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
return (NULL);
}
if (page_pptonum(pp) != pfnum) {
page_unlock(pp);
goto retry;
}
}
if (!PP_ISFREE(pp)) {
page_unlock(pp);
return (NULL);
}
if (!page_reclaim(pp, (kmutex_t *)NULL))
return (NULL);
return (pp);
}
size_t
map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
{
level_t l = 0;
size_t pgsz = MMU_PAGESIZE;
size_t max_lpsize;
uint_t mszc;
ASSERT(maptype != MAPPGSZ_VA);
if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
return (MMU_PAGESIZE);
}
switch (maptype) {
case MAPPGSZ_HEAP:
case MAPPGSZ_STK:
max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
if (max_lpsize == MMU_PAGESIZE) {
return (MMU_PAGESIZE);
}
if (len == 0) {
len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
p->p_brksize - p->p_bssbase : p->p_stksize;
}
len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
for (l = mmu.umax_page_level; l > 0; --l) {
if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
continue;
} else {
pgsz = LEVEL_SIZE(l);
}
break;
}
mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
p->p_stkpageszc);
if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
pgsz = hw_page_array[mszc].hp_size;
}
return (pgsz);
case MAPPGSZ_ISM:
for (l = mmu.umax_page_level; l > 0; --l) {
if (len >= LEVEL_SIZE(l))
return (LEVEL_SIZE(l));
}
return (LEVEL_SIZE(0));
}
return (pgsz);
}
static uint_t
map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
size_t min_physmem)
{
caddr_t eaddr = addr + size;
uint_t szcvec = 0;
caddr_t raddr;
caddr_t readdr;
size_t pgsz;
int i;
if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
return (0);
}
for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
pgsz = page_get_pagesize(i);
if (pgsz > max_lpsize) {
continue;
}
raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
if (raddr < addr || raddr >= readdr) {
continue;
}
if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
continue;
}
szcvec = ((1 << (i + 1)) - 1) & ~1;
break;
}
return (szcvec);
}
uint_t
map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
int memcntl)
{
size_t max_lpsize = mcntl0_lpsize;
if (mmu.max_page_level == 0)
return (0);
if (flags & MAP_TEXT) {
if (!memcntl)
max_lpsize = max_utext_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
shm_lpg_min_physmem));
} else if (flags & MAP_INITDATA) {
if (!memcntl)
max_lpsize = max_uidata_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
privm_lpg_min_physmem));
} else if (type == MAPPGSZC_SHM) {
if (!memcntl)
max_lpsize = max_shm_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
shm_lpg_min_physmem));
} else if (type == MAPPGSZC_HEAP) {
if (!memcntl)
max_lpsize = max_uheap_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
privm_lpg_min_physmem));
} else if (type == MAPPGSZC_STACK) {
if (!memcntl)
max_lpsize = max_ustack_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
privm_lpg_min_physmem));
} else {
if (!memcntl)
max_lpsize = max_privmap_lpsize;
return (map_szcvec(addr, size, off, max_lpsize,
privm_lpg_min_physmem));
}
}
faultcode_t
pagefault(
caddr_t addr,
enum fault_type type,
enum seg_rw rw,
int iskernel)
{
struct as *as;
struct hat *hat;
struct proc *p;
kthread_t *t;
faultcode_t res;
caddr_t base;
size_t len;
int err;
int mapped_red;
uintptr_t ea;
ASSERT_STACK_ALIGNED();
if (INVALID_VADDR(addr))
return (FC_NOMAP);
mapped_red = segkp_map_red();
if (iskernel) {
as = &kas;
hat = as->a_hat;
} else {
t = curthread;
p = ttoproc(t);
as = p->p_as;
hat = as->a_hat;
}
res = as_fault(hat, as, addr, 1, type, rw);
if (res != FC_NOMAP || iskernel)
goto out;
base = p->p_brkbase;
len = p->p_brksize;
if (addr < base || addr >= base + len) {
base = (caddr_t)p->p_usrstack - p->p_stksize;
len = p->p_stksize;
if (addr < base || addr >= p->p_usrstack) {
res = FC_NOMAP;
goto out;
}
}
if (p->p_model == DATAMODEL_ILP32) {
ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
len = ea - (uintptr_t)base;
as_rangelock(as);
if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
0) {
err = as_map(as, base, len, segvn_create, zfod_argsp);
as_rangeunlock(as);
if (err) {
res = FC_MAKE_ERR(err);
goto out;
}
} else {
as_rangeunlock(as);
}
res = as_fault(hat, as, addr, 1, F_INVAL, rw);
}
out:
if (mapped_red)
segkp_unmap_red();
return (res);
}
void
map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
{
struct proc *p = curproc;
caddr_t userlimit = (flags & _MAP_LOW32) ?
(caddr_t)_userlimit32 : p->p_as->a_userlimit;
map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags);
}
int
map_addr_vacalign_check(caddr_t addr, u_offset_t off)
{
return (0);
}
size_t aslr_max_map_skew = 256 * 1024 * 1024;
void
map_addr_proc(
caddr_t *addrp,
size_t len,
offset_t off,
int vacalign,
caddr_t userlimit,
struct proc *p,
uint_t flags)
{
struct as *as = p->p_as;
caddr_t addr;
caddr_t base;
size_t slen;
size_t align_amount;
ASSERT32(userlimit == as->a_userlimit);
base = p->p_brkbase;
if (p->p_model == DATAMODEL_NATIVE) {
if (userlimit < as->a_userlimit) {
ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
if (userlimit > base)
slen = userlimit - base;
else {
*addrp = NULL;
return;
}
} else {
slen = p->p_usrstack - base -
((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
}
} else {
slen = userlimit - base;
}
len = (len + PAGEOFFSET) & PAGEMASK;
if (len <= ELF_386_MAXPGSZ) {
align_amount = ELF_386_MAXPGSZ;
} else {
int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
mmu.umax_page_level;
while (lvl && len < LEVEL_SIZE(lvl))
--lvl;
align_amount = LEVEL_SIZE(lvl);
}
if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
align_amount = (uintptr_t)*addrp;
ASSERT(ISP2(align_amount));
ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
off = off & (align_amount - 1);
if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
PAGESIZE, off) == 0) {
caddr_t as_addr;
addr = base + slen - (PAGESIZE + len);
as_addr = addr;
addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
addr += (uintptr_t)off;
if (addr > as_addr) {
addr -= align_amount;
}
if (flags & _MAP_RANDOMIZE) {
uint32_t slew;
(void) random_get_pseudo_bytes((uint8_t *)&slew,
sizeof (slew));
slew = slew % MIN(aslr_max_map_skew, (addr - base));
addr -= P2ALIGN(slew, align_amount);
}
ASSERT(addr > base);
ASSERT(addr + len < base + slen);
ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
((uintptr_t)(off)));
*addrp = addr;
} else {
*addrp = NULL;
}
}
int valid_va_range_aligned_wraparound;
int
valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
size_t align, size_t redzone, size_t off)
{
uintptr_t hi, lo;
size_t tot_len;
ASSERT(align == 0 ? off == 0 : off < align);
ASSERT(ISP2(align));
ASSERT(align == 0 || align >= PAGESIZE);
lo = (uintptr_t)*basep;
hi = lo + *lenp;
tot_len = minlen + 2 * redzone;
if (hi < lo) {
*lenp = 0UL - lo - 1UL;
valid_va_range_aligned_wraparound++;
hi = lo + *lenp;
}
if (*lenp < tot_len) {
return (0);
}
if (lo < hole_start) {
if (hi > hole_start) {
if (hi < hole_end) {
hi = hole_start;
} else {
if (dir == AH_LO) {
if (hole_start - lo >= tot_len)
hi = hole_start;
else if (hi - hole_end >= tot_len)
lo = hole_end;
else
return (0);
} else {
if (hi - hole_end >= tot_len)
lo = hole_end;
else if (hole_start - lo >= tot_len)
hi = hole_start;
else
return (0);
}
}
}
} else {
if (hi < hole_end)
return (0);
if (lo < hole_end)
lo = hole_end;
}
if (hi - lo < tot_len)
return (0);
if (align > 1) {
uintptr_t tlo = lo + redzone;
uintptr_t thi = hi - redzone;
tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
if (tlo < lo + redzone) {
return (0);
}
if (thi < tlo || thi - tlo < minlen) {
return (0);
}
}
*basep = (caddr_t)lo;
*lenp = hi - lo;
return (1);
}
int
valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
{
return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
}
uintptr_t forbidden_null_mapping_sz = 0x10000;
int
valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
caddr_t userlimit)
{
caddr_t eaddr = addr + len;
if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
return (RANGE_BADADDR);
if ((addr <= (caddr_t)forbidden_null_mapping_sz) &&
as->a_proc != NULL &&
secflag_enabled(as->a_proc, PROC_SEC_FORBIDNULLMAP))
return (RANGE_BADADDR);
if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
return (RANGE_BADADDR);
return (RANGE_OKAY);
}
int
pf_is_memory(pfn_t pf)
{
if (pfn_is_foreign(pf))
return (0);
return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
}
int
memrange_num(pfn_t pfn)
{
int n;
for (n = 0; n < nranges - 1; ++n) {
if (pfn >= memranges[n])
break;
}
return (n);
}
int
pfn_2_mtype(pfn_t pfn)
{
#if defined(__xpv)
return (0);
#else
int n;
for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
if (pfn >= mnoderanges[n].mnr_pfnlo) {
break;
}
}
return (n);
#endif
}
#if !defined(__xpv)
static page_t *
is_contigpage_free(
pfn_t *pfnp,
pgcnt_t *pgcnt,
pgcnt_t minctg,
uint64_t pfnseg,
int iolock)
{
int i = 0;
pfn_t pfn = *pfnp;
page_t *pp;
page_t *plist = NULL;
if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
*pfnp = roundup(*pfnp, pfnseg + 1);
return (NULL);
}
do {
retry:
pp = page_numtopp_nolock(pfn + i);
if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
(page_trylock(pp, SE_EXCL) == 0)) {
(*pfnp)++;
break;
}
if (page_pptonum(pp) != pfn + i) {
page_unlock(pp);
goto retry;
}
if (!(PP_ISFREE(pp))) {
page_unlock(pp);
(*pfnp)++;
break;
}
if (!PP_ISAGED(pp)) {
page_list_sub(pp, PG_CACHE_LIST);
page_hashout(pp, (kmutex_t *)NULL);
} else {
page_list_sub(pp, PG_FREE_LIST);
}
if (iolock)
page_io_lock(pp);
page_list_concat(&plist, &pp);
} while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
*pfnp += i;
if (i >= minctg) {
*pgcnt -= i;
return (plist);
}
if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
*pfnp = roundup(*pfnp, pfnseg + 1);
while (plist) {
pp = plist;
page_sub(&plist, pp);
page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
if (iolock)
page_io_unlock(pp);
page_unlock(pp);
}
return (NULL);
}
#endif
#ifndef DEBUG
#define check_dma(a, b, c) (void)(0)
#else
static void
check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
{
if (dma_attr == NULL)
return;
while (cnt-- > 0) {
if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
dma_attr->dma_attr_addr_lo)
panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
dma_attr->dma_attr_addr_hi)
panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
pp = pp->p_next;
}
}
#endif
#if !defined(__xpv)
static page_t *
page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
{
pfn_t pfn;
int sgllen;
uint64_t pfnseg;
pgcnt_t minctg;
page_t *pplist = NULL, *plist;
uint64_t lo, hi;
pgcnt_t pfnalign = 0;
static pfn_t startpfn;
static pgcnt_t lastctgcnt;
uintptr_t align;
CONTIG_LOCK();
if (mattr) {
lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
hi = mmu_btop(mattr->dma_attr_addr_hi);
if (hi >= physmax)
hi = physmax - 1;
sgllen = mattr->dma_attr_sgllen;
pfnseg = mmu_btop(mattr->dma_attr_seg);
align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
if (align > MMU_PAGESIZE)
pfnalign = mmu_btop(align);
minctg = howmany(*pgcnt, sgllen);
ASSERT(hi >= lo);
if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
startpfn = lo;
} else {
hi = physmax - 1;
lo = 0;
sgllen = 1;
pfnseg = mmu.highest_pfn;
minctg = *pgcnt;
if (minctg < lastctgcnt)
startpfn = lo;
}
lastctgcnt = minctg;
ASSERT(pfnseg + 1 >= (uint64_t)minctg);
if (hi > PFN_16M && startpfn < PFN_16M)
startpfn = PFN_16M;
pfn = startpfn;
if (pfnalign)
pfn = P2ROUNDUP(pfn, pfnalign);
while (pfn + minctg - 1 <= hi) {
plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
if (plist) {
page_list_concat(&pplist, &plist);
sgllen--;
if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
startpfn = pfn;
CONTIG_UNLOCK();
check_dma(mattr, pplist, *pgcnt);
return (pplist);
}
minctg = howmany(*pgcnt, sgllen);
}
if (pfnalign)
pfn = P2ROUNDUP(pfn, pfnalign);
}
if (startpfn == lo) {
CONTIG_UNLOCK();
return (NULL);
}
pfn = lo;
if (pfnalign)
pfn = P2ROUNDUP(pfn, pfnalign);
while (pfn < startpfn) {
plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
if (plist != NULL) {
page_list_concat(&pplist, &plist);
sgllen--;
if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
startpfn = pfn;
CONTIG_UNLOCK();
check_dma(mattr, pplist, *pgcnt);
return (pplist);
}
minctg = howmany(*pgcnt, sgllen);
}
if (pfnalign)
pfn = P2ROUNDUP(pfn, pfnalign);
}
CONTIG_UNLOCK();
return (NULL);
}
#endif
int
mnode_range_cnt(int mnode)
{
#if defined(__xpv)
ASSERT(mnode == 0);
return (1);
#else
int mri;
int mnrcnt = 0;
if (mem_node_config[mnode].exists != 0) {
mri = nranges - 1;
while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
mri--;
while (mri >= 0 &&
mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
mnrcnt++;
if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
mri--;
else
break;
}
}
ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
return (mnrcnt);
#endif
}
static int
mnoderange_cmp(const void *v1, const void *v2)
{
const mnoderange_t *m1 = v1;
const mnoderange_t *m2 = v2;
if (m1->mnr_pfnlo < m2->mnr_pfnlo)
return (-1);
return (m1->mnr_pfnlo > m2->mnr_pfnlo);
}
void
mnode_range_setup(mnoderange_t *mnoderanges)
{
mnoderange_t *mp;
ssize_t nr_ranges;
size_t mnode;
for (mnode = 0, nr_ranges = 0, mp = mnoderanges;
mnode < max_mem_nodes; mnode++) {
ssize_t mri = nranges - 1;
if (mem_node_config[mnode].exists == 0)
continue;
while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
mri--;
while (mri >= 0 && mem_node_config[mnode].physmax >=
MEMRANGELO(mri)) {
mp->mnr_pfnlo = MAX(MEMRANGELO(mri),
mem_node_config[mnode].physbase);
mp->mnr_pfnhi = MIN(MEMRANGEHI(mri),
mem_node_config[mnode].physmax);
mp->mnr_mnode = mnode;
mp->mnr_memrange = mri;
mp->mnr_next = -1;
mp->mnr_exists = 1;
mp++;
nr_ranges++;
if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
mri--;
else
break;
}
}
VERIFY3U(nr_ranges, <=, mnoderangecnt);
qsort(mnoderanges, nr_ranges, sizeof (mnoderange_t), mnoderange_cmp);
for (size_t i = 1; i < nr_ranges; i++)
mnoderanges[i].mnr_next = i - 1;
mtypetop = nr_ranges - 1;
mtype16m = pfn_2_mtype(PFN_16MEG - 1);
if (physmax4g)
mtype4g = pfn_2_mtype(0xfffff);
}
#ifndef __xpv
static void
mnode_range_add(int mnode)
{
int *prev;
int n, mri;
pfn_t start, end;
extern void membar_sync(void);
ASSERT(0 <= mnode && mnode < max_mem_nodes);
ASSERT(mem_node_config[mnode].exists);
start = mem_node_config[mnode].physbase;
end = mem_node_config[mnode].physmax;
ASSERT(start <= end);
mutex_enter(&mnoderange_lock);
#ifdef DEBUG
for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
ASSERT(mnoderanges[n].mnr_exists);
if (mnoderanges[n].mnr_mnode == mnode)
continue;
ASSERT(start > mnoderanges[n].mnr_pfnhi ||
end < mnoderanges[n].mnr_pfnlo);
}
#endif
mri = nranges - 1;
while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
mri--;
while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
if (mnoderanges[n].mnr_mnode == mnode &&
mnoderanges[n].mnr_memrange == mri) {
mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
start);
mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
end);
break;
}
}
if (n == -1) {
for (n = 0; n < mnoderangecnt; n++) {
if (mnoderanges[n].mnr_exists == 0)
break;
}
ASSERT(n < mnoderangecnt);
mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
mnoderanges[n].mnr_mnode = mnode;
mnoderanges[n].mnr_memrange = mri;
mnoderanges[n].mnr_exists = 1;
for (prev = &mtypetop;
mnoderanges[*prev].mnr_pfnlo > start;
prev = &mnoderanges[*prev].mnr_next) {
ASSERT(mnoderanges[*prev].mnr_next >= 0);
ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
}
mnoderanges[n].mnr_next = *prev;
membar_sync();
*prev = n;
}
if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
mri--;
else
break;
}
mutex_exit(&mnoderange_lock);
}
static void
mnode_range_del(int mnode)
{
_NOTE(ARGUNUSED(mnode));
ASSERT(0 <= mnode && mnode < max_mem_nodes);
ASSERT(0);
}
void
plat_slice_add(pfn_t start, pfn_t end)
{
mem_node_add_slice(start, end);
if (plat_dr_enabled()) {
mnode_range_add(PFN_2_MEM_NODE(start));
}
}
void
plat_slice_del(pfn_t start, pfn_t end)
{
ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
ASSERT(plat_dr_enabled());
mnode_range_del(PFN_2_MEM_NODE(start));
mem_node_del_slice(start, end);
}
#endif
int
mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
{
int mtype = mtypetop;
#if !defined(__xpv)
if (RESTRICT4G_ALLOC) {
VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
*flags |= PGI_MT_RANGE4G;
} else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
*flags |= PGI_MT_RANGE16M;
} else {
VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
*flags |= PGI_MT_RANGE0;
}
#endif
return (mtype);
}
int
mtype_pgr_init(int *flags, page_t *pp, pgcnt_t pgcnt)
{
int mtype = mtypetop;
#if !defined(__xpv)
if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
*flags |= PGI_MT_RANGE16M;
} else {
VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
*flags |= PGI_MT_RANGE0;
}
#endif
return (mtype);
}
int
mtype_func(int mnode, int mtype, uint_t flags)
{
if (flags & PGI_MT_RANGE) {
int mnr_lim = MRI_0;
if (flags & PGI_MT_NEXT) {
mtype = mnoderanges[mtype].mnr_next;
}
if (flags & PGI_MT_RANGE4G)
mnr_lim = MRI_4G;
else if (flags & PGI_MT_RANGE16M)
mnr_lim = MRI_16M;
while (mtype != -1 &&
mnoderanges[mtype].mnr_memrange <= mnr_lim) {
if (mnoderanges[mtype].mnr_mnode == mnode)
return (mtype);
mtype = mnoderanges[mtype].mnr_next;
}
} else if (mnoderanges[mtype].mnr_mnode == mnode) {
return (mtype);
}
return (-1);
}
void
mtype_modify_max(pfn_t startpfn, long cnt)
{
int mtype;
pgcnt_t inc;
spgcnt_t scnt = (spgcnt_t)(cnt);
pgcnt_t acnt = ABS(scnt);
pfn_t endpfn = startpfn + acnt;
pfn_t pfn, lo;
if (!physmax4g)
return;
mtype = mtypetop;
for (pfn = endpfn; pfn > startpfn; ) {
ASSERT(mtype != -1);
lo = mnoderanges[mtype].mnr_pfnlo;
if (pfn > lo) {
if (startpfn >= lo) {
inc = pfn - startpfn;
} else {
inc = pfn - lo;
}
if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
if (scnt > 0)
maxmem4g += inc;
else
maxmem4g -= inc;
}
pfn -= inc;
}
mtype = mnoderanges[mtype].mnr_next;
}
}
int
mtype_2_mrange(int mtype)
{
return (mnoderanges[mtype].mnr_memrange);
}
void
mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
{
_NOTE(ARGUNUSED(mnode));
ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
*pfnlo = mnoderanges[mtype].mnr_pfnlo;
*pfnhi = mnoderanges[mtype].mnr_pfnhi;
}
size_t
plcnt_sz(size_t ctrs_sz)
{
#ifdef DEBUG
int szc, colors;
ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
for (szc = 0; szc < mmu_page_sizes; szc++) {
colors = page_get_pagecolors(szc);
ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
}
#endif
return (ctrs_sz);
}
caddr_t
plcnt_init(caddr_t addr)
{
#ifdef DEBUG
int mt, szc, colors;
for (mt = 0; mt < mnoderangecnt; mt++) {
mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
for (szc = 0; szc < mmu_page_sizes; szc++) {
colors = page_get_pagecolors(szc);
mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
(pgcnt_t *)addr;
addr += (sizeof (pgcnt_t) * colors);
}
}
#endif
return (addr);
}
void
plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
{
_NOTE(ARGUNUSED(pp));
#ifdef DEBUG
int bin = PP_2_BIN(pp);
atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
cnt);
#endif
ASSERT(mtype == PP_2_MTYPE(pp));
if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
atomic_add_long(&freemem4g, cnt);
if (flags & PG_CACHE_LIST)
atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
else
atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
}
int
mnode_pgcnt(int mnode)
{
int mtype = mtypetop;
int flags = PGI_MT_RANGE0;
pgcnt_t pgcnt = 0;
mtype = mtype_func(mnode, mtype, flags);
while (mtype != -1) {
pgcnt += MTYPE_FREEMEM(mtype);
mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
}
return (pgcnt);
}
size_t
page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
{
_NOTE(ARGUNUSED(l2_linesz));
size_t colorsz = 0;
int i;
int colors;
#if defined(__xpv)
i = memrange_num(1);
#else
if (plat_dr_physmax > physmax)
i = memrange_num(plat_dr_physmax);
else
i = memrange_num(physmax);
if (i == MRI_4G)
physmax4g = 1;
#endif
memranges += i;
nranges -= i;
ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
ASSERT(ISP2(l2_linesz));
ASSERT(l2_sz > MMU_PAGESIZE);
if (l2_assoc)
l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
else
l2_colors = 1;
ASSERT(ISP2(l2_colors));
page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
if (l2_colors < page_colors)
cpu_page_colors = l2_colors;
ASSERT(ISP2(page_colors));
page_colors_mask = page_colors - 1;
ASSERT(ISP2(CPUSETSIZE()));
page_coloring_shift = lowbit(CPUSETSIZE());
for (i = 0; i <= mmu.max_page_level; i++) {
hw_page_array[i].hp_size = LEVEL_SIZE(i);
hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
hw_page_array[i].hp_colors = (page_colors_mask >>
(hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
+ 1;
colorequivszc[i] = 0;
}
if (cpu_page_colors != 0) {
int a = lowbit(page_colors) - lowbit(cpu_page_colors);
ASSERT(a > 0);
ASSERT(a < 16);
for (i = 0; i <= mmu.max_page_level; i++) {
if ((colors = hw_page_array[i].hp_colors) <= 1) {
colorequivszc[i] = 0;
continue;
}
while ((colors >> a) == 0)
a--;
ASSERT(a >= 0);
colorequivszc[i] = (a << 4);
}
}
if (colorequiv > 1) {
int a = lowbit(colorequiv) - 1;
if (a > 15)
a = 15;
for (i = 0; i <= mmu.max_page_level; i++) {
if ((colors = hw_page_array[i].hp_colors) <= 1) {
continue;
}
while ((colors >> a) == 0)
a--;
if ((a << 4) > colorequivszc[i]) {
colorequivszc[i] = (a << 4);
}
}
}
for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
mnoderangecnt += mnode_range_cnt(i);
if (plat_dr_support_memory()) {
mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
}
colorsz = mnoderangecnt * sizeof (mnoderange_t);
colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
colorsz += mnoderangecnt * sizeof (page_t ***);
colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
for (i = 0; i < mmu_page_sizes; i++) {
colors = page_get_pagecolors(i);
colorsz += mnoderangecnt * colors * sizeof (page_t *);
}
colorsz += mnoderangecnt * sizeof (page_t **);
colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
return (colorsz);
}
void
page_coloring_setup(caddr_t pcmemaddr)
{
int i;
int j;
int k;
caddr_t addr;
int colors;
addr = pcmemaddr;
mnoderanges = (mnoderange_t *)addr;
addr += (mnoderangecnt * sizeof (mnoderange_t));
mnode_range_setup(mnoderanges);
for (k = 0; k < NPC_MUTEX; k++) {
fpc_mutex[k] = (kmutex_t *)addr;
addr += (max_mem_nodes * sizeof (kmutex_t));
}
for (k = 0; k < NPC_MUTEX; k++) {
cpc_mutex[k] = (kmutex_t *)addr;
addr += (max_mem_nodes * sizeof (kmutex_t));
}
page_freelists = (page_t ****)addr;
addr += (mnoderangecnt * sizeof (page_t ***));
page_cachelists = (page_t ***)addr;
addr += (mnoderangecnt * sizeof (page_t **));
for (i = 0; i < mnoderangecnt; i++) {
page_freelists[i] = (page_t ***)addr;
addr += (mmu_page_sizes * sizeof (page_t **));
for (j = 0; j < mmu_page_sizes; j++) {
colors = page_get_pagecolors(j);
page_freelists[i][j] = (page_t **)addr;
addr += (colors * sizeof (page_t *));
}
page_cachelists[i] = (page_t **)addr;
addr += (page_colors * sizeof (page_t *));
}
}
#if defined(__xpv)
static void
page_io_pool_shrink()
{
int retcnt;
page_t *pp, *pp_first, *pp_last, **curpool;
mfn_t mfn;
int bothpools = 0;
mutex_enter(&io_pool_lock);
io_pool_shrink_attempts++;
retcnt = io_pool_cnt / 10;
if (io_pool_cnt - retcnt < io_pool_cnt_min)
retcnt = io_pool_cnt - io_pool_cnt_min;
if (retcnt <= 0)
goto done;
io_pool_shrinks++;
curpool = &io_pool_4g;
domore:
for (pp = *curpool; pp && retcnt > 0; ) {
pp_first = pp_last = pp->p_prev;
if (pp_first == *curpool)
break;
retcnt--;
io_pool_cnt--;
page_io_pool_sub(curpool, pp_first, pp_last);
if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
start_mfn = mfn;
page_free(pp_first, 1);
pp = *curpool;
}
if (retcnt != 0 && !bothpools) {
curpool = &io_pool_16m;
bothpools = 1;
goto domore;
}
done:
mutex_exit(&io_pool_lock);
}
#endif
uint_t
page_create_update_flags_x86(uint_t flags)
{
#if defined(__xpv)
if (!(flags & PG_WAIT) && freemem < desfree)
page_io_pool_shrink();
#else
if (physmax4g)
flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
#endif
return (flags);
}
int
bp_color(struct buf *bp)
{
return (0);
}
#if defined(__xpv)
static void
page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
{
if (*poolp == pp_first) {
*poolp = pp_last->p_next;
if (*poolp == pp_first)
*poolp = NULL;
}
pp_first->p_prev->p_next = pp_last->p_next;
pp_last->p_next->p_prev = pp_first->p_prev;
pp_first->p_prev = pp_last;
pp_last->p_next = pp_first;
}
static void
page_io_pool_add(page_t **poolp, page_t *pp)
{
page_t *look;
mfn_t mfn = mfn_list[pp->p_pagenum];
if (*poolp == NULL) {
*poolp = pp;
pp->p_next = pp;
pp->p_prev = pp;
return;
}
look = (*poolp)->p_prev;
while (mfn < mfn_list[look->p_pagenum]) {
look = look->p_prev;
if (look == (*poolp)->p_prev)
break;
}
pp->p_prev = look;
pp->p_next = look->p_next;
pp->p_next->p_prev = pp;
look->p_next = pp;
if (mfn < mfn_list[(*poolp)->p_pagenum]) {
*poolp = pp;
}
}
static void
add_page_to_pool(page_t *pp, int force)
{
page_t *highest;
page_t *freep = NULL;
mutex_enter(&io_pool_lock);
if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
++io_pool_cnt;
page_io_pool_add(&io_pool_16m, pp);
goto done;
}
if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
++io_pool_cnt;
page_io_pool_add(&io_pool_4g, pp);
} else {
highest = io_pool_4g->p_prev;
if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
page_io_pool_sub(&io_pool_4g, highest, highest);
page_io_pool_add(&io_pool_4g, pp);
freep = highest;
} else {
freep = pp;
}
}
done:
mutex_exit(&io_pool_lock);
if (freep)
page_free(freep, 1);
}
int contig_pfn_cnt;
int contig_pfn_max;
int next_alloc_pfn;
int contig_pfnlist_updates;
int contig_pfnlist_builds;
int contig_pfnlist_buildfailed;
int create_contig_pending;
pfn_t *contig_pfn_list = NULL;
static int
mfn_compare(const void *pfnp1, const void *pfnp2)
{
mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
if (mfn1 > mfn2)
return (1);
if (mfn1 < mfn2)
return (-1);
return (0);
}
static void
compact_contig_pfn_list(void)
{
pfn_t pfn, lapfn, prev_lapfn;
mfn_t mfn;
int i, newcnt = 0;
prev_lapfn = 0;
for (i = 0; i < contig_pfn_cnt - 1; i++) {
pfn = contig_pfn_list[i];
lapfn = contig_pfn_list[i + 1];
mfn = mfn_list[pfn];
if (mfn_list[lapfn] != mfn + 1)
continue;
if (pfn != prev_lapfn)
contig_pfn_list[newcnt++] = pfn;
contig_pfn_list[newcnt++] = lapfn;
prev_lapfn = lapfn;
}
for (i = newcnt; i < contig_pfn_cnt; i++)
contig_pfn_list[i] = 0;
contig_pfn_cnt = newcnt;
}
static void
call_create_contiglist(void *arg)
{
(void) create_contig_pfnlist(PG_WAIT);
}
static int
create_contig_pfnlist(uint_t flags)
{
pfn_t pfn;
page_t *pp;
int ret = 1;
mutex_enter(&contig_list_lock);
if (contig_pfn_list != NULL)
goto out;
contig_pfn_max = freemem + (freemem / 10);
contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
(flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
if (contig_pfn_list == NULL) {
if (!create_contig_pending) {
if (taskq_dispatch(system_taskq, call_create_contiglist,
NULL, TQ_NOSLEEP) != TASKQID_INVALID)
create_contig_pending = 1;
}
contig_pfnlist_buildfailed++;
ret = 0;
goto out;
}
create_contig_pending = 0;
ASSERT(contig_pfn_cnt == 0);
for (pfn = 0; pfn < mfn_count; pfn++) {
pp = page_numtopp_nolock(pfn);
if (pp == NULL || !PP_ISFREE(pp))
continue;
contig_pfn_list[contig_pfn_cnt] = pfn;
if (++contig_pfn_cnt == contig_pfn_max)
break;
}
if (contig_pfn_cnt < 2) {
contig_pfn_cnt = 0;
contig_pfnlist_buildfailed++;
kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
contig_pfn_list = NULL;
contig_pfn_max = 0;
ret = 0;
goto out;
}
qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
compact_contig_pfn_list();
next_alloc_pfn = 0;
contig_pfnlist_builds++;
out:
mutex_exit(&contig_list_lock);
return (ret);
}
void
clear_and_lock_contig_pfnlist()
{
pfn_t *listp = NULL;
size_t listsize;
mutex_enter(&contig_list_lock);
if (contig_pfn_list != NULL) {
listp = contig_pfn_list;
listsize = contig_pfn_max * sizeof (pfn_t);
contig_pfn_list = NULL;
contig_pfn_max = contig_pfn_cnt = 0;
}
if (listp != NULL)
kmem_free(listp, listsize);
}
void
unlock_contig_pfnlist()
{
mutex_exit(&contig_list_lock);
}
void
update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
{
int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
pfn_t probe_pfn;
mfn_t probe_mfn;
int drop_lock = 0;
if (mutex_owner(&contig_list_lock) != curthread) {
drop_lock = 1;
mutex_enter(&contig_list_lock);
}
if (contig_pfn_list == NULL)
goto done;
contig_pfnlist_updates++;
probe_hi = contig_pfn_cnt - 1;
probe_lo = 0;
probe_pos = (probe_hi + probe_lo) / 2;
while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
if (probe_pos == probe_lo) {
probe_pos = -1;
break;
}
if (pfn_to_mfn(probe_pfn) <= oldmfn)
probe_lo = probe_pos;
else
probe_hi = probe_pos;
probe_pos = (probe_hi + probe_lo) / 2;
}
if (probe_pos >= 0) {
if (--contig_pfn_cnt <= next_alloc_pfn)
next_alloc_pfn = 0;
if (contig_pfn_cnt < 2) {
contig_pfn_cnt = 0;
kmem_free(contig_pfn_list,
contig_pfn_max * sizeof (pfn_t));
contig_pfn_list = NULL;
contig_pfn_max = 0;
goto done;
}
ovbcopy(&contig_pfn_list[probe_pos + 1],
&contig_pfn_list[probe_pos],
(contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
}
if (newmfn == MFN_INVALID)
goto done;
probe_hi = contig_pfn_cnt - 1;
probe_lo = 0;
insert_after = -2;
do {
probe_pos = (probe_hi + probe_lo) / 2;
probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
if (newmfn == probe_mfn + 1)
insert_after = probe_pos;
else if (newmfn == probe_mfn - 1)
insert_after = probe_pos - 1;
if (probe_pos == probe_lo)
break;
if (probe_mfn <= newmfn)
probe_lo = probe_pos;
else
probe_hi = probe_pos;
} while (insert_after == -2);
if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
insert_point = insert_after + 1;
ovbcopy(&contig_pfn_list[insert_point],
&contig_pfn_list[insert_point + 1],
(contig_pfn_cnt - insert_point) * sizeof (pfn_t));
contig_pfn_list[insert_point] = pfn;
contig_pfn_cnt++;
}
done:
if (drop_lock)
mutex_exit(&contig_list_lock);
}
long
populate_io_pool(void)
{
pfn_t pfn;
mfn_t mfn, max_mfn;
page_t *pp;
if (io_pool_cnt_max == 0) {
io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
io_pool_cnt_lowater = io_pool_cnt_max;
io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
}
if (io_pool_cnt == 0) {
if (io_pool_cnt_max < physmem / 4)
io_pool_cnt_max += io_pool_cnt_max / 20;
}
io_pool_grows++;
(void) mfn_to_pfn(start_mfn);
max_mfn = MIN(cached_max_mfn, PFN_4GIG);
for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
pfn = mfn_to_pfn(mfn);
if (pfn & PFN_IS_FOREIGN_MFN)
continue;
pp = page_numtopp_alloc(pfn);
if (pp == NULL)
continue;
PP_CLRFREE(pp);
add_page_to_pool(pp, 1);
if (io_pool_cnt >= io_pool_cnt_max)
break;
}
return (io_pool_cnt);
}
void
page_destroy_io(page_t *pp)
{
mfn_t mfn = mfn_list[pp->p_pagenum];
page_unresv(1);
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
page_hashout(pp, NULL);
if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
page_free(pp, 1);
return;
}
add_page_to_pool(pp, 0);
}
long contig_searches;
long contig_search_restarts;
long contig_search_failed;
static void
free_partial_list(page_t **pplist)
{
page_t *pp;
while (*pplist != NULL) {
pp = *pplist;
page_io_pool_sub(pplist, pp, pp);
page_free(pp, 1);
}
}
page_t *
find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
pgcnt_t pfnalign)
{
page_t *pp, *plist = NULL;
mfn_t mfn, prev_mfn, start_mfn;
pfn_t pfn;
int pages_needed, pages_requested;
int search_start;
retry:
mutex_enter(&contig_list_lock);
if (contig_pfn_list == NULL) {
mutex_exit(&contig_list_lock);
if (!create_contig_pfnlist(flags)) {
return (NULL);
}
goto retry;
}
contig_searches++;
pages_requested = pages_needed = npages;
search_start = next_alloc_pfn;
start_mfn = prev_mfn = 0;
while (pages_needed) {
pfn = contig_pfn_list[next_alloc_pfn];
mfn = pfn_to_mfn(pfn);
if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
(pp = page_numtopp_alloc(pfn)) != NULL &&
!((mfn & pfnseg) < (start_mfn & pfnseg))) {
PP_CLRFREE(pp);
page_io_pool_add(&plist, pp);
pages_needed--;
if (prev_mfn == 0) {
if (pfnalign &&
mfn != P2ROUNDUP(mfn, pfnalign)) {
contig_search_restarts++;
free_partial_list(&plist);
pages_needed = pages_requested;
start_mfn = prev_mfn = 0;
goto skip;
}
start_mfn = mfn;
}
prev_mfn = mfn;
} else {
contig_search_restarts++;
free_partial_list(&plist);
pages_needed = pages_requested;
start_mfn = prev_mfn = 0;
}
skip:
if (++next_alloc_pfn == contig_pfn_cnt)
next_alloc_pfn = 0;
if (next_alloc_pfn == search_start)
break;
}
mutex_exit(&contig_list_lock);
if (pages_needed) {
contig_search_failed++;
free_partial_list(&plist);
}
return (plist);
}
page_t *
page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
{
page_t *pp_first, *pp_last;
page_t *pp, **poolp;
pgcnt_t nwanted, pfnalign;
uint64_t pfnseg;
mfn_t mfn, tmfn, hi_mfn, lo_mfn;
int align, attempt = 0;
if (minctg == 1)
contig = 0;
lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
pfnseg = mmu_btop(mattr->dma_attr_seg);
align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
if (align > MMU_PAGESIZE)
pfnalign = mmu_btop(align);
else
pfnalign = 0;
try_again:
if (hi_mfn < PFN_16MEG)
poolp = &io_pool_16m;
else
poolp = &io_pool_4g;
try_smaller:
pp_first = pp_last = NULL;
mutex_enter(&io_pool_lock);
nwanted = minctg;
for (pp = *poolp; pp && nwanted > 0; ) {
pp = pp->p_prev;
mfn = mfn_list[pp->p_pagenum];
if (hi_mfn < mfn)
goto skip;
if (lo_mfn > mfn)
break;
restart:
if (pp_last == NULL) {
tmfn = mfn - (minctg - 1);
if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
goto skip;
if ((mfn & pfnseg) < (tmfn & pfnseg))
goto skip;
pp_first = pp_last = pp;
nwanted--;
} else {
if (contig &&
mfn_list[pp_first->p_pagenum] != mfn + 1) {
pp_last = NULL;
nwanted = minctg;
goto restart;
} else {
pp_first = pp;
nwanted--;
}
}
skip:
if (pp == *poolp)
break;
}
if (nwanted != 0) {
mutex_exit(&io_pool_lock);
if (poolp == &io_pool_4g) {
poolp = &io_pool_16m;
goto try_smaller;
}
kmem_reap();
if (++attempt < 4) {
(void) populate_io_pool();
goto try_again;
}
return (NULL);
}
page_io_pool_sub(poolp, pp_first, pp_last);
io_pool_cnt -= minctg;
if (io_pool_cnt < io_pool_cnt_lowater)
io_pool_cnt_lowater = io_pool_cnt;
mutex_exit(&io_pool_lock);
return (pp_first);
}
page_t *
page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
{
uint_t kflags;
int order, extra, extpages, i, contig, nbits, extents;
page_t *pp, *expp, *pp_first, **pplist = NULL;
mfn_t *mfnlist = NULL;
extra = 0;
contig = flags & PG_PHYSCONTIG;
if (minctg == 1)
contig = 0;
flags &= ~PG_PHYSCONTIG;
kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
if (contig) {
order = highbit(minctg) - 1;
if (minctg & ((1 << order) - 1))
order++;
extpages = 1 << order;
} else {
order = 0;
extpages = minctg;
}
if (extpages > minctg) {
extra = extpages - minctg;
if (!page_resv(extra, kflags))
return (NULL);
}
pp_first = NULL;
pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
if (pplist == NULL)
goto balloon_fail;
mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
if (mfnlist == NULL)
goto balloon_fail;
pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
if (pp == NULL)
goto balloon_fail;
pp_first = pp;
if (extpages > minctg) {
for (i = 0; i < extra; i++) {
expp = page_create_va(vp,
(u_offset_t)(uintptr_t)io_pool_kva,
PAGESIZE, flags, &kvseg, io_pool_kva);
if (expp == NULL)
goto balloon_fail;
(void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
page_io_unlock(expp);
page_hashout(expp, NULL);
page_io_lock(expp);
expp->p_prev = pp_first->p_prev;
expp->p_next = pp_first;
expp->p_prev->p_next = expp;
pp_first->p_prev = expp;
}
}
for (i = 0; i < extpages; i++) {
pplist[i] = pp;
pp = pp->p_next;
}
nbits = highbit(mattr->dma_attr_addr_hi);
extents = contig ? 1 : minctg;
if (balloon_replace_pages(extents, pplist, nbits, order,
mfnlist) != extents) {
if (ioalloc_dbg)
cmn_err(CE_NOTE, "request to hypervisor"
" for %d pages, maxaddr %" PRIx64 " failed",
extpages, mattr->dma_attr_addr_hi);
goto balloon_fail;
}
kmem_free(pplist, extpages * sizeof (page_t *));
kmem_free(mfnlist, extpages * sizeof (mfn_t));
if (extpages > minctg) {
for (i = 0; i < extra; i++) {
pp = pp_first->p_prev;
page_sub(&pp_first, pp);
page_io_unlock(pp);
page_unresv(1);
page_free(pp, 1);
}
}
return (pp_first);
balloon_fail:
while (pp_first != NULL) {
pp = pp_first;
page_sub(&pp_first, pp);
page_io_unlock(pp);
if (pp->p_vnode != NULL)
page_hashout(pp, NULL);
page_free(pp, 1);
}
if (pplist)
kmem_free(pplist, extpages * sizeof (page_t *));
if (mfnlist)
kmem_free(mfnlist, extpages * sizeof (mfn_t));
page_unresv(extpages - minctg);
return (NULL);
}
static void
return_partial_alloc(page_t *plist)
{
page_t *pp;
while (plist != NULL) {
pp = plist;
page_sub(&plist, pp);
page_io_unlock(pp);
page_destroy_io(pp);
}
}
static page_t *
page_get_contigpages(
struct vnode *vp,
u_offset_t off,
int *npagesp,
uint_t flags,
caddr_t vaddr,
ddi_dma_attr_t *mattr)
{
mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
page_t *plist;
page_t *pp, *mcpl;
int contig, anyaddr, npages, getone = 0;
mfn_t lo_mfn;
mfn_t hi_mfn;
pgcnt_t pfnalign = 0;
int align, sgllen;
uint64_t pfnseg;
pgcnt_t minctg;
npages = *npagesp;
ASSERT(mattr != NULL);
lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
sgllen = mattr->dma_attr_sgllen;
pfnseg = mmu_btop(mattr->dma_attr_seg);
align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
if (align > MMU_PAGESIZE)
pfnalign = mmu_btop(align);
contig = flags & PG_PHYSCONTIG;
if (npages == -1) {
npages = 1;
pfnalign = 0;
}
if (npages == 1) {
getone = 1;
contig = 0;
}
anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
if (!contig && anyaddr && !pfnalign) {
flags &= ~PG_PHYSCONTIG;
plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
flags, &kvseg, vaddr);
if (plist != NULL) {
*npagesp = 0;
return (plist);
}
}
plist = NULL;
minctg = howmany(npages, sgllen);
while (npages > sgllen || getone) {
if (minctg > npages)
minctg = npages;
mcpl = NULL;
if (anyaddr && contig) {
mcpl = find_contig_free(minctg, flags, pfnseg,
pfnalign);
}
if (mcpl == NULL)
mcpl = page_io_pool_alloc(mattr, contig, minctg);
if (mcpl != NULL) {
pp = mcpl;
do {
if (!page_hashin(pp, vp, off, NULL)) {
panic("page_get_contigpages:"
" hashin failed"
" pp %p, vp %p, off %llx",
(void *)pp, (void *)vp, off);
}
off += MMU_PAGESIZE;
PP_CLRFREE(pp);
PP_CLRAGED(pp);
page_set_props(pp, P_REF);
page_io_lock(pp);
pp = pp->p_next;
} while (pp != mcpl);
} else {
if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
pfnalign)
goto fail;
mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
flags, minctg);
if (mcpl == NULL)
goto fail;
off += minctg * MMU_PAGESIZE;
}
check_dma(mattr, mcpl, minctg);
page_list_concat(&plist, &mcpl);
npages -= minctg;
*npagesp = npages;
sgllen--;
if (getone)
break;
}
return (plist);
fail:
return_partial_alloc(plist);
return (NULL);
}
page_t *
page_create_io(
struct vnode *vp,
u_offset_t off,
uint_t bytes,
uint_t flags,
struct as *as,
caddr_t vaddr,
ddi_dma_attr_t *mattr)
{
page_t *plist = NULL, *pp;
int npages = 0, contig, anyaddr, pages_req;
mfn_t lo_mfn;
mfn_t hi_mfn;
pgcnt_t pfnalign = 0;
int align;
int is_domu = 0;
int dummy, bytes_got;
mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
ASSERT(mattr != NULL);
lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
if (align > MMU_PAGESIZE)
pfnalign = mmu_btop(align);
pages_req = npages = mmu_btopr(bytes);
contig = (flags & PG_PHYSCONTIG);
bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
contig = 0;
is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
if ((!contig && anyaddr) || is_domu) {
flags &= ~PG_PHYSCONTIG;
plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
if (plist != NULL)
return (plist);
else if (is_domu)
return (NULL);
}
if (contig) {
plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
mattr);
if (plist == NULL)
goto fail;
bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
vaddr += bytes_got;
off += bytes_got;
}
while (npages--) {
dummy = -1;
pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
if (pp == NULL)
goto fail;
page_add(&plist, pp);
vaddr += MMU_PAGESIZE;
off += MMU_PAGESIZE;
}
return (plist);
fail:
return_partial_alloc(plist);
return (NULL);
}
page_t *
page_get_high_mfn(mfn_t new_high)
{
static mfn_t last_mfn = 0;
pfn_t pfn;
page_t *pp;
ulong_t loop_count = 0;
if (new_high > last_mfn)
last_mfn = new_high;
for (; loop_count < mfn_count; loop_count++, last_mfn--) {
if (last_mfn == 0) {
last_mfn = cached_max_mfn;
}
pfn = mfn_to_pfn(last_mfn);
if (pfn & PFN_IS_FOREIGN_MFN)
continue;
pp = page_numtopp_alloc(pfn);
if (pp == NULL)
continue;
PP_CLRFREE(pp);
ASSERT(PAGE_EXCL(pp));
ASSERT(pp->p_vnode == NULL);
ASSERT(!hat_page_is_mapped(pp));
last_mfn--;
return (pp);
}
return (NULL);
}
#else
static page_t *
page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
int mnode, int mtype, ddi_dma_attr_t *dma_attr)
{
kmutex_t *pcm;
int i;
page_t *pp;
page_t *first_pp;
uint64_t pgaddr;
ulong_t bin;
int mtypestart;
int plw_initialized;
page_list_walker_t plw;
VM_STAT_ADD(pga_vmstats.pgma_alloc);
ASSERT((flags & PG_MATCH_COLOR) == 0);
ASSERT(szc == 0);
ASSERT(dma_attr != NULL);
MTYPE_START(mnode, mtype, flags);
if (mtype < 0) {
VM_STAT_ADD(pga_vmstats.pgma_allocempty);
return (NULL);
}
mtypestart = mtype;
bin = origbin;
do {
plw_initialized = 0;
for (plw.plw_count = 0;
plw.plw_count < page_colors; plw.plw_count++) {
if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
goto nextfreebin;
pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
mutex_enter(pcm);
pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
first_pp = pp;
while (pp != NULL) {
if (IS_DUMP_PAGE(pp) || page_trylock(pp,
SE_EXCL) == 0) {
pp = pp->p_next;
if (pp == first_pp) {
pp = NULL;
}
continue;
}
ASSERT(PP_ISFREE(pp));
ASSERT(PP_ISAGED(pp));
ASSERT(pp->p_vnode == NULL);
ASSERT(pp->p_hash == NULL);
ASSERT(pp->p_offset == (u_offset_t)-1);
ASSERT(pp->p_szc == szc);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
(pgaddr + MMU_PAGESIZE - 1 <=
dma_attr->dma_attr_addr_hi)) {
break;
}
page_unlock(pp);
pp = pp->p_next;
if (pp == first_pp)
pp = NULL;
}
if (pp != NULL) {
ASSERT(mtype == PP_2_MTYPE(pp));
ASSERT(pp->p_szc == 0);
page_sub(&PAGE_FREELISTS(mnode, szc, bin,
mtype), pp);
page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
if ((PP_ISFREE(pp) == 0) ||
(PP_ISAGED(pp) == 0)) {
cmn_err(CE_PANIC, "page %p is not free",
(void *)pp);
}
mutex_exit(pcm);
check_dma(dma_attr, pp, 1);
VM_STAT_ADD(pga_vmstats.pgma_allocok);
return (pp);
}
mutex_exit(pcm);
nextfreebin:
if (plw_initialized == 0) {
page_list_walk_init(szc, 0, bin, 1, 0, &plw);
ASSERT(plw.plw_ceq_dif == page_colors);
plw_initialized = 1;
}
if (plw.plw_do_split) {
pp = page_freelist_split(szc, bin, mnode,
mtype,
mmu_btop(dma_attr->dma_attr_addr_lo),
mmu_btop(dma_attr->dma_attr_addr_hi + 1),
&plw);
if (pp != NULL) {
check_dma(dma_attr, pp, 1);
return (pp);
}
}
bin = page_list_walk_next_bin(szc, bin, &plw);
}
MTYPE_NEXT(mnode, mtype, flags);
} while (mtype >= 0);
mtype = mtypestart;
ASSERT(mtype >= 0);
bin = origbin;
do {
for (i = 0; i <= page_colors; i++) {
if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
goto nextcachebin;
pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
mutex_enter(pcm);
pp = PAGE_CACHELISTS(mnode, bin, mtype);
first_pp = pp;
while (pp != NULL) {
if (IS_DUMP_PAGE(pp) || page_trylock(pp,
SE_EXCL) == 0) {
pp = pp->p_next;
if (pp == first_pp)
pp = NULL;
continue;
}
ASSERT(pp->p_vnode);
ASSERT(PP_ISAGED(pp) == 0);
ASSERT(pp->p_szc == 0);
ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
(pgaddr + MMU_PAGESIZE - 1 <=
dma_attr->dma_attr_addr_hi)) {
break;
}
page_unlock(pp);
pp = pp->p_next;
if (pp == first_pp)
pp = NULL;
}
if (pp != NULL) {
ASSERT(mtype == PP_2_MTYPE(pp));
ASSERT(pp->p_szc == 0);
page_sub(&PAGE_CACHELISTS(mnode, bin,
mtype), pp);
page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
mutex_exit(pcm);
ASSERT(pp->p_vnode);
ASSERT(PP_ISAGED(pp) == 0);
check_dma(dma_attr, pp, 1);
VM_STAT_ADD(pga_vmstats.pgma_allocok);
return (pp);
}
mutex_exit(pcm);
nextcachebin:
bin += (i == 0) ? BIN_STEP : 1;
bin &= page_colors_mask;
}
MTYPE_NEXT(mnode, mtype, flags);
} while (mtype >= 0);
VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
return (NULL);
}
static page_t *
page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
{
uint_t bin;
int mtype;
page_t *pp;
int n;
int m;
int szc;
int fullrange;
int mnode;
int local_failed_stat = 0;
lgrp_mnode_cookie_t lgrp_cookie;
VM_STAT_ADD(pga_vmstats.pga_alloc);
if (size != MMU_PAGESIZE)
return (NULL);
if (!LGRP_EXISTS(lgrp))
lgrp = lgrp_home_lgrp();
AS_2_BIN(as, seg, vp, vaddr, bin, 0);
if (dma_attr == NULL) {
n = mtype16m;
m = mtypetop;
fullrange = 1;
VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
} else {
pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
if (dma_attr->dma_attr_align > MMU_PAGESIZE)
return (NULL);
if (pfnlo > pfnhi)
return (NULL);
n = pfn_2_mtype(pfnlo);
m = pfn_2_mtype(pfnhi);
fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
(pfnhi >= mnoderanges[m].mnr_pfnhi));
}
VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
szc = 0;
if (n == mtype16m) {
flags |= PGI_MT_RANGE0;
n = m;
}
LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
mtype = m;
do {
if (fullrange != 0) {
pp = page_get_mnode_freelist(mnode,
bin, mtype, szc, flags);
if (pp == NULL) {
pp = page_get_mnode_cachelist(
bin, flags, mnode, mtype);
}
} else {
pp = page_get_mnode_anylist(bin, szc,
flags, mnode, mtype, dma_attr);
}
if (pp != NULL) {
VM_STAT_ADD(pga_vmstats.pga_allocok);
check_dma(dma_attr, pp, 1);
return (pp);
}
} while (mtype != n &&
(mtype = mnoderanges[mtype].mnr_next) != -1);
if (!local_failed_stat) {
lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
local_failed_stat = 1;
}
}
VM_STAT_ADD(pga_vmstats.pga_allocfailed);
return (NULL);
}
#define PAGE_HASH_SEARCH(index, pp, vp, off) { \
for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
break; \
} \
}
page_t *
page_create_io(
struct vnode *vp,
u_offset_t off,
uint_t bytes,
uint_t flags,
struct as *as,
caddr_t vaddr,
ddi_dma_attr_t *mattr)
{
page_t *plist = NULL;
uint_t plist_len = 0;
pgcnt_t npages;
page_t *npp = NULL;
uint_t pages_req;
page_t *pp;
kmutex_t *phm = NULL;
uint_t index;
TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
"page_create_start:vp %p off %llx bytes %u flags %x",
vp, off, bytes, flags);
ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
pages_req = npages = mmu_btopr(bytes);
if (!page_create_wait(npages, flags)) {
return (NULL);
}
TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
"page_create_success:vp %p off %llx", vp, off);
if (nscan < desscan && freemem < minfree) {
TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
"pageout_cv_signal:freemem %ld", freemem);
WAKE_PAGEOUT_SCANNER(page__create__io);
}
if (flags & PG_PHYSCONTIG) {
plist = page_get_contigpage(&npages, mattr, 1);
if (plist == NULL) {
page_create_putback(npages);
return (NULL);
}
pp = plist;
do {
if (!page_hashin(pp, vp, off, NULL)) {
panic("pg_creat_io: hashin failed %p %p %llx",
(void *)pp, (void *)vp, off);
}
VM_STAT_ADD(page_create_new);
off += MMU_PAGESIZE;
PP_CLRFREE(pp);
PP_CLRAGED(pp);
page_set_props(pp, P_REF);
pp = pp->p_next;
} while (pp != plist);
if (!npages) {
check_dma(mattr, plist, pages_req);
return (plist);
} else {
vaddr += (pages_req - npages) << MMU_PAGESHIFT;
}
}
while (npages--) {
phm = NULL;
index = PAGE_HASH_FUNC(vp, off);
top:
ASSERT(phm == NULL);
ASSERT(index == PAGE_HASH_FUNC(vp, off));
ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
if (npp == NULL) {
npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
flags & ~PG_MATCH_COLOR, mattr, NULL);
if (npp == NULL) {
if (mattr == NULL) {
panic("no page found %d", (int)npages);
}
if ((mattr != NULL) && (flags & PG_WAIT)) {
delay(10);
goto top;
}
goto fail;
}
if (PP_ISAGED(npp) == 0) {
page_hashout(npp, (kmutex_t *)NULL);
}
}
ASSERT(PAGE_EXCL(npp));
ASSERT(npp->p_vnode == NULL);
ASSERT(!hat_page_is_mapped(npp));
PP_CLRFREE(npp);
PP_CLRAGED(npp);
phm = PAGE_HASH_MUTEX(index);
mutex_enter(phm);
PAGE_HASH_SEARCH(index, pp, vp, off);
if (pp == NULL) {
VM_STAT_ADD(page_create_new);
pp = npp;
npp = NULL;
if (!page_hashin(pp, vp, off, phm)) {
ASSERT(MUTEX_HELD(phm));
panic("page_create: hashin fail %p %p %llx %p",
(void *)pp, (void *)vp, off, (void *)phm);
}
ASSERT(MUTEX_HELD(phm));
mutex_exit(phm);
phm = NULL;
page_set_props(pp, P_REF);
} else {
ASSERT(MUTEX_HELD(phm));
mutex_exit(phm);
phm = NULL;
ASSERT(!VN_ISKAS(vp));
if (VN_ISKAS(vp))
cmn_err(CE_NOTE,
"page_create: page not expected "
"in hash list for kernel vnode - pp 0x%p",
(void *)pp);
VM_STAT_ADD(page_create_exists);
goto fail;
}
page_io_lock(pp);
page_add(&plist, pp);
plist = plist->p_next;
off += MMU_PAGESIZE;
vaddr += MMU_PAGESIZE;
}
check_dma(mattr, plist, pages_req);
return (plist);
fail:
if (npp != NULL) {
VM_STAT_ADD(page_create_putbacks);
PP_SETFREE(npp);
PP_SETAGED(npp);
npp->p_offset = (u_offset_t)-1;
page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
page_unlock(npp);
}
while (plist != NULL) {
pp = plist;
page_sub(&plist, pp);
page_io_unlock(pp);
plist_len++;
VN_DISPOSE(pp, B_INVAL, 0, kcred);
}
VM_STAT_ADD(page_create_putbacks);
page_create_putback(pages_req - plist_len);
return (NULL);
}
#endif
int
ppcopy(page_t *frompp, page_t *topp)
{
caddr_t pp_addr1;
caddr_t pp_addr2;
hat_mempte_t pte1;
hat_mempte_t pte2;
label_t ljb;
int ret;
ASSERT_STACK_ALIGNED();
ASSERT(PAGE_LOCKED(frompp));
ASSERT(PAGE_LOCKED(topp));
if (kpm_enable) {
pp_addr1 = hat_kpm_page2va(frompp, 0);
pp_addr2 = hat_kpm_page2va(topp, 0);
kpreempt_disable();
} else {
kpreempt_disable();
pp_addr1 = CPU->cpu_caddr1;
pp_addr2 = CPU->cpu_caddr2;
pte1 = CPU->cpu_caddr1pte;
pte2 = CPU->cpu_caddr2pte;
mutex_enter(&CPU->cpu_ppaddr_mutex);
hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
HAT_LOAD_NOCONSIST);
}
if (on_fault(&ljb)) {
ret = 0;
goto faulted;
} else {
ret = 1;
}
if (use_sse_pagecopy)
#ifdef __xpv
page_copy_no_xmm(pp_addr2, pp_addr1);
#else
hwblkpagecopy(pp_addr1, pp_addr2);
#endif
else
bcopy(pp_addr1, pp_addr2, PAGESIZE);
no_fault();
faulted:
if (!kpm_enable) {
#ifdef __xpv
if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
UVMF_INVLPG | UVMF_LOCAL) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
UVMF_INVLPG | UVMF_LOCAL) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
#endif
mutex_exit(&CPU->cpu_ppaddr_mutex);
}
kpreempt_enable();
return (ret);
}
void
pagezero(page_t *pp, uint_t off, uint_t len)
{
ASSERT(PAGE_LOCKED(pp));
pfnzero(page_pptonum(pp), off, len);
}
void
pfnzero(pfn_t pfn, uint_t off, uint_t len)
{
caddr_t pp_addr2;
hat_mempte_t pte2;
kmutex_t *ppaddr_mutex = NULL;
ASSERT_STACK_ALIGNED();
ASSERT(len <= MMU_PAGESIZE);
ASSERT(off <= MMU_PAGESIZE);
ASSERT(off + len <= MMU_PAGESIZE);
if (kpm_enable && !pfn_is_foreign(pfn)) {
pp_addr2 = hat_kpm_pfn2va(pfn);
kpreempt_disable();
} else {
kpreempt_disable();
pp_addr2 = CPU->cpu_caddr2;
pte2 = CPU->cpu_caddr2pte;
ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
mutex_enter(ppaddr_mutex);
hat_mempte_remap(pfn, pp_addr2, pte2,
PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
HAT_LOAD_NOCONSIST);
}
if (use_sse_pagezero) {
#ifdef __xpv
uint_t rem;
while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
pp_addr2[off++] = 0;
rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
len -= rem;
if (len != 0) {
block_zero_no_xmm(pp_addr2 + off, len);
off += len;
}
while (rem-- > 0)
pp_addr2[off++] = 0;
#else
hwblkclr(pp_addr2 + off, len);
#endif
} else {
bzero(pp_addr2 + off, len);
}
if (!kpm_enable || pfn_is_foreign(pfn)) {
#ifdef __xpv
if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
UVMF_INVLPG) < 0)
panic("HYPERVISOR_update_va_mapping() failed");
#endif
mutex_exit(ppaddr_mutex);
}
kpreempt_enable();
}
void
pagescrub(page_t *pp, uint_t off, uint_t len)
{
pagezero(pp, off, len);
}
void
setup_vaddr_for_ppcopy(struct cpu *cpup)
{
void *addr;
hat_mempte_t pte_pa;
addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
pte_pa = hat_mempte_setup(addr);
cpup->cpu_caddr1 = addr;
cpup->cpu_caddr1pte = pte_pa;
addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
pte_pa = hat_mempte_setup(addr);
cpup->cpu_caddr2 = addr;
cpup->cpu_caddr2pte = pte_pa;
mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
}
void
teardown_vaddr_for_ppcopy(struct cpu *cpup)
{
mutex_destroy(&cpup->cpu_ppaddr_mutex);
hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
cpup->cpu_caddr2pte = 0;
vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
cpup->cpu_caddr2 = 0;
hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
cpup->cpu_caddr1pte = 0;
vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
cpup->cpu_caddr1 = 0;
}
void
dcache_flushall()
{}
page_t *
page_get_physical(uintptr_t seed)
{
page_t *pp;
u_offset_t offset;
static struct seg tmpseg;
static uintptr_t ctr = 0;
offset = seed;
if (offset > kernelbase)
offset -= kernelbase;
offset <<= MMU_PAGESHIFT;
offset += mmu.hole_start;
if (page_resv(1, KM_NOSLEEP) == 0)
return (NULL);
#ifdef DEBUG
pp = page_exists(&kvp, offset);
if (pp != NULL)
panic("page already exists %p", (void *)pp);
#endif
pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
&tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));
if (pp != NULL) {
page_io_unlock(pp);
page_downgrade(pp);
}
return (pp);
}