#include <sys/t_lock.h>
#include <sys/memlist.h>
#include <sys/cpuvar.h>
#include <sys/vmem.h>
#include <sys/mman.h>
#include <sys/vm.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vm_machparam.h>
#include <sys/tss.h>
#include <sys/vnode.h>
#include <vm/hat.h>
#include <vm/anon.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_map.h>
#include <vm/hat_i86.h>
#include <sys/promif.h>
#include <sys/x86_archext.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
#include <sys/sunddi.h>
#include <sys/ddidmareq.h>
#include <sys/controlregs.h>
#include <sys/reboot.h>
#include <sys/kdi.h>
#include <sys/bootconf.h>
#include <sys/bootsvcs.h>
#include <sys/bootinfo.h>
#include <vm/kboot_mmu.h>
#ifdef __xpv
#include <sys/hypervisor.h>
#endif
#define ON_USER_HAT(cpu) \
((cpu)->cpu_m.mcpu_current_hat != NULL && \
(cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
uint_t khat_running = 0;
pfn_t
va_to_pfn(void *vaddr)
{
uintptr_t des_va = ALIGN2PAGE(vaddr);
uintptr_t va = des_va;
size_t len;
uint_t prot;
pfn_t pfn;
if (khat_running)
panic("va_to_pfn(): called too late\n");
if (kbm_probe(&va, &len, &pfn, &prot) == 0)
return (PFN_INVALID);
if (va > des_va)
return (PFN_INVALID);
if (va < des_va)
pfn += mmu_btop(des_va - va);
return (pfn);
}
void
hat_kmap_init(uintptr_t base, size_t len)
{
uintptr_t map_addr;
uintptr_t map_eaddr;
size_t map_len;
caddr_t ptes;
size_t window_size;
ulong_t htable_cnt;
ulong_t i;
htable_t *ht;
uintptr_t va;
map_addr = base & LEVEL_MASK(1);
map_eaddr = (base + len + LEVEL_SIZE(1) - 1) & LEVEL_MASK(1);
map_len = map_eaddr - map_addr;
window_size = mmu_btop(map_len) * mmu.pte_size;
window_size = (window_size + LEVEL_SIZE(1)) & LEVEL_MASK(1);
htable_cnt = map_len >> LEVEL_SHIFT(1);
ptes = vmem_xalloc(heap_arena, window_size, LEVEL_SIZE(1), 0,
0, NULL, NULL, VM_SLEEP);
mmu.kmap_htables =
kmem_alloc(htable_cnt * sizeof (htable_t *), KM_SLEEP);
for (va = map_addr, i = 0; i < htable_cnt; va += LEVEL_SIZE(1), ++i) {
ht = htable_create(kas.a_hat, va, 0, NULL);
if (ht == NULL)
panic("hat_kmap_init: ht == NULL");
mmu.kmap_htables[i] = ht;
hat_devload(kas.a_hat, ptes + i * MMU_PAGESIZE,
MMU_PAGESIZE, ht->ht_pfn,
#ifdef __xpv
PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
#else
PROT_READ | PROT_WRITE | HAT_NOSYNC | HAT_UNORDERED_OK,
#endif
HAT_LOAD | HAT_LOAD_NOCONSIST);
}
mmu.kmap_addr = map_addr;
mmu.kmap_eaddr = map_eaddr;
mmu.kmap_ptes = (x86pte_t *)ptes;
}
extern caddr_t kpm_vbase;
extern size_t kpm_size;
#ifdef __xpv
static void
xen_kpm_create(paddr_t paddr, level_t lvl)
{
ulong_t pg_off;
for (pg_off = 0; pg_off < LEVEL_SIZE(lvl); pg_off += MMU_PAGESIZE) {
kbm_map((uintptr_t)kpm_vbase + paddr, (paddr_t)0, 0, 1);
kbm_read_only((uintptr_t)kpm_vbase + paddr + pg_off,
paddr + pg_off);
}
}
static void
xen_kpm_finish_init(void)
{
pfn_t gdtpfn = mmu_btop(CPU->cpu_m.mcpu_gdtpa);
pfn_t pfn;
page_t *pp;
for (pfn = 0; pfn < mfn_count; ++pfn) {
if (pfn == gdtpfn)
continue;
pp = page_numtopp_nolock(pfn);
if (pp && pp->p_index) {
pp->p_index = 0;
continue;
}
(void) xen_kpm_page(pfn, PT_VALID | PT_WRITABLE);
}
}
#endif
void
hat_kern_alloc(
caddr_t segmap_base,
size_t segmap_size,
caddr_t ekernelheap)
{
uintptr_t last_va = (uintptr_t)-1;
uintptr_t va = 0;
size_t size;
pfn_t pfn;
uint_t prot;
uint_t table_cnt = 1;
uint_t mapping_cnt;
level_t start_level;
level_t l;
struct memlist *pmem;
level_t lpagel = mmu.max_page_level;
uint64_t paddr;
int64_t psize;
int nwindows;
if (kpm_size > 0) {
for (pmem = phys_install; pmem; pmem = pmem->ml_next) {
paddr = pmem->ml_address;
psize = pmem->ml_size;
while (psize >= MMU_PAGESIZE) {
for (l = lpagel; l > 0; l--) {
if ((paddr & LEVEL_OFFSET(l)) == 0 &&
psize > LEVEL_SIZE(l))
break;
}
#if defined(__xpv)
xen_kpm_create(paddr, l);
#else
kbm_map((uintptr_t)kpm_vbase + paddr, paddr,
l, 1);
#endif
paddr += LEVEL_SIZE(l);
psize -= LEVEL_SIZE(l);
}
}
}
nwindows = (kpm_size == 0) ? 2 * NCPU : 0;
#if defined(__xpv)
nwindows = MAX(nwindows, mmu.max_level);
#endif
if (nwindows != 0) {
mmu.pwin_base = vmem_xalloc(heap_arena, nwindows * MMU_PAGESIZE,
LEVEL_SIZE(1), 0, 0, NULL, NULL, VM_SLEEP);
ASSERT(nwindows <= MMU_PAGESIZE / mmu.pte_size);
mmu.pwin_pte_va = vmem_xalloc(heap_arena, MMU_PAGESIZE,
MMU_PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
paddr = 0;
(void) find_pte((uintptr_t)mmu.pwin_base, &paddr, 0, 0);
ASSERT(paddr != 0);
ASSERT((paddr & MMU_PAGEOFFSET) == 0);
mmu.pwin_pte_pa = paddr;
#ifdef __xpv
(void) find_pte((uintptr_t)mmu.pwin_pte_va, NULL, 0, 0);
kbm_read_only((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa);
#else
kbm_map((uintptr_t)mmu.pwin_pte_va, mmu.pwin_pte_pa, 0, 1);
#endif
}
while (kbm_probe(&va, &size, &pfn, &prot) != 0) {
start_level = 0;
while (start_level <= mmu.max_page_level) {
if (size == LEVEL_SIZE(start_level))
break;
start_level++;
}
for (l = start_level; l < mmu.max_level; ++l) {
if (va >> LEVEL_SHIFT(l + 1) ==
last_va >> LEVEL_SHIFT(l + 1))
break;
++table_cnt;
}
last_va = va;
l = (start_level == 0) ? 1 : start_level;
va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
}
table_cnt += mmu.top_level_count - ((kernelbase >>
LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
table_cnt += table_cnt >> 2;
mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
htable_initial_reserve(table_cnt);
hment_reserve(mapping_cnt);
x86pte_cpu_init(CPU);
}
void
hat_kern_setup(void)
{
htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
#ifdef __xpv
mmu_btop(xen_info->pt_base - ONE_GIG));
#else
mmu_btop(getcr3_pa()));
#endif
#if defined(__xpv)
xen_kpm_finish_init();
#endif
khat_running = 1;
CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
CPU->cpu_current_hat = kas.a_hat;
}
#ifndef __xpv
static void
invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
{
ulong_t flag;
uint64_t cr4;
if (x86_use_invpcid == 1) {
ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
invpcid_insn(type, pcid, addr);
return;
}
switch (type) {
case INVPCID_ALL_GLOBAL:
flag = intr_clear();
cr4 = getcr4();
setcr4(cr4 & ~(ulong_t)CR4_PGE);
setcr4(cr4 | CR4_PGE);
intr_restore(flag);
break;
case INVPCID_ALL_NONGLOBAL:
if (!(getcr4() & CR4_PCIDE)) {
reload_cr3();
} else {
flag = intr_clear();
cr4 = getcr4();
setcr4(cr4 & ~(ulong_t)CR4_PGE);
setcr4(cr4 | CR4_PGE);
intr_restore(flag);
}
break;
case INVPCID_ADDR:
if (pcid == PCID_USER) {
flag = intr_clear();
ASSERT(addr < kernelbase);
ASSERT(ON_USER_HAT(CPU));
ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
intr_restore(flag);
} else {
mmu_invlpg((caddr_t)addr);
}
break;
default:
panic("unsupported invpcid(%lu)", type);
break;
}
}
void
mmu_flush_tlb_kpage(uintptr_t va)
{
ASSERT(va >= kernelbase);
ASSERT(getpcid() == PCID_KERNEL);
mmu_invlpg((caddr_t)va);
}
void
mmu_flush_tlb_page(uintptr_t va)
{
ASSERT(getpcid() == PCID_KERNEL);
if (va >= kernelbase) {
mmu_flush_tlb_kpage(va);
return;
}
if (!(getcr4() & CR4_PCIDE)) {
mmu_invlpg((caddr_t)va);
return;
}
if (ON_USER_HAT(CPU))
invpcid(INVPCID_ADDR, PCID_USER, va);
invpcid(INVPCID_ADDR, PCID_KERNEL, va);
}
static void
mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
{
EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
ASSERT(len > 0);
ASSERT(pgsz != 0);
if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
for (uintptr_t va = addr; va < (addr + len); va += pgsz)
mmu_flush_tlb_page(va);
return;
}
if (addr < kernelbase && ON_USER_HAT(CPU)) {
ulong_t flag = intr_clear();
ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
tr_mmu_flush_user_range(addr, len, pgsz,
CPU->cpu_m.mcpu_kpti.kf_user_cr3);
intr_restore(flag);
}
for (uintptr_t va = addr; va < (addr + len); va += pgsz)
mmu_invlpg((caddr_t)va);
}
void
mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
{
ASSERT(getpcid() == PCID_KERNEL);
switch (type) {
case FLUSH_TLB_ALL:
ASSERT(range == NULL);
invpcid(INVPCID_ALL_GLOBAL, 0, 0);
break;
case FLUSH_TLB_NONGLOBAL:
ASSERT(range == NULL);
invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
break;
case FLUSH_TLB_RANGE: {
mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
LEVEL_SIZE(range->tr_level));
break;
}
default:
panic("invalid call mmu_flush_tlb(%d)", type);
break;
}
}
#endif