usr/src/uts/sun4u/os/ppage.c

root/usr/src/uts/sun4u/os/ppage.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/t_lock.h>
#include <sys/vmem.h>
#include <sys/mman.h>
#include <sys/vm.h>
#include <sys/cpu.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/atomic.h>
#include <vm/as.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/hat_sfmmu.h>
#include <sys/debug.h>
#include <sys/cpu_module.h>
#include <sys/mem_cage.h>

/*
 * A quick way to generate a cache consistent address to map in a page.
 * users: ppcopy, pagezero, /proc, dev/mem
 *
 * The ppmapin/ppmapout routines provide a quick way of generating a cache
 * consistent address by reserving a given amount of kernel address space.
 * The base is PPMAPBASE and its size is PPMAPSIZE.  This memory is divided
 * into x number of sets, where x is the number of colors for the virtual
 * cache. The number of colors is how many times a page can be mapped
 * simulatenously in the cache.  For direct map caches this translates to
 * the number of pages in the cache.
 * Each set will be assigned a group of virtual pages from the reserved memory
 * depending on its virtual color.
 * When trying to assign a virtual address we will find out the color for the
 * physical page in question (if applicable).  Then we will try to find an
 * available virtual page from the set of the appropiate color.
 */

#define clsettoarray(color, set) ((color * nsets) + set)

int pp_slots = 4;               /* small default, tuned by cpu module */

/* tuned by cpu module, default is "safe" */
int pp_consistent_coloring = PPAGE_STORES_POLLUTE | PPAGE_LOADS_POLLUTE;

static caddr_t  ppmap_vaddrs[PPMAPSIZE / MMU_PAGESIZE];
static int      nsets;                  /* number of sets */
static int      ppmap_pages;            /* generate align mask */
static int      ppmap_shift;            /* set selector */

#ifdef PPDEBUG
#define         MAXCOLORS       16      /* for debug only */
static int      ppalloc_noslot = 0;     /* # of allocations from kernelmap */
static int      align_hits[MAXCOLORS];
static int      pp_allocs;              /* # of ppmapin requests */
#endif /* PPDEBUG */

/*
 * There are only 64 TLB entries on spitfire, 16 on cheetah
 * (fully-associative TLB) so we allow the cpu module to tune the
 * number to use here via pp_slots.
 */
static struct ppmap_va {
        caddr_t ppmap_slots[MAXPP_SLOTS];
} ppmap_va[NCPU];

void
ppmapinit(void)
{
        int color, nset, setsize;
        caddr_t va;

        ASSERT(pp_slots <= MAXPP_SLOTS);

        va = (caddr_t)PPMAPBASE;
        if (cache & CACHE_VAC) {
                int a;

                ppmap_pages = mmu_btop(shm_alignment);
                nsets = PPMAPSIZE / shm_alignment;
                setsize = shm_alignment;
                ppmap_shift = MMU_PAGESHIFT;
                a = ppmap_pages;
                while (a >>= 1)
                        ppmap_shift++;
        } else {
                /*
                 * If we do not have a virtual indexed cache we simply
                 * have only one set containing all pages.
                 */
                ppmap_pages = 1;
                nsets = mmu_btop(PPMAPSIZE);
                setsize = MMU_PAGESIZE;
                ppmap_shift = MMU_PAGESHIFT;
        }
        for (color = 0; color < ppmap_pages; color++) {
                for (nset = 0; nset < nsets; nset++) {
                        ppmap_vaddrs[clsettoarray(color, nset)] =
                            (caddr_t)((uintptr_t)va + (nset * setsize));
                }
                va += MMU_PAGESIZE;
        }
}

/*
 * Allocate a cache consistent virtual address to map a page, pp,
 * with protection, vprot; and map it in the MMU, using the most
 * efficient means possible.  The argument avoid is a virtual address
 * hint which when masked yields an offset into a virtual cache
 * that should be avoided when allocating an address to map in a
 * page.  An avoid arg of -1 means you don't care, for instance pagezero.
 *
 * machine dependent, depends on virtual address space layout,
 * understands that all kernel addresses have bit 31 set.
 *
 * NOTE: For sun4 platforms the meaning of the hint argument is opposite from
 * that found in other architectures.  In other architectures the hint
 * (called avoid) was used to ask ppmapin to NOT use the specified cache color.
 * This was used to avoid virtual cache trashing in the bcopy.  Unfortunately
 * in the case of a COW,  this later on caused a cache aliasing conflict.  In
 * sun4, the bcopy routine uses the block ld/st instructions so we don't have
 * to worry about virtual cache trashing.  Actually, by using the hint to choose
 * the right color we can almost guarantee a cache conflict will not occur.
 */

caddr_t
ppmapin(page_t *pp, uint_t vprot, caddr_t hint)
{
        int color, nset, index, start;
        caddr_t va;

#ifdef PPDEBUG
        pp_allocs++;
#endif /* PPDEBUG */
        if (cache & CACHE_VAC) {
                color = sfmmu_get_ppvcolor(pp);
                if (color == -1) {
                        if ((intptr_t)hint != -1L) {
                                color = addr_to_vcolor(hint);
                        } else {
                                color = addr_to_vcolor(mmu_ptob(pp->p_pagenum));
                        }
                }

        } else {
                /*
                 * For physical caches, we can pick any address we want.
                 */
                color = 0;
        }

        start = color;
        do {
                for (nset = 0; nset < nsets; nset++) {
                        index = clsettoarray(color, nset);
                        va = ppmap_vaddrs[index];
                        if (va != NULL) {
#ifdef PPDEBUG
                                align_hits[color]++;
#endif /* PPDEBUG */
                                if (atomic_cas_ptr(&ppmap_vaddrs[index],
                                    va, NULL) == va) {
                                        hat_memload(kas.a_hat, va, pp,
                                            vprot | HAT_NOSYNC,
                                            HAT_LOAD_LOCK);
                                        return (va);
                                }
                        }
                }
                /*
                 * first pick didn't succeed, try another
                 */
                if (++color == ppmap_pages)
                        color = 0;
        } while (color != start);

#ifdef PPDEBUG
        ppalloc_noslot++;
#endif /* PPDEBUG */

        /*
         * No free slots; get a random one from the kernel heap area.
         */
        va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);

        hat_memload(kas.a_hat, va, pp, vprot | HAT_NOSYNC, HAT_LOAD_LOCK);

        return (va);

}

void
ppmapout(caddr_t va)
{
        int color, nset, index;

        if (va >= kernelheap && va < ekernelheap) {
                /*
                 * Space came from kernelmap, flush the page and
                 * return the space.
                 */
                hat_unload(kas.a_hat, va, PAGESIZE,
                    (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));
                vmem_free(heap_arena, va, PAGESIZE);
        } else {
                /*
                 * Space came from ppmap_vaddrs[], give it back.
                 */
                color = addr_to_vcolor(va);
                ASSERT((cache & CACHE_VAC)? (color < ppmap_pages) : 1);

                nset = ((uintptr_t)va >> ppmap_shift) & (nsets - 1);
                index = clsettoarray(color, nset);
                hat_unload(kas.a_hat, va, PAGESIZE,
                    (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));

                ASSERT(ppmap_vaddrs[index] == NULL);
                ppmap_vaddrs[index] = va;
        }
}

#ifdef DEBUG
#define PP_STAT_ADD(stat)       (stat)++
uint_t pload, ploadfail;
uint_t ppzero, ppzero_short;
#else
#define PP_STAT_ADD(stat)
#endif /* DEBUG */

/*
 * Find a slot in per CPU page copy area. Load up a locked TLB in the
 * running cpu. We don't call hat layer to load up the tte since the
 * mapping is only temporary. If the thread migrates it'll get a TLB
 * miss trap and TLB/TSB miss handler will panic since there is no
 * official hat record of this mapping.
 */
static caddr_t
pp_load_tlb(processorid_t cpu, caddr_t **pslot, page_t *pp, uint_t prot)
{
        struct ppmap_va *ppmap;
        tte_t           tte;
        caddr_t         *myslot;
        caddr_t         va;
        long            i, start, stride;
        int             vcolor;
        uint_t          flags, strict_flag;

        PP_STAT_ADD(pload);

        ppmap = &ppmap_va[cpu];
        va = (caddr_t)(PPMAP_FAST_BASE + (MMU_PAGESIZE * MAXPP_SLOTS) * cpu);
        myslot = ppmap->ppmap_slots;
        ASSERT(addr_to_vcolor(va) == 0);

        if (prot & TTE_HWWR_INT) {
                flags = PPAGE_STORE_VCOLORING | PPAGE_STORES_POLLUTE;
                strict_flag = PPAGE_STORES_POLLUTE;
        } else {
                flags = PPAGE_LOAD_VCOLORING | PPAGE_LOADS_POLLUTE;
                strict_flag = PPAGE_LOADS_POLLUTE;
        }

        /*
         * If consistent handling is required then keep the current
         * vcolor of the page.  Furthermore, if loads or stores can
         * pollute the VAC then using a "new" page (unassigned vcolor)
         * won't work and we have to return a failure.
         */
        if (pp_consistent_coloring & flags) {
                vcolor = sfmmu_get_ppvcolor(pp);
                if ((vcolor == -1) &&
                    (pp_consistent_coloring & strict_flag))
                        return (NULL);
                /* else keep the current vcolor of the page */
        } else {
                vcolor = -1;
        }

        if (vcolor != -1) {
                va += MMU_PAGESIZE * vcolor;
                start = vcolor;
                stride = ppmap_pages; /* number of colors */
                myslot += vcolor;
        } else {
                start = 0;
                stride = 1;
        }

        for (i = start; i < pp_slots; i += stride) {
                if (*myslot == NULL) {
                        if (atomic_cas_ptr(myslot, NULL, va) == NULL)
                                break;
                }
                myslot += stride;
                va += MMU_PAGESIZE * stride;
        }

        if (i >= pp_slots) {
                PP_STAT_ADD(ploadfail);
                return (NULL);
        }

        ASSERT(vcolor == -1 || addr_to_vcolor(va) == vcolor);

        /*
         * Now we have a slot we can use, make the tte.
         */
        tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(pp->p_pagenum);
        tte.tte_intlo = TTE_PFN_INTLO(pp->p_pagenum) | TTE_CP_INT |
            TTE_CV_INT | TTE_PRIV_INT | TTE_LCK_INT | prot;

        ASSERT(CPU->cpu_id == cpu);
        sfmmu_dtlb_ld_kva(va, &tte);

        *pslot = myslot;        /* Return ptr to the slot we used. */

        return (va);
}

static void
pp_unload_tlb(caddr_t *pslot, caddr_t va)
{
        ASSERT(*pslot == va);

        vtag_flushpage(va, (uint64_t)ksfmmup);
        *pslot = NULL;                          /* release the slot */
}

/*
 * Common copy routine which attempts to use hwblkpagecopy.  If this routine
 * can't be used, failure (0) will be returned.  Otherwise, a PAGESIZE page
 * will be copied and success (1) will be returned.
 */
int
ppcopy_common(page_t *fm_pp, page_t *to_pp)
{
        caddr_t fm_va, to_va;
        caddr_t *fm_slot, *to_slot;
        processorid_t cpu;
        label_t ljb;
        int ret = 1;

        ASSERT(fm_pp != NULL && PAGE_LOCKED(fm_pp));
        ASSERT(to_pp != NULL && PAGE_LOCKED(to_pp));

        /*
         * If we can't use VIS block loads and stores we can't use
         * pp_load_tlb/pp_unload_tlb due to the possibility of
         * d$ aliasing.
         */
        if (!use_hw_bcopy && (cache & CACHE_VAC))
                return (0);

        kpreempt_disable();
        cpu = CPU->cpu_id;
        fm_va = pp_load_tlb(cpu, &fm_slot, fm_pp, 0);
        if (fm_va == NULL) {
                kpreempt_enable();
                return (0);
        }
        to_va = pp_load_tlb(cpu, &to_slot, to_pp, TTE_HWWR_INT);
        if (to_va == NULL) {
                pp_unload_tlb(fm_slot, fm_va);
                kpreempt_enable();
                return (0);
        }
        if (on_fault(&ljb)) {
                ret = 0;
                goto faulted;
        }
        hwblkpagecopy(fm_va, to_va);
        no_fault();
faulted:
        ASSERT(CPU->cpu_id == cpu);
        pp_unload_tlb(fm_slot, fm_va);
        pp_unload_tlb(to_slot, to_va);
        kpreempt_enable();
        return (ret);
}

/*
 * Routine to copy kernel pages during relocation.  It will copy one
 * PAGESIZE page to another PAGESIZE page.  This function may be called
 * above LOCK_LEVEL so it should not grab any locks.
 */
void
ppcopy_kernel__relocatable(page_t *fm_pp, page_t *to_pp)
{
        uint64_t fm_pa, to_pa;
        size_t nbytes;

        fm_pa = (uint64_t)(fm_pp->p_pagenum) << MMU_PAGESHIFT;
        to_pa = (uint64_t)(to_pp->p_pagenum) << MMU_PAGESHIFT;

        nbytes = MMU_PAGESIZE;

        for (; nbytes > 0; fm_pa += 32, to_pa += 32, nbytes -= 32)
                hw_pa_bcopy32(fm_pa, to_pa);
}

/*
 * Copy the data from the physical page represented by "frompp" to
 * that represented by "topp".
 *
 * Try to use per cpu mapping first, if that fails then call pp_mapin
 * to load it.
 *
 * Returns one on success or zero on some sort of fault while doing the copy.
 */
int
ppcopy(page_t *fm_pp, page_t *to_pp)
{
        caddr_t fm_va, to_va;
        label_t ljb;
        int ret = 1;
        boolean_t       use_kpm = B_FALSE;

        /* Try the fast path first */
        if (ppcopy_common(fm_pp, to_pp))
                return (1);

        /*
         * Try to map using KPM if enabled and we are the cageout thread.
         * If it fails, fall back to ppmapin/ppmaput
         */

        if (kpm_enable) {
                if (curthread == kcage_cageout_thread)
                        use_kpm = B_TRUE;
        }

        if (use_kpm) {
                if ((fm_va = hat_kpm_mapin(fm_pp, NULL)) == NULL ||
                    (to_va = hat_kpm_mapin(to_pp, NULL)) == NULL) {
                        if (fm_va != NULL)
                                hat_kpm_mapout(fm_pp, NULL, fm_va);
                        use_kpm = B_FALSE;
                }
        }

        if (use_kpm == B_FALSE) {
                /* do the slow path */
                fm_va = ppmapin(fm_pp, PROT_READ, (caddr_t)-1);
                to_va = ppmapin(to_pp, PROT_READ | PROT_WRITE, fm_va);
                if (on_fault(&ljb)) {
                        ret = 0;
                        goto faulted;
                }
        }
        bcopy(fm_va, to_va, PAGESIZE);
        no_fault();
faulted:
        /* unmap */
        if (use_kpm == B_TRUE) {
                hat_kpm_mapout(fm_pp, NULL, fm_va);
                hat_kpm_mapout(to_pp, NULL, to_va);
        } else {
                ppmapout(fm_va);
                ppmapout(to_va);
        }
        return (ret);
}

/*
 * Zero the physical page from off to off + len given by `pp'
 * without changing the reference and modified bits of page.
 *
 * Again, we'll try per cpu mapping first.
 */
void
pagezero(page_t *pp, uint_t off, uint_t len)
{
        caddr_t va;
        caddr_t *slot;
        int fast = 1;
        processorid_t cpu;
        extern int hwblkclr(void *, size_t);
        extern int use_hw_bzero;

        ASSERT((int)len > 0 && (int)off >= 0 && off + len <= PAGESIZE);
        ASSERT(PAGE_LOCKED(pp));

        PP_STAT_ADD(ppzero);

        if (len != MMU_PAGESIZE || !use_hw_bzero) {
                /*
                 * Since the fast path doesn't do anything about
                 * VAC coloring, we make sure bcopy h/w will be used.
                 */
                fast = 0;
                va = NULL;
                PP_STAT_ADD(ppzero_short);
        }

        kpreempt_disable();

        if (fast) {
                cpu = CPU->cpu_id;
                va = pp_load_tlb(cpu, &slot, pp, TTE_HWWR_INT);
        }

        if (va == NULL) {
                /*
                 * We are here either length != MMU_PAGESIZE or pp_load_tlb()
                 * returns NULL or use_hw_bzero is disabled.
                 */
                va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
                fast = 0;
        }

        if (hwblkclr(va + off, len)) {
                /*
                 * We may not have used block commit asi.
                 * So flush the I-$ manually
                 */

                ASSERT(fast == 0);

                sync_icache(va + off, len);
        } else {
                /*
                 * We have used blk commit, and flushed the I-$. However we
                 * still may have an instruction in the pipeline. Only a flush
                 * instruction will invalidate that.
                 */
                doflush(va);
        }

        if (fast) {
                ASSERT(CPU->cpu_id == cpu);
                pp_unload_tlb(slot, va);
        } else {
                ppmapout(va);
        }

        kpreempt_enable();
}
Illumos