root/usr/src/uts/i86pc/os/memscrub.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * i86pc Memory Scrubbing
 *
 * On detection of a correctable memory ECC error, the i86pc hardware
 * returns the corrected data to the requester and may re-write it
 * to memory (DRAM or NVRAM). Machines which do not re-write this to
 * memory should add an NMI handler to correct and rewrite.
 *
 * Scrubbing thus reduces the likelyhood that multiple transient errors
 * will occur in the same memory word, making uncorrectable errors due
 * to transients less likely.
 *
 * Thus is born the desire that every memory location be periodically
 * accessed.
 *
 * This file implements a memory scrubbing thread.  This scrubber
 * guarantees that all of physical memory is accessed periodically
 * (memscrub_period_sec -- 12 hours).
 *
 * It attempts to do this as unobtrusively as possible.  The thread
 * schedules itself to wake up at an interval such that if it reads
 * memscrub_span_pages (4MB) on each wakeup, it will read all of physical
 * memory in in memscrub_period_sec (12 hours).
 *
 * The scrubber uses the REP LODS so it reads 4MB in 0.15 secs (on P5-200).
 * When it completes a span, if all the CPUs are idle, it reads another span.
 * Typically it soaks up idle time this way to reach its deadline early
 * -- and sleeps until the next period begins.
 *
 * Maximal Cost Estimate:  8GB @ xxMB/s = xxx seconds spent in 640 wakeups
 * that run for 0.15 seconds at intervals of 67 seconds.
 *
 * In practice, the scrubber finds enough idle time to finish in a few
 * minutes, and sleeps until its 12 hour deadline.
 *
 * The scrubber maintains a private copy of the phys_install memory list
 * to keep track of what memory should be scrubbed.
 *
 * The following parameters can be set via /etc/system
 *
 * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (4MB)
 * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
 * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (0)
 * memscrub_delay_start_sec = (10 seconds)
 * disable_memscrub = (0)
 *
 * the scrubber will exit (or never be started) if it finds the variable
 * "disable_memscrub" set.
 *
 * MEMSCRUB_DFL_SPAN_PAGES  is based on the guess that 0.15 sec
 * is a "good" amount of minimum time for the thread to run at a time.
 *
 * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
 * twice the frequency the hardware folk estimated would be necessary.
 *
 * MEMSCRUB_DFL_THREAD_PRI (0) is based on the assumption that nearly
 * any other use of the system should be higher priority than scrubbing.
 */

#include <sys/types.h>
#include <sys/systm.h>          /* timeout, types, t_lock */
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>      /* MIN */
#include <sys/memlist.h>        /* memlist */
#include <sys/kmem.h>           /* KMEM_NOSLEEP */
#include <sys/cpuvar.h>         /* ncpus_online */
#include <sys/debug.h>          /* ASSERTs */
#include <sys/vmem.h>
#include <sys/mman.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/hat_i86.h>
#include <sys/callb.h>          /* CPR callback */

static caddr_t  memscrub_window;
static hat_mempte_t memscrub_pte;

/*
 * Global Data:
 */
/*
 * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
 */
#define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60)  /* 12 hours */

/*
 * start only if at least MEMSCRUB_MIN_PAGES in system
 */
#define MEMSCRUB_MIN_PAGES      ((32 * 1024 * 1024) / PAGESIZE)

/*
 * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
 */
#define MEMSCRUB_DFL_SPAN_PAGES ((4 * 1024 * 1024) / PAGESIZE)

/*
 * almost anything is higher priority than scrubbing
 */
#define MEMSCRUB_DFL_THREAD_PRI 0

/*
 * we can patch these defaults in /etc/system if necessary
 */
uint_t disable_memscrub = 0;
static uint_t disable_memscrub_quietly = 0;
pgcnt_t memscrub_min_pages = MEMSCRUB_MIN_PAGES;
pgcnt_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
time_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
time_t memscrub_delay_start_sec = 10;

/*
 * Static Routines
 */
static void memscrubber(void);
static int system_is_idle(void);
static int memscrub_add_span(uint64_t, uint64_t);

/*
 * Static Data
 */
static struct memlist *memscrub_memlist;
static uint_t memscrub_phys_pages;

static kcondvar_t memscrub_cv;
static kmutex_t memscrub_lock;

/*
 * memscrub_lock protects memscrub_memlist
 */
uint_t memscrub_scans_done;

uint_t memscrub_done_early;
uint_t memscrub_early_sec;

uint_t memscrub_done_late;
time_t memscrub_late_sec;

/*
 * create memscrub_memlist from phys_install list
 * initialize locks, set memscrub_phys_pages.
 */
void
memscrub_init()
{
        struct memlist *src;

        if (physmem < memscrub_min_pages)
                return;

        if (!kpm_enable) {
                memscrub_window = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
                memscrub_pte = hat_mempte_setup(memscrub_window);
        }

        /*
         * copy phys_install to memscrub_memlist
         */
        for (src = phys_install; src; src = src->ml_next) {
                if (memscrub_add_span(src->ml_address, src->ml_size)) {
                        cmn_err(CE_WARN,
                            "Software memory scrubber failed to initialize\n");
                        return;
                }
        }

        mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);

        /*
         * create memscrubber thread
         */
        (void) thread_create(NULL, 0, (void (*)())memscrubber, NULL, 0, &p0,
            TS_RUN, memscrub_thread_pri);
}

/*
 * Function to cause the software memscrubber to exit quietly if the
 * platform support has located a hardware scrubber and enabled it.
 */
void
memscrub_disable(void)
{
        disable_memscrub_quietly = 1;
}

#ifdef MEMSCRUB_DEBUG
static void
memscrub_printmemlist(char *title, struct memlist *listp)
{
        struct memlist *list;

        cmn_err(CE_CONT, "%s:\n", title);

        for (list = listp; list; list = list->next) {
                cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
                    list->address, list->size);
        }
}
#endif /* MEMSCRUB_DEBUG */

/* ARGSUSED */
static void
memscrub_wakeup(void *c)
{
        /*
         * grab mutex to guarantee that our wakeup call
         * arrives after we go to sleep -- so we can't sleep forever.
         */
        mutex_enter(&memscrub_lock);
        cv_signal(&memscrub_cv);
        mutex_exit(&memscrub_lock);
}

/*
 * this calculation doesn't account for the time that the actual scan
 * consumes -- so we'd fall slightly behind schedule with this
 * interval_sec.  but the idle loop optimization below usually makes us
 * come in way ahead of schedule.
 */
static int
compute_interval_sec()
{
        if (memscrub_phys_pages <= memscrub_span_pages)
                return (memscrub_period_sec);
        else
                return (memscrub_period_sec/
                    (memscrub_phys_pages/memscrub_span_pages));
}

static void
memscrubber()
{
        time_t deadline;
        uint64_t mlp_last_addr;
        uint64_t mlp_next_addr;
        int reached_end = 1;
        time_t interval_sec = 0;
        struct memlist *mlp;

        extern void scan_memory(caddr_t, size_t);
        callb_cpr_t cprinfo;

        /*
         * notify CPR of our existence
         */
        CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");

        if (memscrub_memlist == NULL) {
                cmn_err(CE_WARN, "memscrub_memlist not initialized.");
                goto memscrub_exit;
        }

        mlp = memscrub_memlist;
        mlp_next_addr = mlp->ml_address;
        mlp_last_addr = mlp->ml_address + mlp->ml_size;

        deadline = gethrestime_sec() + memscrub_delay_start_sec;

        for (;;) {
                if (disable_memscrub || disable_memscrub_quietly)
                        break;

                mutex_enter(&memscrub_lock);

                /*
                 * did we just reach the end of memory?
                 */
                if (reached_end) {
                        time_t now = gethrestime_sec();

                        if (now >= deadline) {
                                memscrub_done_late++;
                                memscrub_late_sec += (now - deadline);
                                /*
                                 * past deadline, start right away
                                 */
                                interval_sec = 0;

                                deadline = now + memscrub_period_sec;
                        } else {
                                /*
                                 * we finished ahead of schedule.
                                 * wait till previous dealine before re-start.
                                 */
                                interval_sec = deadline - now;
                                memscrub_done_early++;
                                memscrub_early_sec += interval_sec;
                                deadline += memscrub_period_sec;
                        }
                } else {
                        interval_sec = compute_interval_sec();
                }

                /*
                 * it is safe from our standpoint for CPR to
                 * suspend the system
                 */
                CALLB_CPR_SAFE_BEGIN(&cprinfo);

                /*
                 * hit the snooze bar
                 */
                (void) timeout(memscrub_wakeup, NULL, interval_sec * hz);

                /*
                 * go to sleep
                 */
                cv_wait(&memscrub_cv, &memscrub_lock);

                /* we need to goto work */
                CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);

                mutex_exit(&memscrub_lock);

                do {
                        pgcnt_t pages = memscrub_span_pages;
                        uint64_t address = mlp_next_addr;

                        if (disable_memscrub || disable_memscrub_quietly)
                                break;

                        mutex_enter(&memscrub_lock);

                        /*
                         * Make sure we don't try to scan beyond the end of
                         * the current memlist.  If we would, then resize
                         * our scan target for this iteration, and prepare
                         * to read the next memlist entry on the next
                         * iteration.
                         */
                        reached_end = 0;
                        if (address + mmu_ptob(pages) >= mlp_last_addr) {
                                pages = mmu_btop(mlp_last_addr - address);
                                mlp = mlp->ml_next;
                                if (mlp == NULL) {
                                        reached_end = 1;
                                        mlp = memscrub_memlist;
                                }
                                mlp_next_addr = mlp->ml_address;
                                mlp_last_addr = mlp->ml_address + mlp->ml_size;
                        } else {
                                mlp_next_addr += mmu_ptob(pages);
                        }

                        mutex_exit(&memscrub_lock);

                        while (pages--) {
                                pfn_t pfn = btop(address);

                                /*
                                 * Without segkpm, the memscrubber cannot
                                 * be allowed to migrate across CPUs, as
                                 * the CPU-specific mapping of
                                 * memscrub_window would be incorrect.
                                 * With segkpm, switching CPUs is legal, but
                                 * inefficient.  We don't use
                                 * kpreempt_disable as it might hold a
                                 * higher priority thread (eg, RT) too long
                                 * off CPU.
                                 */
                                thread_affinity_set(curthread, CPU_CURRENT);
                                if (kpm_enable)
                                        memscrub_window = hat_kpm_pfn2va(pfn);
                                else
                                        hat_mempte_remap(pfn, memscrub_window,
                                            memscrub_pte,
                                            PROT_READ, HAT_LOAD_NOCONSIST);

                                scan_memory(memscrub_window, PAGESIZE);

                                thread_affinity_clear(curthread);
                                address += MMU_PAGESIZE;
                        }

                        memscrub_scans_done++;
                } while (!reached_end && system_is_idle());
        }

memscrub_exit:

        if (!disable_memscrub_quietly)
                cmn_err(CE_NOTE, "Software memory scrubber exiting.");
        /*
         * We are about to bail, but don't have the memscrub_lock,
         * and it is needed for CALLB_CPR_EXIT.
         */
        mutex_enter(&memscrub_lock);
        CALLB_CPR_EXIT(&cprinfo);

        cv_destroy(&memscrub_cv);

        thread_exit();
}


/*
 * return 1 if we're MP and all the other CPUs are idle
 */
static int
system_is_idle()
{
        int cpu_id;
        int found = 0;

        if (1 == ncpus_online)
                return (0);

        for (cpu_id = 0; cpu_id < NCPU; ++cpu_id) {
                if (!cpu[cpu_id])
                        continue;

                found++;

                if (cpu[cpu_id]->cpu_thread != cpu[cpu_id]->cpu_idle_thread) {
                        if (CPU->cpu_id == cpu_id &&
                            CPU->cpu_disp->disp_nrunnable == 0)
                                continue;
                        return (0);
                }

                if (found == ncpus)
                        break;
        }
        return (1);
}

/*
 * add a span to the memscrub list
 */
static int
memscrub_add_span(uint64_t start, uint64_t bytes)
{
        struct memlist *dst;
        struct memlist *prev, *next;
        uint64_t end = start + bytes - 1;
        int retval = 0;

        mutex_enter(&memscrub_lock);

#ifdef MEMSCRUB_DEBUG
        memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
        cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
        cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
            " size: 0x%llx\n", start, bytes);
#endif /* MEMSCRUB_DEBUG */

        /*
         * Scan through the list to find the proper place to install it.
         */
        prev = NULL;
        next = memscrub_memlist;
        while (next) {
                uint64_t ns = next->ml_address;
                uint64_t ne = next->ml_address + next->ml_size - 1;

                /*
                 * If this span overlaps with an existing span, then
                 * something has gone horribly wrong with the phys_install
                 * list.  In fact, I'm surprised we made it this far.
                 */
                if ((start >= ns && start <= ne) || (end >= ns && end <= ne) ||
                    (start < ns && end > ne))
                        panic("memscrub found overlapping memory ranges "
                            "(0x%p-0x%p) and (0x%p-0x%p)",
                            (void *)(uintptr_t)start, (void *)(uintptr_t)end,
                            (void *)(uintptr_t)ns, (void *)(uintptr_t)ne);

                /*
                 * New span can be appended to an existing one.
                 */
                if (start == ne + 1) {
                        next->ml_size += bytes;
                        goto add_done;
                }

                /*
                 * New span can be prepended to an existing one.
                 */
                if (end + 1 == ns) {
                        next->ml_size += bytes;
                        next->ml_address = start;
                        goto add_done;
                }

                /*
                 * If the next span has a higher start address than the new
                 * one, then we have found the right spot for our
                 * insertion.
                 */
                if (ns > start)
                        break;

                prev = next;
                next = next->ml_next;
        }

        /*
         * allocate a new struct memlist
         */
        dst = kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
        if (dst == NULL) {
                retval = -1;
                goto add_done;
        }
        dst->ml_address = start;
        dst->ml_size = bytes;
        dst->ml_prev = prev;
        dst->ml_next = next;

        if (prev)
                prev->ml_next = dst;
        else
                memscrub_memlist = dst;

        if (next)
                next->ml_prev = dst;

add_done:

        if (retval != -1)
                memscrub_phys_pages += mmu_btop(bytes);

#ifdef MEMSCRUB_DEBUG
        memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
        cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
#endif /* MEMSCRUB_DEBUG */

        mutex_exit(&memscrub_lock);
        return (retval);
}