root/usr/src/uts/sun4v/os/mach_startup.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/machsystm.h>
#include <sys/archsystm.h>
#include <sys/prom_plat.h>
#include <sys/promif.h>
#include <sys/vm.h>
#include <sys/cpu.h>
#include <sys/bitset.h>
#include <sys/cpupart.h>
#include <sys/disp.h>
#include <sys/hypervisor_api.h>
#include <sys/traptrace.h>
#include <sys/modctl.h>
#include <sys/ldoms.h>
#include <sys/cpu_module.h>
#include <sys/mutex_impl.h>
#include <sys/rwlock.h>
#include <sys/sdt.h>
#include <sys/cmt.h>
#include <vm/vm_dep.h>

#ifdef TRAPTRACE
int mach_htraptrace_enable = 1;
#else
int mach_htraptrace_enable = 0;
#endif
int htrap_tr0_inuse = 0;
extern char htrap_tr0[];        /* prealloc buf for boot cpu */

caddr_t mmu_fault_status_area;

extern void sfmmu_set_tsbs(void);
/*
 * CPU IDLE optimization variables/routines
 */
static int enable_halt_idle_cpus = 1;

/*
 * Defines for the idle_state_transition DTrace probe
 *
 * The probe fires when the CPU undergoes an idle state change (e.g. hv yield)
 * The agument passed is the state to which the CPU is transitioning.
 *
 * The states are defined here.
 */
#define IDLE_STATE_NORMAL 0
#define IDLE_STATE_YIELDED 1

#define SUN4V_CLOCK_TICK_THRESHOLD      64
#define SUN4V_CLOCK_TICK_NCPUS          64

extern int      clock_tick_threshold;
extern int      clock_tick_ncpus;

uint_t cp_haltset_fanout = 3;

void
setup_trap_table(void)
{
        caddr_t mmfsa_va;
        extern   caddr_t mmu_fault_status_area;
        mmfsa_va =
            mmu_fault_status_area + (MMFSA_SIZE * CPU->cpu_id);

        intr_init(CPU);         /* init interrupt request free list */
        setwstate(WSTATE_KERN);
        set_mmfsa_scratchpad(mmfsa_va);
        prom_set_mmfsa_traptable(&trap_table, va_to_pa(mmfsa_va));
        sfmmu_set_tsbs();
}

void
phys_install_has_changed(void)
{

}

/*
 * Halt the present CPU until awoken via an interrupt
 */
static void
cpu_halt(void)
{
        cpu_t *cpup = CPU;
        processorid_t cpu_sid = cpup->cpu_seqid;
        cpupart_t *cp = cpup->cpu_part;
        int hset_update = 1;
        volatile int *p = &cpup->cpu_disp->disp_nrunnable;
        uint_t s;

        /*
         * If this CPU is online then we should notate our halting
         * by adding ourselves to the partition's halted CPU
         * bitset. This allows other CPUs to find/awaken us when
         * work becomes available.
         */
        if (CPU->cpu_flags & CPU_OFFLINE)
                hset_update = 0;

        /*
         * Add ourselves to the partition's halted CPUs bitset
         * and set our HALTED flag, if necessary.
         *
         * When a thread becomes runnable, it is placed on the queue
         * and then the halted cpu bitset is checked to determine who
         * (if anyone) should be awoken. We therefore need to first
         * add ourselves to the halted bitset, and then check if there
         * is any work available.  The order is important to prevent a race
         * that can lead to work languishing on a run queue somewhere while
         * this CPU remains halted.
         *
         * Either the producing CPU will see we're halted and will awaken us,
         * or this CPU will see the work available in disp_anywork()
         */
        if (hset_update) {
                cpup->cpu_disp_flags |= CPU_DISP_HALTED;
                membar_producer();
                bitset_atomic_add(&cp->cp_haltset, cpu_sid);
        }

        /*
         * Check to make sure there's really nothing to do.
         * Work destined for this CPU may become available after
         * this check. We'll be notified through the clearing of our
         * bit in the halted CPU bitset, and a poke.
         */
        if (disp_anywork()) {
                if (hset_update) {
                        cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                        bitset_atomic_del(&cp->cp_haltset, cpu_sid);
                }
                return;
        }

        /*
         * We're on our way to being halted.  Wait until something becomes
         * runnable locally or we are awaken (i.e. removed from the halt set).
         * Note that the call to hv_cpu_yield() can return even if we have
         * nothing to do.
         *
         * Disable interrupts now, so that we'll awaken immediately
         * after halting if someone tries to poke us between now and
         * the time we actually halt.
         *
         * We check for the presence of our bit after disabling interrupts.
         * If it's cleared, we'll return. If the bit is cleared after
         * we check then the poke will pop us out of the halted state.
         * Also, if the offlined CPU has been brought back on-line, then
         * we return as well.
         *
         * The ordering of the poke and the clearing of the bit by cpu_wakeup
         * is important.
         * cpu_wakeup() must clear, then poke.
         * cpu_halt() must disable interrupts, then check for the bit.
         *
         * The check for anything locally runnable is here for performance
         * and isn't needed for correctness. disp_nrunnable ought to be
         * in our cache still, so it's inexpensive to check, and if there
         * is anything runnable we won't have to wait for the poke.
         *
         * Any interrupt will awaken the cpu from halt. Looping here
         * will filter spurious interrupts that wake us up, but don't
         * represent a need for us to head back out to idle().  This
         * will enable the idle loop to be more efficient and sleep in
         * the processor pipeline for a larger percent of the time,
         * which returns useful cycles to the peer hardware strand
         * that shares the pipeline.
         */
        s = disable_vec_intr();
        while (*p == 0 &&
            ((hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid)) ||
            (!hset_update && (CPU->cpu_flags & CPU_OFFLINE)))) {

                DTRACE_PROBE1(idle__state__transition,
                    uint_t, IDLE_STATE_YIELDED);
                (void) hv_cpu_yield();
                DTRACE_PROBE1(idle__state__transition,
                    uint_t, IDLE_STATE_NORMAL);

                enable_vec_intr(s);
                s = disable_vec_intr();
        }

        /*
         * We're no longer halted
         */
        enable_vec_intr(s);
        if (hset_update) {
                cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                bitset_atomic_del(&cp->cp_haltset, cpu_sid);
        }
}

/*
 * If "cpu" is halted, then wake it up clearing its halted bit in advance.
 * Otherwise, see if other CPUs in the cpu partition are halted and need to
 * be woken up so that they can steal the thread we placed on this CPU.
 * This function is only used on MP systems.
 */
static void
cpu_wakeup(cpu_t *cpu, int bound)
{
        uint_t          cpu_found;
        processorid_t   cpu_sid;
        cpupart_t       *cp;

        cp = cpu->cpu_part;
        cpu_sid = cpu->cpu_seqid;
        if (bitset_in_set(&cp->cp_haltset, cpu_sid)) {
                /*
                 * Clear the halted bit for that CPU since it will be
                 * poked in a moment.
                 */
                bitset_atomic_del(&cp->cp_haltset, cpu_sid);
                /*
                 * We may find the current CPU present in the halted cpu bitset
                 * if we're in the context of an interrupt that occurred
                 * before we had a chance to clear our bit in cpu_halt().
                 * Poking ourself is obviously unnecessary, since if
                 * we're here, we're not halted.
                 */
                if (cpu != CPU)
                        poke_cpu(cpu->cpu_id);
                return;
        } else {
                /*
                 * This cpu isn't halted, but it's idle or undergoing a
                 * context switch. No need to awaken anyone else.
                 */
                if (cpu->cpu_thread == cpu->cpu_idle_thread ||
                    cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
                        return;
        }

        /*
         * No need to wake up other CPUs if this is for a bound thread.
         */
        if (bound)
                return;

        /*
         * The CPU specified for wakeup isn't currently halted, so check
         * to see if there are any other halted CPUs in the partition,
         * and if there are then awaken one.
         */
        do {
                cpu_found = bitset_find(&cp->cp_haltset);
                if (cpu_found == (uint_t)-1)
                        return;
        } while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0);

        if (cpu_found != CPU->cpu_seqid)
                poke_cpu(cpu_seq[cpu_found]->cpu_id);
}

void
mach_cpu_halt_idle(void)
{
        if (enable_halt_idle_cpus) {
                idle_cpu = cpu_halt;
                disp_enq_thread = cpu_wakeup;
        }
}

int
ndata_alloc_mmfsa(struct memlist *ndata)
{
        size_t  size;

        size = MMFSA_SIZE * max_ncpus;
        mmu_fault_status_area = ndata_alloc(ndata, size, ecache_alignsize);
        if (mmu_fault_status_area == NULL)
                return (-1);
        return (0);
}

void
mach_memscrub(void)
{
        /* no memscrub support for sun4v for now */
}

void
mach_fpras()
{
        /* no fpras support for sun4v for now */
}

void
mach_hw_copy_limit(void)
{
        /* HW copy limits set by individual CPU module */
}

/*
 * We need to enable soft ring functionality on Niagara platforms since
 * one strand can't handle interrupts for a 1Gb NIC. So set the tunable
 * mac_soft_ring_enable by default on this platform.
 * mac_soft_ring_enable variable is defined in space.c and used by MAC
 * module. This tunable in concert with mac_soft_ring_count (declared
 * in mac.h) will configure the number of fanout soft rings for a link.
 */
extern boolean_t mac_soft_ring_enable;
void
startup_platform(void)
{
        mac_soft_ring_enable = B_TRUE;
        if (clock_tick_threshold == 0)
                clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
        if (clock_tick_ncpus == 0)
                clock_tick_ncpus = SUN4V_CLOCK_TICK_NCPUS;
        /* set per-platform constants for mutex_backoff */
        mutex_backoff_base = 1;
        mutex_cap_factor = 4;
        if (l2_cache_node_count() > 1) {
                /* VF for example */
                mutex_backoff_base = 2;
                mutex_cap_factor = 64;
        }
        rw_lock_backoff = default_lock_backoff;
        rw_lock_delay = default_lock_delay;
}

/*
 * This function sets up hypervisor traptrace buffer
 * This routine is called by the boot cpu only
 */
void
mach_htraptrace_setup(int cpuid)
{
        TRAP_TRACE_CTL  *ctlp;
        int bootcpuid = getprocessorid(); /* invoked on boot cpu only */

        if (mach_htraptrace_enable && ((cpuid != bootcpuid) ||
            !htrap_tr0_inuse)) {
                ctlp = &trap_trace_ctl[cpuid];
                ctlp->d.hvaddr_base = (cpuid == bootcpuid) ? htrap_tr0 :
                    contig_mem_alloc_align(HTRAP_TSIZE, HTRAP_TSIZE);
                if (ctlp->d.hvaddr_base == NULL) {
                        ctlp->d.hlimit = 0;
                        ctlp->d.hpaddr_base = 0;
                        cmn_err(CE_WARN, "!cpu%d: failed to allocate HV "
                            "traptrace buffer", cpuid);
                } else {
                        ctlp->d.hlimit = HTRAP_TSIZE;
                        ctlp->d.hpaddr_base = va_to_pa(ctlp->d.hvaddr_base);
                }
        }
}

/*
 * This function enables or disables the hypervisor traptracing
 */
void
mach_htraptrace_configure(int cpuid)
{
        uint64_t ret;
        uint64_t prev_buf, prev_bufsize;
        uint64_t prev_enable;
        uint64_t size;
        TRAP_TRACE_CTL  *ctlp;

        ctlp = &trap_trace_ctl[cpuid];
        if (mach_htraptrace_enable) {
                if ((ctlp->d.hvaddr_base != NULL) &&
                    ((ctlp->d.hvaddr_base != htrap_tr0) ||
                    (!htrap_tr0_inuse))) {
                        ret = hv_ttrace_buf_info(&prev_buf, &prev_bufsize);
                        if ((ret == H_EOK) && (prev_bufsize != 0)) {
                                cmn_err(CE_CONT,
                                    "!cpu%d: previous HV traptrace buffer of "
                                    "size 0x%lx at address 0x%lx", cpuid,
                                    prev_bufsize, prev_buf);
                        }

                        ret = hv_ttrace_buf_conf(ctlp->d.hpaddr_base,
                            ctlp->d.hlimit /
                            (sizeof (struct htrap_trace_record)), &size);
                        if (ret == H_EOK) {
                                ret = hv_ttrace_enable(\
                                    (uint64_t)TRAP_TENABLE_ALL, &prev_enable);
                                if (ret != H_EOK) {
                                        cmn_err(CE_WARN,
                                            "!cpu%d: HV traptracing not "
                                            "enabled, ta: 0x%x returned error: "
                                            "%ld", cpuid, TTRACE_ENABLE, ret);
                                } else {
                                        if (ctlp->d.hvaddr_base == htrap_tr0)
                                                htrap_tr0_inuse = 1;
                                }
                        } else {
                                cmn_err(CE_WARN,
                                    "!cpu%d: HV traptrace buffer not "
                                    "configured, ta: 0x%x returned error: %ld",
                                    cpuid, TTRACE_BUF_CONF, ret);
                        }
                        /*
                         * set hvaddr_base to NULL when traptrace buffer
                         * registration fails
                         */
                        if (ret != H_EOK) {
                                ctlp->d.hvaddr_base = NULL;
                                ctlp->d.hlimit = 0;
                                ctlp->d.hpaddr_base = 0;
                        }
                }
        } else {
                ret = hv_ttrace_buf_info(&prev_buf, &prev_bufsize);
                if ((ret == H_EOK) && (prev_bufsize != 0)) {
                        ret = hv_ttrace_enable((uint64_t)TRAP_TDISABLE_ALL,
                            &prev_enable);
                        if (ret == H_EOK) {
                                if (ctlp->d.hvaddr_base == htrap_tr0)
                                        htrap_tr0_inuse = 0;
                                ctlp->d.hvaddr_base = NULL;
                                ctlp->d.hlimit = 0;
                                ctlp->d.hpaddr_base = 0;
                        } else
                                cmn_err(CE_WARN,
                                    "!cpu%d: HV traptracing is not disabled, "
                                    "ta: 0x%x returned error: %ld",
                                    cpuid, TTRACE_ENABLE, ret);
                }
        }
}

/*
 * This function cleans up the hypervisor traptrace buffer
 */
void
mach_htraptrace_cleanup(int cpuid)
{
        if (mach_htraptrace_enable) {
                TRAP_TRACE_CTL *ctlp;
                caddr_t httrace_buf_va;

                ASSERT(cpuid < max_ncpus);
                ctlp = &trap_trace_ctl[cpuid];
                httrace_buf_va = ctlp->d.hvaddr_base;
                if (httrace_buf_va == htrap_tr0) {
                        bzero(httrace_buf_va, HTRAP_TSIZE);
                } else if (httrace_buf_va != NULL) {
                        contig_mem_free(httrace_buf_va, HTRAP_TSIZE);
                }
                ctlp->d.hvaddr_base = NULL;
                ctlp->d.hlimit = 0;
                ctlp->d.hpaddr_base = 0;
        }
}

/*
 * Load any required machine class (sun4v) specific drivers.
 */
void
load_mach_drivers(void)
{
        /*
         * We don't want to load these LDOMs-specific
         * modules if domaining is not supported.  Also,
         * we must be able to run on non-LDOMs firmware.
         */
        if (!domaining_supported())
                return;

        /*
         * Load the core domain services module
         */
        if (modload("misc", "ds") == -1)
                cmn_err(CE_NOTE, "!'ds' module failed to load");

        /*
         * Load the rest of the domain services
         */
        if (modload("misc", "fault_iso") == -1)
                cmn_err(CE_NOTE, "!'fault_iso' module failed to load");

        if (modload("misc", "platsvc") == -1)
                cmn_err(CE_NOTE, "!'platsvc' module failed to load");

        if (domaining_enabled() && modload("misc", "dr_cpu") == -1)
                cmn_err(CE_NOTE, "!'dr_cpu' module failed to load");

        if (modload("misc", "dr_io") == -1)
                cmn_err(CE_NOTE, "!'dr_io' module failed to load");

        if (modload("misc", "dr_mem") == -1)
                cmn_err(CE_NOTE, "!'dr_mem' module failed to load");

        /*
         * Attempt to attach any virtual device servers. These
         * drivers must be loaded at start of day so that they
         * can respond to any updates to the machine description.
         *
         * Since it is quite likely that a domain will not support
         * one or more of these servers, failures are ignored.
         */

        /* virtual disk server */
        (void) i_ddi_attach_hw_nodes("vds");

        /* virtual network switch */
        (void) i_ddi_attach_hw_nodes("vsw");

        /* virtual console concentrator */
        (void) i_ddi_attach_hw_nodes("vcc");
}

void
set_platform_defaults(void)
{
        /*
         * Allow at most one context domain per 8 CPUs, which is ample for
         * good performance.  Do not make this too large, because it
         * increases the space consumed in the per-process sfmmu structure.
         */
        if (max_mmu_ctxdoms == 0)
                max_mmu_ctxdoms = (NCPU + 7) / 8;
}