root/usr/src/uts/intel/io/mc-amd/mcamd_drv.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2022 Oxide Computer Co.
 */

#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/ddifm.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/stat.h>
#include <sys/modctl.h>
#include <sys/types.h>
#include <sys/cpuvar.h>
#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/cred.h>
#include <sys/ksynch.h>
#include <sys/rwlock.h>
#include <sys/pghw.h>
#include <sys/open.h>
#include <sys/policy.h>
#include <sys/x86_archext.h>
#include <sys/cpu_module.h>
#include <qsort.h>
#include <sys/pci_cfgspace.h>
#include <sys/mc.h>
#include <sys/mc_amd.h>
#include <sys/smbios.h>
#include <sys/pci.h>
#include <mcamd.h>
#include <mcamd_dimmcfg.h>
#include <mcamd_pcicfg.h>
#include <mcamd_api.h>
#include <sys/fm/cpu/AMD.h>
#include <sys/fm/smb/fmsmb.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>

/*
 * Set to prevent mc-amd from attaching.
 */
int mc_no_attach = 0;

/*
 * Of the 754/939/940 packages, only socket 940 supports quadrank registered
 * dimms.  Unfortunately, no memory-controller register indicates the
 * presence of quadrank dimm support or presence (i.e., in terms of number
 * of slots per cpu, and chip-select lines per slot,  The following may be set
 * in /etc/system to indicate the presence of quadrank support on a motherboard.
 *
 * There is no need to set this for F(1207) and S1g1.
 */
int mc_quadranksupport = 0;

mc_t *mc_list, *mc_last;
krwlock_t mc_lock;
int mc_hold_attached = 1;

#define MAX(m, n) ((m) >= (n) ? (m) : (n))
#define MIN(m, n) ((m) <= (n) ? (m) : (n))

/*
 * The following tuneable is used to determine the DRAM scrubbing rate.
 * The values range from 0x00-0x16 as described in the BKDG.  Zero
 * disables DRAM scrubbing.  Values above zero indicate rates in descending
 * order.
 *
 * The default value below is used on several Sun systems.  In the future
 * this code should assign values dynamically based on memory sizing.
 */
uint32_t mc_scrub_rate_dram = 0xd;      /* 64B every 163.8 us; 1GB per 45 min */

enum {
        MC_SCRUB_BIOSDEFAULT,   /* retain system default value */
        MC_SCRUB_FIXED,         /* assign mc_scrub_rate_* values */
        MC_SCRUB_MAX            /* assign max of system and tunables */
} mc_scrub_policy = MC_SCRUB_MAX;

static void
mc_snapshot_destroy(mc_t *mc)
{
        ASSERT(RW_LOCK_HELD(&mc_lock));

        if (mc->mc_snapshot == NULL)
                return;

        kmem_free(mc->mc_snapshot, mc->mc_snapshotsz);
        mc->mc_snapshot = NULL;
        mc->mc_snapshotsz = 0;
        mc->mc_snapshotgen++;
}

static int
mc_snapshot_update(mc_t *mc)
{
        ASSERT(RW_LOCK_HELD(&mc_lock));

        if (mc->mc_snapshot != NULL)
                return (0);

        if (nvlist_pack(mc->mc_nvl, &mc->mc_snapshot, &mc->mc_snapshotsz,
            NV_ENCODE_XDR, KM_SLEEP) != 0)
                return (-1);

        return (0);
}

static mc_t *
mc_lookup_by_chipid(int chipid)
{
        mc_t *mc;

        ASSERT(RW_LOCK_HELD(&mc_lock));

        for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
                if (mc->mc_props.mcp_num  == chipid)
                        return (mc);
        }

        return (NULL);
}

/*
 * Read config register pairs into the two arrays provided on the given
 * handle and at offsets as follows:
 *
 *      Index   Array r1 offset                 Array r2 offset
 *      0       r1addr                          r2addr
 *      1       r1addr + incr                   r2addr + incr
 *      2       r1addr + 2 * incr               r2addr + 2 * incr
 *      ...
 *      n - 1   r1addr + (n - 1) * incr         r2addr + (n - 1) * incr
 *
 * The number of registers to read into the r1 array is r1n; the number
 * for the r2 array is r2n.
 */
static void
mc_prop_read_pair(mc_pcicfg_hdl_t cfghdl, uint32_t *r1, off_t r1addr,
    int r1n, uint32_t *r2, off_t r2addr, int r2n, off_t incr)
{
        int i;

        for (i = 0; i < MAX(r1n, r2n); i++, r1addr += incr, r2addr += incr) {
                if (i < r1n)
                        r1[i] = mc_pcicfg_get32(cfghdl, r1addr);
                if (i < r2n)
                        r2[i] = mc_pcicfg_get32(cfghdl, r2addr);
        }
}

/*ARGSUSED*/
static int
mc_nvl_add_socket_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
{
        uint32_t skt = *((uint32_t *)arg1);
        cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;

        if (cmi_hdl_getsockettype(whdl) == skt) {
                cmi_hdl_hold(whdl);     /* short-term hold */
                *hdlp = whdl;
                return (CMI_HDL_WALK_DONE);
        } else {
                return (CMI_HDL_WALK_NEXT);
        }
}

static void
mc_nvl_add_socket(nvlist_t *nvl, mc_t *mc)
{
        cmi_hdl_t hdl = NULL;
        const char *s;

        cmi_hdl_walk(mc_nvl_add_socket_cb, (void *)&mc->mc_socket,
            (void *)&hdl, NULL);
        if (hdl == NULL)
                s = "Unknown";  /* no cpu for this chipid found */
        else
                s = cmi_hdl_getsocketstr(hdl);

        (void) nvlist_add_string(nvl, "socket", s);

        if (hdl != NULL)
                cmi_hdl_rele(hdl);
}

static uint32_t
mc_ecc_enabled(mc_t *mc)
{
        x86_chiprev_t rev = mc->mc_props.mcp_rev;
        union mcreg_nbcfg nbcfg;

        MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;

        return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
            MCREG_FIELD_F_preF(&nbcfg, EccEn) :
            MCREG_FIELD_F_revFG(&nbcfg, EccEn));
}

static uint32_t
mc_ck_enabled(mc_t *mc)
{
        x86_chiprev_t rev = mc->mc_props.mcp_rev;
        union mcreg_nbcfg nbcfg;

        MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;

        return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
            MCREG_FIELD_F_preF(&nbcfg, ChipKillEccEn) :
            MCREG_FIELD_F_revFG(&nbcfg, ChipKillEccEn));
}

static void
mc_nvl_add_ecctype(nvlist_t *nvl, mc_t *mc)
{
        (void) nvlist_add_string(nvl, "ecc-type", mc_ecc_enabled(mc) ?
            (mc_ck_enabled(mc) ? "ChipKill 128/16" : "Normal 64/8") : "None");
}

static void
mc_nvl_add_prop(nvlist_t *nvl, void *node, mcamd_propcode_t code, int reqval)
{
        int valfound;
        uint64_t value;
        const char *name = mcamd_get_propname(code);

        valfound = mcamd_get_numprop(NULL, (mcamd_node_t *)node, code, &value);

        ASSERT(name != NULL && valfound);
        if (name != NULL && valfound && (!reqval || value != MC_INVALNUM))
                (void) nvlist_add_uint64(nvl, name, value);
}

static void
mc_nvl_add_cslist(nvlist_t *mcnvl, mc_t *mc)
{
        mc_cs_t *mccs = mc->mc_cslist;
        nvlist_t *cslist[MC_CHIP_NCS];
        int nelem, i;

        for (nelem = 0; mccs != NULL; mccs = mccs->mccs_next, nelem++) {
                nvlist_t **csp = &cslist[nelem];
                char csname[MCDCFG_CSNAMELEN];

                (void) nvlist_alloc(csp, NV_UNIQUE_NAME, KM_SLEEP);
                mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_NUM, 0);
                mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_BASE_ADDR, 0);
                mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_MASK, 0);
                mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_SIZE, 0);

                /*
                 * It is possible for an mc_cs_t not to have associated
                 * DIMM info if mcdcfg_lookup failed.
                 */
                if (mccs->mccs_csl[0] != NULL) {
                        mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM1, 1);
                        mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[0], csname,
                            sizeof (csname));
                        (void) nvlist_add_string(*csp, "dimm1-csname", csname);
                }

                if (mccs->mccs_csl[1] != NULL) {
                        mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM2, 1);
                        mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[1], csname,
                            sizeof (csname));
                        (void) nvlist_add_string(*csp, "dimm2-csname", csname);
                }
        }

        /* Add cslist nvlist array even if zero members */
        (void) nvlist_add_nvlist_array(mcnvl, "cslist", cslist, nelem);
        for (i = 0; i < nelem; i++)
                nvlist_free(cslist[i]);
}

static void
mc_nvl_add_dimmlist(nvlist_t *mcnvl, mc_t *mc)
{
        nvlist_t *dimmlist[MC_CHIP_NDIMM];
        mc_dimm_t *mcd;
        int nelem, i;

        for (nelem = 0, mcd = mc->mc_dimmlist; mcd != NULL;
            mcd = mcd->mcd_next, nelem++) {
                nvlist_t **dimmp = &dimmlist[nelem];
                uint64_t csnums[MC_CHIP_DIMMRANKMAX];
                char csname[4][MCDCFG_CSNAMELEN];
                char *csnamep[4];
                int ncs = 0;

                (void) nvlist_alloc(dimmp, NV_UNIQUE_NAME, KM_SLEEP);

                mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_NUM, 1);
                mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_SIZE, 1);

                for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
                        if (mcd->mcd_cs[i] != NULL) {
                                csnums[ncs] =
                                    mcd->mcd_cs[i]->mccs_props.csp_num;
                                mcdcfg_csname(mc->mc_socket, mcd->mcd_csl[i],
                                    csname[ncs], MCDCFG_CSNAMELEN);
                                csnamep[ncs] = csname[ncs];
                                ncs++;
                        }
                }

                (void) nvlist_add_uint64_array(*dimmp, "csnums", csnums, ncs);
                (void) nvlist_add_string_array(*dimmp, "csnames", csnamep, ncs);
        }

        /* Add dimmlist nvlist array even if zero members */
        (void) nvlist_add_nvlist_array(mcnvl, "dimmlist", dimmlist, nelem);
        for (i = 0; i < nelem; i++)
                nvlist_free(dimmlist[i]);
}

static void
mc_nvl_add_htconfig(nvlist_t *mcnvl, mc_t *mc)
{
        mc_cfgregs_t *mcr = &mc->mc_cfgregs;
        union mcreg_htroute *htrp = (union mcreg_htroute *)&mcr->mcr_htroute[0];
        union mcreg_nodeid *nip = (union mcreg_nodeid *)&mcr->mcr_htnodeid;
        union mcreg_unitid *uip = (union mcreg_unitid *)&mcr->mcr_htunitid;
        int ndcnt = HT_COHERENTNODES(nip);
        uint32_t BCRte[MC_CHIP_MAXNODES];
        uint32_t RPRte[MC_CHIP_MAXNODES];
        uint32_t RQRte[MC_CHIP_MAXNODES];
        nvlist_t *nvl;
        int i;

        (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);

        (void) nvlist_add_uint32(nvl, "NodeId", MCREG_FIELD_CMN(nip, NodeId));
        (void) nvlist_add_uint32(nvl, "CoherentNodes", HT_COHERENTNODES(nip));
        (void) nvlist_add_uint32(nvl, "SbNode", MCREG_FIELD_CMN(nip, SbNode));
        (void) nvlist_add_uint32(nvl, "LkNode", MCREG_FIELD_CMN(nip, LkNode));
        (void) nvlist_add_uint32(nvl, "SystemCoreCount",
            HT_SYSTEMCORECOUNT(nip));

        (void) nvlist_add_uint32(nvl, "C0Unit", MCREG_FIELD_CMN(uip, C0Unit));
        (void) nvlist_add_uint32(nvl, "C1Unit", MCREG_FIELD_CMN(uip, C1Unit));
        (void) nvlist_add_uint32(nvl, "McUnit", MCREG_FIELD_CMN(uip, McUnit));
        (void) nvlist_add_uint32(nvl, "HbUnit", MCREG_FIELD_CMN(uip, HbUnit));
        (void) nvlist_add_uint32(nvl, "SbLink", MCREG_FIELD_CMN(uip, SbLink));

        if (ndcnt <= MC_CHIP_MAXNODES) {
                for (i = 0; i < ndcnt; i++, htrp++) {
                        BCRte[i] = MCREG_FIELD_CMN(htrp, BCRte);
                        RPRte[i] = MCREG_FIELD_CMN(htrp, RPRte);
                        RQRte[i] = MCREG_FIELD_CMN(htrp, RQRte);
                }

                (void) nvlist_add_uint32_array(nvl, "BroadcastRoutes",
                    &BCRte[0], ndcnt);
                (void) nvlist_add_uint32_array(nvl, "ResponseRoutes",
                    &RPRte[0], ndcnt);
                (void) nvlist_add_uint32_array(nvl, "RequestRoutes",
                    &RQRte[0], ndcnt);
        }

        (void) nvlist_add_nvlist(mcnvl, "htconfig", nvl);
        nvlist_free(nvl);
}

static nvlist_t *
mc_nvl_create(mc_t *mc)
{
        nvlist_t *mcnvl;

        (void) nvlist_alloc(&mcnvl, NV_UNIQUE_NAME, KM_SLEEP);

        /*
         * Since this nvlist is used in populating the topo tree changes
         * made here may propogate through to changed property names etc
         * in the topo tree.  Some properties in the topo tree will be
         * contracted via ARC, so be careful what you change here.
         */
        (void) nvlist_add_uint8(mcnvl, MC_NVLIST_VERSTR, MC_NVLIST_VERS1);

        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_NUM, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_REV, 0);
        (void) nvlist_add_string(mcnvl, "revname", mc->mc_revname);
        mc_nvl_add_socket(mcnvl, mc);
        mc_nvl_add_ecctype(mcnvl, mc);

        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BASE_ADDR, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_LIM_ADDR, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILEN, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILSEL, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSINTLVFCTR, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_DRAMHOLE_SIZE, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ACCESS_WIDTH, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSBANKMAPREG, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BANKSWZL, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_MOD64MUX, 0);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_SPARECS, 1);
        mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BADCS, 1);

        mc_nvl_add_cslist(mcnvl, mc);
        mc_nvl_add_dimmlist(mcnvl, mc);
        mc_nvl_add_htconfig(mcnvl, mc);

        return (mcnvl);
}

/*
 * Link a dimm to its associated chip-selects and chip-select lines.
 * Total the size of all ranks of this dimm.
 */
static void
mc_dimm_csadd(mc_t *mc, mc_dimm_t *mcd, mc_cs_t *mccs, const mcdcfg_csl_t *csl)
{
        int factor = (mc->mc_props.mcp_accwidth == 128) ? 2 : 1;
        uint64_t sz = 0;
        int i;

        /* Skip to first unused rank slot */
        for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
                if (mcd->mcd_cs[i] == NULL) {
                        mcd->mcd_cs[i] = mccs;
                        mcd->mcd_csl[i] = csl;
                        sz += mccs->mccs_props.csp_size / factor;
                        break;
                } else {
                        sz += mcd->mcd_cs[i]->mccs_props.csp_size / factor;
                }
        }

        ASSERT(i != MC_CHIP_DIMMRANKMAX);

        mcd->mcd_size = sz;
}

/*
 * Create a dimm structure and call to link it to its associated chip-selects.
 */
static mc_dimm_t *
mc_dimm_create(mc_t *mc, uint_t num)
{
        mc_dimm_t *mcd = kmem_zalloc(sizeof (mc_dimm_t), KM_SLEEP);

        mcd->mcd_hdr.mch_type = MC_NT_DIMM;
        mcd->mcd_mc = mc;
        mcd->mcd_num = num;

        return (mcd);
}

/*
 * The chip-select structure includes an array of dimms associated with
 * that chip-select.  This function fills that array, and also builds
 * the list of all dimms on this memory controller mc_dimmlist.  The
 * caller has filled a structure with all there is to know about the
 * associated dimm(s).
 */
static void
mc_csdimms_create(mc_t *mc, mc_cs_t *mccs, mcdcfg_rslt_t *rsltp)
{
        mc_dimm_t *found[MC_CHIP_DIMMPERCS];
        mc_dimm_t *mcd;
        int nfound = 0;
        int i;

        /*
         * Has some other chip-select already created this dimm or dimms?
         * If so then link to the dimm(s) from the mccs_dimm array,
         * record their topo numbers in the csp_dimmnums array, and link
         * the dimm(s) to the additional chip-select.
         */
        for (mcd = mc->mc_dimmlist; mcd != NULL; mcd = mcd->mcd_next) {
                for (i = 0; i < rsltp->ndimm; i++) {
                        if (mcd->mcd_num == rsltp->dimm[i].toponum)
                                found[nfound++] = mcd;
                }
        }
        ASSERT(nfound == 0 || nfound == rsltp->ndimm);

        for (i = 0; i < rsltp->ndimm; i++) {
                if (nfound == 0) {
                        mcd = mc_dimm_create(mc, rsltp->dimm[i].toponum);
                        if (mc->mc_dimmlist == NULL)
                                mc->mc_dimmlist = mcd;
                        else
                                mc->mc_dimmlast->mcd_next = mcd;
                        mc->mc_dimmlast = mcd;
                } else {
                        mcd = found[i];
                }

                mccs->mccs_dimm[i] = mcd;
                mccs->mccs_csl[i] = rsltp->dimm[i].cslp;
                mccs->mccs_props.csp_dimmnums[i] = mcd->mcd_num;
                mc_dimm_csadd(mc, mcd, mccs, rsltp->dimm[i].cslp);

        }

        /* The rank number is constant across all constituent dimm(s) */
        mccs->mccs_props.csp_dimmrank = rsltp->dimm[0].cslp->csl_rank;
}

/*
 * mc_dimmlist_create is called after we have discovered all enabled
 * (and spare or testfailed on revs F and G) chip-selects on the
 * given memory controller.  For each chip-select we must derive
 * the associated dimms, remembering that a chip-select csbase/csmask
 * pair may be associated with up to 2 chip-select lines (in 128 bit mode)
 * and that any one dimm may be associated with 1, 2, or 4 chip-selects
 * depending on whether it is single, dual or quadrank.
 */
static void
mc_dimmlist_create(mc_t *mc)
{
        union mcreg_dramcfg_hi *drcfghip =
            (union mcreg_dramcfg_hi *)(&mc->mc_cfgregs.mcr_dramcfghi);
        mc_props_t *mcp = &mc->mc_props;
        x86_chiprev_t rev = mcp->mcp_rev;
        mc_cs_t *mccs;
        int r4 = 0, s4 = 0;

        /*
         * Are we dealing with quadrank registered dimms?
         *
         * For socket 940 we can't tell and we'll assume we're not.
         * This can be over-ridden by the admin in /etc/system by setting
         * mc_quadranksupport nonzero.  A possible optimisation in systems
         * that export an SMBIOS table would be to count the number of
         * dimm slots per cpu - more than 4 would indicate no quadrank support
         * and 4 or fewer would indicate that if we see any of the upper
         * chip-selects enabled then a quadrank dimm is present.
         *
         * For socket F(1207) we can check a bit in the dram config high reg.
         *
         * Other socket types do not support registered dimms.
         */
        if (mc->mc_socket == X86_SOCKET_940)
                r4 = mc_quadranksupport != 0;
        else if (mc->mc_socket == X86_SOCKET_F1207)
                r4 = MCREG_FIELD_F_revFG(drcfghip, FourRankRDimm);

        /*
         * Are we dealing with quadrank SO-DIMMs?  These are supported
         * in AM2 and S1g1 packages only, but in all rev F/G cases we
         * can detect their presence via a bit in the dram config high reg.
         */
        if (MC_REV_MATCH(rev, MC_F_REVS_FG))
                s4 = MCREG_FIELD_F_revFG(drcfghip, FourRankSODimm);

        for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
                mcdcfg_rslt_t rslt;

                /*
                 * If lookup fails we will not create dimm structures for
                 * this chip-select.  In the mc_cs_t we will have both
                 * csp_dimmnum members set to MC_INVALNUM and patounum
                 * code will see from those that we do not have dimm info
                 * for this chip-select.
                 */
                if (mcdcfg_lookup(rev, mcp->mcp_mod64mux, mcp->mcp_accwidth,
                    mccs->mccs_props.csp_num, mc->mc_socket,
                    r4, s4, &rslt) < 0)
                        continue;

                mc_csdimms_create(mc, mccs, &rslt);
        }
}

static mc_cs_t *
mc_cs_create(mc_t *mc, uint_t num, uint64_t base, uint64_t mask, size_t sz,
    int csbe, int spare, int testfail)
{
        mc_cs_t *mccs = kmem_zalloc(sizeof (mc_cs_t), KM_SLEEP);
        mccs_props_t *csp = &mccs->mccs_props;
        int i;

        mccs->mccs_hdr.mch_type = MC_NT_CS;
        mccs->mccs_mc = mc;
        csp->csp_num = num;
        csp->csp_base = base;
        csp->csp_mask = mask;
        csp->csp_size = sz;
        csp->csp_csbe = csbe;
        csp->csp_spare = spare;
        csp->csp_testfail = testfail;

        for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
                csp->csp_dimmnums[i] = MC_INVALNUM;

        if (spare)
                mc->mc_props.mcp_sparecs = num;

        return (mccs);
}

/*
 * For any cs# of this mc marked TestFail generate an ereport with
 * resource identifying the associated dimm(s).
 */
static void
mc_report_testfails(mc_t *mc)
{
        mc_unum_t unum;
        mc_cs_t *mccs;
        int i;

        for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
                if (mccs->mccs_props.csp_testfail) {
                        unum.unum_board = 0;
                        unum.unum_chip = mc->mc_props.mcp_num;
                        unum.unum_mc = 0;
                        unum.unum_chan = MC_INVALNUM;
                        unum.unum_cs = mccs->mccs_props.csp_num;
                        unum.unum_rank = mccs->mccs_props.csp_dimmrank;
                        unum.unum_offset = MCAMD_RC_INVALID_OFFSET;
                        for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
                                unum.unum_dimms[i] = MC_INVALNUM;

                        mcamd_ereport_post(mc, FM_EREPORT_CPU_AMD_MC_TESTFAIL,
                            &unum,
                            FM_EREPORT_PAYLOAD_FLAGS_CPU_AMD_MC_TESTFAIL);
                }
        }
}

/*
 * Function 0 - HyperTransport Technology Configuration
 */
static void
mc_mkprops_htcfg(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
{
        union mcreg_nodeid nodeid;
        off_t offset;
        int i;

        mc->mc_cfgregs.mcr_htnodeid = MCREG_VAL32(&nodeid) =
            mc_pcicfg_get32(cfghdl, MC_HT_REG_NODEID);

        mc->mc_cfgregs.mcr_htunitid = mc_pcicfg_get32(cfghdl, MC_HT_REG_UNITID);

        for (i = 0, offset = MC_HT_REG_RTBL_NODE_0;
            i < HT_COHERENTNODES(&nodeid);
            i++, offset += MC_HT_REG_RTBL_INCR)
                mc->mc_cfgregs.mcr_htroute[i] = mc_pcicfg_get32(cfghdl, offset);
}

/*
 * Function 1 Configuration - Address Map (see BKDG 3.4.4 DRAM Address Map)
 *
 * Read the Function 1 Address Map for each potential DRAM node.  The Base
 * Address for a node gives the starting system address mapped at that node,
 * and the limit gives the last valid address mapped at that node.  Regions for
 * different nodes should not overlap, unless node-interleaving is enabled.
 * The base register also indicates the node-interleaving settings (IntlvEn).
 * The limit register includes IntlvSel which determines which 4K blocks will
 * be routed to this node and the destination node ID for addresses that fall
 * within the [base, limit] range - this must match the pair number.
 */
static void
mc_mkprops_addrmap(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
{
        union mcreg_drambase basereg;
        union mcreg_dramlimit limreg;
        mc_props_t *mcp = &mc->mc_props;
        mc_cfgregs_t *mcr = &mc->mc_cfgregs;
        union mcreg_dramhole hole;
        int nodeid = mc->mc_props.mcp_num;

        mcr->mcr_drambase = MCREG_VAL32(&basereg) = mc_pcicfg_get32(cfghdl,
            MC_AM_REG_DRAMBASE_0 + nodeid * MC_AM_REG_DRAM_INCR);

        mcr->mcr_dramlimit = MCREG_VAL32(&limreg) = mc_pcicfg_get32(cfghdl,
            MC_AM_REG_DRAMLIM_0 + nodeid * MC_AM_REG_DRAM_INCR);

        /*
         * Derive some "cooked" properties for nodes that have a range of
         * physical addresses that are read or write enabled and for which
         * the DstNode matches the node we are attaching.
         */
        if (MCREG_FIELD_CMN(&limreg, DRAMLimiti) != 0 &&
            MCREG_FIELD_CMN(&limreg, DstNode) == nodeid &&
            (MCREG_FIELD_CMN(&basereg, WE) || MCREG_FIELD_CMN(&basereg, RE))) {
                mcp->mcp_base = MC_DRAMBASE(&basereg);
                mcp->mcp_lim = MC_DRAMLIM(&limreg);
                mcp->mcp_ilen = MCREG_FIELD_CMN(&basereg, IntlvEn);
                mcp->mcp_ilsel = MCREG_FIELD_CMN(&limreg, IntlvSel);
        }

        /*
         * The Function 1 DRAM Hole Address Register tells us which node(s)
         * own the DRAM space that is hoisted above 4GB, together with the
         * hole base and offset for this node.  This was introduced in
         * revision E.
         */
        if (MC_REV_ATLEAST(mc->mc_props.mcp_rev, MC_F_REV_E)) {
                mcr->mcr_dramhole = MCREG_VAL32(&hole) =
                    mc_pcicfg_get32(cfghdl, MC_AM_REG_HOLEADDR);

                if (MCREG_FIELD_CMN(&hole, DramHoleValid))
                        mcp->mcp_dramhole_size = MC_DRAMHOLE_SIZE(&hole);
        }
}

/*
 * Read some function 3 parameters via PCI Mechanism 1 accesses (which
 * will serialize any NB accesses).
 */
static void
mc_getmiscctl(mc_t *mc)
{
        x86_chiprev_t rev = mc->mc_props.mcp_rev;
        union mcreg_nbcfg nbcfg;
        union mcreg_sparectl sparectl;

        mc->mc_cfgregs.mcr_nbcfg = MCREG_VAL32(&nbcfg) =
            mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG);

        if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
                mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl) =
                    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
                    MC_CTL_REG_SPARECTL);

                if (MCREG_FIELD_F_revFG(&sparectl, SwapDone)) {
                        mc->mc_props.mcp_badcs =
                            MCREG_FIELD_F_revFG(&sparectl, BadDramCs);
                }
        }
}

static int
csbasecmp(mc_cs_t **csapp, mc_cs_t **csbpp)
{
        uint64_t basea = (*csapp)->mccs_props.csp_base;
        uint64_t baseb = (*csbpp)->mccs_props.csp_base;

        if (basea == baseb)
                return (0);
        else if (basea < baseb)
                return (-1);
        else
                return (1);
}

/*
 * The following are for use in simulating TestFail for a chip-select
 * without poking at the hardware (which tends to get upset if you do
 * since the BIOS needs to restart to map a failed cs out).  For internal
 * testing only!  Note that setting these does not give the full experience -
 * the select chip-select *is* enabled and can give errors etc and the
 * patounum logic will get confused.
 */
int testfail_mcnum = -1;
int testfail_csnum = -1;

/*
 * Function 2 configuration - DRAM Controller
 */
static void
mc_mkprops_dramctl(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
{
        union mcreg_csbase base[MC_CHIP_NCS];
        union mcreg_csmask mask[MC_CHIP_NCS];
        union mcreg_dramcfg_lo drcfg_lo;
        union mcreg_dramcfg_hi drcfg_hi;
        union mcreg_drammisc drmisc;
        union mcreg_bankaddrmap baddrmap;
        mc_props_t *mcp = &mc->mc_props;
        mc_cfgregs_t *mcr = &mc->mc_cfgregs;
        int maskdivisor;
        int wide = 0;
        x86_chiprev_t rev = mc->mc_props.mcp_rev;
        int i;
        mcamd_hdl_t hdl;

        mcamd_mkhdl(&hdl);      /* to call into common code */

        /*
         * Read Function 2 DRAM Configuration High and Low registers.  The High
         * part is mostly concerned with memory clocks etc and we'll not have
         * any use for that.  The Low component tells us if ECC is enabled,
         * if we're in 64- or 128-bit MC mode, how the upper chip-selects
         * are mapped, which chip-select pairs are using x4 parts, etc.
         */
        MCREG_VAL32(&drcfg_lo) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGLO);
        MCREG_VAL32(&drcfg_hi) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGHI);
        mcr->mcr_dramcfglo = MCREG_VAL32(&drcfg_lo);
        mcr->mcr_dramcfghi = MCREG_VAL32(&drcfg_hi);

        /*
         * Note the DRAM controller width.  The 64/128 bit is in a different
         * bit position for revision F and G.
         */
        if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
                wide = MCREG_FIELD_F_revFG(&drcfg_lo, Width128);
        } else {
                wide = MCREG_FIELD_F_preF(&drcfg_lo, Width128);
        }
        mcp->mcp_accwidth = wide ? 128 : 64;

        /*
         * Read Function 2 DRAM Controller Miscellaenous Regsiter for those
         * revs that support it.  This include the Mod64Mux indication on
         * these revs - for rev E it is in DRAM config low.
         */
        if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
                mcr->mcr_drammisc = MCREG_VAL32(&drmisc) =
                    mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMMISC);
                mcp->mcp_mod64mux = MCREG_FIELD_F_revFG(&drmisc, Mod64Mux);
        } else if (MC_REV_MATCH(rev, MC_F_REV_E)) {
                mcp->mcp_mod64mux = MCREG_FIELD_F_preF(&drcfg_lo, Mod64BitMux);
        }

        /*
         * Read Function 2 DRAM Bank Address Mapping.  This encodes the
         * type of DIMM module in use for each chip-select pair.
         * Prior ro revision F it also tells us whether BankSwizzle mode
         * is enabled - in rev F that has moved to dram config hi register.
         */
        mcp->mcp_csbankmapreg = MCREG_VAL32(&baddrmap) =
            mc_pcicfg_get32(cfghdl, MC_DC_REG_BANKADDRMAP);

        /*
         * Determine whether bank swizzle mode is active.  Bank swizzling was
         * introduced as an option in rev E,  but the bit that indicates it
         * is enabled has moved in revs F/G.
         */
        if (MC_REV_MATCH(rev, MC_F_REV_E)) {
                mcp->mcp_bnkswzl =
                    MCREG_FIELD_F_preF(&baddrmap, BankSwizzleMode);
        } else if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
                mcp->mcp_bnkswzl = MCREG_FIELD_F_revFG(&drcfg_hi,
                    BankSwizzleMode);
        }

        /*
         * Read the DRAM CS Base and DRAM CS Mask registers.  Revisions prior
         * to F have an equal number of base and mask registers; revision F
         * has twice as many base registers as masks.
         */
        maskdivisor = MC_REV_MATCH(rev, MC_F_REVS_FG) ? 2 : 1;

        mc_prop_read_pair(cfghdl,
            (uint32_t *)base, MC_DC_REG_CSBASE_0, MC_CHIP_NCS,
            (uint32_t *)mask, MC_DC_REG_CSMASK_0, MC_CHIP_NCS / maskdivisor,
            MC_DC_REG_CS_INCR);

        /*
         * Create a cs node for each enabled chip-select as well as
         * any appointed online spare chip-selects and for any that have
         * failed test.
         */
        for (i = 0; i < MC_CHIP_NCS; i++) {
                mc_cs_t *mccs;
                uint64_t csbase, csmask;
                size_t sz;
                int csbe, spare, testfail;

                if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
                        csbe = MCREG_FIELD_F_revFG(&base[i], CSEnable);
                        spare = MCREG_FIELD_F_revFG(&base[i], Spare);
                        testfail = MCREG_FIELD_F_revFG(&base[i], TestFail);
                } else {
                        csbe = MCREG_FIELD_F_preF(&base[i], CSEnable);
                        spare = 0;
                        testfail = 0;
                }

                /* Testing hook */
                if (testfail_mcnum != -1 && testfail_csnum != -1 &&
                    mcp->mcp_num == testfail_mcnum && i == testfail_csnum) {
                        csbe = spare = 0;
                        testfail = 1;
                        cmn_err(CE_NOTE, "Pretending MC %d CS %d failed test",
                            testfail_mcnum, testfail_csnum);
                }

                /*
                 * If the chip-select is not enabled then skip it unless
                 * it is a designated online spare or is marked with TestFail.
                 */
                if (!csbe && !(spare || testfail))
                        continue;

                /*
                 * For an enabled or spare chip-select the Bank Address Mapping
                 * register will be valid as will the chip-select mask.  The
                 * base will not be valid but we'll read and store it anyway.
                 * We will not know whether the spare is already swapped in
                 * until MC function 3 attaches.
                 */
                if (csbe || spare) {
                        if (mcamd_cs_size(&hdl, (mcamd_node_t *)mc, i, &sz) < 0)
                                continue;
                        csbase = MC_CSBASE(&base[i], rev);
                        csmask = MC_CSMASK(&mask[i / maskdivisor], rev);
                } else {
                        sz = 0;
                        csbase = csmask = 0;
                }

                mccs = mc_cs_create(mc, i, csbase, csmask, sz,
                    csbe, spare, testfail);

                if (mc->mc_cslist == NULL)
                        mc->mc_cslist = mccs;
                else
                        mc->mc_cslast->mccs_next = mccs;
                mc->mc_cslast = mccs;

                mccs->mccs_cfgregs.csr_csbase = MCREG_VAL32(&base[i]);
                mccs->mccs_cfgregs.csr_csmask =
                    MCREG_VAL32(&mask[i / maskdivisor]);

                /*
                 * Check for cs bank interleaving - some bits clear in the
                 * lower mask.  All banks must/will have the same lomask bits
                 * if cs interleaving is active.
                 */
                if (csbe && !mcp->mcp_csintlvfctr) {
                        int bitno, ibits = 0;
                        for (bitno = MC_CSMASKLO_LOBIT(rev);
                            bitno <= MC_CSMASKLO_HIBIT(rev); bitno++) {
                                if (!(csmask & (1 << bitno)))
                                        ibits++;
                        }
                        mcp->mcp_csintlvfctr = 1 << ibits;
                }
        }

        /*
         * If there is no chip-select interleave on this node determine
         * whether the chip-select ranks are contiguous or if there
         * is a hole.
         */
        if (mcp->mcp_csintlvfctr == 1) {
                mc_cs_t *csp[MC_CHIP_NCS];
                mc_cs_t *mccs;
                int ncsbe = 0;

                for (mccs = mc->mc_cslist; mccs != NULL;
                    mccs = mccs->mccs_next) {
                        if (mccs->mccs_props.csp_csbe)
                                csp[ncsbe++] = mccs;
                }

                if (ncsbe != 0) {
                        qsort((void *)csp, ncsbe, sizeof (mc_cs_t *),
                            (int (*)(const void *, const void *))csbasecmp);

                        for (i = 1; i < ncsbe; i++) {
                                if (csp[i]->mccs_props.csp_base !=
                                    csp[i - 1]->mccs_props.csp_base +
                                    csp[i - 1]->mccs_props.csp_size)
                                        mc->mc_csdiscontig = 1;
                        }
                }
        }


        /*
         * Since we do not attach to MC function 3 go ahead and read some
         * config parameters from it now.
         */
        mc_getmiscctl(mc);

        /*
         * Now that we have discovered all enabled/spare/testfail chip-selects
         * we divine the associated DIMM configuration.
         */
        mc_dimmlist_create(mc);
}

typedef struct mc_bind_map {
        const char *bm_bindnm;   /* attachment binding name */
        enum mc_funcnum bm_func; /* PCI config space function number for bind */
        const char *bm_model;    /* value for device node model property */
        void (*bm_mkprops)(mc_pcicfg_hdl_t, mc_t *);
} mc_bind_map_t;

/*
 * Do not attach to MC function 3 - agpgart already attaches to that.
 * Function 3 may be a good candidate for a nexus driver to fan it out
 * into virtual devices by functionality.  We will use pci_mech1_getl
 * to retrieve the function 3 parameters we require.
 */

static const mc_bind_map_t mc_bind_map[] = {
        { MC_FUNC_HTCONFIG_BINDNM, MC_FUNC_HTCONFIG,
            "AMD Memory Controller (HT Configuration)", mc_mkprops_htcfg },
        { MC_FUNC_ADDRMAP_BINDNM, MC_FUNC_ADDRMAP,
            "AMD Memory Controller (Address Map)", mc_mkprops_addrmap },
        { MC_FUNC_DRAMCTL_BINDNM, MC_FUNC_DRAMCTL,
            "AMD Memory Controller (DRAM Controller & HT Trace)",
            mc_mkprops_dramctl },
        NULL
};

/*ARGSUSED*/
static int
mc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
{
        if (otyp != OTYP_CHR)
                return (EINVAL);

        rw_enter(&mc_lock, RW_READER);
        if (mc_lookup_by_chipid(getminor(*devp)) == NULL) {
                rw_exit(&mc_lock);
                return (EINVAL);
        }
        rw_exit(&mc_lock);

        return (0);
}

/*ARGSUSED*/
static int
mc_close(dev_t dev, int flag, int otyp, cred_t *credp)
{
        return (0);
}

/*
 * Enable swap from chip-select csnum to the spare chip-select on this
 * memory controller (if any).
 */

int mc_swapdonetime = 30;       /* max number of seconds to wait for SwapDone */

static int
mc_onlinespare(mc_t *mc, int csnum)
{
        mc_props_t *mcp = &mc->mc_props;
        union mcreg_sparectl sparectl;
        union mcreg_scrubctl scrubctl;
        mc_cs_t *mccs;
        hrtime_t tmax;
        int i = 0;

        ASSERT(RW_WRITE_HELD(&mc_lock));

        if (!MC_REV_MATCH(mcp->mcp_rev, MC_F_REVS_FG))
                return (ENOTSUP);       /* MC rev does not offer online spare */
        else if (mcp->mcp_sparecs == MC_INVALNUM)
                return (ENODEV);        /* Supported, but no spare configured */
        else if (mcp->mcp_badcs != MC_INVALNUM)
                return (EBUSY);         /* Spare already swapped in */
        else if (csnum == mcp->mcp_sparecs)
                return (EINVAL);        /* Can't spare the spare! */

        for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
                if (mccs->mccs_props.csp_num == csnum)
                        break;
        }
        if (mccs == NULL)
                return (EINVAL);        /* nominated bad CS does not exist */

        /*
         * If the DRAM Scrubber is not enabled then the swap cannot succeed.
         */
        MCREG_VAL32(&scrubctl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
            MC_CTL_REG_SCRUBCTL);
        if (MCREG_FIELD_CMN(&scrubctl, DramScrub) == 0)
                return (ENODEV);        /* DRAM scrubber not enabled */

        /*
         * Read Online Spare Comtrol Register again, just in case our
         * state does not reflect reality.
         */
        MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
            MC_CTL_REG_SPARECTL);

        if (MCREG_FIELD_F_revFG(&sparectl, SwapDone))
                return (EBUSY);

        /* Write to the BadDramCs field */
        MCREG_FIELD_F_revFG(&sparectl, BadDramCs) = csnum;
        mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
            MCREG_VAL32(&sparectl));

        /* And request that the swap to the spare start */
        MCREG_FIELD_F_revFG(&sparectl, SwapEn) = 1;
        mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
            MCREG_VAL32(&sparectl));

        /*
         * Poll for SwapDone - we have disabled notification by interrupt.
         * Swap takes "several CPU cycles, depending on the DRAM speed, but
         * is performed in the background" (Family 0Fh Bios Porting Guide).
         * We're in a slow ioctl path so there is no harm in waiting around
         * a bit - consumers of the ioctl must be aware that it may take
         * a moment.  We will poll for up to mc_swapdonetime seconds,
         * limiting that to 120s.
         *
         * The swap is performed by the DRAM scrubber (which must be enabled)
         * whose scrub rate is accelerated for the duration of the swap.
         * The maximum swap rate is 40.0ns per 64 bytes, so the maximum
         * supported cs size of 16GB would take 10.7s at that max rate
         * of 25000000 scrubs/second.
         */
        tmax = gethrtime() + MIN(mc_swapdonetime, 120) * 1000000000ULL;
        do {
                if (i++ < 20)
                        delay(drv_usectohz(100000));    /* 0.1s for up to 2s */
                else
                        delay(drv_usectohz(500000));    /* 0.5s */

                MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc,
                    MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
        } while (!MCREG_FIELD_F_revFG(&sparectl, SwapDone) &&
            gethrtime() < tmax);

        if (!MCREG_FIELD_F_revFG(&sparectl, SwapDone))
                return (ETIME);         /* Operation timed out */

        mcp->mcp_badcs = csnum;
        mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl);
        mc->mc_spareswaptime = gethrtime();

        return (0);
}

/*ARGSUSED*/
static int
mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
{
        int rc = 0;
        mc_t *mc;

        if (cmd != MC_IOC_SNAPSHOT_INFO && cmd != MC_IOC_SNAPSHOT &&
            cmd != MC_IOC_ONLINESPARE_EN)
                return (EINVAL);

        rw_enter(&mc_lock, RW_READER);

        if ((mc = mc_lookup_by_chipid(getminor(dev))) == NULL) {
                rw_exit(&mc_lock);
                return (EINVAL);
        }

        switch (cmd) {
        case MC_IOC_SNAPSHOT_INFO: {
                mc_snapshot_info_t mcs;

                if (mc_snapshot_update(mc) < 0) {
                        rw_exit(&mc_lock);
                        return (EIO);
                }

                mcs.mcs_size = mc->mc_snapshotsz;
                mcs.mcs_gen = mc->mc_snapshotgen;

                if (ddi_copyout(&mcs, (void *)arg, sizeof (mc_snapshot_info_t),
                    mode) < 0)
                        rc = EFAULT;
                break;
        }

        case MC_IOC_SNAPSHOT:
                if (mc_snapshot_update(mc) < 0) {
                        rw_exit(&mc_lock);
                        return (EIO);
                }

                if (ddi_copyout(mc->mc_snapshot, (void *)arg, mc->mc_snapshotsz,
                    mode) < 0)
                        rc = EFAULT;
                break;

        case MC_IOC_ONLINESPARE_EN:
                if (drv_priv(credp) != 0) {
                        rw_exit(&mc_lock);
                        return (EPERM);
                }

                if (!rw_tryupgrade(&mc_lock)) {
                        rw_exit(&mc_lock);
                        return (EAGAIN);
                }

                if ((rc = mc_onlinespare(mc, (int)arg)) == 0) {
                        mc_snapshot_destroy(mc);
                        nvlist_free(mc->mc_nvl);
                        mc->mc_nvl = mc_nvl_create(mc);
                }

                break;
        }

        rw_exit(&mc_lock);

        return (rc);
}

static struct cb_ops mc_cb_ops = {
        mc_open,
        mc_close,
        nodev,          /* not a block driver */
        nodev,          /* no print routine */
        nodev,          /* no dump routine */
        nodev,          /* no read routine */
        nodev,          /* no write routine */
        mc_ioctl,
        nodev,          /* no devmap routine */
        nodev,          /* no mmap routine */
        nodev,          /* no segmap routine */
        nochpoll,       /* no chpoll routine */
        ddi_prop_op,
        0,              /* not a STREAMS driver */
        D_NEW | D_MP,   /* safe for multi-thread/multi-processor */
};

/*ARGSUSED*/
static int
mc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
        int rc = DDI_SUCCESS;
        mc_t *mc;

        if (infocmd != DDI_INFO_DEVT2DEVINFO &&
            infocmd != DDI_INFO_DEVT2INSTANCE) {
                *result = NULL;
                return (DDI_FAILURE);
        }

        rw_enter(&mc_lock, RW_READER);

        if ((mc = mc_lookup_by_chipid(getminor((dev_t)arg))) == NULL ||
            mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi == NULL) {
                rc = DDI_FAILURE;
        } else if (infocmd == DDI_INFO_DEVT2DEVINFO) {
                *result = mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi;
        } else {
                *result = (void *)(uintptr_t)
                    mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_instance;
        }

        rw_exit(&mc_lock);

        return (rc);
}

/*ARGSUSED2*/
static int
mc_fm_handle(dev_info_t *dip, ddi_fm_error_t *fmerr, const void *arg)
{
        pci_ereport_post(dip, fmerr, NULL);
        return (fmerr->fme_status);
}

static void
mc_fm_init(dev_info_t *dip)
{
        int fmcap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE;
        ddi_fm_init(dip, &fmcap, NULL);
        pci_ereport_setup(dip);
        ddi_fm_handler_register(dip, mc_fm_handle, NULL);
}

static void
mc_read_smbios(mc_t *mc, dev_info_t *dip)
{

        uint16_t bdf;
        pci_regspec_t *pci_rp = NULL;
        uint32_t phys_hi;
        int m = 0;
        uint_t chip_inst;
        int rc = 0;

        if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg",
            (caddr_t)&pci_rp, &m) == DDI_SUCCESS) {
                phys_hi = pci_rp->pci_phys_hi;
                bdf = (uint16_t)(PCI_REG_BDFR_G(phys_hi) >>
                    PCI_REG_FUNC_SHIFT);
                kmem_free(pci_rp, m);
                pci_rp = NULL;

                rc = fm_smb_mc_chipinst(bdf, &chip_inst);
                if (rc == 0) {
                        mc->smb_chipid = chip_inst;
                } else {
#ifdef DEBUG
                        cmn_err(CE_NOTE, "!mc read smbios chip info failed");
#endif /* DEBUG */
                        return;
                }
                mc->smb_bboard = fm_smb_mc_bboards(bdf);
#ifdef DEBUG
                if (mc->smb_bboard == NULL)
                        cmn_err(CE_NOTE,
                            "!mc read smbios base boards info failed");
#endif /* DEBUG */
        }

        if (pci_rp != NULL)
                kmem_free(pci_rp, m);
}

/*ARGSUSED*/
static int
mc_create_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
{
        chipid_t chipid = *((chipid_t *)arg1);
        cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;

        if (cmi_hdl_chipid(whdl) == chipid) {
                cmi_hdl_hold(whdl);     /* short-term hold */
                *hdlp = whdl;
                return (CMI_HDL_WALK_DONE);
        } else {
                return (CMI_HDL_WALK_NEXT);
        }
}

static mc_t *
mc_create(chipid_t chipid, dev_info_t *dip)
{
        mc_t *mc;
        cmi_hdl_t hdl = NULL;

        ASSERT(RW_WRITE_HELD(&mc_lock));

        /*
         * Find a handle for one of a chip's CPU.
         *
         * We can use one of the chip's CPUs since all cores
         * of a chip share the same revision and socket type.
         */
        cmi_hdl_walk(mc_create_cb, (void *)&chipid, (void *)&hdl, NULL);
        if (hdl == NULL)
                return (NULL);  /* no cpu for this chipid found! */

        mc = kmem_zalloc(sizeof (mc_t), KM_SLEEP);

        mc->mc_hdr.mch_type = MC_NT_MC;
        mc->mc_props.mcp_num = chipid;
        mc->mc_props.mcp_sparecs = MC_INVALNUM;
        mc->mc_props.mcp_badcs = MC_INVALNUM;

        mc->mc_props.mcp_rev = cmi_hdl_chiprev(hdl);
        mc->mc_revname = cmi_hdl_chiprevstr(hdl);
        mc->mc_socket = cmi_hdl_getsockettype(hdl);

        mc_read_smbios(mc, dip);

        if (mc_list == NULL)
                mc_list = mc;
        if (mc_last != NULL)
                mc_last->mc_next = mc;

        mc->mc_next = NULL;
        mc_last = mc;

        cmi_hdl_rele(hdl);

        return (mc);
}

/*
 * Return the maximum scrubbing rate between r1 and r2, where r2 is extracted
 * from the specified 'cfg' register value using 'mask' and 'shift'.  If a
 * value is zero, scrubbing is off so return the opposite value.  Otherwise
 * the maximum rate is the smallest non-zero value of the two values.
 */
static uint32_t
mc_scrubber_max(uint32_t r1, uint32_t cfg, uint32_t mask, uint32_t shift)
{
        uint32_t r2 = (cfg & mask) >> shift;

        if (r1 != 0 && r2 != 0)
                return (MIN(r1, r2));

        return (r1 ? r1 : r2);
}


/*
 * Enable the memory scrubber.  We must use the mc_pcicfg_{get32,put32}_nohdl
 * interfaces since we do not bind to function 3.
 */
cmi_errno_t
mc_scrubber_enable(mc_t *mc)
{
        mc_props_t *mcp = &mc->mc_props;
        chipid_t chipid = (chipid_t)mcp->mcp_num;
        x86_chiprev_t rev = (x86_chiprev_t)mcp->mcp_rev;
        mc_cfgregs_t *mcr = &mc->mc_cfgregs;
        union mcreg_scrubctl scrubctl;
        union mcreg_dramscrublo dalo;
        union mcreg_dramscrubhi dahi;

        mcr->mcr_scrubctl = MCREG_VAL32(&scrubctl) =
            mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL);

        mcr->mcr_scrubaddrlo = MCREG_VAL32(&dalo) =
            mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO);

        mcr->mcr_scrubaddrhi = MCREG_VAL32(&dahi) =
            mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI);

        if (mc_scrub_policy == MC_SCRUB_BIOSDEFAULT)
                return (MCREG_FIELD_CMN(&scrubctl, DramScrub) !=
                    AMD_NB_SCRUBCTL_RATE_NONE ?
                    CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);

        /*
         * Disable DRAM scrubbing while we fiddle.
         */
        MCREG_FIELD_CMN(&scrubctl, DramScrub) = AMD_NB_SCRUBCTL_RATE_NONE;
        mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
            MCREG_VAL32(&scrubctl));

        /*
         * Setup DRAM Scrub Address Low and High registers for the
         * base address of this node, and to select srubber redirect.
         */
        MCREG_FIELD_CMN(&dalo, ScrubReDirEn) = 1;
        MCREG_FIELD_CMN(&dalo, ScrubAddrLo) =
            AMD_NB_SCRUBADDR_MKLO(mcp->mcp_base);

        MCREG_FIELD_CMN(&dahi, ScrubAddrHi) =
            AMD_NB_SCRUBADDR_MKHI(mcp->mcp_base);

        mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO,
            MCREG_VAL32(&dalo));
        mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI,
            MCREG_VAL32(&dahi));

        if (mc_scrub_rate_dram > AMD_NB_SCRUBCTL_RATE_MAX) {
                cmn_err(CE_WARN, "mc_scrub_rate_dram is too large; "
                    "resetting to 0x%x\n", AMD_NB_SCRUBCTL_RATE_MAX);
                mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_MAX;
        }

        switch (mc_scrub_policy) {
        case MC_SCRUB_FIXED:
                /* Use the system value checked above */
                break;

        default:
                cmn_err(CE_WARN, "Unknown mc_scrub_policy value %d - "
                    "using default policy of MC_SCRUB_MAX", mc_scrub_policy);
                /*FALLTHRU*/

        case MC_SCRUB_MAX:
                mc_scrub_rate_dram = mc_scrubber_max(mc_scrub_rate_dram,
                    mcr->mcr_scrubctl, AMD_NB_SCRUBCTL_DRAM_MASK,
                    AMD_NB_SCRUBCTL_DRAM_SHIFT);
                break;
        }

        /*
         * OPTERON_ERRATUM_99:
         * This erratum applies on revisions D and earlier.
         * This erratum also applies on revisions E and later,
         * if BIOS uses chip-select hoisting instead of DRAM hole
         * mapping.
         *
         * Do not enable the dram scrubber if the chip-select ranges
         * for the node are not contiguous.
         */
        if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
            mc->mc_csdiscontig) {
                cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
                    "%s chip %d because DRAM hole is present on this node",
                    mc->mc_revname, chipid);
                mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
        }

        /*
         * OPTERON_ERRATUM_101:
         * This erratum applies on revisions D and earlier.
         *
         * If the DRAM Base Address register's IntlvEn field indicates that
         * node interleaving is enabled, we must disable the DRAM scrubber
         * and return zero to indicate that Solaris should use s/w instead.
         */
        if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
            mcp->mcp_ilen != 0 &&
            !chiprev_at_least(rev, X86_CHIPREV_AMD_LEGACY_F_REV_E)) {
                cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
                    "%s chip %d because DRAM memory is node-interleaved",
                    mc->mc_revname, chipid);
                mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
        }

        if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE) {
                MCREG_FIELD_CMN(&scrubctl, DramScrub) = mc_scrub_rate_dram;
                mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
                    MCREG_VAL32(&scrubctl));
        }

        return (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE ?
            CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
}

/*ARGSUSED*/
static int
mc_attach_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
{
        mc_t *mc = (mc_t *)arg1;
        mcamd_prop_t chipid = *((mcamd_prop_t *)arg2);

        if (cmi_hdl_chipid(whdl) == chipid) {
                mcamd_mc_register(whdl, mc);
        }

        return (CMI_HDL_WALK_NEXT);
}

static int mc_sw_scrub_disabled = 0;

static int
mc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
        mc_pcicfg_hdl_t cfghdl;
        const mc_bind_map_t *bm;
        const char *bindnm;
        char *unitstr = NULL;
        enum mc_funcnum func;
        long unitaddr;
        int chipid, rc;
        mc_t *mc;

        /*
         * This driver has no hardware state, but does
         * claim to have a reg property, so it will be
         * called on suspend.  It is probably better to
         * make sure it doesn't get called on suspend,
         * but it is just as easy to make sure we just
         * return DDI_SUCCESS if called.
         */
        if (cmd == DDI_RESUME)
                return (DDI_SUCCESS);

        if (cmd != DDI_ATTACH || mc_no_attach != 0)
                return (DDI_FAILURE);

        bindnm = ddi_binding_name(dip);
        for (bm = mc_bind_map; bm->bm_bindnm != NULL; bm++) {
                if (strcmp(bindnm, bm->bm_bindnm) == 0) {
                        func = bm->bm_func;
                        break;
                }
        }

        if (bm->bm_bindnm == NULL)
                return (DDI_FAILURE);

        /*
         * We need the device number, which corresponds to the processor node
         * number plus 24.  The node number can then be used to associate this
         * memory controller device with a given processor chip.
         */
        if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
            DDI_PROP_DONTPASS, "unit-address", &unitstr) != DDI_PROP_SUCCESS) {
                cmn_err(CE_WARN, "failed to find unit-address for %s", bindnm);
                return (DDI_FAILURE);
        }

        rc = ddi_strtol(unitstr, NULL, 16, &unitaddr);
        ASSERT(rc == 0 && unitaddr >= MC_AMD_DEV_OFFSET);

        if (rc != 0 || unitaddr < MC_AMD_DEV_OFFSET) {
                cmn_err(CE_WARN, "failed to parse unit address %s for %s\n",
                    unitstr, bindnm);
                ddi_prop_free(unitstr);
                return (DDI_FAILURE);
        }
        ddi_prop_free(unitstr);

        chipid = unitaddr - MC_AMD_DEV_OFFSET;

        rw_enter(&mc_lock, RW_WRITER);

        for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
                if (mc->mc_props.mcp_num == chipid)
                        break;
        }

        /* Integrate this memory controller device into existing set */
        if (mc == NULL) {
                mc = mc_create(chipid, dip);

                if (mc == NULL) {
                        /*
                         * We don't complain here because this is a legitimate
                         * path for MP systems.  On those machines, we'll attach
                         * before all CPUs have been initialized, and thus the
                         * chip verification in mc_create will fail.  We'll be
                         * reattached later for those CPUs.
                         */
                        rw_exit(&mc_lock);
                        return (DDI_FAILURE);
                }
        } else {
                mc_snapshot_destroy(mc);
        }

        /* Beyond this point, we're committed to creating this node */

        mc_fm_init(dip);

        ASSERT(mc->mc_funcs[func].mcf_devi == NULL);
        mc->mc_funcs[func].mcf_devi = dip;
        mc->mc_funcs[func].mcf_instance = ddi_get_instance(dip);

        mc->mc_ref++;

        /*
         * Add the common properties to this node, and then add any properties
         * that are specific to this node based upon its configuration space.
         */
        (void) ddi_prop_update_string(DDI_DEV_T_NONE,
            dip, "model", (char *)bm->bm_model);

        (void) ddi_prop_update_int(DDI_DEV_T_NONE,
            dip, "chip-id", mc->mc_props.mcp_num);

        if (bm->bm_mkprops != NULL &&
            mc_pcicfg_setup(mc, bm->bm_func, &cfghdl) == DDI_SUCCESS) {
                bm->bm_mkprops(cfghdl, mc);
                mc_pcicfg_teardown(cfghdl);
        }

        /*
         * If this is the last node to be attached for this memory controller,
         * then create the minor node, enable scrubbers, and register with
         * cpu module(s) for this chip.
         */
        if (func == MC_FUNC_DEVIMAP) {
                mc_props_t *mcp = &mc->mc_props;
                int dram_present = 0;

                if (ddi_create_minor_node(dip, "mc-amd", S_IFCHR,
                    mcp->mcp_num, "ddi_mem_ctrl",
                    0) != DDI_SUCCESS) {
                        cmn_err(CE_WARN, "failed to create minor node for chip "
                            "%d memory controller\n",
                            (chipid_t)mcp->mcp_num);
                }

                /*
                 * Register the memory controller for every CPU of this chip.
                 *
                 * If there is memory present on this node and ECC is enabled
                 * attempt to enable h/w memory scrubbers for this node.
                 * If we are successful in enabling *any* hardware scrubbers,
                 * disable the software memory scrubber.
                 */
                cmi_hdl_walk(mc_attach_cb, (void *)mc, (void *)&mcp->mcp_num,
                    NULL);

                if (mcp->mcp_lim != mcp->mcp_base) {
                        /*
                         * This node may map non-dram memory alone, so we
                         * must check for an enabled chip-select to be
                         * sure there is dram present.
                         */
                        mc_cs_t *mccs;

                        for (mccs = mc->mc_cslist; mccs != NULL;
                            mccs = mccs->mccs_next) {
                                if (mccs->mccs_props.csp_csbe) {
                                        dram_present = 1;
                                        break;
                                }
                        }
                }

                if (dram_present && !mc_ecc_enabled(mc)) {
                        /*
                         * On a single chip system there is no point in
                         * scrubbing if there is no ECC on the single node.
                         * On a multichip system, necessarily Opteron using
                         * registered ECC-capable DIMMs, if there is memory
                         * present on a node but no ECC there then we'll assume
                         * ECC is disabled for all nodes and we will not enable
                         * the scrubber and wll also disable the software
                         * memscrub thread.
                         */
                        rc = 1;
                } else if (!dram_present) {
                        /* No memory on this node - others decide memscrub */
                        rc = 0;
                } else {
                        /*
                         * There is memory on this node and ECC is enabled.
                         * Call via the cpu module to enable memory scrubbing
                         * on this node - we could call directly but then
                         * we may overlap with a request to enable chip-cache
                         * scrubbing.
                         */
                        rc = mc_scrubber_enable(mc);
                }

                if (rc == CMI_SUCCESS && !mc_sw_scrub_disabled++)
                        cmi_mc_sw_memscrub_disable();

                mc_report_testfails(mc);
        }

        /*
         * Update nvlist for as far as we have gotten in attach/init.
         */
        nvlist_free(mc->mc_nvl);
        mc->mc_nvl = mc_nvl_create(mc);

        rw_exit(&mc_lock);
        return (DDI_SUCCESS);
}

/*ARGSUSED*/
static int
mc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
        /*
         * See the comment about suspend in
         * mc_attach().
         */
        if (cmd == DDI_SUSPEND)
                return (DDI_SUCCESS);
        else
                return (DDI_FAILURE);
}


static struct dev_ops mc_ops = {
        DEVO_REV,               /* devo_rev */
        0,                      /* devo_refcnt */
        mc_getinfo,             /* devo_getinfo */
        nulldev,                /* devo_identify */
        nulldev,                /* devo_probe */
        mc_attach,              /* devo_attach */
        mc_detach,              /* devo_detach */
        nodev,                  /* devo_reset */
        &mc_cb_ops,             /* devo_cb_ops */
        NULL,                   /* devo_bus_ops */
        NULL,                   /* devo_power */
        ddi_quiesce_not_needed,         /* devo_quiesce */
};

static struct modldrv modldrv = {
        &mod_driverops,
        "Memory Controller for AMD processors",
        &mc_ops
};

static struct modlinkage modlinkage = {
        MODREV_1,
        (void *)&modldrv,
        NULL
};

int
_init(void)
{
        /*
         * Refuse to load if there is no PCI config space support.
         */
        if (pci_getl_func == NULL)
                return (ENOTSUP);

        rw_init(&mc_lock, NULL, RW_DRIVER, NULL);
        return (mod_install(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}

int
_fini(void)
{
        int rc;

        if ((rc = mod_remove(&modlinkage)) != 0)
                return (rc);

        rw_destroy(&mc_lock);
        return (0);
}