root/usr/src/uts/sun4/io/px/px_fm.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * PX Fault Management Architecture
 */
#include <sys/types.h>
#include <sys/sunndi.h>
#include <sys/sunddi.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>
#include <sys/fm/io/pci.h>
#include <sys/membar.h>
#include "px_obj.h"

extern uint_t px_ranges_phi_mask;

#define PX_PCIE_PANIC_BITS \
        (PCIE_AER_UCE_DLP | PCIE_AER_UCE_FCP | PCIE_AER_UCE_TO | \
        PCIE_AER_UCE_RO | PCIE_AER_UCE_MTLP | PCIE_AER_UCE_ECRC)
#define PX_PCIE_NO_PANIC_BITS \
        (PCIE_AER_UCE_TRAINING | PCIE_AER_UCE_SD | PCIE_AER_UCE_CA | \
        PCIE_AER_UCE_UC | PCIE_AER_UCE_UR)

/*
 * Global panicing state variabled used to control if further error handling
 * should occur.  If the system is already panic'ing or if PX itself has
 * recommended panic'ing the system, no further error handling should occur to
 * prevent the system from hanging.
 */
boolean_t px_panicing = B_FALSE;

static int px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr,
    px_err_pcie_t *regs);

#if defined(DEBUG)
static void px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs);
#else   /* DEBUG */
#define px_pcie_log 0 &&
#endif  /* DEBUG */

/*
 * Initialize px FMA support
 */
int
px_fm_attach(px_t *px_p)
{
        int             i;
        dev_info_t      *dip = px_p->px_dip;
        pcie_bus_t      *bus_p;

        px_p->px_fm_cap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE |
            DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE;

        /*
         * check parents' capability
         */
        ddi_fm_init(dip, &px_p->px_fm_cap, &px_p->px_fm_ibc);

        /*
         * parents need to be ereport and error handling capable
         */
        ASSERT(px_p->px_fm_cap &&
            (DDI_FM_ERRCB_CAPABLE | DDI_FM_EREPORT_CAPABLE));

        /*
         * Initialize lock to synchronize fabric error handling
         */
        mutex_init(&px_p->px_fm_mutex, NULL, MUTEX_DRIVER,
            (void *)px_p->px_fm_ibc);

        px_p->px_pfd_idx = 0;
        for (i = 0; i < 5; i++)
                pcie_rc_init_pfd(dip, &px_p->px_pfd_arr[i]);
        PCIE_DIP2PFD(dip) = px_p->px_pfd_arr;

        bus_p = PCIE_DIP2BUS(dip);
        bus_p->bus_rp_bdf = px_p->px_bdf;
        bus_p->bus_rp_dip = dip;

        return (DDI_SUCCESS);
}

/*
 * Deregister FMA
 */
void
px_fm_detach(px_t *px_p)
{
        int i;

        mutex_destroy(&px_p->px_fm_mutex);
        ddi_fm_fini(px_p->px_dip);
        for (i = 0; i < 5; i++)
                pcie_rc_fini_pfd(&px_p->px_pfd_arr[i]);
}

/*
 * register error callback in parent
 */
void
px_fm_cb_enable(px_t *px_p)
{
        ddi_fm_handler_register(px_p->px_dip, px_fm_callback, px_p);
}

void
px_fm_cb_disable(px_t *px_p)
{
        ddi_fm_handler_unregister(px_p->px_dip);
}

/*
 * Function used to setup access functions depending on level of desired
 * protection.
 */
void
px_fm_acc_setup(ddi_map_req_t *mp, dev_info_t *rdip, pci_regspec_t *rp)
{
        uchar_t fflag;
        ndi_err_t *errp;
        ddi_acc_hdl_t *hp;
        ddi_acc_impl_t *ap;

        hp = mp->map_handlep;
        ap = (ddi_acc_impl_t *)hp->ah_platform_private;
        fflag = ap->ahi_common.ah_acc.devacc_attr_access;

        if (mp->map_op == DDI_MO_MAP_LOCKED) {
                ndi_fmc_insert(rdip, ACC_HANDLE, (void *)hp, NULL);
                switch (fflag) {
                case DDI_FLAGERR_ACC:
                        ap->ahi_get8 = i_ddi_prot_get8;
                        ap->ahi_get16 = i_ddi_prot_get16;
                        ap->ahi_get32 = i_ddi_prot_get32;
                        ap->ahi_get64 = i_ddi_prot_get64;
                        ap->ahi_put8 = i_ddi_prot_put8;
                        ap->ahi_put16 = i_ddi_prot_put16;
                        ap->ahi_put32 = i_ddi_prot_put32;
                        ap->ahi_put64 = i_ddi_prot_put64;
                        ap->ahi_rep_get8 = i_ddi_prot_rep_get8;
                        ap->ahi_rep_get16 = i_ddi_prot_rep_get16;
                        ap->ahi_rep_get32 = i_ddi_prot_rep_get32;
                        ap->ahi_rep_get64 = i_ddi_prot_rep_get64;
                        ap->ahi_rep_put8 = i_ddi_prot_rep_put8;
                        ap->ahi_rep_put16 = i_ddi_prot_rep_put16;
                        ap->ahi_rep_put32 = i_ddi_prot_rep_put32;
                        ap->ahi_rep_put64 = i_ddi_prot_rep_put64;
                        impl_acc_err_init(hp);
                        errp = ((ddi_acc_impl_t *)hp)->ahi_err;
                        if ((rp->pci_phys_hi & PCI_REG_ADDR_M) ==
                            PCI_ADDR_CONFIG)
                                errp->err_cf = px_err_cfg_hdl_check;
                        else
                                errp->err_cf = px_err_pio_hdl_check;
                        break;
                case DDI_CAUTIOUS_ACC :
                        ap->ahi_get8 = i_ddi_caut_get8;
                        ap->ahi_get16 = i_ddi_caut_get16;
                        ap->ahi_get32 = i_ddi_caut_get32;
                        ap->ahi_get64 = i_ddi_caut_get64;
                        ap->ahi_put8 = i_ddi_caut_put8;
                        ap->ahi_put16 = i_ddi_caut_put16;
                        ap->ahi_put32 = i_ddi_caut_put32;
                        ap->ahi_put64 = i_ddi_caut_put64;
                        ap->ahi_rep_get8 = i_ddi_caut_rep_get8;
                        ap->ahi_rep_get16 = i_ddi_caut_rep_get16;
                        ap->ahi_rep_get32 = i_ddi_caut_rep_get32;
                        ap->ahi_rep_get64 = i_ddi_caut_rep_get64;
                        ap->ahi_rep_put8 = i_ddi_caut_rep_put8;
                        ap->ahi_rep_put16 = i_ddi_caut_rep_put16;
                        ap->ahi_rep_put32 = i_ddi_caut_rep_put32;
                        ap->ahi_rep_put64 = i_ddi_caut_rep_put64;
                        impl_acc_err_init(hp);
                        errp = ((ddi_acc_impl_t *)hp)->ahi_err;
                        if ((rp->pci_phys_hi & PCI_REG_ADDR_M) ==
                            PCI_ADDR_CONFIG)
                                errp->err_cf = px_err_cfg_hdl_check;
                        else
                                errp->err_cf = px_err_pio_hdl_check;
                        break;
                default:
                        /* Illegal state, remove the handle from cache */
                        ndi_fmc_remove(rdip, ACC_HANDLE, (void *)hp);
                        break;
                }
        } else if (mp->map_op == DDI_MO_UNMAP) {
                ndi_fmc_remove(rdip, ACC_HANDLE, (void *)hp);
        }
}

/*
 * Function used to initialize FMA for our children nodes. Called
 * through pci busops when child node calls ddi_fm_init.
 */
/*ARGSUSED*/
int
px_fm_init_child(dev_info_t *dip, dev_info_t *cdip, int cap,
    ddi_iblock_cookie_t *ibc_p)
{
        px_t *px_p = DIP_TO_STATE(dip);

        ASSERT(ibc_p != NULL);
        *ibc_p = px_p->px_fm_ibc;

        return (px_p->px_fm_cap);
}

/*
 * lock access for exclusive PCIe access
 */
void
px_bus_enter(dev_info_t *dip, ddi_acc_handle_t handle)
{
        px_pec_t        *pec_p = ((px_t *)DIP_TO_STATE(dip))->px_pec_p;

        /*
         * Exclusive access has been used for cautious put/get,
         * Both utilize i_ddi_ontrap which, on sparcv9, implements
         * similar protection as what on_trap() does, and which calls
         * membar  #Sync to flush out all cpu deferred errors
         * prior to get/put operation, so here we're not calling
         * membar  #Sync - a difference from what's in pci_bus_enter().
         */
        mutex_enter(&pec_p->pec_pokefault_mutex);
        pec_p->pec_acc_hdl = handle;
}

/*
 * unlock access for exclusive PCIe access
 */
/* ARGSUSED */
void
px_bus_exit(dev_info_t *dip, ddi_acc_handle_t handle)
{
        px_t            *px_p = DIP_TO_STATE(dip);
        px_pec_t        *pec_p = px_p->px_pec_p;

        pec_p->pec_acc_hdl = NULL;
        mutex_exit(&pec_p->pec_pokefault_mutex);
}

static uint64_t
px_in_addr_range(dev_info_t *dip, pci_ranges_t *ranges_p, uint64_t addr)
{
        uint64_t        addr_low, addr_high;

        addr_low = (uint64_t)(ranges_p->parent_high & px_ranges_phi_mask) << 32;
        addr_low |= (uint64_t)ranges_p->parent_low;
        addr_high = addr_low + ((uint64_t)ranges_p->size_high << 32) +
            (uint64_t)ranges_p->size_low;

        DBG(DBG_ERR_INTR, dip, "Addr: 0x%llx high: 0x%llx low: 0x%llx\n",
            addr, addr_high, addr_low);

        if ((addr < addr_high) && (addr >= addr_low))
                return (addr_low);

        return (0);
}

/*
 * PCI error callback which is registered with our parent to call
 * for PCIe logging when the CPU traps due to PCIe Uncorrectable Errors
 * and PCI BERR/TO/UE on IO Loads.
 */
/*ARGSUSED*/
int
px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data)
{
        dev_info_t      *pdip = ddi_get_parent(dip);
        px_t            *px_p = (px_t *)impl_data;
        int             i, acc_type = 0;
        int             lookup, rc_err, fab_err;
        uint64_t        addr, base_addr;
        uint64_t        fault_addr = (uint64_t)derr->fme_bus_specific;
        pcie_req_id_t   bdf = PCIE_INVALID_BDF;
        pci_ranges_t    *ranges_p;
        int             range_len;
        pf_data_t       *pfd_p;

        /*
         * If the current thread already owns the px_fm_mutex, then we
         * have encountered an error while processing a previous
         * error.  Attempting to take the mutex again will cause the
         * system to deadlock.
         */
        if (px_p->px_fm_mutex_owner == curthread)
                return (DDI_FM_FATAL);

        i_ddi_fm_handler_exit(pdip);

        if (px_fm_enter(px_p) != DDI_SUCCESS) {
                i_ddi_fm_handler_enter(pdip);
                return (DDI_FM_FATAL);
        }

        /*
         * Make sure this failed load came from this PCIe port.  Check by
         * matching the upper 32 bits of the address with the ranges property.
         */
        range_len = px_p->px_ranges_length / sizeof (pci_ranges_t);
        i = 0;
        for (ranges_p = px_p->px_ranges_p; i < range_len; i++, ranges_p++) {
                base_addr = px_in_addr_range(dip, ranges_p, fault_addr);
                if (base_addr) {
                        switch (ranges_p->child_high & PCI_ADDR_MASK) {
                        case PCI_ADDR_CONFIG:
                                acc_type = PF_ADDR_CFG;
                                addr = 0;
                                bdf = (pcie_req_id_t)((fault_addr >> 12) &
                                    0xFFFF);
                                break;
                        case PCI_ADDR_IO:
                        case PCI_ADDR_MEM64:
                        case PCI_ADDR_MEM32:
                                acc_type = PF_ADDR_PIO;
                                addr = fault_addr - base_addr;
                                bdf = PCIE_INVALID_BDF;
                                break;
                        }
                        break;
                }
        }

        /* This address doesn't belong to this leaf, just return with OK */
        if (!acc_type) {
                px_fm_exit(px_p);
                i_ddi_fm_handler_enter(pdip);
                return (DDI_FM_OK);
        }

        rc_err = px_err_cmn_intr(px_p, derr, PX_TRAP_CALL, PX_FM_BLOCK_ALL);
        lookup = pf_hdl_lookup(dip, derr->fme_ena, acc_type, (uint64_t)addr,
            bdf);

        pfd_p = px_rp_en_q(px_p, bdf, addr,
            (PCI_STAT_R_MAST_AB | PCI_STAT_R_TARG_AB));
        PCIE_ROOT_EH_SRC(pfd_p)->intr_type = PF_INTR_TYPE_DATA;

        /* Update affected info, either addr or bdf is not NULL */
        if (addr) {
                PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_ADDR;
        } else if (PCIE_CHECK_VALID_BDF(bdf)) {
                PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_BDF;
                PFD_AFFECTED_DEV(pfd_p)->pe_affected_bdf = bdf;
        }

        fab_err = px_scan_fabric(px_p, dip, derr);

        px_fm_exit(px_p);
        i_ddi_fm_handler_enter(pdip);

        if (!px_die)
                return (DDI_FM_OK);

        if ((rc_err & (PX_PANIC | PX_PROTECTED)) ||
            (fab_err & PF_ERR_FATAL_FLAGS) ||
            (lookup == PF_HDL_NOTFOUND))
                return (DDI_FM_FATAL);
        else if ((rc_err == PX_NO_ERROR) && (fab_err == PF_ERR_NO_ERROR))
                return (DDI_FM_OK);

        return (DDI_FM_NONFATAL);
}

/*
 * px_err_fabric_intr:
 * Interrupt handler for PCIE fabric block.
 * o lock
 * o create derr
 * o px_err_cmn_intr(leaf, with jbc)
 * o send ereport(fire fmri, derr, payload = BDF)
 * o dispatch (leaf)
 * o unlock
 * o handle error: fatal? fm_panic() : return INTR_CLAIMED)
 */
/* ARGSUSED */
uint_t
px_err_fabric_intr(px_t *px_p, msgcode_t msg_code, pcie_req_id_t rid)
{
        dev_info_t      *rpdip = px_p->px_dip;
        int             rc_err, fab_err;
        ddi_fm_error_t  derr;
        uint32_t        rp_status;
        uint16_t        ce_source, ue_source;
        pf_data_t       *pfd_p;

        if (px_fm_enter(px_p) != DDI_SUCCESS)
                goto done;

        /* Create the derr */
        bzero(&derr, sizeof (ddi_fm_error_t));
        derr.fme_version = DDI_FME_VERSION;
        derr.fme_ena = fm_ena_generate(0, FM_ENA_FMT1);
        derr.fme_flag = DDI_FM_ERR_UNEXPECTED;

        px_err_safeacc_check(px_p, &derr);

        if (msg_code == PCIE_MSG_CODE_ERR_COR) {
                rp_status = PCIE_AER_RE_STS_CE_RCVD;
                ce_source = rid;
                ue_source = 0;
        } else {
                rp_status = PCIE_AER_RE_STS_FE_NFE_RCVD;
                ce_source = 0;
                ue_source = rid;
                if (msg_code == PCIE_MSG_CODE_ERR_NONFATAL)
                        rp_status |= PCIE_AER_RE_STS_NFE_MSGS_RCVD;
                else {
                        rp_status |= PCIE_AER_RE_STS_FE_MSGS_RCVD;
                        rp_status |= PCIE_AER_RE_STS_FIRST_UC_FATAL;
                }
        }

        if (derr.fme_flag == DDI_FM_ERR_UNEXPECTED) {
                ddi_fm_ereport_post(rpdip, PCI_ERROR_SUBCLASS "." PCIEX_FABRIC,
                    derr.fme_ena,
                    DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0,
                    FIRE_PRIMARY, DATA_TYPE_BOOLEAN_VALUE, B_TRUE,
                    "pcie_adv_rp_status", DATA_TYPE_UINT32, rp_status,
                    "pcie_adv_rp_command", DATA_TYPE_UINT32, 0,
                    "pcie_adv_rp_ce_src_id", DATA_TYPE_UINT16, ce_source,
                    "pcie_adv_rp_ue_src_id", DATA_TYPE_UINT16, ue_source,
                    NULL);
        }

        /* Ensure that the rid of the fabric message will get scanned. */
        pfd_p = px_rp_en_q(px_p, rid, 0, 0);
        PCIE_ROOT_EH_SRC(pfd_p)->intr_type = PF_INTR_TYPE_FABRIC;

        rc_err = px_err_cmn_intr(px_p, &derr, PX_INTR_CALL, PX_FM_BLOCK_PCIE);

        /* call rootport dispatch */
        fab_err = px_scan_fabric(px_p, rpdip, &derr);

        px_err_panic(rc_err, PX_RC, fab_err, B_TRUE);
        px_fm_exit(px_p);
        px_err_panic(rc_err, PX_RC, fab_err, B_FALSE);

done:
        return (DDI_INTR_CLAIMED);
}

/*
 * px_scan_fabric:
 *
 * Check for drain state and if there is anything to scan.
 *
 * Note on pfd: Different interrupts will populate the pfd's differently.  The
 * px driver can have a total of 5 different error sources, so it has a queue of
 * 5 pfds.  Each valid PDF is linked together and passed to pf_scan_fabric.
 *
 * Each error handling will populate the following info in the pfd
 *
 *                      Root Fault       Intr Src        Affected BDF
 *                      ----------------+---------------+------------
 * Callback/CPU Trap    Address/BDF     |DATA           |Lookup Addr
 * Mondo 62/63 (sun4u)  decode error    |N/A            |N/A
 * EPKT (sun4v)         decode epkt     |INTERNAL       |decode epkt
 * Fabric Message       fabric payload  |FABRIC         |NULL
 * Peek/Poke            Address/BDF     |NULL           |NULL
 *                      ----------------+---------------+------------
 */
int
px_scan_fabric(px_t *px_p, dev_info_t *rpdip, ddi_fm_error_t *derr) {
        int fab_err = 0;

        ASSERT(MUTEX_HELD(&px_p->px_fm_mutex));

        if (!px_lib_is_in_drain_state(px_p) && px_p->px_pfd_idx) {
                fab_err = pf_scan_fabric(rpdip, derr, px_p->px_pfd_arr);
        }

        return (fab_err);
}

/*
 * px_err_safeacc_check:
 * Check to see if a peek/poke and cautious access is currently being
 * done on a particular leaf.
 *
 * Safe access reads induced fire errors will be handled by cpu trap handler
 * which will call px_fm_callback() which calls this function. In that
 * case, the derr fields will be set by trap handler with the correct values.
 *
 * Safe access writes induced errors will be handled by px interrupt
 * handlers, this function will fill in the derr fields.
 *
 * If a cpu trap does occur, it will quiesce all other interrupts allowing
 * the cpu trap error handling to finish before Fire receives an interrupt.
 *
 * If fire does indeed have an error when a cpu trap occurs as a result of
 * a safe access, a trap followed by a Mondo/Fabric interrupt will occur.
 * In which case derr will be initialized as "UNEXPECTED" by the interrupt
 * handler and this function will need to find if this error occured in the
 * middle of a safe access operation.
 *
 * @param px_p          leaf in which to check access
 * @param derr          fm err data structure to be updated
 */
void
px_err_safeacc_check(px_t *px_p, ddi_fm_error_t *derr)
{
        px_pec_t        *pec_p = px_p->px_pec_p;
        int             acctype = pec_p->pec_safeacc_type;

        ASSERT(MUTEX_HELD(&px_p->px_fm_mutex));

        if (derr->fme_flag != DDI_FM_ERR_UNEXPECTED) {
                return;
        }

        /* safe access checking */
        switch (acctype) {
        case DDI_FM_ERR_EXPECTED:
                /*
                 * cautious access protection, protected from all err.
                 */
                ddi_fm_acc_err_get(pec_p->pec_acc_hdl, derr,
                    DDI_FME_VERSION);
                derr->fme_flag = acctype;
                derr->fme_acc_handle = pec_p->pec_acc_hdl;
                break;
        case DDI_FM_ERR_POKE:
                /*
                 * ddi_poke protection, check nexus and children for
                 * expected errors.
                 */
                membar_sync();
                derr->fme_flag = acctype;
                break;
        case DDI_FM_ERR_PEEK:
                derr->fme_flag = acctype;
                break;
        }
}

/*
 * Suggest panic if any EQ (except CE q) has overflown.
 */
int
px_err_check_eq(dev_info_t *dip)
{
        px_t                    *px_p = DIP_TO_STATE(dip);
        px_msiq_state_t         *msiq_state_p = &px_p->px_ib_p->ib_msiq_state;
        px_pec_t                *pec_p = px_p->px_pec_p;
        msiqid_t                eq_no = msiq_state_p->msiq_1st_msiq_id;
        pci_msiq_state_t        msiq_state;
        int                     i;

        for (i = 0; i < msiq_state_p->msiq_cnt; i++) {
                if (i + eq_no == pec_p->pec_corr_msg_msiq_id) /* skip CE q */
                        continue;
                if ((px_lib_msiq_getstate(dip, i + eq_no, &msiq_state) !=
                    DDI_SUCCESS) || msiq_state == PCI_MSIQ_STATE_ERROR)
                        return (PX_PANIC);
        }
        return (PX_NO_PANIC);
}

/* ARGSUSED */
int
px_err_check_pcie(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs,
    pf_intr_type_t intr_type)
{
        px_t            *px_p = DIP_TO_STATE(dip);
        pf_data_t       *pfd_p = px_get_pfd(px_p);
        int             i;
        pf_pcie_adv_err_regs_t *adv_reg = PCIE_ADV_REG(pfd_p);

        PCIE_ROOT_EH_SRC(pfd_p)->intr_type = intr_type;

        /*
         * set RC s_status in PCI term to coordinate with downstream fabric
         * errors ananlysis.
         */
        if (regs->primary_ue & PCIE_AER_UCE_UR)
                PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_R_MAST_AB;
        if (regs->primary_ue & PCIE_AER_UCE_CA)
                PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_R_TARG_AB;
        if (regs->primary_ue & (PCIE_AER_UCE_PTLP | PCIE_AER_UCE_ECRC))
                PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_PERROR;

        if (!regs->primary_ue)
                goto done;

        adv_reg->pcie_ce_status = regs->ce_reg;
        adv_reg->pcie_ue_status = regs->ue_reg | regs->primary_ue;
        PCIE_ADV_HDR(pfd_p, 0) = regs->rx_hdr1;
        PCIE_ADV_HDR(pfd_p, 1) = regs->rx_hdr2;
        PCIE_ADV_HDR(pfd_p, 2) = regs->rx_hdr3;
        PCIE_ADV_HDR(pfd_p, 3) = regs->rx_hdr4;
        for (i = regs->primary_ue; i != 1; i = i >> 1)
                adv_reg->pcie_adv_ctl++;

        if (regs->primary_ue & (PCIE_AER_UCE_UR | PCIE_AER_UCE_CA)) {
                if (pf_tlp_decode(PCIE_DIP2BUS(dip), adv_reg) == DDI_SUCCESS)
                        PCIE_ROOT_FAULT(pfd_p)->scan_bdf =
                            adv_reg->pcie_ue_tgt_bdf;
        } else if (regs->primary_ue & PCIE_AER_UCE_PTLP) {
                if (pf_tlp_decode(PCIE_DIP2BUS(dip), adv_reg) == DDI_SUCCESS) {
                        PCIE_ROOT_FAULT(pfd_p)->scan_bdf =
                            adv_reg->pcie_ue_tgt_bdf;
                        if (adv_reg->pcie_ue_tgt_trans ==
                            PF_ADDR_PIO)
                                PCIE_ROOT_FAULT(pfd_p)->scan_addr =
                                    adv_reg->pcie_ue_tgt_addr;
                }

                /*
                 * Normally for Poisoned Completion TLPs we can look at the
                 * transmit log header for the original request and the original
                 * address, however this doesn't seem to be working.  HW BUG.
                 */
        }

done:
        px_pcie_log(dip, regs);

        /* Return No Error here and let the pcie misc module analyse it */
        return (PX_NO_ERROR);
}

#if defined(DEBUG)
static void
px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs)
{
        DBG(DBG_ERR_INTR, dip,
            "A PCIe RC error has occured\n"
            "\tCE: 0x%x UE: 0x%x Primary UE: 0x%x\n"
            "\tTX Hdr: 0x%x 0x%x 0x%x 0x%x\n\tRX Hdr: 0x%x 0x%x 0x%x 0x%x\n",
            regs->ce_reg, regs->ue_reg, regs->primary_ue,
            regs->tx_hdr1, regs->tx_hdr2, regs->tx_hdr3, regs->tx_hdr4,
            regs->rx_hdr1, regs->rx_hdr2, regs->rx_hdr3, regs->rx_hdr4);
}
#endif

/*
 * look through poisoned TLP cases and suggest panic/no panic depend on
 * handle lookup.
 */
static int
px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs)
{
        pf_pcie_adv_err_regs_t adv_reg;
        pcie_req_id_t   bdf;
        uint64_t        addr;
        uint32_t        trans_type;
        int             tlp_sts, tlp_cmd;
        int             lookup = PF_HDL_NOTFOUND;

        if (regs->primary_ue != PCIE_AER_UCE_PTLP)
                return (PX_PANIC);

        if (!regs->rx_hdr1)
                goto done;

        adv_reg.pcie_ue_hdr[0] = regs->rx_hdr1;
        adv_reg.pcie_ue_hdr[1] = regs->rx_hdr2;
        adv_reg.pcie_ue_hdr[2] = regs->rx_hdr3;
        adv_reg.pcie_ue_hdr[3] = regs->rx_hdr4;

        tlp_sts = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg);
        tlp_cmd = ((pcie_tlp_hdr_t *)(adv_reg.pcie_ue_hdr))->type;

        if (tlp_sts == DDI_FAILURE)
                goto done;

        bdf = adv_reg.pcie_ue_tgt_bdf;
        addr = adv_reg.pcie_ue_tgt_addr;
        trans_type = adv_reg.pcie_ue_tgt_trans;

        switch (tlp_cmd) {
        case PCIE_TLP_TYPE_CPL:
        case PCIE_TLP_TYPE_CPLLK:
                /*
                 * Usually a PTLP is a CPL with data.  Grab the completer BDF
                 * from the RX TLP, and the original address from the TX TLP.
                 */
                if (regs->tx_hdr1) {
                        adv_reg.pcie_ue_hdr[0] = regs->tx_hdr1;
                        adv_reg.pcie_ue_hdr[1] = regs->tx_hdr2;
                        adv_reg.pcie_ue_hdr[2] = regs->tx_hdr3;
                        adv_reg.pcie_ue_hdr[3] = regs->tx_hdr4;

                        lookup = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg);
                        if (lookup != DDI_SUCCESS)
                                break;
                        addr = adv_reg.pcie_ue_tgt_addr;
                        trans_type = adv_reg.pcie_ue_tgt_trans;
                } /* FALLTHRU */
        case PCIE_TLP_TYPE_IO:
        case PCIE_TLP_TYPE_MEM:
        case PCIE_TLP_TYPE_MEMLK:
                lookup = pf_hdl_lookup(dip, derr->fme_ena, trans_type, addr,
                    bdf);
                break;
        default:
                lookup = PF_HDL_NOTFOUND;
        }
done:
        return (lookup == PF_HDL_FOUND ? PX_NO_PANIC : PX_PANIC);
}

/*
 * px_get_pdf automatically allocates a RC pf_data_t and returns a pointer to
 * it.  This function should be used when an error requires a fabric scan.
 */
pf_data_t *
px_get_pfd(px_t *px_p) {
        int             idx = px_p->px_pfd_idx++;
        pf_data_t       *pfd_p = &px_p->px_pfd_arr[idx];

        /* Clear Old Data */
        PCIE_ROOT_FAULT(pfd_p)->scan_bdf = PCIE_INVALID_BDF;
        PCIE_ROOT_FAULT(pfd_p)->scan_addr = 0;
        PCIE_ROOT_EH_SRC(pfd_p)->intr_type = PF_INTR_TYPE_NONE;
        PCIE_ROOT_EH_SRC(pfd_p)->intr_data = NULL;
        PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = 0;
        PFD_AFFECTED_DEV(pfd_p)->pe_affected_bdf = PCIE_INVALID_BDF;
        PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = 0;
        PCIE_ADV_REG(pfd_p)->pcie_ce_status = 0;
        PCIE_ADV_REG(pfd_p)->pcie_ue_status = 0;
        PCIE_ADV_REG(pfd_p)->pcie_adv_ctl = 0;

        pfd_p->pe_next = NULL;

        if (idx > 0) {
                px_p->px_pfd_arr[idx - 1].pe_next = pfd_p;
                pfd_p->pe_prev = &px_p->px_pfd_arr[idx - 1];
        } else {
                pfd_p->pe_prev = NULL;
        }

        pfd_p->pe_severity_flags = 0;
        pfd_p->pe_severity_mask = 0;
        pfd_p->pe_orig_severity_flags = 0;
        pfd_p->pe_valid = B_TRUE;

        return (pfd_p);
}

/*
 * This function appends a pf_data structure to the error q which is used later
 * during PCIe fabric scan.  It signifies:
 * o errs rcvd in RC, that may have been propagated to/from the fabric
 * o the fabric scan code should scan the device path of fault bdf/addr
 *
 * scan_bdf: The bdf that caused the fault, which may have error bits set.
 * scan_addr: The PIO addr that caused the fault, such as failed PIO, but not
 *             failed DMAs.
 * s_status: Secondary Status equivalent to why the fault occured.
 *           (ie S-TA/MA, R-TA)
 * Either the scan bdf or addr may be NULL, but not both.
 */
pf_data_t *
px_rp_en_q(px_t *px_p, pcie_req_id_t scan_bdf, uint32_t scan_addr,
    uint16_t s_status)
{
        pf_data_t       *pfd_p;

        if (!PCIE_CHECK_VALID_BDF(scan_bdf) && !scan_addr)
                return (NULL);

        pfd_p = px_get_pfd(px_p);

        PCIE_ROOT_FAULT(pfd_p)->scan_bdf = scan_bdf;
        PCIE_ROOT_FAULT(pfd_p)->scan_addr = (uint64_t)scan_addr;
        PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = s_status;

        return (pfd_p);
}


/*
 * Find and Mark CFG Handles as failed associated with the given BDF. We should
 * always know the BDF for CFG accesses, since it is encoded in the address of
 * the TLP.  Since there can be multiple cfg handles, mark them all as failed.
 */
/* ARGSUSED */
int
px_err_cfg_hdl_check(dev_info_t *dip, const void *handle, const void *arg1,
    const void *arg2)
{
        int                     status = DDI_FM_FATAL;
        uint32_t                addr = *(uint32_t *)arg1;
        uint16_t                bdf = *(uint16_t *)arg2;
        pcie_bus_t              *bus_p;

        DBG(DBG_ERR_INTR, dip, "Check CFG Hdl: dip 0x%p addr 0x%x bdf=0x%x\n",
            dip, addr, bdf);

        bus_p = PCIE_DIP2BUS(dip);

        /*
         * Because CFG and IO Acc Handlers are on the same cache list and both
         * types of hdls gets called for both types of errors.  For this checker
         * only mark the device as "Non-Fatal" if the addr == NULL and bdf !=
         * NULL.
         */
        status = (!addr && (PCIE_CHECK_VALID_BDF(bdf) &&
            (bus_p->bus_bdf == bdf))) ? DDI_FM_NONFATAL : DDI_FM_FATAL;

        return (status);
}

/*
 * Find and Mark all ACC Handles associated with a give address and BDF as
 * failed.  If the BDF != NULL, then check to see if the device has a ACC Handle
 * associated with ADDR.  If the handle is not found, mark all the handles as
 * failed.  If the BDF == NULL, mark the handle as failed if it is associated
 * with ADDR.
 */
int
px_err_pio_hdl_check(dev_info_t *dip, const void *handle, const void *arg1,
    const void *arg2)
{
        dev_info_t              *px_dip;
        px_t                    *px_p;
        pci_ranges_t            *ranges_p;
        int                     range_len;
        ddi_acc_handle_t        ap = (ddi_acc_handle_t)handle;
        ddi_acc_hdl_t           *hp = impl_acc_hdl_get(ap);
        int                     i, status = DDI_FM_FATAL;
        uint64_t                fault_addr = *(uint64_t *)arg1;
        uint16_t                bdf = *(uint16_t *)arg2;
        uint64_t                base_addr, range_addr;
        uint_t                  size;

        /*
         * Find the correct px dip.  On system with a real Root Port, it's the
         * node above the root port.  On systems without a real Root Port the px
         * dip is the bus_rp_dip.
         */
        px_dip = PCIE_DIP2BUS(dip)->bus_rp_dip;

        if (!PCIE_IS_RC(PCIE_DIP2BUS(px_dip)))
                px_dip = ddi_get_parent(px_dip);

        ASSERT(PCIE_IS_RC(PCIE_DIP2BUS(px_dip)));
        px_p = INST_TO_STATE(ddi_get_instance(px_dip));

        DBG(DBG_ERR_INTR, dip, "Check PIO Hdl: dip 0x%x addr 0x%x bdf=0x%x\n",
            dip, fault_addr, bdf);

        /* Normalize the base addr to the addr and strip off the HB info. */
        base_addr = (hp->ah_pfn << MMU_PAGESHIFT) + hp->ah_offset;
        range_len = px_p->px_ranges_length / sizeof (pci_ranges_t);
        i = 0;
        for (ranges_p = px_p->px_ranges_p; i < range_len; i++, ranges_p++) {
                range_addr = px_in_addr_range(dip, ranges_p, base_addr);
                if (range_addr) {
                        switch (ranges_p->child_high & PCI_ADDR_MASK) {
                        case PCI_ADDR_IO:
                        case PCI_ADDR_MEM64:
                        case PCI_ADDR_MEM32:
                                base_addr = base_addr - range_addr;
                                break;
                        }
                        break;
                }
        }

        /*
         * Mark the handle as failed if the ADDR is mapped, or if we
         * know the BDF and ADDR == 0.
         */
        size = hp->ah_len;
        if (((fault_addr >= base_addr) && (fault_addr < (base_addr + size))) ||
            ((fault_addr == 0) && (PCIE_CHECK_VALID_BDF(bdf) &&
            (bdf == PCIE_DIP2BUS(dip)->bus_bdf))))
                status = DDI_FM_NONFATAL;

        return (status);
}

/*
 * Find and Mark all DNA Handles associated with a give address and BDF as
 * failed.  If the BDF != NULL, then check to see if the device has a DMA Handle
 * associated with ADDR.  If the handle is not found, mark all the handles as
 * failed.  If the BDF == NULL, mark the handle as failed if it is associated
 * with ADDR.
 */
int
px_err_dma_hdl_check(dev_info_t *dip, const void *handle, const void *arg1,
    const void *arg2)
{
        ddi_dma_impl_t          *pcie_dp;
        int                     status = DDI_FM_FATAL;
        uint32_t                addr = *(uint32_t *)arg1;
        uint16_t                bdf = *(uint16_t *)arg2;
        uint32_t                base_addr;
        uint_t                  size;

        DBG(DBG_ERR_INTR, dip, "Check PIO Hdl: dip 0x%x addr 0x%x bdf=0x%x\n",
            dip, addr, bdf);

        pcie_dp = (ddi_dma_impl_t *)handle;
        base_addr = (uint32_t)pcie_dp->dmai_mapping;
        size = pcie_dp->dmai_size;

        /*
         * Mark the handle as failed if the ADDR is mapped, or if we
         * know the BDF and ADDR == 0.
         */
        if (((addr >= base_addr) && (addr < (base_addr + size))) ||
            ((addr == 0) && PCIE_CHECK_VALID_BDF(bdf)))
                status = DDI_FM_NONFATAL;

        return (status);
}

int
px_fm_enter(px_t *px_p) {
        if (px_panicing || (px_p->px_fm_mutex_owner == curthread))
                return (DDI_FAILURE);

        mutex_enter(&px_p->px_fm_mutex);
        /*
         * In rare cases when trap occurs and in the middle of scanning the
         * fabric, a PIO will fail in the scan fabric.  The CPU error handling
         * code will correctly panic the system, while a mondo for the failed
         * PIO may also show up.  Normally the mondo will try to grab the mutex
         * and wait until the callback finishes.  But in this rare case,
         * mutex_enter actually suceeds also continues to scan the fabric.
         *
         * This code below is designed specifically to check for this case.  If
         * we successfully grab the px_fm_mutex, the px_fm_mutex_owner better be
         * NULL.  If it isn't that means we are in the rare corner case.  Return
         * DDI_FAILURE, this should prevent PX from doing anymore error
         * handling.
         */
        if (px_p->px_fm_mutex_owner) {
                return (DDI_FAILURE);
        }

        px_p->px_fm_mutex_owner = curthread;

        if (px_panicing) {
                px_fm_exit(px_p);
                return (DDI_FAILURE);
        }

        /* Signal the PCIe error handling module error handling is starting */
        pf_eh_enter(PCIE_DIP2BUS(px_p->px_dip));

        return (DDI_SUCCESS);
}

static void
px_guest_panic(px_t *px_p)
{
        pf_data_t *root_pfd_p = PCIE_DIP2PFD(px_p->px_dip);
        pf_data_t *pfd_p;
        pcie_bus_t *bus_p, *root_bus_p;
        pcie_req_id_list_t *rl;

        /*
         * check if all devices under the root device are unassigned.
         * this function should quickly return in non-IOV environment.
         */
        root_bus_p = PCIE_PFD2BUS(root_pfd_p);
        if (PCIE_BDG_IS_UNASSIGNED(root_bus_p))
                return;

        for (pfd_p = root_pfd_p; pfd_p; pfd_p = pfd_p->pe_next) {
                bus_p = PCIE_PFD2BUS(pfd_p);

                /* assume all affected devs were in the error Q */
                if (!PCIE_BUS2DOM(bus_p)->nfma_panic)
                        continue;

                if (PCIE_IS_BDG(bus_p)) {
                        rl = PCIE_BDF_LIST_GET(bus_p);
                        while (rl) {
                                px_panic_domain(px_p, rl->bdf);
                                rl = rl->next;
                        }
                } else {
                        px_panic_domain(px_p, bus_p->bus_bdf);
                }
                /* clear panic flag */
                PCIE_BUS2DOM(bus_p)->nfma_panic = B_FALSE;
        }
}

void
px_fm_exit(px_t *px_p) {
        px_p->px_fm_mutex_owner = NULL;
        if (px_p->px_pfd_idx == 0) {
                mutex_exit(&px_p->px_fm_mutex);
                return;
        }
        /* panic the affected domains that are non-fma-capable */
        px_guest_panic(px_p);
        /* Signal the PCIe error handling module error handling is ending */
        pf_eh_exit(PCIE_DIP2BUS(px_p->px_dip));
        px_p->px_pfd_idx = 0;
        mutex_exit(&px_p->px_fm_mutex);
}

/*
 * Panic if the err tunable is set and that we are not already in the middle
 * of panic'ing.
 *
 * rc_err = Error severity of PX specific errors
 * msg = Where the error was detected
 * fabric_err = Error severity of PCIe Fabric errors
 * isTest = Test if error severity causes panic
 */
#define MSZ (sizeof (fm_msg) -strlen(fm_msg) - 1)
void
px_err_panic(int rc_err, int msg, int fabric_err, boolean_t isTest)
{
        char fm_msg[96] = "";
        int ferr = PX_NO_ERROR;

        if (panicstr) {
                px_panicing = B_TRUE;
                return;
        }

        if (!(rc_err & px_die))
                goto fabric;
        if (msg & PX_RC)
                (void) strncat(fm_msg, px_panic_rc_msg, MSZ);
        if (msg & PX_RP)
                (void) strncat(fm_msg, px_panic_rp_msg, MSZ);
        if (msg & PX_HB)
                (void) strncat(fm_msg, px_panic_hb_msg, MSZ);

fabric:
        if (fabric_err & PF_ERR_FATAL_FLAGS)
                ferr = PX_PANIC;
        else if (fabric_err & ~(PF_ERR_FATAL_FLAGS | PF_ERR_NO_ERROR))
                ferr = PX_NO_PANIC;

        if (ferr & px_die) {
                if (strlen(fm_msg)) {
                        (void) strncat(fm_msg, " and", MSZ);
                }
                (void) strncat(fm_msg, px_panic_fab_msg, MSZ);
        }

        if (strlen(fm_msg)) {
                px_panicing = B_TRUE;
                if (!isTest)
                        fm_panic("Fatal error has occured in:%s.(0x%x)(0x%x)",
                            fm_msg, rc_err, fabric_err);
        }
}