root/usr/src/uts/common/io/scsi/adapters/lmrc/lmrc.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2024 Racktop Systems, Inc.
 */

/*
 * This file implements the interfaces for communicating with the MegaRAID HBA.
 * There are three basic interfaces:
 * - the device registers, which provide basic information about the controller
 *   hardware and the features it supports, as well as control registers used
 *   during sending and reception of I/O frames
 * - Fusion-MPT v2.5, perhaps later, which defines the format of the I/O frames
 *   used for communicating with the HBA and virtual and physical devices that
 *   are attached to it
 * - MFI, the MegaRAID Firmware Interface, which are sent and received as MPT
 *   payloads to control and communicate with the RAID controller.
 */

#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/scsi/scsi.h>

#include <sys/scsi/adapters/mfi/mfi.h>
#include <sys/scsi/adapters/mfi/mfi_evt.h>
#include <sys/scsi/adapters/mfi/mfi_pd.h>

#include <sys/cpuvar.h>

#include "lmrc.h"
#include "lmrc_reg.h"
#include "lmrc_raid.h"
#include "lmrc_phys.h"

static uint32_t lmrc_read_reg(lmrc_t *, uint32_t);
static void lmrc_write_reg(lmrc_t *, uint32_t, uint32_t);
static int lmrc_transition_to_ready(lmrc_t *);
static void lmrc_process_mptmfi_passthru(lmrc_t *, lmrc_mpt_cmd_t *);
static int lmrc_poll_mfi(lmrc_t *, lmrc_mfi_cmd_t *, uint8_t);
static boolean_t lmrc_check_fw_fault(lmrc_t *);
static int lmrc_get_event_log_info(lmrc_t *, mfi_evt_log_info_t *);
static void lmrc_aen_handler(void *);
static void lmrc_complete_aen(lmrc_t *, lmrc_mfi_cmd_t *);
static int lmrc_register_aen(lmrc_t *, uint32_t);

/*
 * Device register access functions.
 *
 * Due to the way ddi_get* and ddi_put* work, we'll need to calculate the
 * absolute virtual address of the registers ourselves.
 *
 * For read accesses, employ a erratum workaround for Aero controllers. In some
 * cases, reads of certain registers will intermittently return all zeros. As a
 * workaround, retry the read up to three times until a non-zero value is read.
 * Supposedly this is enough, every other driver I looked at does this.
 */
static uint32_t
lmrc_read_reg_1(lmrc_t *lmrc, uint32_t reg)
{
        uint32_t *addr = (uint32_t *)((uintptr_t)lmrc->l_regmap + reg);
        return (ddi_get32(lmrc->l_reghandle, addr));
}

static uint32_t
lmrc_read_reg(lmrc_t *lmrc, uint32_t reg)
{
        if (lmrc->l_class != LMRC_ACLASS_AERO)
                return (lmrc_read_reg_1(lmrc, reg));

        /* Workaround for the hardware erratum in Aero controllers */
        for (uint_t i = 0; i < 3; i++) {
                uint32_t val = lmrc_read_reg_1(lmrc, reg);

                if (val != 0)
                        return (val);
        }

        return (0);
}

static void
lmrc_write_reg(lmrc_t *lmrc, uint32_t reg, uint32_t val)
{
        uint32_t *addr = (uint32_t *)((uintptr_t)lmrc->l_regmap + reg);
        ddi_put32(lmrc->l_reghandle, addr, val);
}

static void
lmrc_write_reg64(lmrc_t *lmrc, uint32_t reg, uint64_t val)
{
        uint64_t *addr = (uint64_t *)((uintptr_t)lmrc->l_regmap + reg);
        ddi_put64(lmrc->l_reghandle, addr, val);
}

/*
 * Interrupt control
 *
 * There are two interrupt registers for host driver use, HostInterruptStatus
 * and HostInterruptMask. Most of the bits in each register are reserved and
 * must masked and/or preserved when used.
 */
void
lmrc_disable_intr(lmrc_t *lmrc)
{
        uint32_t mask = lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET);

        /* Disable all known interrupt: reset, reply, and doorbell. */
        mask |= MPI2_HIM_RESET_IRQ_MASK;
        mask |= MPI2_HIM_REPLY_INT_MASK;
        mask |= MPI2_HIM_IOC2SYS_DB_MASK;

        lmrc_write_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET, mask);

        /* Dummy read to force pci flush. Probably bogus but harmless. */
        (void) lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET);
}

void
lmrc_enable_intr(lmrc_t *lmrc)
{
        uint32_t mask = lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET);

        /* Enable the reply interrupts and the doorbell interrupts. */
        mask &= ~MPI2_HIM_REPLY_INT_MASK;
        mask &= ~MPI2_HIM_IOC2SYS_DB_MASK;

        /* Clear outstanding interrupts before enabling any. */
        lmrc_write_reg(lmrc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0);
        /* Dummy read to force pci flush. Probably bogus but harmless. */
        (void) lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);

        lmrc_write_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET, mask);
        /* Dummy read to force pci flush. Probably bogus but harmless. */
        (void) lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_MASK_OFFSET);
}

uint_t
lmrc_intr_ack(lmrc_t *lmrc)
{
        uint32_t mask =
            MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT | MPI2_HIS_IOC2SYS_DB_STATUS;
        uint32_t status;

        status = lmrc_read_reg(lmrc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);

        if ((status & mask) == 0)
                return (DDI_INTR_UNCLAIMED);

        if (lmrc_check_acc_handle(lmrc->l_reghandle) != DDI_SUCCESS) {
                ddi_fm_service_impact(lmrc->l_dip, DDI_SERVICE_LOST);
                return (DDI_INTR_UNCLAIMED);
        }

        return (DDI_INTR_CLAIMED);
}

/*
 * Fusion-MPT requests
 *
 * The controller expects to have access to a large chunk of DMA memory, into
 * which the driver writes fixed-size I/O requests for the controller to
 * process. To notify the hardware about a new request, a request descriptor is
 * written to the queue port registers which includes the SMID of the request.
 * This memory isn't really a queue, though, as it seems there are no
 * constraints about ordering of the requests. All that matters is that there
 * is a valid request at the address that corresponds with the SMID in the
 * descriptor.
 *
 * If the hardware supports MPI 2.6 atomic request descriptors, which are a
 * 32bit subset of the 64bit MPI 2.0/2.5 request descriptors, the descriptor is
 * sent to the controller in a single 32bit write into a device register.
 *
 * For all other descriptor types, we'll employ a 64bit write to the queue
 * registers, assuming that provides the required atomicity.
 */
void
lmrc_send_atomic_request(lmrc_t *lmrc, lmrc_atomic_req_desc_t req_desc)
{
        if (lmrc->l_atomic_desc_support) {
                lmrc_write_reg(lmrc,
                    MPI26_ATOMIC_REQUEST_DESCRIPTOR_POST_OFFSET,
                    req_desc.rd_reg);
        } else {
                lmrc_req_desc_t rd;

                bzero(&rd, sizeof (rd));
                rd.rd_atomic = req_desc;

                lmrc_send_request(lmrc, rd);
        }
}

void
lmrc_send_request(lmrc_t *lmrc, lmrc_req_desc_t req_desc)
{
        lmrc_write_reg64(lmrc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
            req_desc.rd_reg);
}

lmrc_atomic_req_desc_t
lmrc_build_atomic_request(lmrc_t *lmrc, lmrc_mpt_cmd_t *mpt, uint8_t flags)
{
        lmrc_atomic_req_desc_t req_desc;

        VERIFY3U(mpt->mpt_smid, !=, 0);

        /*
         * Select the reply queue based on the CPU id to distribute reply load
         * among queues.
         */
        mpt->mpt_queue = CPU->cpu_id % lmrc->l_max_reply_queues;

        bzero(&req_desc, sizeof (req_desc));

        req_desc.rd_atomic.RequestFlags = flags;
        req_desc.rd_atomic.MSIxIndex = mpt->mpt_queue;
        req_desc.rd_atomic.SMID = mpt->mpt_smid;

        return (req_desc);
}

/*
 * Reply Processing
 *
 * The controller will post replies to completed requests in the DMA memory
 * provided for that purpose. This memory is divided in equally-sized chunks,
 * each being a separate reply queue that is also associated with an interrupt
 * vector. The replies are fixed size structures and will be written by the
 * hardware in order of completion into the queue. For each queue, there is a
 * register to tell the hardware which replies have been consumed by the driver.
 *
 * In response to an interrupt, the driver will walk the reply queue associated
 * with the interrupt vector at the last known position and processess all
 * completed replies. After a number of replies has been processed, or if no
 * more replies are ready to be processed, the controller will be notified about
 * the last reply index to be processed by writing the appropriate register.
 */

/*
 * lmrc_get_next_reply_desc
 *
 * Get the next unprocessed reply descriptor for a queue, or NULL if there is
 * none.
 */
static Mpi2ReplyDescriptorsUnion_t *
lmrc_get_next_reply_desc(lmrc_t *lmrc, int queue)
{
        Mpi2ReplyDescriptorsUnion_t *desc;

        desc = lmrc->l_reply_dma.ld_buf;

        desc += (queue * lmrc->l_reply_alloc_sz) / sizeof (*desc);
        desc += lmrc->l_last_reply_idx[queue];

        VERIFY3S(ddi_dma_sync(lmrc->l_reply_dma.ld_hdl,
            (void *)desc - lmrc->l_reply_dma.ld_buf, sizeof (*desc),
            DDI_DMA_SYNC_FORKERNEL), ==, DDI_SUCCESS);

        /*
         * Check if this is an unused reply descriptor, indicating that
         * we've reached the end of replies in this queue.
         *
         * Even if the descriptor is only "half unused" we can't use it.
         */
        if (desc->Words.Low == MPI2_RPY_DESCRIPT_UNUSED_WORD0_MARK ||
            desc->Words.High == MPI2_RPY_DESCRIPT_UNUSED_WORD1_MARK)
                return (NULL);

        /* advance last reply index, wrap around if necessary */
        lmrc->l_last_reply_idx[queue]++;
        if (lmrc->l_last_reply_idx[queue] >= lmrc->l_reply_q_depth)
                lmrc->l_last_reply_idx[queue] = 0;

        return (desc);
}

/*
 * lmrc_write_rphi
 *
 * Write the Reply Post Host Index register for queue.
 */
static void
lmrc_write_rphi(lmrc_t *lmrc, uint32_t queue)
{
        int reg = 0;
        uint32_t val = (queue << 24) | lmrc->l_last_reply_idx[queue];

        if (lmrc->l_intr_type != DDI_INTR_TYPE_MSIX)
                VERIFY3U(queue, ==, 0);

        if (lmrc->l_msix_combined) {
                reg = queue / 8;
                val &= 0x07ffffff;
        }

        lmrc_write_reg(lmrc, lmrc->l_rphi[reg], val);
}

/*
 * lmrc_process_mpt_pkt
 *
 * Process a reply to a MPT IO request. Update the scsi_pkt according to status,
 * ex_status, and data_len, setting up the ARQ pkt if necessary.
 */
static void
lmrc_process_mpt_pkt(lmrc_t *lmrc, struct scsi_pkt *pkt, uint8_t status,
    uint8_t ex_status, uint32_t data_len)
{
        pkt->pkt_statistics = 0;
        pkt->pkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD |
            STATE_XFERRED_DATA | STATE_GOT_STATUS;

        pkt->pkt_resid = pkt->pkt_dma_len - data_len;

        switch (status) {
        case MFI_STAT_OK:
        case MFI_STAT_LD_CC_IN_PROGRESS:
        case MFI_STAT_LD_RECON_IN_PROGRESS:
                pkt->pkt_reason = CMD_CMPLT;
                pkt->pkt_scbp[0] = STATUS_GOOD;
                break;

        case MFI_STAT_SCSI_DONE_WITH_ERROR:
        case MFI_STAT_LD_LBA_OUT_OF_RANGE: {
                struct scsi_arq_status *arq =
                    (struct scsi_arq_status *)pkt->pkt_scbp;

                pkt->pkt_reason = CMD_CMPLT;
                arq->sts_status.sts_chk = 1;

                pkt->pkt_state |= STATE_ARQ_DONE;
                arq->sts_rqpkt_reason = CMD_CMPLT;
                arq->sts_rqpkt_resid = 0;
                arq->sts_rqpkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET |
                    STATE_SENT_CMD | STATE_XFERRED_DATA;
                *(uint8_t *)&arq->sts_rqpkt_status = STATUS_GOOD;
                break;
        }
        case MFI_STAT_LD_OFFLINE:
        case MFI_STAT_DEVICE_NOT_FOUND:
                pkt->pkt_reason = CMD_DEV_GONE;
                pkt->pkt_statistics = STAT_DISCON;
                break;

        default:
                dev_err(lmrc->l_dip, CE_WARN, "!command failed, status = %x, "
                    "ex_status = %x, cdb[0] = %x", status, ex_status,
                    pkt->pkt_cdbp[0]);
                pkt->pkt_reason = CMD_TRAN_ERR;
                break;
        }
}

/*
 * lmrc_poll_for_reply
 *
 * During a panic we'll have to resort to polled I/O to write core dumps.
 * Repeatedly check the reply queue for a new reply associated with the
 * given request descriptor and complete it, or return an error if we get
 * no reply within a reasonable time.
 */
int
lmrc_poll_for_reply(lmrc_t *lmrc, lmrc_mpt_cmd_t *mpt)
{
        clock_t max_wait = LMRC_IO_TIMEOUT * MILLISEC * 10;
        Mpi25SCSIIORequest_t *io_req = mpt->mpt_io_frame;
        Mpi2ReplyDescriptorsUnion_t *desc;
        uint16_t desc_smid;

        VERIFY(ddi_in_panic());

        /*
         * Walk the reply queue. Discard entries which we aren't
         * looking for.
         */
        do {
                desc = lmrc_get_next_reply_desc(lmrc, mpt->mpt_queue);
                if (desc == NULL) {
                        if (max_wait == 0)
                                return (TRAN_FATAL_ERROR);

                        drv_usecwait(100);
                        max_wait--;
                        continue;
                }

                desc_smid = desc->SCSIIOSuccess.SMID;

                /* reset descriptor */
                desc->Words.Low = MPI2_RPY_DESCRIPT_UNUSED_WORD0_MARK;
                desc->Words.High = MPI2_RPY_DESCRIPT_UNUSED_WORD1_MARK;

                lmrc_write_rphi(lmrc, mpt->mpt_queue);
        } while (desc == NULL || desc_smid != mpt->mpt_smid);

        VERIFY3S(ddi_dma_sync(lmrc->l_ioreq_dma.ld_hdl,
            (void *)io_req - lmrc->l_ioreq_dma.ld_buf,
            LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE, DDI_DMA_SYNC_FORKERNEL),
            ==, DDI_SUCCESS);

        /* If this is I/O, process it. */
        if (io_req->Function == LMRC_MPI2_FUNCTION_LD_IO_REQUEST ||
            io_req->Function == MPI2_FUNCTION_SCSI_IO_REQUEST) {
                lmrc_process_mpt_pkt(lmrc, mpt->mpt_pkt,
                    io_req->VendorRegion.rc_status,
                    io_req->VendorRegion.rc_exstatus, io_req->DataLength);
        }

        return (TRAN_ACCEPT);
}

/*
 * lmrc_process_replies
 *
 * Process all new reply entries in a queue in response to an interrupt.
 */
int
lmrc_process_replies(lmrc_t *lmrc, uint8_t queue)
{
        int nprocessed = 0;
        Mpi2ReplyDescriptorsUnion_t *desc;

        for (desc = lmrc_get_next_reply_desc(lmrc, queue);
            desc != NULL;
            desc = lmrc_get_next_reply_desc(lmrc, queue)) {
                Mpi2SCSIIOSuccessReplyDescriptor_t *reply =
                    &desc->SCSIIOSuccess;
                uint16_t smid = reply->SMID;
                lmrc_mpt_cmd_t *mpt = lmrc->l_mpt_cmds[smid - 1];
                lmrc_tgt_t *tgt = NULL;
                Mpi25SCSIIORequest_t *io_req;
                struct scsi_pkt *pkt;
                struct scsi_device *sd;

                VERIFY3U(reply->SMID, <=, lmrc->l_max_fw_cmds);

                mutex_enter(&mpt->mpt_lock);
                mpt->mpt_complete = B_TRUE;
                pkt = mpt->mpt_pkt;
                io_req = mpt->mpt_io_frame;

                VERIFY3S(ddi_dma_sync(lmrc->l_ioreq_dma.ld_hdl,
                    (void *)io_req - lmrc->l_ioreq_dma.ld_buf,
                    LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE,
                    DDI_DMA_SYNC_FORKERNEL), ==, DDI_SUCCESS);


                switch (io_req->Function) {
                case MPI2_FUNCTION_SCSI_TASK_MGMT:
                        VERIFY0(pkt);
                        VERIFY0(list_link_active(&mpt->mpt_node));
                        cv_signal(&mpt->mpt_cv);
                        break;

                case MPI2_FUNCTION_SCSI_IO_REQUEST:
                case LMRC_MPI2_FUNCTION_LD_IO_REQUEST:
                        VERIFY(pkt != NULL);

                        sd = scsi_address_device(&pkt->pkt_address);
                        VERIFY(sd != NULL);

                        tgt = scsi_device_hba_private_get(sd);
                        VERIFY(tgt != NULL);

                        lmrc_process_mpt_pkt(lmrc, pkt,
                            io_req->VendorRegion.rc_status,
                            io_req->VendorRegion.rc_exstatus,
                            io_req->DataLength);

                        break;

                case LMRC_MPI2_FUNCTION_PASSTHRU_IO_REQUEST:
                        VERIFY0(pkt);
                        VERIFY0(list_link_active(&mpt->mpt_node));
                        lmrc_process_mptmfi_passthru(lmrc, mpt);
                        break;

                default:
                        mutex_exit(&mpt->mpt_lock);
                        dev_err(lmrc->l_dip, CE_PANIC,
                            "reply received for unknown Function %x",
                            io_req->Function);
                }

                mutex_exit(&mpt->mpt_lock);

                if (pkt != NULL) {
                        lmrc_tgt_rem_active_mpt(tgt, mpt);
                        atomic_dec_uint(&lmrc->l_fw_outstanding_cmds);
                        scsi_hba_pkt_comp(pkt);
                }

                /* reset descriptor */
                desc->Words.Low = MPI2_RPY_DESCRIPT_UNUSED_WORD0_MARK;
                desc->Words.High = MPI2_RPY_DESCRIPT_UNUSED_WORD1_MARK;

                nprocessed++;

                if (nprocessed % LMRC_THRESHOLD_REPLY_COUNT == 0)
                        lmrc_write_rphi(lmrc, queue);
        }

        if (nprocessed != 0 && nprocessed % LMRC_THRESHOLD_REPLY_COUNT != 0)
                lmrc_write_rphi(lmrc, queue);

        return (DDI_INTR_CLAIMED);
}


/*
 * MFI - MegaRAID Firmware Interface
 */

/*
 * lmrc_build_mptmfi_passthru
 *
 * MFI commands are send as MPT MFI passthrough I/O requests. To be able to send
 * a MFI frame to the RAID controller, we need to have a MPT command set up as
 * MPT I/O request and a one-entry SGL pointing to the MFI command.
 *
 * As there's only a small number of MFI commands compared to the amound of MPT
 * commands, the MPT command for each MFI is pre-allocated at attach time and
 * initialized here.
 */
int
lmrc_build_mptmfi_passthru(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi)
{
        Mpi25SCSIIORequest_t *io_req;
        const ddi_dma_cookie_t *cookie;
        lmrc_mpt_cmd_t *mpt;

        mpt = lmrc_get_mpt(lmrc);
        if (mpt == NULL)
                return (DDI_FAILURE);

        /* lmrc_get_mpt() should return the mpt locked */
        ASSERT(mutex_owned(&mpt->mpt_lock));

        mfi->mfi_mpt = mpt;
        mpt->mpt_mfi = mfi;

        io_req = mpt->mpt_io_frame;
        io_req->Function = LMRC_MPI2_FUNCTION_PASSTHRU_IO_REQUEST;
        io_req->ChainOffset = lmrc->l_chain_offset_mfi_pthru;

        cookie = ddi_dma_cookie_one(mfi->mfi_frame_dma.ld_hdl);
        lmrc_dma_build_sgl(lmrc, mpt, cookie, 1);

        VERIFY3S(ddi_dma_sync(lmrc->l_ioreq_dma.ld_hdl,
            (void *)io_req - lmrc->l_ioreq_dma.ld_buf,
            LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE, DDI_DMA_SYNC_FORDEV),
            ==, DDI_SUCCESS);

        /*
         * As we're not sending this command to the hardware any time soon,
         * drop the mutex before we return.
         */
        mutex_exit(&mpt->mpt_lock);

        return (DDI_SUCCESS);
}

/*
 * lmrc_process_mptmfi_passthru
 *
 * When a MPT MFI passthrough command completes, invoke the callback if there
 * is one. Panic if an invalid command completed as that should never happen.
 */
static void
lmrc_process_mptmfi_passthru(lmrc_t *lmrc, lmrc_mpt_cmd_t *mpt)
{
        lmrc_mfi_cmd_t *mfi;
        mfi_header_t *hdr;

        VERIFY3P(mpt->mpt_mfi, !=, NULL);
        mfi = mpt->mpt_mfi;
        VERIFY0(list_link_active(&mfi->mfi_node));

        hdr = &mfi->mfi_frame->mf_hdr;

        if ((hdr->mh_flags & MFI_FRAME_DIR_READ) != 0)
                (void) ddi_dma_sync(mfi->mfi_data_dma.ld_hdl, 0,
                    mfi->mfi_data_dma.ld_len, DDI_DMA_SYNC_FORKERNEL);

        switch (hdr->mh_cmd) {
        case MFI_CMD_DCMD:
        case MFI_CMD_LD_SCSI_IO:
        case MFI_CMD_PD_SCSI_IO:
        case MFI_CMD_ABORT:
                mutex_enter(&mfi->mfi_lock);
                if (mfi->mfi_callback != NULL)
                        mfi->mfi_callback(lmrc, mfi);
                mutex_exit(&mfi->mfi_lock);
                break;

        case MFI_CMD_INVALID:
        default:
                dev_err(lmrc->l_dip, CE_PANIC,
                    "invalid MFI cmd completion received, cmd = %x",
                    hdr->mh_cmd);
                break;
        }
}

/*
 * lmrc_issue_mfi
 *
 * Post a MFI command to the firmware. Reset the cmd_status to invalid. Build
 * a MPT MFI passthru command if necessary and a MPT atomic request descriptor
 * before posting the request. The MFI command's mutex must be held.
 */
void
lmrc_issue_mfi(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi, lmrc_mfi_cmd_cb_t *cb)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        lmrc_atomic_req_desc_t req_desc;

        ASSERT(mutex_owned(&mfi->mfi_lock));

        if ((hdr->mh_flags & MFI_FRAME_DONT_POST_IN_REPLY_QUEUE) == 0) {
                VERIFY3U(cb, !=, NULL);
                mfi->mfi_callback = cb;
        } else {
                VERIFY3U(cb, ==, NULL);
        }

        hdr->mh_cmd_status = MFI_STAT_INVALID_STATUS;

        req_desc = lmrc_build_atomic_request(lmrc, mfi->mfi_mpt,
            MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO);

        (void) ddi_dma_sync(mfi->mfi_frame_dma.ld_hdl, 0,
            mfi->mfi_frame_dma.ld_len, DDI_DMA_SYNC_FORDEV);

        if ((hdr->mh_flags & MFI_FRAME_DIR_WRITE) != 0)
                (void) ddi_dma_sync(mfi->mfi_data_dma.ld_hdl, 0,
                    mfi->mfi_data_dma.ld_len, DDI_DMA_SYNC_FORDEV);

        lmrc_send_atomic_request(lmrc, req_desc);
}

/*
 * lmrc_poll_mfi
 *
 * Poll a MFI command for completion, waiting up to max_wait secs. Repeatedly
 * check the command status until it changes to something that is not invalid.
 *
 * Trigger an online controller reset on timeout.
 */
static int
lmrc_poll_mfi(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi, uint8_t max_wait)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        lmrc_dma_t *dma = &mfi->mfi_frame_dma;
        clock_t timeout = ddi_get_lbolt() + drv_usectohz(max_wait * MICROSEC);
        clock_t now;

        ASSERT(mutex_owned(&mfi->mfi_lock));

        do {
                (void) ddi_dma_sync(dma->ld_hdl, 0, dma->ld_len,
                    DDI_DMA_SYNC_FORKERNEL);
                if (hdr->mh_cmd_status != MFI_STAT_INVALID_STATUS)
                        break;

                (void) cv_reltimedwait(&mfi->mfi_cv, &mfi->mfi_lock,
                    drv_usectohz(MILLISEC), TR_MILLISEC);
                now = ddi_get_lbolt();
        } while (!lmrc->l_fw_fault && now <= timeout);

        if (hdr->mh_cmd_status != MFI_STAT_INVALID_STATUS)
                return (DDI_SUCCESS);

        if (now > timeout) {
                dev_err(lmrc->l_dip, CE_WARN,
                    "!%s: command timeout after %ds", __func__, max_wait);

                /*
                 * Signal the housekeeping thread to check for FW/HW faults,
                 * performing a reset if necessary.
                 */
                cv_signal(&lmrc->l_thread_cv);
        }

        return (DDI_FAILURE);
}

/*
 * lmrc_wait_mfi
 *
 * Wait for up to max_wait secs for a MFI command to complete. The cmd mutex
 * must be held.
 *
 * Trigger an online controller reset on timeout.
 */
int
lmrc_wait_mfi(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi, uint8_t max_wait)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        lmrc_dma_t *dma = &mfi->mfi_frame_dma;
        clock_t timeout = ddi_get_lbolt() + drv_usectohz(max_wait * MICROSEC);
        int ret;

        ASSERT(mutex_owned(&mfi->mfi_lock));

        do {
                ret = cv_timedwait(&mfi->mfi_cv, &mfi->mfi_lock, timeout);

                (void) ddi_dma_sync(dma->ld_hdl, 0, dma->ld_len,
                    DDI_DMA_SYNC_FORKERNEL);

        } while (!lmrc->l_fw_fault &&
            hdr->mh_cmd_status == MFI_STAT_INVALID_STATUS && ret != -1);

        if (!lmrc->l_fw_fault && ret != -1)
                return (DDI_SUCCESS);

        if (ret == -1) {
                dev_err(lmrc->l_dip, CE_WARN, "!%s: blocked command timeout "
                    "after %ds, cmd = %d, status = %d", __func__, max_wait,
                    hdr->mh_cmd, hdr->mh_cmd_status);

                /*
                 * Signal the housekeeping thread to check for FW/HW faults,
                 * performing a reset if necessary.
                 */
                cv_signal(&lmrc->l_thread_cv);
        }

        return (DDI_FAILURE);
}

/*
 * lmrc_wakeup_mfi
 *
 * Signal the CV associated with a MFI command to wake up the thread waiting
 * for its completion.
 */
void
lmrc_wakeup_mfi(lmrc_t *lmrc, lmrc_mfi_cmd_t *cmd)
{
        ASSERT(mutex_owned(&cmd->mfi_lock));
        cv_signal(&cmd->mfi_cv);
}

/*
 * lmrc_issue_blocked_mfi
 *
 * Post a MFI command to the firmware and wait for the command to complete.
 */
int
lmrc_issue_blocked_mfi(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        int ret;

        mutex_enter(&mfi->mfi_lock);
        lmrc_issue_mfi(lmrc, mfi, lmrc_wakeup_mfi);
        ret = lmrc_wait_mfi(lmrc, mfi, LMRC_INTERNAL_CMD_WAIT_TIME);
        mutex_exit(&mfi->mfi_lock);

        if (ret == DDI_SUCCESS && hdr->mh_cmd_status == MFI_STAT_OK)
                return (DDI_SUCCESS);

        dev_err(lmrc->l_dip, CE_WARN,
            "!%s: blocked command failure, cmd = %d, status = %d",
            __func__, hdr->mh_cmd, hdr->mh_cmd_status);

        return (ret);
}

/*
 * lmrc_abort_cb
 *
 * Callback for any command that is to be aborted.
 *
 * If the command completed normally before it could be aborted, set the status
 * to indicate the intended abortion.
 */
static void
lmrc_abort_cb(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;

        if (hdr->mh_cmd_status == MFI_STAT_OK)
                hdr->mh_cmd_status = MFI_STAT_NOT_FOUND;
}

/*
 * lmrc_abort_mfi
 *
 * Abort a MFI command. This is a bit tricky as the hardware may still complete
 * it at any time.
 *
 * The mutex of the command to be aborted must be held to prevent it from
 * completing behind our back. We'll replace its callback with our own, issue an
 * ABORT command, and drop the mutex before we wait for the ABORT command to
 * complete.
 */
static int
lmrc_abort_cmd(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi_to_abort)
{
        lmrc_mfi_cmd_t *mfi = lmrc_get_mfi(lmrc);
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        mfi_abort_payload_t *abort = &mfi->mfi_frame->mf_abort;
        lmrc_mfi_cmd_cb_t *orig_cb = mfi_to_abort->mfi_callback;
        int ret;

        ASSERT(mutex_owned(&mfi_to_abort->mfi_lock));

        /* Replace the commands callback with our own. */
        mfi_to_abort->mfi_callback = lmrc_abort_cb;

        hdr->mh_cmd = MFI_CMD_ABORT;
        abort->ma_abort_context = mfi_to_abort->mfi_idx;
        lmrc_dma_set_addr64(&mfi_to_abort->mfi_frame_dma,
            &abort->ma_abort_mfi_phys_addr);

        /* Send the ABORT. */
        mutex_enter(&mfi->mfi_lock);
        lmrc_issue_mfi(lmrc, mfi, lmrc_wakeup_mfi);

        /*
         * Drop the mutex of the command to be aborted, allowing it to proceed
         * while we wait for the ABORT command to complete.
         */
        mutex_exit(&mfi_to_abort->mfi_lock);
        ret = lmrc_wait_mfi(lmrc, mfi, LMRC_INTERNAL_CMD_WAIT_TIME);
        mutex_exit(&mfi->mfi_lock);

        /*
         * The ABORT command may fail if cmd_to_abort has completed already.
         * Treat any other failure as fatal, restore the callback and fail.
         */
        if (ret != DDI_SUCCESS && hdr->mh_cmd_status != MFI_STAT_NOT_FOUND) {
                mutex_enter(&mfi_to_abort->mfi_lock);
                mfi_to_abort->mfi_callback = orig_cb;
                goto out;
        }

        /*
         * Wait for the aborted command to complete. If we time out on this
         * there's little we can do here, so we restore the callback and fail.
         */
        mutex_enter(&mfi_to_abort->mfi_lock);
        ret = lmrc_poll_mfi(lmrc, mfi_to_abort, LMRC_INTERNAL_CMD_WAIT_TIME);
        mfi_to_abort->mfi_callback = orig_cb;

        if (ret != DDI_SUCCESS)
                goto out;

        /* Wake up anyone waiting on the aborted command. */
        if (mfi_to_abort->mfi_callback != NULL)
                mfi_to_abort->mfi_callback(lmrc, mfi_to_abort);

out:
        lmrc_put_mfi(mfi);
        ASSERT(mutex_owned(&mfi_to_abort->mfi_lock));
        return (ret);
}


/*
 * Controller Initialization and Housekeeping
 */

/*
 * lmrc_check_fw_fault
 *
 * Check the firmware state. If faulted, return B_TRUE.
 * Return B_FALSE otherwise.
 */
static boolean_t
lmrc_check_fw_fault(lmrc_t *lmrc)
{
        uint32_t status = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);
        uint32_t fw_state = LMRC_FW_STATE(status);

        if (fw_state == LMRC_FW_STATE_FAULT)
                return (B_TRUE);

        return (B_FALSE);
}

/*
 * lmrc_wait_for_reg
 *
 * Repeatedly read the register and check that 'bits' match 'exp'.
 */
static boolean_t
lmrc_wait_for_reg(lmrc_t *lmrc, uint32_t reg, uint32_t bits, uint32_t exp,
    uint64_t max_wait)
{
        uint32_t val;
        uint64_t i;

        max_wait *= MILLISEC / 100;

        for (i = 0; i < max_wait; i++) {
                delay(drv_usectohz(100 * MILLISEC));
                val = lmrc_read_reg(lmrc, reg);

                if ((val & bits) == exp)
                        return (B_TRUE);
        }

        return (B_FALSE);
}

static int
lmrc_hard_reset(lmrc_t *lmrc)
{
        int ret = DDI_SUCCESS;

        /* Write the reset key sequence. */
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_FLUSH_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_1ST_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_2ND_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_3RD_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_4TH_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_5TH_KEY_VALUE);
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_6TH_KEY_VALUE);

        /* Check diag write enable. */
        if (!lmrc_wait_for_reg(lmrc, MPI2_HOST_DIAGNOSTIC_OFFSET,
            MPI2_DIAG_DIAG_WRITE_ENABLE, MPI2_DIAG_DIAG_WRITE_ENABLE,
            LMRC_RESET_TIMEOUT)) {
                dev_err(lmrc->l_dip, CE_WARN, "diag unlock failed");
                return (DDI_FAILURE);
        }

        /* Reset IOC. */
        lmrc_write_reg(lmrc, MPI2_HOST_DIAGNOSTIC_OFFSET,
            lmrc_read_reg(lmrc, MPI2_HOST_DIAGNOSTIC_OFFSET) |
            MPI2_DIAG_RESET_ADAPTER);
        delay(drv_usectohz(MPI2_HARD_RESET_PCIE_FIRST_READ_DELAY_MICRO_SEC));

        /* Check the reset adapter bit. */
        if ((lmrc_read_reg(lmrc, MPI2_HOST_DIAGNOSTIC_OFFSET) &
            MPI2_DIAG_RESET_ADAPTER) == 0)
                goto out;

        delay(drv_usectohz(MPI2_HARD_RESET_PCIE_SECOND_READ_DELAY_MICRO_SEC));

        /* Check the reset adapter bit again. */
        if ((lmrc_read_reg(lmrc, MPI2_HOST_DIAGNOSTIC_OFFSET) &
            MPI2_DIAG_RESET_ADAPTER) == 0)
                goto out;

        ret = DDI_FAILURE;
out:
        lmrc_write_reg(lmrc, MPI2_WRITE_SEQUENCE_OFFSET,
            MPI2_WRSEQ_FLUSH_KEY_VALUE);
        return (ret);
}

/*
 * lmrc_reset_ctrl
 *
 * Attempt to reset the controller, if the hardware supports it.
 * If reset is unsupported or the reset fails repeatedly, we shut the
 * controller down.
 */
static int
lmrc_reset_ctrl(lmrc_t *lmrc)
{
        uint32_t status, fw_state, reset_adapter;
        int max_wait, i;

        if (lmrc->l_disable_online_ctrl_reset)
                return (DDI_FAILURE);

        status = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);
        fw_state = LMRC_FW_STATE(status);
        reset_adapter = LMRC_FW_RESET_ADAPTER(status);

        if (fw_state == LMRC_FW_STATE_FAULT && reset_adapter == 0) {
                dev_err(lmrc->l_dip, CE_WARN,
                    "FW in fault state, but reset not supported");
                goto out;
        }

        for (i = 0; i < LMRC_MAX_RESET_TRIES; i++) {
                dev_err(lmrc->l_dip, CE_WARN, "resetting...");

                if (lmrc_hard_reset(lmrc) != DDI_SUCCESS)
                        continue;

                /* Wait for the FW state to move beyond INIT. */
                max_wait = LMRC_IO_TIMEOUT * MILLISEC / 100;
                do {
                        status = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);
                        fw_state = LMRC_FW_STATE(status);

                        if (fw_state <= LMRC_FW_STATE_FW_INIT)
                                delay(drv_usectohz(100 * MILLISEC));
                } while (fw_state <= LMRC_FW_STATE_FW_INIT && max_wait > 0);

                if (fw_state <= LMRC_FW_STATE_FW_INIT) {
                        dev_err(lmrc->l_dip, CE_WARN,
                            "fw state <= LMRC_FW_STATE_FW_INIT, state = %x",
                            fw_state);
                        continue;
                }

                return (DDI_SUCCESS);
        }

        dev_err(lmrc->l_dip, CE_WARN, "reset failed");
out:
        /* Stop the controller. */
        lmrc_write_reg(lmrc, MPI2_DOORBELL_OFFSET, MFI_STOP_ADP);
        (void) lmrc_read_reg(lmrc, MPI2_DOORBELL_OFFSET);

        return (DDI_FAILURE);
}

/*
 * lmrc_tgt_complete_cmd
 *
 * In case of a controller reset, complete the cmd and clean up. This is done
 * in a taskq to avoid locking and list manipulation headaches.
 */
static void
lmrc_tgt_complete_cmd(void *arg)
{
        lmrc_scsa_cmd_t *cmd = arg;
        struct scsi_pkt *pkt;
        lmrc_t *lmrc;

        mutex_enter(&cmd->sc_mpt->mpt_lock);

        /* Just in case the command completed before the taskq was run... */
        if (cmd->sc_mpt->mpt_complete) {
                mutex_exit(&cmd->sc_mpt->mpt_lock);
                return;
        }

        lmrc = cmd->sc_mpt->mpt_lmrc;
        pkt = cmd->sc_mpt->mpt_pkt;

        pkt->pkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD;
        pkt->pkt_reason = CMD_RESET;
        pkt->pkt_statistics = STAT_BUS_RESET;
        mutex_exit(&cmd->sc_mpt->mpt_lock);

        lmrc_tgt_rem_active_mpt(cmd->sc_tgt, cmd->sc_mpt);
        atomic_dec_uint(&lmrc->l_fw_outstanding_cmds);

        scsi_hba_pkt_comp(pkt);
}

/*
 * lmrc_tgt_complete_cmds
 *
 * Walk the list of active commands of a target. Schedule a taskq to handle the
 * timeout processing and clean up.
 */
static void
lmrc_tgt_complete_cmds(lmrc_t *lmrc, lmrc_tgt_t *tgt)
{
        lmrc_mpt_cmd_t *mpt;

        mutex_enter(&tgt->tgt_mpt_active_lock);
        if (list_is_empty(&tgt->tgt_mpt_active)) {
                mutex_exit(&tgt->tgt_mpt_active_lock);
                return;
        }

        for (mpt = lmrc_tgt_first_active_mpt(tgt);
            mpt != NULL;
            mpt = lmrc_tgt_next_active_mpt(tgt, mpt)) {
                lmrc_scsa_cmd_t *cmd = mpt->mpt_pkt->pkt_ha_private;

                ASSERT(mutex_owned(&mpt->mpt_lock));
                VERIFY(mpt->mpt_pkt != NULL);
                VERIFY(cmd != NULL);

                if (mpt->mpt_complete)
                        continue;

                taskq_dispatch_ent(lmrc->l_taskq, lmrc_tgt_complete_cmd, cmd,
                    TQ_NOSLEEP, &mpt->mpt_tqent);
        }
        mutex_exit(&tgt->tgt_mpt_active_lock);
}

/*
 * lmrc_tgt_timeout_cmds
 *
 * Walk the list of active commands of a target. Try to abort commands which are
 * overdue.
 */
static int
lmrc_tgt_timeout_cmds(lmrc_t *lmrc, lmrc_tgt_t *tgt)
{
        lmrc_mpt_cmd_t *mpt;
        int ret = DDI_SUCCESS;

        mutex_enter(&tgt->tgt_mpt_active_lock);
        if (list_is_empty(&tgt->tgt_mpt_active))
                goto out;

        for (mpt = lmrc_tgt_first_active_mpt(tgt);
            mpt != NULL;
            mpt = lmrc_tgt_next_active_mpt(tgt, mpt)) {
                hrtime_t now;

                ASSERT(mutex_owned(&mpt->mpt_lock));
                VERIFY(mpt->mpt_pkt != NULL);

                /* Just in case the command completed by now... */
                if (mpt->mpt_complete)
                        continue;

                now = gethrtime();

                if (now > mpt->mpt_timeout) {
                        /*
                         * Give the packet a bit more time for the abort to
                         * complete.
                         */
                        mpt->mpt_timeout = now + LMRC_IO_TIMEOUT * NANOSEC;

                        /*
                         * If the abort failed for whatever reason,
                         * we can stop here as only a controller reset
                         * can get us back into a sane state.
                         */
                        if (lmrc_abort_mpt(lmrc, tgt, mpt) != 1) {
                                mutex_exit(&mpt->mpt_lock);
                                ret = DDI_FAILURE;
                                goto out;
                        }
                }
        }

out:
        mutex_exit(&tgt->tgt_mpt_active_lock);
        return (ret);
}

/*
 * lmrc_thread
 *
 * Check whether the controller is FW fault state. Check all targets for
 * commands which have timed out.
 */
void
lmrc_thread(void *arg)
{
        lmrc_t *lmrc = arg;

        do {
                int i;

                /* Wake up at least once a minute. */
                mutex_enter(&lmrc->l_thread_lock);
                (void) cv_reltimedwait(&lmrc->l_thread_cv, &lmrc->l_thread_lock,
                    drv_usectohz(60 * MICROSEC), TR_SEC);
                mutex_exit(&lmrc->l_thread_lock);

                if (lmrc->l_thread_stop)
                        continue;

                lmrc->l_fw_fault = lmrc_check_fw_fault(lmrc);

                /*
                 * Check all targets for timed-out commands. If we find any
                 * and fail to abort them, we pretend the FW has faulted to
                 * trigger a reset.
                 */
                if (!lmrc->l_fw_fault) {
                        for (i = 0; i < ARRAY_SIZE(lmrc->l_targets); i++) {
                                if (lmrc_tgt_timeout_cmds(lmrc,
                                    &lmrc->l_targets[i]) != DDI_SUCCESS) {
                                        lmrc->l_fw_fault = B_TRUE;
                                        break;
                                }
                        }
                }

                /*
                 * If the FW is faulted, try to recover by performing a reset.
                 */
                if (lmrc->l_fw_fault) {
                        int ret;

                        lmrc_disable_intr(lmrc);

                        /*
                         * Even if the reset failed, it will have stopped the
                         * controller and we can complete all outstanding
                         * commands.
                         */
                        ret = lmrc_reset_ctrl(lmrc);

                        (void) lmrc_abort_outstanding_mfi(lmrc,
                            LMRC_MAX_MFI_CMDS);

                        for (i = 0; i < ARRAY_SIZE(lmrc->l_targets); i++)
                                lmrc_tgt_complete_cmds(lmrc,
                                    &lmrc->l_targets[i]);

                        if (ret != DDI_SUCCESS) {
                                dev_err(lmrc->l_dip, CE_WARN, "reset failed");
                                continue;
                        }

                        if (lmrc_transition_to_ready(lmrc) != DDI_SUCCESS)
                                continue;

                        if (lmrc_ioc_init(lmrc) != DDI_SUCCESS)
                                continue;

                        lmrc_enable_intr(lmrc);

                        if (lmrc_start_aen(lmrc) != DDI_SUCCESS) {
                                dev_err(lmrc->l_dip, CE_WARN,
                                    "failed to re-initiate AEN");
                                continue;
                        }

                        lmrc->l_fw_fault = lmrc_check_fw_fault(lmrc);
                }
        } while (!lmrc->l_thread_stop);

        thread_exit();
}

/*
 * lmrc_transition_to_ready
 *
 * Move firmware to ready state. At attach time, the FW can potentially be in
 * any one of several possible states. If the FW is in operational, waiting-for-
 * handshake states, take steps to bring it to ready state. Otherwise, wait for
 * the FW to reach ready state.
 */
static int
lmrc_transition_to_ready(lmrc_t *lmrc)
{
        uint32_t status, new_status;
        uint32_t fw_state;
        uint8_t max_wait;
        uint_t i;

        status = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);
        fw_state = LMRC_FW_STATE(status);
        max_wait = LMRC_RESET_TIMEOUT;

        while (fw_state != LMRC_FW_STATE_READY) {
                switch (fw_state) {
                case LMRC_FW_STATE_FAULT:
                        dev_err(lmrc->l_dip, CE_NOTE, "FW is in fault state!");
                        if (lmrc_reset_ctrl(lmrc) != DDI_SUCCESS)
                                return (DDI_FAILURE);
                        break;

                case LMRC_FW_STATE_WAIT_HANDSHAKE:
                        /* Set the CLR bit in inbound doorbell */
                        lmrc_write_reg(lmrc, MPI2_DOORBELL_OFFSET,
                            MFI_INIT_CLEAR_HANDSHAKE | MFI_INIT_HOTPLUG);
                        break;

                case LMRC_FW_STATE_BOOT_MSG_PENDING:
                        lmrc_write_reg(lmrc, MPI2_DOORBELL_OFFSET,
                            MFI_INIT_HOTPLUG);
                        break;

                case LMRC_FW_STATE_OPERATIONAL:
                        /* Bring it to READY state, wait up to 10s */
                        lmrc_disable_intr(lmrc);
                        lmrc_write_reg(lmrc, MPI2_DOORBELL_OFFSET,
                            MFI_RESET_FLAGS);
                        (void) lmrc_wait_for_reg(lmrc, MPI2_DOORBELL_OFFSET, 1,
                            0, 10);
                        break;

                case LMRC_FW_STATE_UNDEFINED:
                        /* This state should not last for more than 2 sec */
                case LMRC_FW_STATE_BB_INIT:
                case LMRC_FW_STATE_FW_INIT:
                case LMRC_FW_STATE_FW_INIT_2:
                case LMRC_FW_STATE_DEVICE_SCAN:
                case LMRC_FW_STATE_FLUSH_CACHE:
                        break;
                default:
                        dev_err(lmrc->l_dip, CE_WARN, "Unknown FW state %x",
                            fw_state);
                        return (DDI_FAILURE);
                }

                /*
                 * The current state should not last for more than max_wait
                 * seconds.
                 */
                for (i = 0; i < max_wait * 1000; i++) {
                        new_status = lmrc_read_reg(lmrc,
                            MPI26_SCRATCHPAD0_OFFSET);

                        if (status != new_status)
                                break;

                        delay(drv_usectohz(MILLISEC));
                }

                if (new_status == status) {
                        dev_err(lmrc->l_dip, CE_WARN,
                            "FW state (%x) hasn't changed in %d seconds",
                            fw_state, max_wait);
                        return (DDI_FAILURE);
                }

                status = new_status;
                fw_state = LMRC_FW_STATE(status);
        }

        if (lmrc_check_acc_handle(lmrc->l_reghandle) != DDI_FM_OK)
                return (DDI_FAILURE);

        return (DDI_SUCCESS);
}

/*
 * lmrc_adapter_init
 *
 * Get the hardware and firmware into a usable state, and fetch some basic
 * information from the registers to calculate sizes of basic data structures.
 */
int
lmrc_adapter_init(lmrc_t *lmrc)
{
        uint32_t reg;
        int ret;
        int i;

        ret = lmrc_transition_to_ready(lmrc);
        if (ret != DDI_SUCCESS)
                return (ret);

        /*
         * Get maximum RAID map size.
         */
        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD2_OFFSET);
        lmrc->l_max_raid_map_sz = LMRC_MAX_RAID_MAP_SZ(reg);

        lmrc->l_max_reply_queues = 1;
        lmrc->l_rphi[0] = MPI2_REPLY_POST_HOST_INDEX_OFFSET;

        /*
         * Apparently, bit 27 of the scratch pad register indicates whether
         * MSI-X is supported by the firmware.
         */
        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);

        if (LMRC_FW_MSIX_ENABLED(reg)) {
                lmrc->l_fw_msix_enabled = B_TRUE;

                reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD1_OFFSET);
                lmrc->l_max_reply_queues = LMRC_MAX_REPLY_QUEUES_EXT(reg);

                if (lmrc->l_max_reply_queues > LMRC_MAX_REPLY_POST_HOST_INDEX) {
                        lmrc->l_msix_combined = B_TRUE;
                        lmrc->l_rphi[0] =
                            MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET;
                }

                /*
                 * Compute reply post index register addresses 1-15.
                 */
                for (i = 1; i < LMRC_MAX_REPLY_POST_HOST_INDEX; i++) {
                        lmrc->l_rphi[i] = i * 0x10 +
                            MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET;
                }
        }

        /*
         * Get the number of commands the firmware supports. Use one less,
         * because reply_q_depth is based on one more than this. XXX: Why?
         */
        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD0_OFFSET);
        lmrc->l_max_fw_cmds = LMRC_FW_MAX_CMD(reg) - 1;

        if (lmrc->l_max_fw_cmds < LMRC_MAX_MFI_CMDS) {
                dev_err(lmrc->l_dip, CE_WARN, "!max_fw_cmds too low: %d",
                    lmrc->l_max_fw_cmds);
                return (DDI_FAILURE);
        }

        /*
         * Reserve some commands for MFI, the remainder is for SCSI commands.
         */
        lmrc->l_max_scsi_cmds = lmrc->l_max_fw_cmds - LMRC_MAX_MFI_CMDS;

        /*
         * XXX: This magic calculation isn't explained anywhere. Let's see...
         * lmrc_max_fw_cmds + 1 gives us what was reported in the register,
         * That + 15 is for rounding it up the next multiple of 16, which
         * / 16 * 16 does.
         * And apparently we want twice that much for queue depth. Why?
         *
         * So in reality, the queue depth is based on at least one more than
         * lmrc_max_fw_cmds, but it could be even more. That makes the above
         * statement about lmrc_max_fw_cmds questionable.
         */
        lmrc->l_reply_q_depth = (lmrc->l_max_fw_cmds + 1 + 15) / 16 * 16 * 2;

        /* Allocation size of one reply queue, based on depth. */
        lmrc->l_reply_alloc_sz =
            sizeof (Mpi2ReplyDescriptorsUnion_t) * lmrc->l_reply_q_depth;

        /* Allocation size of the DMA memory used for all MPI I/O frames. */
        lmrc->l_io_frames_alloc_sz = LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE *
            (lmrc->l_max_fw_cmds + 2);

        /*
         * If LMRC_EXT_CHAIN_SIZE_SUPPORT is set in scratch pad 1, firmware
         * supports an extended IO chain frame which is 4 times the size of a
         * legacy firmware frame.
         * Legacy Firmware frame size is (8 * 128) = 1K
         * 1M IO Firmware frame size is (8 * 128 * 4) = 4K
         */
        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD1_OFFSET);
        lmrc->l_max_chain_frame_sz = LMRC_MAX_CHAIN_SIZE(reg) *
            (LMRC_EXT_CHAIN_SIZE_SUPPORT(reg) ? LMRC_1MB_IO : LMRC_256K_IO);

        /*
         * Check whether the controller supports DMA to the full 64bit address
         * space.
         */
        lmrc->l_64bit_dma_support = LMRC_64BIT_DMA_SUPPORT(reg);

        /*
         * We use a I/O frame size of 256 bytes, that is what
         * LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE is set to.
         *
         * The offset of the SGL in the I/O frame is 128, so
         * there are 128 bytes left for 8 SGEs of 16 bytes each.
         */
        lmrc->l_max_sge_in_main_msg =
            (LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE -
            offsetof(Mpi25SCSIIORequest_t, SGL)) / sizeof (Mpi25SGEIOUnion_t);

        /*
         * Similarly, number of SGE in a SGE chain frame.
         */
        lmrc->l_max_sge_in_chain =
            lmrc->l_max_chain_frame_sz / sizeof (Mpi25SGEIOUnion_t);

        /*
         * The total number of SGE we support in a transfer is sum of
         * the above two, minus one for the link (last SGE in main msg).
         *
         * XXX: So why -2?
         */
        lmrc->l_max_num_sge =
            lmrc->l_max_sge_in_main_msg + lmrc->l_max_sge_in_chain - 2;

        /*
         * The offset of the last SGE in the I/O request, used for linking
         * the SGE chain frame if necessary.
         */
        lmrc->l_chain_offset_io_request =
            (LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE -
            sizeof (Mpi25SGEIOUnion_t)) / sizeof (Mpi25SGEIOUnion_t);

        /*
         * For MFI passthru, the link to the SGE chain frame is always
         * the first SGE in the I/O frame, the other SGEs in the I/O frame
         * will not be used.
         */
        lmrc->l_chain_offset_mfi_pthru =
            offsetof(Mpi25SCSIIORequest_t, SGL) / sizeof (Mpi25SGEIOUnion_t);


        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD3_OFFSET);
        if (LMRC_NVME_PAGE_SHIFT(reg) > LMRC_DEFAULT_NVME_PAGE_SHIFT) {
                lmrc->l_nvme_page_sz = 1 << LMRC_NVME_PAGE_SHIFT(reg);
                dev_err(lmrc->l_dip, CE_NOTE, "!NVME page size: %ld",
                    lmrc->l_nvme_page_sz);
        }

        reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD1_OFFSET);
        lmrc->l_fw_sync_cache_support = LMRC_SYNC_CACHE_SUPPORT(reg);

        if (lmrc->l_class == LMRC_ACLASS_AERO) {
                reg = lmrc_read_reg(lmrc, MPI26_SCRATCHPAD1_OFFSET);
                lmrc->l_atomic_desc_support =
                    LMRC_ATOMIC_DESCRIPTOR_SUPPORT(reg);
        }

        return (DDI_SUCCESS);
}

/*
 * lmrc_ioc_init
 *
 * Manually build a MFI IOC INIT command to setup basic operating parameters
 * such as the DMA parameters for the I/O request frames and the reply post
 * queues. Send the IOC INIT command using a special request descriptor which
 * directly includes the physical address of the MFI command frame.
 *
 * After this command completes, the controller is ready to accept MPT commands
 * using the normal method of placing it in the I/O request DMA memory and
 * writing a MPT request descripter to the appropriate registers.
 */
int
lmrc_ioc_init(lmrc_t *lmrc)
{
        lmrc_mfi_cmd_t *mfi = lmrc_get_mfi(lmrc);
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        mfi_init_payload_t *init = &mfi->mfi_frame->mf_init;
        lmrc_req_desc_t req_desc;
        Mpi2IOCInitRequest_t *IOCInitMsg;
        lmrc_dma_t dma;
        int ret = DDI_SUCCESS;

        ret = lmrc_dma_alloc(lmrc, lmrc->l_dma_attr, &dma,
            sizeof (Mpi2IOCInitRequest_t), 256, DDI_DMA_CONSISTENT);
        if (ret != DDI_SUCCESS) {
                lmrc_put_mfi(mfi);
                dev_err(lmrc->l_dip, CE_WARN,
                    "!%s: failed to allocate IOC command", __func__);
                return (DDI_FAILURE);
        }

        IOCInitMsg = dma.ld_buf;
        IOCInitMsg->Function = MPI2_FUNCTION_IOC_INIT;
        IOCInitMsg->WhoInit = MPI2_WHOINIT_HOST_DRIVER;
        IOCInitMsg->MsgVersion = MPI2_VERSION;
        IOCInitMsg->HeaderVersion = MPI2_HEADER_VERSION;
        IOCInitMsg->SystemRequestFrameSize =
            LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE / 4;
        IOCInitMsg->ReplyDescriptorPostQueueDepth = lmrc->l_reply_q_depth;
        lmrc_dma_set_addr64(&lmrc->l_reply_dma,
            (uint64_t *)&IOCInitMsg->ReplyDescriptorPostQueueAddress);
        lmrc_dma_set_addr64(&lmrc->l_ioreq_dma,
            (uint64_t *)&IOCInitMsg->SystemRequestFrameBaseAddress);
        IOCInitMsg->HostMSIxVectors = lmrc->l_max_reply_queues;
        /* XXX: Why NVMe? */
        IOCInitMsg->HostPageSize = LMRC_DEFAULT_NVME_PAGE_SHIFT;

        hdr->mh_cmd = MFI_CMD_INIT;
        hdr->mh_cmd_status = MFI_STAT_INVALID_STATUS;
        hdr->mh_flags = MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;

        hdr->mh_drv_opts.mc_support_additional_msix = 1;
        hdr->mh_drv_opts.mc_support_max_255lds = 1;
        hdr->mh_drv_opts.mc_support_ndrive_r1_lb = 1;
        hdr->mh_drv_opts.mc_support_security_protocol_cmds_fw = 1;
        hdr->mh_drv_opts.mc_support_ext_io_size = 1;

        hdr->mh_data_xfer_len = lmrc_dma_get_size(&dma);

        lmrc_dma_set_addr64(&dma, &init->mi_queue_info_new_phys_addr);

        lmrc_dma_set_addr64(&mfi->mfi_frame_dma, &req_desc.rd_reg);
        VERIFY0(req_desc.rd_mfa_io.RequestFlags);
        req_desc.rd_mfa_io.RequestFlags = LMRC_REQ_DESCRIPT_FLAGS_MFA;

        lmrc_disable_intr(lmrc);
        if (!lmrc_wait_for_reg(lmrc, MPI2_DOORBELL_OFFSET, 1, 0, 10))
                return (DDI_FAILURE);

        (void) ddi_dma_sync(dma.ld_hdl, 0, dma.ld_len, DDI_DMA_SYNC_FORDEV);
        (void) ddi_dma_sync(mfi->mfi_frame_dma.ld_hdl, 0,
            mfi->mfi_frame_dma.ld_len, DDI_DMA_SYNC_FORDEV);

        lmrc_send_request(lmrc, req_desc);

        mutex_enter(&mfi->mfi_lock);
        ret = lmrc_poll_mfi(lmrc, mfi, LMRC_INTERNAL_CMD_WAIT_TIME);
        mutex_exit(&mfi->mfi_lock);

        if (ret != DDI_SUCCESS) {
                if (hdr->mh_cmd_status != MFI_STAT_INVALID_STATUS)
                        dev_err(lmrc->l_dip, CE_WARN,
                            "!IOC Init failed, status = 0x%x",
                            hdr->mh_cmd_status);
        }

        lmrc_dma_free(&dma);
        lmrc_put_mfi(mfi);

        return (ret);
}

/*
 * lmrc_get_ctrl_info
 *
 * Build a MFI DCMD to get controller information from FW. Update the copy in
 * the soft state.
 */
static int
lmrc_get_ctrl_info(lmrc_t *lmrc)
{
        mfi_ctrl_info_t *ci = lmrc->l_ctrl_info;
        lmrc_mfi_cmd_t *mfi;
        int ret;

        mfi = lmrc_get_dcmd(lmrc, MFI_FRAME_DIR_READ, MFI_DCMD_CTRL_GET_INFO,
            sizeof (mfi_ctrl_info_t), 1);

        if (mfi == NULL)
                return (DDI_FAILURE);

        ret = lmrc_issue_blocked_mfi(lmrc, mfi);

        if (ret != DDI_SUCCESS)
                goto out;

        (void) ddi_dma_sync(mfi->mfi_data_dma.ld_hdl, 0,
            mfi->mfi_data_dma.ld_len, DDI_DMA_SYNC_FORKERNEL);
        bcopy(mfi->mfi_data_dma.ld_buf, ci, sizeof (mfi_ctrl_info_t));

out:
        lmrc_put_dcmd(lmrc, mfi);
        return (ret);
}

/*
 * lmrc_fw_init
 *
 * Complete firmware initialization. At this point, we can already send MFI
 * commands. so we can start by getting the controller information from the
 * firmware and set up things in our soft state. Next we issue the commands
 * to get the PD map and RAID map, which will complete asynchronously when
 * new information is available and then re-send themselves.
 */
int
lmrc_fw_init(lmrc_t *lmrc)
{
        int drv_max_lds = MFI_MAX_LOGICAL_DRIVES;
        mfi_ctrl_info_t *ci = lmrc->l_ctrl_info;
        int ret;

        ret = lmrc_get_ctrl_info(lmrc);
        if (ret != DDI_SUCCESS) {
                dev_err(lmrc->l_dip, CE_WARN, "!Unable to get FW ctrl info.");
                return (DDI_FAILURE);
        }

        lmrc->l_disable_online_ctrl_reset =
            ci->ci_prop.cp_disable_online_ctrl_reset == 1;

        lmrc->l_max_256_vd_support =
            ci->ci_adapter_opts3.ao3_support_max_ext_lds == 1;

        if (ci->ci_max_lds > 64) {
                lmrc->l_max_256_vd_support = B_TRUE;
                drv_max_lds = LMRC_MAX_LOGICAL_DRIVES_EXT;
        }

        lmrc->l_fw_supported_vd_count = min(ci->ci_max_lds, drv_max_lds);

        lmrc->l_fw_supported_pd_count = min(ci->ci_max_pds,
            MFI_MAX_PHYSICAL_DRIVES);

        lmrc->l_max_map_sz = lmrc->l_current_map_sz =
            lmrc->l_max_raid_map_sz * LMRC_MIN_MAP_SIZE;

        lmrc->l_use_seqnum_jbod_fp =
            ci->ci_adapter_opts3.ao3_use_seq_num_jbod_FP != 0;

        lmrc->l_pdmap_tgtid_support =
            ci->ci_adapter_opts4.ao4_support_pd_map_target_id != 0;

        return (DDI_SUCCESS);
}


/*
 * lmrc_ctrl_shutdown
 *
 * Called by lmrc_quiesce() to send a shutdown command to the controller.
 * Cannot use locks, therefore cannot use lmrc_get_dcmd() or lmrc_get_mfi().
 */
int
lmrc_ctrl_shutdown(lmrc_t *lmrc)
{
        lmrc_mfi_cmd_t *mfi = list_remove_head(&lmrc->l_mfi_cmd_list);
        mfi_header_t *hdr;
        mfi_dcmd_payload_t *dcmd;

        if (mfi == NULL)
                return (DDI_FAILURE);

        hdr = &mfi->mfi_frame->mf_hdr;
        dcmd = &mfi->mfi_frame->mf_dcmd;

        hdr->mh_cmd = MFI_CMD_DCMD;
        hdr->mh_flags = MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
        dcmd->md_opcode = MFI_DCMD_CTRL_SHUTDOWN;

        lmrc_disable_intr(lmrc);
        lmrc_issue_mfi(lmrc, mfi, NULL);

        return (DDI_SUCCESS);
}

/*
 * driver target state management
 *
 * The soft state of the controller instance keeps a pre-allocated array of
 * target structures for all possible targets, even though only a small number
 * of them are likely to be used. Each target structure contains back link to
 * the soft state and a mutex, which are never cleared or changed when a target
 * is added or removed.
 */

/*
 * lmrc_tgt_init
 *
 * Initialize the tgt structure for a newly discovered tgt. The same tgt
 * structure is used for PDs and LDs, the distinction can be made by the
 * presence or absence of tgt_pd_info. LDs are always of type disk, the type
 * of PDs is taken from their pd_info. If a device has no SAS WWN, we'll fake
 * the interconnect type to be PARALLEL to make sure device address isn't
 * misunderstood as a WWN by devfsadm.
 */
void
lmrc_tgt_init(lmrc_tgt_t *tgt, uint16_t dev_id, char *addr,
    mfi_pd_info_t *pd_info)
{
        rw_enter(&tgt->tgt_lock, RW_WRITER);

        bzero(&tgt->tgt_dev_id,
            sizeof (lmrc_tgt_t) - offsetof(lmrc_tgt_t, tgt_dev_id));

        tgt->tgt_dev_id = dev_id;
        tgt->tgt_pd_info = pd_info;
        tgt->tgt_interconnect_type = INTERCONNECT_SAS;

        if (pd_info == NULL) {
                tgt->tgt_type = DTYPE_DIRECT;
        } else {
                tgt->tgt_type = pd_info->pd_scsi_dev_type;
        }

        (void) strlcpy(tgt->tgt_wwnstr, addr, sizeof (tgt->tgt_wwnstr));
        if (scsi_wwnstr_to_wwn(tgt->tgt_wwnstr, &tgt->tgt_wwn) != DDI_SUCCESS) {
                tgt->tgt_interconnect_type = INTERCONNECT_PARALLEL;
                tgt->tgt_wwn = dev_id;
        }

        rw_exit(&tgt->tgt_lock);
}

/*
 * lmrc_tgt_clear
 *
 * Reset the tgt structure of a target which is no longer present.
 */
void
lmrc_tgt_clear(lmrc_tgt_t *tgt)
{
        rw_enter(&tgt->tgt_lock, RW_WRITER);

        if (tgt->tgt_pd_info != NULL)
                kmem_free(tgt->tgt_pd_info, sizeof (mfi_pd_info_t));

        bzero(&tgt->tgt_dev_id,
            sizeof (lmrc_tgt_t) - offsetof(lmrc_tgt_t, tgt_dev_id));
        tgt->tgt_dev_id = LMRC_DEVHDL_INVALID;
        rw_exit(&tgt->tgt_lock);
}

/*
 * lmrc_tgt_find
 *
 * Walk the target list and find a tgt matching the given scsi_device.
 * Return the tgt read-locked. The targets_lock mutex must be held the
 * whole time.
 */
lmrc_tgt_t *
lmrc_tgt_find(lmrc_t *lmrc, struct scsi_device *sd)
{
        const char *ua = scsi_device_unit_address(sd);
        char *comma, wwnstr[SCSI_WWN_BUFLEN];
        uint64_t wwn;
        unsigned long tgtid;
        lmrc_tgt_t *tgt;
        size_t i;

        VERIFY(ua != NULL);

        (void) strlcpy(wwnstr, ua, sizeof (wwnstr));

        /*
         * If the unit address is a valid target ID and within range for
         * VD IDs, use that.
         */
        if (ddi_strtoul(wwnstr, &comma, 10, &tgtid) == 0 &&
            *comma == ',' &&
            tgtid <= lmrc->l_fw_supported_vd_count) {
                tgt = &lmrc->l_targets[tgtid];

                rw_enter(&tgt->tgt_lock, RW_READER);
                if (tgt->tgt_dev_id == tgtid &&
                    tgt->tgt_wwn == tgtid) {
                        return (tgt);
                }
                rw_exit(&tgt->tgt_lock);
        }

        /* Chop off ",lun" as scsi_wwnstr_to_wwn() can't handle it. */
        comma = strchr(wwnstr, ',');
        if (comma != NULL)
                *comma = '\0';

        /* Else, if unit address is a valid WWN, look for that. */
        if (scsi_wwnstr_to_wwn(wwnstr, &wwn) == DDI_SUCCESS) {
                for (i = 0; i < ARRAY_SIZE(lmrc->l_targets); i++) {
                        tgt = &lmrc->l_targets[i];

                        rw_enter(&tgt->tgt_lock, RW_READER);
                        if (tgt->tgt_wwn == wwn) {
                                return (tgt);
                        }
                        rw_exit(&tgt->tgt_lock);
                }
        } else {
                /* Do it the hard way and compare wwnstr. */
                for (i = 0; i < ARRAY_SIZE(lmrc->l_targets); i++) {
                        tgt = &lmrc->l_targets[i];

                        rw_enter(&tgt->tgt_lock, RW_READER);
                        if (strcmp(tgt->tgt_wwnstr, wwnstr) == 0) {
                                return (tgt);
                        }
                        rw_exit(&tgt->tgt_lock);
                }
        }

        return (NULL);
}

/*
 * MPT/MFI command management
 *
 * For each kind of command, MFI and MPT, the driver keeps an array of pre-
 * allocated and pre-initialized commands. Additionally, it keeps two lists of
 * currently unused commands. A set of functions is provided for each list to
 * get and put commands from/to the list. Commands are initialized during get(),
 * because having completed commands on the list can help in certain cases
 * during debugging.
 *
 * MPT commands in use for I/O are kept on a active command list of the target
 * they are addressing. All other types of commands are not kept on any list
 * while they are being processed by the hardware. When walking the command
 * arrays, busy commands not associated with a target can be distinguished by
 * not being linked on any list.
 */

/*
 * lmrc_get_mpt
 *
 * Get a MPT command from the list and initialize it. Return the command locked.
 * Return NULL if the MPT command list is empty.
 */
lmrc_mpt_cmd_t *
lmrc_get_mpt(lmrc_t *lmrc)
{
        lmrc_mpt_cmd_t *mpt;
        Mpi25SCSIIORequest_t *io_req;

        mutex_enter(&lmrc->l_mpt_cmd_lock);
        mpt = list_remove_head(&lmrc->l_mpt_cmd_list);
        mutex_exit(&lmrc->l_mpt_cmd_lock);
        if (mpt == NULL)
                return (NULL);

        mutex_enter(&mpt->mpt_lock);
        bzero(mpt->mpt_io_frame, LMRC_MPI2_RAID_DEFAULT_IO_FRAME_SIZE);
        bzero(mpt->mpt_chain_dma.ld_buf, mpt->mpt_chain_dma.ld_len);
        bzero(mpt->mpt_sense_dma.ld_buf, mpt->mpt_sense_dma.ld_len);

        mpt->mpt_mfi = NULL;
        mpt->mpt_pkt = NULL;

        /* Set the offset of the SGL entries inside the MPT command. */
        io_req = mpt->mpt_io_frame;
        io_req->SGLOffset0 = offsetof(Mpi25SCSIIORequest_t, SGL) / 4;

        mpt->mpt_complete = B_FALSE;
        cv_init(&mpt->mpt_cv, NULL, CV_DRIVER, NULL);

        return (mpt);
}

/*
 * lmrc_put_mpt
 *
 * Put a MPT command back on the list. The command lock must be held when this
 * function is called, being unlocked only after the command has been put on
 * the free list. The command CV is destroyed, thereby asserting that no one is
 * still waiting on it.
 */
void
lmrc_put_mpt(lmrc_mpt_cmd_t *mpt)
{
        lmrc_t *lmrc = mpt->mpt_lmrc;

        VERIFY(lmrc != NULL);

        ASSERT0(list_link_active(&mpt->mpt_node));
        ASSERT(mutex_owned(&mpt->mpt_lock));
        cv_destroy(&mpt->mpt_cv);

        mutex_enter(&lmrc->l_mpt_cmd_lock);
        list_insert_tail(&lmrc->l_mpt_cmd_list, mpt);
        mutex_exit(&lmrc->l_mpt_cmd_lock);
        mutex_exit(&mpt->mpt_lock);
}

/*
 * lmrc_get_mfi
 *
 * Get a MFI command from the list and initialize it.
 */
lmrc_mfi_cmd_t *
lmrc_get_mfi(lmrc_t *lmrc)
{
        lmrc_mfi_cmd_t *mfi;

        mutex_enter(&lmrc->l_mfi_cmd_lock);
        mfi = list_remove_head(&lmrc->l_mfi_cmd_list);
        mutex_exit(&lmrc->l_mfi_cmd_lock);
        VERIFY(mfi != NULL);

        mutex_enter(&mfi->mfi_lock);
        bzero(mfi->mfi_frame, sizeof (mfi_frame_t));
        mfi->mfi_frame->mf_hdr.mh_context = mfi->mfi_idx;
        mfi->mfi_callback = NULL;

        cv_init(&mfi->mfi_cv, NULL, CV_DRIVER, NULL);
        mutex_exit(&mfi->mfi_lock);

        return (mfi);
}

/*
 * lmrc_put_mfi
 *
 * Put a MFI command back on the list. Destroy the CV, thereby
 * asserting that no one is waiting on it.
 */
void
lmrc_put_mfi(lmrc_mfi_cmd_t *mfi)
{
        lmrc_t *lmrc = mfi->mfi_lmrc;

        VERIFY(lmrc != NULL);

        ASSERT0(list_link_active(&mfi->mfi_node));

        mutex_enter(&mfi->mfi_lock);

        cv_destroy(&mfi->mfi_cv);

        mutex_enter(&lmrc->l_mfi_cmd_lock);
        list_insert_tail(&lmrc->l_mfi_cmd_list, mfi);
        mutex_exit(&lmrc->l_mfi_cmd_lock);
        mutex_exit(&mfi->mfi_lock);
}

/*
 * lmrc_abort_outstanding_mfi
 *
 * Walk the MFI cmd array and abort each command which is still outstanding,
 * which is indicated by not being linked on l_mfi_cmd_list.
 *
 * As a special case, if the FW is in fault state, just call each commands
 * completion callback.
 */
int
lmrc_abort_outstanding_mfi(lmrc_t *lmrc, const size_t ncmd)
{
        int ret;
        int i;

        for (i = 0; i < ncmd; i++) {
                lmrc_mfi_cmd_t *mfi = lmrc->l_mfi_cmds[i];

                mutex_enter(&mfi->mfi_lock);
                if (list_link_active(&mfi->mfi_node)) {
                        mutex_exit(&mfi->mfi_lock);
                        continue;
                }

                /*
                 * If the FW is faulted, wake up anyone waiting on the command
                 * to clean it up.
                 */
                if (lmrc->l_fw_fault) {
                        if (mfi->mfi_callback != NULL)
                                mfi->mfi_callback(lmrc, mfi);
                        mutex_exit(&mfi->mfi_lock);
                        continue;
                }

                ret = lmrc_abort_cmd(lmrc, mfi);
                mutex_exit(&mfi->mfi_lock);
                if (ret != DDI_SUCCESS)
                        return (ret);

                lmrc_dma_free(&mfi->mfi_data_dma);
                lmrc_put_mfi(mfi);
        }

        return (DDI_SUCCESS);
}

/*
 * lmrc_get_dcmd
 *
 * Build a MFI DCMD with DMA memory for data transfers.
 */
lmrc_mfi_cmd_t *
lmrc_get_dcmd(lmrc_t *lmrc, uint16_t flags, uint32_t opcode, uint32_t xferlen,
    uint_t align)
{
        lmrc_mfi_cmd_t *mfi = lmrc_get_mfi(lmrc);
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;
        mfi_dcmd_payload_t *dcmd = &mfi->mfi_frame->mf_dcmd;
        lmrc_dma_t *dma = &mfi->mfi_data_dma;
        int ret;

        hdr->mh_cmd = MFI_CMD_DCMD;
        hdr->mh_flags = flags;

        dcmd->md_opcode = opcode;

        if ((flags & MFI_FRAME_DIR_READ) != 0 ||
            (flags & MFI_FRAME_DIR_WRITE) != 0) {
                ret = lmrc_dma_alloc(lmrc, lmrc->l_dma_attr, dma, xferlen,
                    align, DDI_DMA_CONSISTENT);
                if (ret != DDI_SUCCESS) {
                        lmrc_put_mfi(mfi);
                        return (NULL);
                }

                hdr->mh_flags |= MFI_FRAME_SGL64;
                hdr->mh_sge_count = 1;
                hdr->mh_data_xfer_len = lmrc_dma_get_size(dma);

                dcmd->md_sgl.ms64_length = lmrc_dma_get_size(dma);
                lmrc_dma_set_addr64(dma, &dcmd->md_sgl.ms64_phys_addr);
        }

        return (mfi);
}

/*
 * lmrc_put_dcmd
 *
 * Free the DMA memory of a MFI DCMD and return the command back on the list.
 */
void
lmrc_put_dcmd(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi)
{
        lmrc_dma_free(&mfi->mfi_data_dma);
        lmrc_put_mfi(mfi);
}


/*
 * Asynchronous Event Notifications
 */
/*
 * lmrc_get_event_log_info
 *
 * Get the Event Log Info from the firmware.
 */
static int
lmrc_get_event_log_info(lmrc_t *lmrc, mfi_evt_log_info_t *eli)
{
        lmrc_mfi_cmd_t *mfi;
        int ret;

        mfi = lmrc_get_dcmd(lmrc, MFI_FRAME_DIR_READ,
            MFI_DCMD_CTRL_EVENT_GET_INFO, sizeof (mfi_evt_log_info_t), 1);

        if (mfi == NULL)
                return (DDI_FAILURE);

        ret = lmrc_issue_blocked_mfi(lmrc, mfi);

        if (ret != DDI_SUCCESS)
                goto out;

        bcopy(mfi->mfi_data_dma.ld_buf, eli, sizeof (mfi_evt_log_info_t));

out:
        lmrc_put_dcmd(lmrc, mfi);
        return (ret);
}

/*
 * lmrc_aen_handler
 *
 * Check the event code and handle it as needed. In the case of PD or LD related
 * events, invoke their special handlers.
 */
static void
lmrc_aen_handler(void *arg)
{
        lmrc_mfi_cmd_t *mfi = arg;
        lmrc_t *lmrc = mfi->mfi_lmrc;
        mfi_evt_detail_t *evt = mfi->mfi_data_dma.ld_buf;
        mfi_dcmd_payload_t *dcmd = &mfi->mfi_frame->mf_dcmd;
        int ret = DDI_FAILURE;

        /* Controller & Configuration specific events */
        switch (evt->evt_code) {
        case MFI_EVT_CFG_CLEARED:
        case MFI_EVT_CTRL_HOST_BUS_SCAN_REQD:
        case MFI_EVT_FOREIGN_CFG_IMPORTED:
                ret = lmrc_get_pd_list(lmrc);
                if (ret != DDI_SUCCESS)
                        break;

                ret = lmrc_get_ld_list(lmrc);
                break;

        case MFI_EVT_CTRL_PROP_CHANGED:
                ret = lmrc_get_ctrl_info(lmrc);
                break;

        case MFI_EVT_CTRL_PATROL_READ_START:
        case MFI_EVT_CTRL_PATROL_READ_RESUMED:
        case MFI_EVT_CTRL_PATROL_READ_COMPLETE:
        case MFI_EVT_CTRL_PATROL_READ_CANT_START:
        case MFI_EVT_CTRL_PERF_COLLECTION:
        case MFI_EVT_CTRL_BOOTDEV_SET:
        case MFI_EVT_CTRL_BOOTDEV_RESET:
        case MFI_EVT_CTRL_PERSONALITY_CHANGE:
        case MFI_EVT_CTRL_PERSONALITY_CHANGE_PEND:
        case MFI_EVT_CTRL_NR_OF_VALID_SNAPDUMP:
                break;

        default:
                /* LD-specific events */
                if ((evt->evt_cl.evt_locale & MFI_EVT_LOCALE_LD) != 0)
                        ret = lmrc_raid_aen_handler(lmrc, evt);

                /* PD-specific events */
                else if ((evt->evt_cl.evt_locale & MFI_EVT_LOCALE_PD) != 0)
                        ret = lmrc_phys_aen_handler(lmrc, evt);

                if (ret != DDI_SUCCESS) {
                        dev_err(lmrc->l_dip, CE_NOTE, "!unknown AEN received, "
                            "seqnum = %d, timestamp = %d, code = %x, "
                            "locale = %x, class = %d, argtype = %d",
                            evt->evt_seqnum, evt->evt_timestamp, evt->evt_code,
                            evt->evt_cl.evt_locale, evt->evt_cl.evt_class,
                            evt->evt_argtype);
                }
        }

        dev_err(lmrc->l_dip, CE_NOTE, "!%s", evt->evt_descr);

        /*
         * Just reuse the command in its entirety. Increase the sequence
         * number.
         */
        dcmd->md_mbox_32[0] = evt->evt_seqnum + 1;
        mutex_enter(&mfi->mfi_lock);
        lmrc_issue_mfi(lmrc, mfi, lmrc_complete_aen);
        mutex_exit(&mfi->mfi_lock);
}

/*
 * lmrc_complete_aen
 *
 * An AEN was received, so schedule a taskq to process it.
 */
static void
lmrc_complete_aen(lmrc_t *lmrc, lmrc_mfi_cmd_t *mfi)
{
        mfi_header_t *hdr = &mfi->mfi_frame->mf_hdr;

        ASSERT(mutex_owned(&mfi->mfi_lock));

        if (hdr->mh_cmd_status != MFI_STAT_OK) {
                /* Was the command aborted? */
                if (hdr->mh_cmd_status == MFI_STAT_NOT_FOUND)
                        return;

                dev_err(lmrc->l_dip, CE_WARN,
                    "!AEN failed, status = %d",
                    hdr->mh_cmd_status);
                taskq_dispatch_ent(lmrc->l_taskq, (task_func_t *)lmrc_put_mfi,
                    mfi, TQ_NOSLEEP, &mfi->mfi_tqent);
                return;
        }

        taskq_dispatch_ent(lmrc->l_taskq, lmrc_aen_handler, mfi, TQ_NOSLEEP,
            &mfi->mfi_tqent);
}

/*
 * lmrc_register_aen
 *
 * In FreeBSD, this function checks for an existing AEN. If its class and locale
 * already include what is requested here they just return. In the other case,
 * the existing AEN is aborted and a new one is created, which includes
 * the previous locale and class and new ones.
 *
 * Given that the driver (same as in FreeBSD) calls this function during attach
 * to create an AEN with LOCALE_ALL and CLASS_DEBUG, all of this would be dead
 * code anyway.
 */
static int
lmrc_register_aen(lmrc_t *lmrc, uint32_t seqnum)
{
        lmrc_mfi_cmd_t *mfi;
        mfi_dcmd_payload_t *dcmd;

        mfi = lmrc_get_dcmd(lmrc, MFI_FRAME_DIR_READ, MFI_DCMD_CTRL_EVENT_WAIT,
            sizeof (mfi_evt_detail_t), 1);

        if (mfi == NULL)
                return (DDI_FAILURE);

        dcmd = &mfi->mfi_frame->mf_dcmd;
        dcmd->md_mbox_32[0] = seqnum;
        dcmd->md_mbox_16[2] = MFI_EVT_LOCALE_ALL;
        dcmd->md_mbox_8[7] = MFI_EVT_CLASS_DEBUG;

        mutex_enter(&mfi->mfi_lock);
        lmrc_issue_mfi(lmrc, mfi, lmrc_complete_aen);
        mutex_exit(&mfi->mfi_lock);

        return (DDI_SUCCESS);
}

/*
 * lmrc_start_aen
 *
 * Set up and enable AEN processing.
 */
int
lmrc_start_aen(lmrc_t *lmrc)
{
        mfi_evt_log_info_t eli;
        int ret;

        bzero(&eli, sizeof (eli));

        /* Get the latest sequence number from the Event Log Info. */
        ret = lmrc_get_event_log_info(lmrc, &eli);
        if (ret != DDI_SUCCESS)
                return (ret);

        /* Register AEN with FW for latest sequence number + 1. */
        ret = lmrc_register_aen(lmrc, eli.eli_newest_seqnum + 1);
        return (ret);
}