root/usr/src/uts/common/io/scsi/targets/sd_xbuf.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/scsi/scsi.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/var.h>

#include "sd_xbuf.h"

/*
 * xbuf.c: buf(9s) extension facility.
 *
 * The buf(9S) extension facility is intended to allow block drivers to
 * allocate additional memory that is associated with a particular buf(9S)
 * struct.  It is further intended to help in addressing the usual set of
 * problems associated with such allocations, in particular those involving
 * recovery from allocation failures, especially in code paths that the
 * system relies on to free memory.
 *
 * CAVEAT: Currently this code is completely private to the sd driver and in
 * NO WAY constitutes a public or supported interface of any kind. It is
 * envisioned that this may one day migrate into the Solaris DDI, but until
 * that time this ought to be considered completely unstable and is subject
 * to change without notice. This code may NOT in any way be utilized by
 * ANY code outside the sd driver.
 */


static int xbuf_iostart(ddi_xbuf_attr_t xap);
static void xbuf_dispatch(ddi_xbuf_attr_t xap);
static void xbuf_restart_callback(void *arg);
static int xbuf_brk_done(struct buf *bp);


/*
 * Note: Should this be exposed to the caller.... do we want to give the
 * caller the fexibility of specifying the parameters for the thread pool?
 * Note: these values are just estimates at this time, based upon what
 * seems reasonable for the sd driver. It may be preferable to make these
 * parameters self-scaling in a real (future) implementation.
 */
#define XBUF_TQ_MINALLOC        64
#define XBUF_TQ_MAXALLOC        512
#define XBUF_DISPATCH_DELAY     (drv_usectohz(50000))   /* 50 msec */

static taskq_t *xbuf_tq = NULL;
static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC;
static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC;

static kmutex_t xbuf_mutex = { 0 };
static uint32_t xbuf_refcount = 0;

/*
 * Private wrapper for buf cloned via ddi_xbuf_qstrategy()
 */
struct xbuf_brk {
        kmutex_t mutex;
        struct buf *bp0;
        uint8_t nbufs;  /* number of buf allocated */
        uint8_t active; /* number of active xfer */

        size_t brksize; /* break size used for this buf */
        int brkblk;

        /* xfer position */
        off_t off;
        off_t noff;
        daddr_t blkno;
};

_NOTE(DATA_READABLE_WITHOUT_LOCK(xbuf_brk::off))

/*
 * Hack needed in the prototype so buf breakup will work.
 * Here we can rely on the sd code not changing the value in
 * b_forw.
 */
#define b_clone_private b_forw


/* ARGSUSED */
DDII ddi_xbuf_attr_t
ddi_xbuf_attr_create(size_t xsize,
    void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg),
    void *attr_arg, uint32_t active_limit, uint32_t reserve_limit,
    major_t major, int flags)
{
        ddi_xbuf_attr_t xap;

        xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP);

        mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL);
        mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL);

        /* Future: Allow the caller to specify alignment requirements? */
        xap->xa_allocsize       = max(xsize, sizeof (void *));
        xap->xa_active_limit    = active_limit;
        xap->xa_active_lowater  = xap->xa_active_limit / 2;
        xap->xa_reserve_limit   = reserve_limit;
        xap->xa_strategy        = xa_strategy;
        xap->xa_attr_arg        = attr_arg;

        mutex_enter(&xbuf_mutex);
        if (xbuf_refcount == 0) {
                ASSERT(xbuf_tq == NULL);
                /*
                 * Note: Would be nice if: (1) #threads in the taskq pool (set
                 * to the value of 'ncpus' at the time the taskq is created)
                 * could adjust automatically with DR; (2) the taskq
                 * minalloc/maxalloc counts could be grown/shrunk on the fly.
                 */
                xbuf_tq = taskq_create("xbuf_taskq", ncpus,
                    (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc,
                    xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE);
        }
        xbuf_refcount++;
        mutex_exit(&xbuf_mutex);

        /* In this prototype we just always use the global system pool. */
        xap->xa_tq = xbuf_tq;

        return (xap);
}


DDII void
ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap)
{
        ddi_xbuf_t      xp;

        mutex_destroy(&xap->xa_mutex);
        mutex_destroy(&xap->xa_reserve_mutex);

        /* Free any xbufs on the reserve list */
        while (xap->xa_reserve_count != 0) {
                xp = xap->xa_reserve_headp;
                xap->xa_reserve_headp = *((void **)xp);
                xap->xa_reserve_count--;
                kmem_free(xp, xap->xa_allocsize);
        }
        ASSERT(xap->xa_reserve_headp == NULL);

        mutex_enter(&xbuf_mutex);
        ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL));
        xbuf_refcount--;
        if (xbuf_refcount == 0) {
                taskq_destroy(xbuf_tq);
                xbuf_tq = NULL;
        }
        mutex_exit(&xbuf_mutex);

        kmem_free(xap, sizeof (struct __ddi_xbuf_attr));
}


/* ARGSUSED */
DDII void
ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
{
        /* Currently a no-op in this prototype */
}


/* ARGSUSED */
DDII void
ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip)
{
        /* Currently a no-op in this prototype */
}

DDII int
ddi_xbuf_attr_setup_brk(ddi_xbuf_attr_t xap, size_t size)
{
        if (size < DEV_BSIZE)
                return (0);

        mutex_enter(&xap->xa_mutex);
        xap->xa_brksize = size & ~(DEV_BSIZE - 1);
        mutex_exit(&xap->xa_mutex);
        return (1);
}



/*
 * Enqueue the given buf and attempt to initiate IO.
 * Called from the driver strategy(9E) routine.
 */

DDII int
ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap)
{
        ASSERT(xap != NULL);
        ASSERT(!mutex_owned(&xap->xa_mutex));
        ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

        mutex_enter(&xap->xa_mutex);

        ASSERT((bp->b_bcount & (DEV_BSIZE - 1)) == 0);

        /*
         * Breakup buf if necessary. bp->b_private is temporarily
         * used to save xbuf_brk
         */
        if (xap->xa_brksize && bp->b_bcount > xap->xa_brksize) {
                struct xbuf_brk *brkp;

                brkp = kmem_zalloc(sizeof (struct xbuf_brk), KM_SLEEP);
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*brkp))
                mutex_init(&brkp->mutex, NULL, MUTEX_DRIVER, NULL);
                brkp->bp0 = bp;
                brkp->brksize = xap->xa_brksize;
                brkp->brkblk = btodt(xap->xa_brksize);
                brkp->noff = xap->xa_brksize;
                brkp->blkno = bp->b_blkno;
                _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*brkp))
                bp->b_private = brkp;
        } else {
                bp->b_private = NULL;
        }

        /* Enqueue buf */
        if (xap->xa_headp == NULL) {
                xap->xa_headp = xap->xa_tailp = bp;
        } else {
                xap->xa_tailp->av_forw = bp;
                xap->xa_tailp = bp;
        }
        bp->av_forw = NULL;

        xap->xa_pending++;
        mutex_exit(&xap->xa_mutex);
        return (xbuf_iostart(xap));
}


/*
 * Drivers call this immediately before calling biodone(9F), to notify the
 * framework that the indicated xbuf is no longer being used by the driver.
 * May be called under interrupt context.
 */

DDII int
ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap)
{
        ddi_xbuf_t xp;
        int done;

        ASSERT(bp != NULL);
        ASSERT(xap != NULL);
        ASSERT(!mutex_owned(&xap->xa_mutex));
        ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

        xp = ddi_xbuf_get(bp, xap);

        mutex_enter(&xap->xa_mutex);

#ifdef  SDDEBUG
        if (xap->xa_active_limit != 0) {
                ASSERT(xap->xa_active_count > 0);
        }
#endif
        xap->xa_active_count--;

        if (xap->xa_reserve_limit != 0) {
                mutex_enter(&xap->xa_reserve_mutex);
                if (xap->xa_reserve_count < xap->xa_reserve_limit) {
                        /* Put this xbuf onto the reserve list & exit */
                        *((void **)xp) = xap->xa_reserve_headp;
                        xap->xa_reserve_headp = xp;
                        xap->xa_reserve_count++;
                        mutex_exit(&xap->xa_reserve_mutex);
                        goto done;
                }
                mutex_exit(&xap->xa_reserve_mutex);
        }

        kmem_free(xp, xap->xa_allocsize);       /* return it to the system */

done:
        if (bp->b_iodone == xbuf_brk_done) {
                struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private;

                brkp->active--;
                if (brkp->active || xap->xa_headp == brkp->bp0) {
                        done = 0;
                } else {
                        brkp->off = -1; /* mark bp0 as completed */
                        done = 1;
                }
        } else {
                done = 1;
        }

        if ((xap->xa_active_limit == 0) ||
            (xap->xa_active_count <= xap->xa_active_lowater)) {
                xbuf_dispatch(xap);
        }

        mutex_exit(&xap->xa_mutex);
        return (done);
}

static int
xbuf_brk_done(struct buf *bp)
{
        struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private;
        struct buf *bp0 = brkp->bp0;
        int done;

        mutex_enter(&brkp->mutex);
        if (bp->b_flags & B_ERROR && !(bp0->b_flags & B_ERROR)) {
                bp0->b_flags |= B_ERROR;
                bp0->b_error = bp->b_error;
        }
        if (bp->b_resid)
                bp0->b_resid = bp0->b_bcount;

        freerbuf(bp);
        brkp->nbufs--;

        done = (brkp->off == -1 && brkp->nbufs == 0);
        mutex_exit(&brkp->mutex);

        /* All buf segments done */
        if (done) {
                mutex_destroy(&brkp->mutex);
                kmem_free(brkp, sizeof (struct xbuf_brk));
                biodone(bp0);
        }
        return (0);
}

DDII void
ddi_xbuf_dispatch(ddi_xbuf_attr_t xap)
{
        mutex_enter(&xap->xa_mutex);
        if ((xap->xa_active_limit == 0) ||
            (xap->xa_active_count <= xap->xa_active_lowater)) {
                xbuf_dispatch(xap);
        }
        mutex_exit(&xap->xa_mutex);
}


/*
 * ISSUE: in this prototype we cannot really implement ddi_xbuf_get()
 * unless we explicitly hide the xbuf pointer somewhere in the buf
 * during allocation, and then rely on the driver never changing it.
 * We can probably get away with using b_private for this for now,
 * tho it really is kinda gnarly.....
 */

/* ARGSUSED */
DDII ddi_xbuf_t
ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap)
{
        return (bp->b_private);
}


/*
 * Initiate IOs for bufs on the queue.  Called from kernel thread or taskq
 * thread context. May execute concurrently for the same ddi_xbuf_attr_t.
 */

static int
xbuf_iostart(ddi_xbuf_attr_t xap)
{
        struct buf *bp;
        ddi_xbuf_t xp;

        ASSERT(xap != NULL);
        ASSERT(!mutex_owned(&xap->xa_mutex));
        ASSERT(!mutex_owned(&xap->xa_reserve_mutex));

        /*
         * For each request on the queue, attempt to allocate the specified
         * xbuf extension area, and call the driver's iostart() routine.
         * We process as many requests on the queue as we can, until either
         * (1) we run out of requests; or
         * (2) we run out of resources; or
         * (3) we reach the maximum limit for the given ddi_xbuf_attr_t.
         */
        for (;;) {
                mutex_enter(&xap->xa_mutex);

                if ((bp = xap->xa_headp) == NULL) {
                        break;  /* queue empty */
                }

                if ((xap->xa_active_limit != 0) &&
                    (xap->xa_active_count >= xap->xa_active_limit)) {
                        break;  /* allocation limit reached */
                }

                /*
                 * If the reserve_limit is non-zero then work with the
                 * reserve else always allocate a new struct.
                 */
                if (xap->xa_reserve_limit != 0) {
                        /*
                         * Don't penalize EVERY I/O by always allocating a new
                         * struct. for the sake of maintaining and not touching
                         * a reserve for a pathalogical condition that may never
                         * happen. Use the reserve entries first, this uses it
                         * like a local pool rather than a reserve that goes
                         * untouched. Make sure it's re-populated whenever it
                         * gets fully depleted just in case it really is needed.
                         * This is safe because under the pathalogical
                         * condition, when the system runs out of memory such
                         * that the below allocs fail, the reserve will still
                         * be available whether the entries are saved away on
                         * the queue unused or in-transport somewhere. Thus
                         * progress can still continue, however slowly.
                         */
                        mutex_enter(&xap->xa_reserve_mutex);
                        if (xap->xa_reserve_count != 0) {
                                ASSERT(xap->xa_reserve_headp != NULL);
                                /* Grab an xbuf from the reserve */
                                xp = xap->xa_reserve_headp;
                                xap->xa_reserve_headp = *((void **)xp);
                                ASSERT(xap->xa_reserve_count > 0);
                                xap->xa_reserve_count--;
                        } else {
                                /*
                                 * Either this is the first time through,
                                 * or the reserve has been totally depleted.
                                 * Re-populate the reserve (pool). Excess
                                 * structs. get released in the done path.
                                 */
                                while (xap->xa_reserve_count <
                                    xap->xa_reserve_limit) {
                                        xp = kmem_alloc(xap->xa_allocsize,
                                            KM_NOSLEEP);
                                        if (xp == NULL) {
                                                break;
                                        }
                                        *((void **)xp) = xap->xa_reserve_headp;
                                        xap->xa_reserve_headp = xp;
                                        xap->xa_reserve_count++;
                                }
                                /* And one more to use right now. */
                                xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
                        }
                        mutex_exit(&xap->xa_reserve_mutex);
                } else {
                        /*
                         * Try to alloc a new xbuf struct. If this fails just
                         * exit for now. We'll get back here again either upon
                         * cmd completion or via the timer handler.
                         * Question: what if the allocation attempt for the very
                         * first cmd. fails? There are no outstanding cmds so
                         * how do we get back here?
                         * Should look at un_ncmds_in_transport, if it's zero
                         * then schedule xbuf_restart_callback via the timer.
                         * Athough that breaks the architecture by bringing
                         * softstate data into this code.
                         */
                        xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP);
                }
                if (xp == NULL) {
                        break; /* Can't process a cmd. right now. */
                }

                /*
                 * Always run the counter. It's used/needed when xa_active_limit
                 * is non-zero which is the typical (and right now only) case.
                 */
                xap->xa_active_count++;

                if (bp->b_private) {
                        struct xbuf_brk *brkp = bp->b_private;
                        struct buf *bp0 = bp;

                        brkp->active++;

                        mutex_enter(&brkp->mutex);
                        brkp->nbufs++;
                        mutex_exit(&brkp->mutex);

                        if (brkp->noff < bp0->b_bcount) {
                                bp = bioclone(bp0, brkp->off, brkp->brksize,
                                    bp0->b_edev, brkp->blkno, xbuf_brk_done,
                                    NULL, KM_SLEEP);

                                /* update xfer position */
                                brkp->off = brkp->noff;
                                brkp->noff += brkp->brksize;
                                brkp->blkno += brkp->brkblk;
                        } else {
                                bp = bioclone(bp0, brkp->off,
                                    bp0->b_bcount - brkp->off, bp0->b_edev,
                                    brkp->blkno, xbuf_brk_done, NULL, KM_SLEEP);

                                /* unlink the buf from the list */
                                xap->xa_headp = bp0->av_forw;
                                bp0->av_forw = NULL;
                        }
                        bp->b_clone_private = (struct buf *)brkp;
                } else {
                        /* unlink the buf from the list */
                        xap->xa_headp = bp->av_forw;
                        bp->av_forw = NULL;
                }

                /*
                 * Hack needed in the prototype so ddi_xbuf_get() will work.
                 * Here we can rely on the sd code not changing the value in
                 * b_private (in fact it wants it there). See ddi_get_xbuf()
                 */
                bp->b_private = xp;

                /* call the driver's iostart routine */
                mutex_exit(&xap->xa_mutex);
                (*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg);
        }

        ASSERT(xap->xa_pending > 0);
        xap->xa_pending--;
        mutex_exit(&xap->xa_mutex);
        return (0);
}

static void
xbuf_taskq_cb(void *arg)
{
        (void) xbuf_iostart(arg);
}

/*
 * Re-start IO processing if there is anything on the queue, AND if the
 * restart function is not already running/pending for this ddi_xbuf_attr_t
 */
static void
xbuf_dispatch(ddi_xbuf_attr_t xap)
{
        ASSERT(xap != NULL);
        ASSERT(xap->xa_tq != NULL);
        ASSERT(mutex_owned(&xap->xa_mutex));

        if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) &&
            (xap->xa_pending == 0)) {
                /*
                 * First try to see if we can dispatch the restart function
                 * immediately, in a taskq thread.  If this fails, then
                 * schedule a timeout(9F) callback to try again later.
                 */
                if (taskq_dispatch(xap->xa_tq,
                    xbuf_taskq_cb, xap, KM_NOSLEEP) == TASKQID_INVALID) {
                        /*
                         * Unable to enqueue the request for the taskq thread,
                         * try again later.  Note that this will keep re-trying
                         * until taskq_dispatch() succeeds.
                         */
                        xap->xa_timeid = timeout(xbuf_restart_callback, xap,
                            XBUF_DISPATCH_DELAY);
                } else {
                        /*
                         * This indicates that xbuf_iostart() will soon be
                         * run for this ddi_xbuf_attr_t, and we do not need to
                         * schedule another invocation via timeout/taskq
                         */
                        xap->xa_pending++;
                }
        }
}

/* timeout(9F) callback routine for xbuf restart mechanism. */
static void
xbuf_restart_callback(void *arg)
{
        ddi_xbuf_attr_t xap = arg;

        ASSERT(xap != NULL);
        ASSERT(xap->xa_tq != NULL);
        ASSERT(!mutex_owned(&xap->xa_mutex));

        mutex_enter(&xap->xa_mutex);
        xap->xa_timeid = NULL;
        xbuf_dispatch(xap);
        mutex_exit(&xap->xa_mutex);
}


DDII void
ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *))
{
        struct buf *bp;
        struct buf *next_bp;
        struct buf *prev_bp = NULL;

        ASSERT(xap != NULL);
        ASSERT(xap->xa_tq != NULL);
        ASSERT(!mutex_owned(&xap->xa_mutex));

        mutex_enter(&xap->xa_mutex);

        for (bp = xap->xa_headp; bp != NULL; bp = next_bp) {

                next_bp = bp->av_forw;  /* Save for next iteration */

                /*
                 * If the user-supplied function is non-NULL and returns
                 * FALSE, then just leave the current bp on the queue.
                 */
                if ((funcp != NULL) && (!(*funcp)(bp))) {
                        prev_bp = bp;
                        continue;
                }

                /* de-queue the bp */
                if (bp == xap->xa_headp) {
                        xap->xa_headp = next_bp;
                        if (xap->xa_headp == NULL) {
                                xap->xa_tailp = NULL;
                        }
                } else {
                        ASSERT(xap->xa_headp != NULL);
                        ASSERT(prev_bp != NULL);
                        if (bp == xap->xa_tailp) {
                                ASSERT(next_bp == NULL);
                                xap->xa_tailp = prev_bp;
                        }
                        prev_bp->av_forw = next_bp;
                }
                bp->av_forw = NULL;

                /* Add the bp to the flush queue */
                if (xap->xa_flush_headp == NULL) {
                        ASSERT(xap->xa_flush_tailp == NULL);
                        xap->xa_flush_headp = xap->xa_flush_tailp = bp;
                } else {
                        ASSERT(xap->xa_flush_tailp != NULL);
                        xap->xa_flush_tailp->av_forw = bp;
                        xap->xa_flush_tailp = bp;
                }
        }

        while ((bp = xap->xa_flush_headp) != NULL) {
                xap->xa_flush_headp = bp->av_forw;
                if (xap->xa_flush_headp == NULL) {
                        xap->xa_flush_tailp = NULL;
                }
                mutex_exit(&xap->xa_mutex);
                bioerror(bp, EIO);
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                mutex_enter(&xap->xa_mutex);
        }

        mutex_exit(&xap->xa_mutex);
}