root/usr/src/uts/common/fs/ufs/lufs_log.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/cmn_err.h>
#include <sys/fssnap_if.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_log.h>
#include <sys/fs/ufs_bio.h>
#include <sys/atomic.h>

extern int              maxphys;
extern uint_t           bypass_snapshot_throttle_key;

extern struct kmem_cache        *lufs_sv;
extern struct kmem_cache        *lufs_bp;

static void
makebusy(ml_unit_t *ul, buf_t *bp)
{
        sema_p(&bp->b_sem);
        if ((bp->b_flags & B_ERROR) == 0)
                return;
        if (bp->b_flags & B_READ)
                ldl_seterror(ul, "Error reading ufs log");
        else
                ldl_seterror(ul, "Error writing ufs log");
}

static int
logdone(buf_t *bp)
{
        bp->b_flags |= B_DONE;

        if (bp->b_flags & B_WRITE)
                sema_v(&bp->b_sem);
        else
                /* wakeup the thread waiting on this buf */
                sema_v(&bp->b_io);
        return (0);
}

static int
ldl_strategy_done(buf_t *cb)
{
        lufs_save_t     *sv;
        lufs_buf_t      *lbp;
        buf_t           *bp;

        ASSERT(SEMA_HELD(&cb->b_sem));
        ASSERT((cb->b_flags & B_DONE) == 0);

        /*
         * Compute address of the ``save'' struct
         */
        lbp = (lufs_buf_t *)cb;
        sv = (lufs_save_t *)lbp->lb_ptr;

        if (cb->b_flags & B_ERROR)
                sv->sv_error = 1;

        /*
         * If this is the last request, release the resources and
         * ``done'' the original buffer header.
         */
        if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
                kmem_cache_free(lufs_bp, lbp);
                return (1);
        }
        /* Propagate any errors back to the original buffer header */
        bp = sv->sv_bp;
        if (sv->sv_error)
                bp->b_flags |= B_ERROR;
        kmem_cache_free(lufs_bp, lbp);
        kmem_cache_free(lufs_sv, sv);

        biodone(bp);
        return (0);
}

/*
 * Map the log logical block number to a physical disk block number
 */
static int
map_frag(
        ml_unit_t       *ul,
        daddr_t         lblkno,
        size_t          bcount,
        daddr_t         *pblkno,
        size_t          *pbcount)
{
        ic_extent_t     *ext = ul->un_ebp->ic_extents;
        uint32_t        e = ul->un_ebp->ic_nextents;
        uint32_t        s = 0;
        uint32_t        i = e >> 1;
        uint32_t        lasti = i;
        uint32_t        bno_off;

again:
        if (ext[i].ic_lbno <= lblkno) {
                if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
                        /* FOUND IT */
                        bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
                        *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
                        *pblkno = ext[i].ic_pbno + bno_off;
                        return (0);
                } else
                        s = i;
        } else
                e = i;
        i = s + ((e - s) >> 1);

        if (i == lasti) {
                *pbcount = bcount;
                return (ENOENT);
        }
        lasti = i;

        goto again;
}

/*
 * The log is a set of extents (which typically will be only one, but
 * may be more if the disk was close to full when the log was created)
 * and hence the logical offsets into the log
 * have to be translated into their real device locations before
 * calling the device's strategy routine. The translation may result
 * in several IO requests if this request spans extents.
 */
void
ldl_strategy(ml_unit_t *ul, buf_t *pb)
{
        lufs_save_t     *sv;
        lufs_buf_t      *lbp;
        buf_t           *cb;
        ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
        daddr_t         lblkno, pblkno;
        size_t          nb_left, pbcount;
        off_t           offset;
        dev_t           dev     = ul->un_dev;
        int             error;
        int             read = pb->b_flags & B_READ;

        /*
         * Allocate and initialise the save stucture,
         */
        sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
        sv->sv_error = 0;
        sv->sv_bp = pb;
        nb_left = pb->b_bcount;
        sv->sv_nb_left = nb_left;

        lblkno = pb->b_blkno;
        offset = 0;

        do {
                error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);

                lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
                bioinit(&lbp->lb_buf);
                lbp->lb_ptr = sv;

                cb = bioclone(pb, offset, pbcount, dev,
                    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);

                offset += pbcount;
                lblkno += btodb(pbcount);
                nb_left -= pbcount;

                if (error) {
                        cb->b_flags |= B_ERROR;
                        cb->b_resid = cb->b_bcount;
                        biodone(cb);
                } else {
                        if (read) {
                                logstats.ls_ldlreads.value.ui64++;
                                ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
                                lwp_stat_update(LWP_STAT_INBLK, 1);
                        } else {
                                logstats.ls_ldlwrites.value.ui64++;
                                lwp_stat_update(LWP_STAT_OUBLK, 1);
                        }

                        /*
                         * write through the snapshot driver if necessary
                         * We do not want this write to be throttled because
                         * we are holding the un_log mutex here. If we
                         * are throttled in fssnap_translate, the fssnap_taskq
                         * thread which can wake us up can get blocked on
                         * the un_log mutex resulting in a deadlock.
                         */
                        if (ufsvfsp->vfs_snapshot) {
                                (void) tsd_set(bypass_snapshot_throttle_key,
                                    (void *)1);
                                fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);

                                (void) tsd_set(bypass_snapshot_throttle_key,
                                    (void *)0);
                        } else {
                                (void) bdev_strategy(cb);
                        }
                }

        } while (nb_left);
}

static void
writelog(ml_unit_t *ul, buf_t *bp)
{
        ASSERT(SEMA_HELD(&bp->b_sem));

        /*
         * This is really an B_ASYNC write but we want Presto to
         * cache this write.  The iodone routine, logdone, processes
         * the buf correctly.
         */
        bp->b_flags = B_WRITE;
        bp->b_edev = ul->un_dev;
        bp->b_iodone = logdone;

        /*
         * return EIO for every IO if in hard error state
         */
        if (ul->un_flags & LDL_ERROR) {
                bp->b_flags |= B_ERROR;
                bp->b_error = EIO;
                biodone(bp);
                return;
        }

        ldl_strategy(ul, bp);
}

static void
readlog(ml_unit_t *ul, buf_t *bp)
{
        ASSERT(SEMA_HELD(&bp->b_sem));
        ASSERT(bp->b_bcount);

        bp->b_flags = B_READ;
        bp->b_edev = ul->un_dev;
        bp->b_iodone = logdone;

        /* all IO returns errors when in error state */
        if (ul->un_flags & LDL_ERROR) {
                bp->b_flags |= B_ERROR;
                bp->b_error = EIO;
                biodone(bp);
                (void) trans_wait(bp);
                return;
        }

        ldl_strategy(ul, bp);

        if (trans_wait(bp))
                ldl_seterror(ul, "Error reading ufs log");
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static void
push_dirty_bp(ml_unit_t *ul, buf_t *bp)
{
        buf_t           *newbp;
        cirbuf_t        *cb             = &ul->un_wrbuf;

        ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
        ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);

        /*
         * async write the buf
         */
        writelog(ul, bp);

        /*
         * no longer filling any buf
         */
        cb->cb_dirty = NULL;

        /*
         * no extra buffer space; all done
         */
        if (bp->b_bcount == bp->b_bufsize)
                return;

        /*
         * give extra buffer space to a new bp
         *      try to take buf off of free list
         */
        if ((newbp = cb->cb_free) != NULL) {
                cb->cb_free = newbp->b_forw;
        } else {
                newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
                sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
                sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
        }
        newbp->b_flags = 0;
        newbp->b_bcount = 0;
        newbp->b_file = NULL;
        newbp->b_offset = -1;
        newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
        newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
        bp->b_bufsize = bp->b_bcount;

        /*
         * lock out readers and put new buf at LRU position
         */
        rw_enter(&cb->cb_rwlock, RW_WRITER);
        newbp->b_forw = bp->b_forw;
        newbp->b_back = bp;
        bp->b_forw->b_back = newbp;
        bp->b_forw = newbp;
        rw_exit(&cb->cb_rwlock);
}

static void
inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
{
        buf_t           *bp;
        off_t           elof    = lof + nb;
        off_t           buflof;
        off_t           bufelof;

        /*
         * discard all bufs that overlap the range (lof, lof + nb)
         */
        rw_enter(&cb->cb_rwlock, RW_WRITER);
        bp = cb->cb_bp;
        do {
                if (bp == cb->cb_dirty || bp->b_bcount == 0) {
                        bp = bp->b_forw;
                        continue;
                }
                buflof = dbtob(bp->b_blkno);
                bufelof = buflof + bp->b_bcount;
                if ((buflof < lof && bufelof <= lof) ||
                    (buflof >= elof && bufelof > elof)) {
                        bp = bp->b_forw;
                        continue;
                }
                makebusy(ul, bp);
                bp->b_flags = 0;
                bp->b_bcount = 0;
                sema_v(&bp->b_sem);
                bp = bp->b_forw;
        } while (bp != cb->cb_bp);
        rw_exit(&cb->cb_rwlock);
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static buf_t *
get_write_bp(ml_unit_t *ul)
{
        cirbuf_t        *cb = &ul->un_wrbuf;
        buf_t           *bp;

        /*
         * cb_dirty is the buffer we are currently filling; if any
         */
        if ((bp = cb->cb_dirty) != NULL) {
                makebusy(ul, bp);
                return (bp);
        }
        /*
         * discard any bp that overlaps the current tail since we are
         * about to overwrite it.
         */
        inval_range(ul, cb, ul->un_tail_lof, 1);

        /*
         * steal LRU buf
         */
        rw_enter(&cb->cb_rwlock, RW_WRITER);
        bp = cb->cb_bp->b_forw;
        makebusy(ul, bp);

        cb->cb_dirty = bp;
        cb->cb_bp = bp;

        bp->b_flags = 0;
        bp->b_bcount = 0;
        bp->b_blkno = btodb(ul->un_tail_lof);
        ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
        rw_exit(&cb->cb_rwlock);

        /*
         * NOTE:
         *      1. un_tail_lof never addresses >= un_eol_lof
         *      2. b_blkno + btodb(b_bufsize) may > un_eol_lof
         *              this case is handled in storebuf
         */
        return (bp);
}

void
alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
{
        int     i;
        buf_t   *bp;

        /*
         * Clear previous allocation
         */
        if (cb->cb_nb)
                free_cirbuf(cb);

        bzero(cb, sizeof (*cb));
        rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);

        rw_enter(&cb->cb_rwlock, RW_WRITER);

        /*
         * preallocate 3 bp's and put them on the free list.
         */
        for (i = 0; i < 3; ++i) {
                bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
                sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
                sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
                bp->b_offset = -1;
                bp->b_forw = cb->cb_free;
                cb->cb_free = bp;
        }

        cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
        cb->cb_nb = bufsize;

        /*
         * first bp claims entire write buffer
         */
        bp = cb->cb_free;
        cb->cb_free = bp->b_forw;

        bp->b_forw = bp;
        bp->b_back = bp;
        cb->cb_bp = bp;
        bp->b_un.b_addr = cb->cb_va;
        bp->b_bufsize = cb->cb_nb;

        rw_exit(&cb->cb_rwlock);
}

void
alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
{
        caddr_t va;
        size_t  nb;
        buf_t   *bp;

        /*
         * Clear previous allocation
         */
        if (cb->cb_nb)
                free_cirbuf(cb);

        bzero(cb, sizeof (*cb));
        rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);

        rw_enter(&cb->cb_rwlock, RW_WRITER);

        cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
        cb->cb_nb = bufsize;

        /*
         * preallocate N bufs that are hard-sized to blksize
         *      in other words, the read buffer pool is a linked list
         *      of statically sized bufs.
         */
        va = cb->cb_va;
        while ((nb = bufsize) != 0) {
                if (nb > blksize)
                        nb = blksize;
                bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
                bzero(bp, sizeof (buf_t));
                sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
                sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
                bp->b_un.b_addr = va;
                bp->b_bufsize = nb;
                if (cb->cb_bp) {
                        bp->b_forw = cb->cb_bp->b_forw;
                        bp->b_back = cb->cb_bp;
                        cb->cb_bp->b_forw->b_back = bp;
                        cb->cb_bp->b_forw = bp;
                } else
                        bp->b_forw = bp->b_back = bp;
                cb->cb_bp = bp;
                bufsize -= nb;
                va += nb;
        }

        rw_exit(&cb->cb_rwlock);
}

void
free_cirbuf(cirbuf_t *cb)
{
        buf_t   *bp;

        if (cb->cb_nb == 0)
                return;

        rw_enter(&cb->cb_rwlock, RW_WRITER);
        ASSERT(cb->cb_dirty == NULL);

        /*
         * free the active bufs
         */
        while ((bp = cb->cb_bp) != NULL) {
                if (bp == bp->b_forw)
                        cb->cb_bp = NULL;
                else
                        cb->cb_bp = bp->b_forw;
                bp->b_back->b_forw = bp->b_forw;
                bp->b_forw->b_back = bp->b_back;
                sema_destroy(&bp->b_sem);
                sema_destroy(&bp->b_io);
                kmem_free(bp, sizeof (buf_t));
        }

        /*
         * free the free bufs
         */
        while ((bp = cb->cb_free) != NULL) {
                cb->cb_free = bp->b_forw;
                sema_destroy(&bp->b_sem);
                sema_destroy(&bp->b_io);
                kmem_free(bp, sizeof (buf_t));
        }
        kmem_free(cb->cb_va, cb->cb_nb);
        cb->cb_va = NULL;
        cb->cb_nb = 0;
        rw_exit(&cb->cb_rwlock);
        rw_destroy(&cb->cb_rwlock);
}

static int
within_range(off_t lof, daddr_t blkno, ulong_t bcount)
{
        off_t   blof    = dbtob(blkno);

        return ((lof >= blof) && (lof < (blof + bcount)));
}

static buf_t *
find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
{
        buf_t *bp;

        /*
         * find a buf that contains the offset lof
         */
        rw_enter(&cb->cb_rwlock, RW_READER);
        bp = cb->cb_bp;
        do {
                if (bp->b_bcount &&
                    within_range(lof, bp->b_blkno, bp->b_bcount)) {
                        makebusy(ul, bp);
                        rw_exit(&cb->cb_rwlock);
                        return (bp);
                }
                bp = bp->b_forw;
        } while (bp != cb->cb_bp);
        rw_exit(&cb->cb_rwlock);

        return (NULL);
}

static off_t
find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
{
        buf_t   *bp, *bpend;
        off_t   rlof;

        /*
         * we mustn't:
         *      o read past eol
         *      o read past the tail
         *      o read data that may be being written.
         */
        rw_enter(&cb->cb_rwlock, RW_READER);
        bpend = bp = cb->cb_bp->b_forw;
        rlof = ul->un_tail_lof;
        do {
                if (bp->b_bcount) {
                        rlof = dbtob(bp->b_blkno);
                        break;
                }
                bp = bp->b_forw;
        } while (bp != bpend);
        rw_exit(&cb->cb_rwlock);

        if (lof <= rlof)
                /* lof is prior to the range represented by the write buf */
                return (rlof);
        else
                /* lof follows the range represented by the write buf */
                return ((off_t)ul->un_eol_lof);
}

static buf_t *
get_read_bp(ml_unit_t *ul, off_t lof)
{
        cirbuf_t        *cb;
        buf_t           *bp;
        off_t           rlof;

        /*
         * retrieve as much data as possible from the incore buffers
         */
        if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
                logstats.ls_lreadsinmem.value.ui64++;
                return (bp);
        }
        if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
                logstats.ls_lreadsinmem.value.ui64++;
                return (bp);
        }

        /*
         * steal the LRU buf
         */
        cb = &ul->un_rdbuf;
        rw_enter(&cb->cb_rwlock, RW_WRITER);
        bp = cb->cb_bp->b_forw;
        makebusy(ul, bp);
        bp->b_flags = 0;
        bp->b_bcount = 0;
        cb->cb_bp = bp;
        rw_exit(&cb->cb_rwlock);

        /*
         * don't read past the tail or the end-of-log
         */
        bp->b_blkno = btodb(lof);
        lof = dbtob(bp->b_blkno);
        rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
        bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
        readlog(ul, bp);
        return (bp);
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static int
extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
{
        buf_t   *bpforw = bp->b_forw;

        ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);

        /*
         * there is no `next' bp; do nothing
         */
        if (bpforw == bp)
                return (0);

        /*
         * buffer space is not adjacent; do nothing
         */
        if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
                return (0);

        /*
         * locking protocol requires giving up any bp locks before
         * acquiring cb_rwlock.  This is okay because we hold
         * un_log_mutex.
         */
        sema_v(&bp->b_sem);

        /*
         * lock out ldl_read
         */
        rw_enter(&cb->cb_rwlock, RW_WRITER);

        /*
         * wait for current IO to finish w/next bp; if necessary
         */
        makebusy(ul, bpforw);

        /*
         * free the next bp and steal its space
         */
        bp->b_forw = bpforw->b_forw;
        bpforw->b_forw->b_back = bp;
        bp->b_bufsize += bpforw->b_bufsize;
        sema_v(&bpforw->b_sem);
        bpforw->b_forw = cb->cb_free;
        cb->cb_free = bpforw;
        makebusy(ul, bp);
        rw_exit(&cb->cb_rwlock);

        return (1);
}

static size_t
storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
{
        size_t          copy_nb;
        size_t          nb_in_sec;
        sect_trailer_t  *st;
        size_t          nb_left = nb;
        cirbuf_t        *cb     = &ul->un_wrbuf;

again:
        nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
        copy_nb = MIN(nb_left, nb_in_sec);

        ASSERT(copy_nb);

        bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
        bp->b_bcount += copy_nb;
        va += copy_nb;
        nb_left -= copy_nb;
        ul->un_tail_lof += copy_nb;

        if ((nb_in_sec -= copy_nb) == 0) {
                st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);

                st->st_tid = ul->un_logmap->mtm_tid;
                st->st_ident = ul->un_tail_ident++;
                bp->b_bcount += sizeof (sect_trailer_t);
                ul->un_tail_lof += sizeof (sect_trailer_t);
                /*
                 * log wrapped; async write this bp
                 */
                if (ul->un_tail_lof == ul->un_eol_lof) {
                        ul->un_tail_lof = ul->un_bol_lof;
                        push_dirty_bp(ul, bp);
                        return (nb - nb_left);
                }
                /*
                 * out of bp space; get more or async write buf
                 */
                if (bp->b_bcount == bp->b_bufsize) {
                        if (!extend_write_bp(ul, cb, bp)) {
                                push_dirty_bp(ul, bp);
                                return (nb - nb_left);
                        }
                }
        }
        if (nb_left)
                goto again;

        sema_v(&bp->b_sem);
        return (nb);
}

static void
fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
{
        offset_t        src_mof = me->me_mof;
        size_t          src_nb  = me->me_nb;

        if (src_mof > dst_mof) {
                ASSERT(src_mof < (dst_mof + dst_nb));
                dst_va += (src_mof - dst_mof);
                dst_nb -= (src_mof - dst_mof);
        } else {
                ASSERT(dst_mof < (src_mof + src_nb));
                src_nb -= (dst_mof - src_mof);
        }

        src_nb = MIN(src_nb, dst_nb);
        ASSERT(src_nb);
        bzero(dst_va, src_nb);
}

/*
 * dst_va == NULL means don't copy anything
 */
static ulong_t
fetchbuf(
        ml_unit_t *ul,
        buf_t *bp,
        caddr_t dst_va,
        size_t dst_nb,
        off_t *dst_lofp)
{
        caddr_t copy_va;
        size_t  copy_nb;
        size_t  nb_sec;
        off_t   dst_lof         = *dst_lofp;
        ulong_t sav_dst_nb      = dst_nb;
        ulong_t src_nb          = bp->b_bcount;
        off_t   src_lof         = dbtob(bp->b_blkno);
        off_t   src_elof        = src_lof + src_nb;
        caddr_t src_va          = bp->b_un.b_addr;

        /*
         * copy from bp to dst_va
         */
        while (dst_nb) {
                /*
                 * compute address within bp
                 */
                copy_va = src_va + (dst_lof - src_lof);

                /*
                 * adjust copy size to amount of data in bp
                 */
                copy_nb = MIN(dst_nb, src_elof - dst_lof);

                /*
                 * adjust copy size to amount of data in sector
                 */
                nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
                copy_nb = MIN(copy_nb, nb_sec);

                /*
                 * dst_va == NULL means don't do copy (see logseek())
                 */
                if (dst_va) {
                        bcopy(copy_va, dst_va, copy_nb);
                        dst_va += copy_nb;
                }
                dst_lof += copy_nb;
                dst_nb -= copy_nb;
                nb_sec -= copy_nb;

                /*
                 * advance over sector trailer
                 */
                if (nb_sec == 0)
                        dst_lof += sizeof (sect_trailer_t);

                /*
                 * exhausted buffer
                 *      return current lof for next read
                 */
                if (dst_lof == src_elof) {
                        sema_v(&bp->b_sem);
                        if (dst_lof == ul->un_eol_lof)
                                dst_lof = ul->un_bol_lof;
                        *dst_lofp = dst_lof;
                        return (sav_dst_nb - dst_nb);
                }
        }

        /*
         * copy complete - return current lof
         */
        sema_v(&bp->b_sem);
        *dst_lofp = dst_lof;
        return (sav_dst_nb);
}

void
ldl_round_commit(ml_unit_t *ul)
{
        int             wrapped;
        buf_t           *bp;
        sect_trailer_t  *st;
        size_t          bcount;
        cirbuf_t        *cb     = &ul->un_wrbuf;

        /*
         * if nothing to write; then do nothing
         */
        if ((bp = cb->cb_dirty) == NULL)
                return;
        makebusy(ul, bp);

        /*
         * round up to sector boundary and set new tail
         *      don't readjust st_ident if buf is already rounded
         */
        bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
        if (bcount == bp->b_bcount) {
                sema_v(&bp->b_sem);
                return;
        }
        bp->b_bcount = bcount;
        ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
        wrapped = 0;
        if (ul->un_tail_lof == ul->un_eol_lof) {
                ul->un_tail_lof = ul->un_bol_lof;
                ++wrapped;
        }
        ASSERT(ul->un_tail_lof != ul->un_head_lof);

        /*
         * fix up the sector trailer
         */
        /* LINTED */
        st = (sect_trailer_t *)
            ((bp->b_un.b_addr + bcount) - sizeof (*st));
        st->st_tid = ul->un_logmap->mtm_tid;
        st->st_ident = ul->un_tail_ident++;

        /*
         * if tail wrapped or we have exhausted this buffer
         *      async write the buffer
         */
        if (wrapped || bcount == bp->b_bufsize)
                push_dirty_bp(ul, bp);
        else
                sema_v(&bp->b_sem);
}

void
ldl_push_commit(ml_unit_t *ul)
{
        buf_t           *bp;
        cirbuf_t        *cb     = &ul->un_wrbuf;

        /*
         * if nothing to write; then do nothing
         */
        if ((bp = cb->cb_dirty) == NULL)
                return;
        makebusy(ul, bp);
        push_dirty_bp(ul, bp);
}

int
ldl_need_commit(ml_unit_t *ul)
{
        return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
}

int
ldl_has_space(ml_unit_t *ul, mapentry_t *me)
{
        off_t   nfb;
        off_t   nb;

        ASSERT(MUTEX_HELD(&ul->un_log_mutex));

        /*
         * Add up the size used by the deltas
         * round nb up to a sector length plus an extra sector
         *      w/o the extra sector we couldn't distinguish
         *      a full log (head == tail) from an empty log (head == tail)
         */
        for (nb = DEV_BSIZE; me; me = me->me_hash) {
                nb += sizeof (struct delta);
                if (me->me_dt != DT_CANCEL)
                        nb += me->me_nb;
        }
        nb = P2ROUNDUP(nb, DEV_BSIZE);

        if (ul->un_head_lof <= ul->un_tail_lof)
                nfb = (ul->un_head_lof - ul->un_bol_lof) +
                    (ul->un_eol_lof - ul->un_tail_lof);
        else
                nfb = ul->un_head_lof - ul->un_tail_lof;

        return (nb < nfb);
}

void
ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
{
        buf_t           *bp;
        caddr_t         va;
        size_t          nb;
        size_t          actual;

        ASSERT(MUTEX_HELD(&ul->un_log_mutex));

        /* Write the delta */

        nb = sizeof (struct delta);
        va = (caddr_t)&me->me_delta;
        bp = get_write_bp(ul);

        while (nb) {
                if (ul->un_flags & LDL_ERROR) {
                        sema_v(&bp->b_sem);
                        return;
                }
                actual = storebuf(ul, bp, va, nb);
                ASSERT(actual);
                va += actual;
                nb -= actual;
                if (nb)
                        bp = get_write_bp(ul);
        }

        /* If a commit, cancel, or 0's; we're almost done */
        switch (me->me_dt) {
                case DT_COMMIT:
                case DT_CANCEL:
                case DT_ABZERO:
                        /* roll needs to know where the next delta will go */
                        me->me_lof = ul->un_tail_lof;
                        return;
                default:
                        break;
        }

        /* Now write the data */

        ASSERT(me->me_nb != 0);

        nb = me->me_nb;
        va = (me->me_mof - bufmof) + bufp;
        bp = get_write_bp(ul);

        /* Save where we will put the data */
        me->me_lof = ul->un_tail_lof;

        while (nb) {
                if (ul->un_flags & LDL_ERROR) {
                        sema_v(&bp->b_sem);
                        return;
                }
                actual = storebuf(ul, bp, va, nb);
                ASSERT(actual);
                va += actual;
                nb -= actual;
                if (nb)
                        bp = get_write_bp(ul);
        }
}

void
ldl_waito(ml_unit_t *ul)
{
        buf_t           *bp;
        cirbuf_t        *cb     = &ul->un_wrbuf;

        rw_enter(&cb->cb_rwlock, RW_WRITER);
        /*
         * wait on them
         */
        bp = cb->cb_bp;
        do {
                if ((bp->b_flags & B_DONE) == 0) {
                        makebusy(ul, bp);
                        sema_v(&bp->b_sem);
                }
                bp = bp->b_forw;
        } while (bp != cb->cb_bp);
        rw_exit(&cb->cb_rwlock);
}

/*
 * seek nb bytes from location lof
 */
static int
logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
{
        buf_t   *bp;
        ulong_t actual;

        while (nb) {
                bp = get_read_bp(ul, lof);
                if (bp->b_flags & B_ERROR) {
                        sema_v(&bp->b_sem);
                        return (EIO);
                }
                actual = fetchbuf(ul, bp, NULL, nb, &lof);
                ASSERT(actual);
                nb -= actual;
        }
        *lofp = lof;
        ASSERT(nb == 0);
        return (0);
}

int
ldl_read(
        ml_unit_t *ul,          /* Log unit */
        caddr_t va,             /* address of buffer to read into */
        offset_t mof,           /* mof of buffer */
        off_t nb,               /* length of buffer */
        mapentry_t *me)         /* Map entry list */
{
        buf_t   *bp;
        crb_t   *crb;
        caddr_t rva;                    /* address to read into */
        size_t  rnb;                    /* # of bytes to read */
        off_t   lof;                    /* log device offset to read from */
        off_t   skip;
        ulong_t actual;
        int     error;
        caddr_t eva     = va + nb;      /* end of buffer */

        for (; me; me = me->me_agenext) {
                ASSERT(me->me_dt != DT_CANCEL);

                /*
                 * check for an cached roll buffer
                 */
                crb = me->me_crb;
                if (crb) {
                        if (mof > crb->c_mof) {
                                /*
                                 * This mapentry overlaps with the beginning of
                                 * the supplied buffer
                                 */
                                skip = mof - crb->c_mof;
                                bcopy(crb->c_buf + skip, va,
                                    MIN(nb, crb->c_nb - skip));
                        } else {
                                /*
                                 * This mapentry starts at or after
                                 * the supplied buffer.
                                 */
                                skip = crb->c_mof - mof;
                                bcopy(crb->c_buf, va + skip,
                                    MIN(crb->c_nb, nb - skip));
                        }
                        logstats.ls_lreadsinmem.value.ui64++;
                        continue;
                }

                /*
                 * check for a delta full of zeroes - there's no log data
                 */
                if (me->me_dt == DT_ABZERO) {
                        fetchzeroes(va, mof, nb, me);
                        continue;
                }

                if (mof > me->me_mof) {
                        rnb = (size_t)(mof - me->me_mof);
                        error = logseek(ul, me->me_lof, rnb, &lof);
                        if (error)
                                return (EIO);
                        rva = va;
                        rnb = me->me_nb - rnb;
                        rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
                } else {
                        lof = me->me_lof;
                        rva = (me->me_mof - mof) + va;
                        rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
                }

                while (rnb) {
                        bp = get_read_bp(ul, lof);
                        if (bp->b_flags & B_ERROR) {
                                sema_v(&bp->b_sem);
                                return (EIO);
                        }
                        ASSERT(((me->me_flags & ME_ROLL) == 0) ||
                            (bp != ul->un_wrbuf.cb_dirty));
                        actual = fetchbuf(ul, bp, rva, rnb, &lof);
                        ASSERT(actual);
                        rva += actual;
                        rnb -= actual;
                }
        }
        return (0);
}

void
ldl_savestate(ml_unit_t *ul)
{
        int             error;
        buf_t           *bp     = ul->un_bp;
        ml_odunit_t     *ud     = (void *)bp->b_un.b_addr;
        ml_odunit_t     *ud2    = (void *)(bp->b_un.b_addr + DEV_BSIZE);

#if     DEBUG
        /*
         * Scan test is running; don't update intermediate state
         */
        if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
                return;
#endif  /* DEBUG */

        mutex_enter(&ul->un_state_mutex);
        bcopy(&ul->un_ondisk, ud, sizeof (*ud));
        ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
        bcopy(ud, ud2, sizeof (*ud));

        /* If a snapshot is enabled write through the shapshot driver. */
        if (ul->un_ufsvfs->vfs_snapshot)
                UFS_BWRITE2(ul->un_ufsvfs, bp);
        else
                BWRITE2(bp);
        logstats.ls_ldlwrites.value.ui64++;
        error = bp->b_flags & B_ERROR;
        mutex_exit(&ul->un_state_mutex);
        if (error)
                ldl_seterror(ul, "Error writing ufs log state");
}

/*
 * The head will be set to (new_lof - header) since ldl_sethead is
 * called with the new_lof of the data portion of a delta.
 */
void
ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
{
        off_t           nb;
        off_t           new_lof;
        uint32_t        new_ident;
        daddr_t         beg_blkno;
        daddr_t         end_blkno;

        ASSERT(MUTEX_HELD(&ul->un_log_mutex));

        if (data_lof == -1) {
                /* log is empty */
                new_ident = lufs_hd_genid(ul);
                new_lof = ul->un_tail_lof;

        } else {
                /* compute header's lof */
                new_ident = ul->un_head_ident;
                new_lof = data_lof - sizeof (struct delta);

                /* whoops, header spans sectors; subtract out sector trailer */
                if (btodb(new_lof) != btodb(data_lof))
                        new_lof -= sizeof (sect_trailer_t);

                /* whoops, header wrapped the log; go to last sector */
                if (new_lof < ul->un_bol_lof) {
                        /* sector offset */
                        new_lof -= dbtob(btodb(new_lof));
                        /* add to last sector's lof */
                        new_lof += (ul->un_eol_lof - DEV_BSIZE);
                }
                ul->un_head_tid = tid;
        }

        /*
         * check for nop
         */
        if (new_lof == ul->un_head_lof)
                return;

        /*
         * invalidate the affected bufs and calculate new ident
         */
        if (new_lof > ul->un_head_lof) {
                nb = new_lof - ul->un_head_lof;
                inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
                inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);

                end_blkno = btodb(new_lof);
                beg_blkno = btodb(ul->un_head_lof);
                new_ident += (end_blkno - beg_blkno);
        } else {
                nb = ul->un_eol_lof - ul->un_head_lof;
                inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
                inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);

                end_blkno = btodb(ul->un_eol_lof);
                beg_blkno = btodb(ul->un_head_lof);
                new_ident += (end_blkno - beg_blkno);

                nb = new_lof - ul->un_bol_lof;
                inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
                inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);

                end_blkno = btodb(new_lof);
                beg_blkno = btodb(ul->un_bol_lof);
                new_ident += (end_blkno - beg_blkno);
        }
        /*
         * don't update the head if there has been an error
         */
        if (ul->un_flags & LDL_ERROR)
                return;

        /* Fix up the head and ident */
        ASSERT(new_lof >= ul->un_bol_lof);
        ul->un_head_lof = new_lof;
        ul->un_head_ident = new_ident;
        if (data_lof == -1) {
                ul->un_tail_ident = ul->un_head_ident;
        }


        /* Commit to the database */
        ldl_savestate(ul);

        ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
            ldl_sethead_debug(ul));
}

/*
 * The tail will be set to the sector following lof+nb
 *      lof + nb == size of the last delta + commit record
 *      this function is called once after the log scan has completed.
 */
void
ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
{
        off_t           new_lof;
        uint32_t        new_ident;
        daddr_t         beg_blkno;
        daddr_t         end_blkno;

        ASSERT(MUTEX_HELD(&ul->un_log_mutex));

        if (lof == -1) {
                ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
                ul->un_head_lof = ul->un_tail_lof;
                ul->un_head_ident = lufs_hd_genid(ul);
                ul->un_tail_ident = ul->un_head_ident;

                /* Commit to the database */
                ldl_savestate(ul);

                return;
        }

        /*
         * new_lof is the offset of the sector following the last commit
         */
        (void) logseek(ul, lof, nb, &new_lof);
        ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));

        /*
         * calculate new ident
         */
        if (new_lof > ul->un_head_lof) {
                end_blkno = btodb(new_lof);
                beg_blkno = btodb(ul->un_head_lof);
                new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
        } else {
                end_blkno = btodb(ul->un_eol_lof);
                beg_blkno = btodb(ul->un_head_lof);
                new_ident = ul->un_head_ident + (end_blkno - beg_blkno);

                end_blkno = btodb(new_lof);
                beg_blkno = btodb(ul->un_bol_lof);
                new_ident += (end_blkno - beg_blkno);
        }

        /* Fix up the tail and ident */
        ul->un_tail_lof = new_lof;
        ul->un_tail_ident = new_ident;

        /* Commit to the database */
        ldl_savestate(ul);
}

/*
 * LOGSCAN STUFF
 */
static int
ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
{
        ulong_t         ident;
        size_t          nblk, i;
        sect_trailer_t  *st;

        /*
         * compute ident for first sector in the buffer
         */
        ident = ul->un_head_ident;
        if (bp->b_blkno >= btodb(ul->un_head_lof)) {
                ident += (bp->b_blkno - btodb(ul->un_head_lof));
        } else {
                ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
                ident += (bp->b_blkno - btodb(ul->un_bol_lof));
        }
        /*
         * truncate the buffer down to the last valid sector
         */
        nblk = btodb(bp->b_bcount);
        bp->b_bcount = 0;
        /* LINTED */
        st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
        for (i = 0; i < nblk; ++i) {
                if (st->st_ident != ident)
                        break;

                /* remember last valid tid for ldl_logscan_error() */
                ul->un_tid = st->st_tid;

                /* LINTED */
                st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
                ++ident;
                bp->b_bcount += DEV_BSIZE;
        }
        /*
         * make sure that lof is still within range
         */
        return (within_range(lof, bp->b_blkno, bp->b_bcount));
}

ulong_t
ldl_logscan_nbcommit(off_t lof)
{
        /*
         * lof is the offset following the commit header.  However,
         * if the commit header fell on the end-of-sector, then lof
         * has already been advanced to the beginning of the next
         * sector.  So do nothing.  Otherwise, return the remaining
         * bytes in the sector.
         */
        if ((lof & (DEV_BSIZE - 1)) == 0)
                return (0);
        return (NB_LEFT_IN_SECTOR(lof));
}

int
ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
{
        buf_t   *bp;
        ulong_t actual;

        ASSERT(ul->un_head_lof != ul->un_tail_lof);

        /*
         * Check the log data doesn't go out of bounds
         */
        if (ul->un_head_lof < ul->un_tail_lof) {
                if (!WITHIN(*lofp, nb, ul->un_head_lof,
                    (ul->un_tail_lof - ul->un_head_lof))) {
                        return (EIO);
                }
        } else {
                if (OVERLAP(*lofp, nb, ul->un_tail_lof,
                    (ul->un_head_lof - ul->un_tail_lof))) {
                        return (EIO);
                }
        }

        while (nb) {
                bp = get_read_bp(ul, *lofp);
                if (bp->b_flags & B_ERROR) {
                        sema_v(&bp->b_sem);
                        return (EIO);
                }
                /*
                 * out-of-seq idents means partial transaction
                 *      panic, non-corrupting powerfail, ...
                 */
                if (!ldl_logscan_ident(ul, bp, *lofp)) {
                        sema_v(&bp->b_sem);
                        return (EIO);
                }
                /*
                 * copy the header into the caller's buf
                 */
                actual = fetchbuf(ul, bp, va, nb, lofp);
                if (va)
                        va += actual;
                nb -= actual;
        }
        return (0);
}

void
ldl_logscan_begin(ml_unit_t *ul)
{
        size_t  bufsize;

        ASSERT(ul->un_wrbuf.cb_dirty == NULL);

        /*
         * logscan has begun
         */
        ul->un_flags |= LDL_SCAN;

        /*
         * reset the circular bufs
         */
        bufsize = ldl_bufsize(ul);
        alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
        alloc_wrbuf(&ul->un_wrbuf, bufsize);

        /*
         * set the tail to reflect a full log
         */
        ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;

        if (ul->un_tail_lof < ul->un_bol_lof)
                ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
        if (ul->un_tail_lof >= ul->un_eol_lof)
                ul->un_tail_lof = ul->un_bol_lof;

        /*
         * un_tid is used during error processing; it is initialized to
         * the tid of the delta at un_head_lof;
         */
        ul->un_tid = ul->un_head_tid;
}

void
ldl_logscan_end(ml_unit_t *ul)
{
        size_t  bufsize;

        /*
         * reset the circular bufs
         */
        bufsize = ldl_bufsize(ul);
        alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
        alloc_wrbuf(&ul->un_wrbuf, bufsize);

        /*
         * Done w/scan
         */
        ul->un_flags &= ~LDL_SCAN;
}

int
ldl_need_roll(ml_unit_t *ul)
{
        off_t   busybytes;
        off_t   head;
        off_t   tail;
        off_t   bol;
        off_t   eol;
        off_t   nb;

        /*
         * snapshot the log state
         */
        head = ul->un_head_lof;
        tail = ul->un_tail_lof;
        bol = ul->un_bol_lof;
        eol = ul->un_eol_lof;
        nb = ul->un_logsize;

        /*
         * compute number of busy (inuse) bytes
         */
        if (head <= tail)
                busybytes = tail - head;
        else
                busybytes = (eol - head) + (tail - bol);

        /*
         * return TRUE if > 75% full
         */
        return (busybytes > (nb - (nb >> 2)));
}

void
ldl_seterror(ml_unit_t *ul, char *why)
{
        /*
         * already in error state; do nothing
         */
        if (ul->un_flags & LDL_ERROR)
                return;

        ul->un_flags |= LDL_ERROR;      /* incore */
        ul->un_badlog = 1;              /* ondisk (cleared by fsck) */

        /*
         * Commit to state sectors
         */
        uniqtime(&ul->un_timestamp);
        ldl_savestate(ul);

        /* Pretty print */
        cmn_err(CE_WARN, "%s", why);
        cmn_err(CE_WARN, "ufs log for %s changed state to Error",
            ul->un_ufsvfs->vfs_fs->fs_fsmnt);
        cmn_err(CE_WARN, "Please umount(8) %s and run fsck(8)",
            ul->un_ufsvfs->vfs_fs->fs_fsmnt);

        /*
         * If we aren't in the middle of scan (aka snarf); tell ufs
         * to hard lock itself.
         */
        if ((ul->un_flags & LDL_SCAN) == 0)
                ufs_trans_onerror();
}

size_t
ldl_bufsize(ml_unit_t *ul)
{
        size_t          bufsize;
        extern uint32_t ldl_minbufsize;

        /*
         * initial guess is the maxtransfer value for this log device
         *      increase if too small
         *      decrease if too large
         */
        bufsize = dbtob(btod(ul->un_maxtransfer));
        if (bufsize < ldl_minbufsize)
                bufsize = ldl_minbufsize;
        if (bufsize > maxphys)
                bufsize = maxphys;
        if (bufsize > ul->un_maxtransfer)
                bufsize = ul->un_maxtransfer;
        return (bufsize);
}