root/usr.sbin/camdd/camdd.c
/*-
 * Copyright (c) 1997-2007 Kenneth D. Merry
 * Copyright (c) 2013, 2014, 2015 Spectra Logic Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions, and the following disclaimer,
 *    without modification.
 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
 *    substantially similar to the "NO WARRANTY" disclaimer below
 *    ("Disclaimer") and any redistribution must be conditioned upon
 *    including a substantially similar Disclaimer requirement for further
 *    binary redistribution.
 *
 * NO WARRANTY
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGES.
 *
 * Authors: Ken Merry           (Spectra Logic Corporation)
 */

/*
 * This is eventually intended to be:
 * - A basic data transfer/copy utility
 * - A simple benchmark utility
 * - An example of how to use the asynchronous pass(4) driver interface.
 */
#include <sys/cdefs.h>
#include <sys/ioctl.h>
#include <sys/stdint.h>
#include <sys/types.h>
#include <sys/endian.h>
#include <sys/param.h>
#include <sys/sbuf.h>
#include <sys/stat.h>
#include <sys/event.h>
#include <sys/time.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <sys/bus.h>
#include <sys/bus_dma.h>
#include <sys/mtio.h>
#include <sys/conf.h>
#include <sys/disk.h>

#include <stdio.h>
#include <stdlib.h>
#include <semaphore.h>
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
#include <limits.h>
#include <fcntl.h>
#include <ctype.h>
#include <err.h>
#include <libutil.h>
#include <pthread.h>
#include <assert.h>
#include <bsdxml.h>

#include <cam/cam.h>
#include <cam/cam_debug.h>
#include <cam/cam_ccb.h>
#include <cam/scsi/scsi_all.h>
#include <cam/scsi/scsi_da.h>
#include <cam/scsi/scsi_pass.h>
#include <cam/scsi/scsi_message.h>
#include <cam/scsi/smp_all.h>
#include <cam/nvme/nvme_all.h>
#include <camlib.h>
#include <mtlib.h>
#include <zlib.h>

typedef enum {
        CAMDD_CMD_NONE          = 0x00000000,
        CAMDD_CMD_HELP          = 0x00000001,
        CAMDD_CMD_WRITE         = 0x00000002,
        CAMDD_CMD_READ          = 0x00000003
} camdd_cmdmask;

typedef enum {
        CAMDD_ARG_NONE          = 0x00000000,
        CAMDD_ARG_VERBOSE       = 0x00000001,
        CAMDD_ARG_ERR_RECOVER   = 0x00000080,
} camdd_argmask;

typedef enum {
        CAMDD_DEV_NONE          = 0x00,
        CAMDD_DEV_PASS          = 0x01,
        CAMDD_DEV_FILE          = 0x02
} camdd_dev_type;

struct camdd_io_opts {
        camdd_dev_type  dev_type;
        char            *dev_name;
        uint64_t        blocksize;
        uint64_t        queue_depth;
        uint64_t        offset;
        int             min_cmd_size;
        int             write_dev;
        uint64_t        debug;
};

typedef enum {
        CAMDD_BUF_NONE,
        CAMDD_BUF_DATA,
        CAMDD_BUF_INDIRECT
} camdd_buf_type;

struct camdd_buf_indirect {
        /*
         * Pointer to the source buffer.
         */
        struct camdd_buf *src_buf;

        /*
         * Offset into the source buffer, in bytes.
         */
        uint64_t          offset;
        /*
         * Pointer to the starting point in the source buffer.
         */
        uint8_t          *start_ptr;

        /*
         * Length of this chunk in bytes.
         */
        size_t            len;
};

struct camdd_buf_data {
        /*
         * Buffer allocated when we allocate this camdd_buf.  This should
         * be the size of the blocksize for this device.
         */
        uint8_t                 *buf;

        /*
         * The amount of backing store allocated in buf.  Generally this
         * will be the blocksize of the device.
         */
        uint32_t                 alloc_len;

        /*
         * The amount of data that was put into the buffer (on reads) or
         * the amount of data we have put onto the src_list so far (on
         * writes).
         */
        uint32_t                 fill_len;

        /*
         * The amount of data that was not transferred.
         */
        uint32_t                 resid;

        /*
         * Starting byte offset on the reader.
         */
        uint64_t                 src_start_offset;
        
        /*
         * CCB used for pass(4) device targets.
         */
        union ccb                ccb;

        /*
         * Number of scatter/gather segments.
         */
        int                      sg_count;

        /*
         * Set if we had to tack on an extra buffer to round the transfer
         * up to a sector size.
         */
        int                      extra_buf;

        /*
         * Scatter/gather list used generally when we're the writer for a
         * pass(4) device. 
         */
        bus_dma_segment_t       *segs;

        /*
         * Scatter/gather list used generally when we're the writer for a
         * file or block device;
         */
        struct iovec            *iovec;
};

union camdd_buf_types {
        struct camdd_buf_indirect       indirect;
        struct camdd_buf_data           data;
};

typedef enum {
        CAMDD_STATUS_NONE,
        CAMDD_STATUS_OK,
        CAMDD_STATUS_SHORT_IO,
        CAMDD_STATUS_EOF,
        CAMDD_STATUS_ERROR
} camdd_buf_status;

struct camdd_buf {
        camdd_buf_type           buf_type;
        union camdd_buf_types    buf_type_spec;

        camdd_buf_status         status;

        uint64_t                 lba;
        size_t                   len;

        /*
         * A reference count of how many indirect buffers point to this
         * buffer.
         */
        int                      refcount;

        /*
         * A link back to our parent device.
         */
        struct camdd_dev        *dev;
        STAILQ_ENTRY(camdd_buf)  links;
        STAILQ_ENTRY(camdd_buf)  work_links;

        /*
         * A count of the buffers on the src_list.
         */
        int                      src_count;

        /*
         * List of buffers from our partner thread that are the components
         * of this buffer for the I/O.  Uses src_links.
         */
        STAILQ_HEAD(,camdd_buf)  src_list;
        STAILQ_ENTRY(camdd_buf)  src_links;
};

#define NUM_DEV_TYPES   2

struct camdd_dev_pass {
        int                      scsi_dev_type;
        int                      protocol;
        struct cam_device       *dev;
        uint64_t                 max_sector;
        uint32_t                 block_len;
        uint32_t                 cpi_maxio;
};

typedef enum {
        CAMDD_FILE_NONE,
        CAMDD_FILE_REG,
        CAMDD_FILE_STD,
        CAMDD_FILE_PIPE,
        CAMDD_FILE_DISK,
        CAMDD_FILE_TAPE,
        CAMDD_FILE_TTY,
        CAMDD_FILE_MEM
} camdd_file_type;

typedef enum {
        CAMDD_FF_NONE           = 0x00,
        CAMDD_FF_CAN_SEEK       = 0x01
} camdd_file_flags;

struct camdd_dev_file {
        int                      fd;
        struct stat              sb;
        char                     filename[MAXPATHLEN + 1];
        camdd_file_type          file_type;
        camdd_file_flags         file_flags;
        uint8_t                 *tmp_buf;
};

struct camdd_dev_block {
        int                      fd;
        uint64_t                 size_bytes;
        uint32_t                 block_len;
};

union camdd_dev_spec {
        struct camdd_dev_pass   pass;
        struct camdd_dev_file   file;
        struct camdd_dev_block  block;
};

typedef enum {
        CAMDD_DEV_FLAG_NONE             = 0x00,
        CAMDD_DEV_FLAG_EOF              = 0x01,
        CAMDD_DEV_FLAG_PEER_EOF         = 0x02,
        CAMDD_DEV_FLAG_ACTIVE           = 0x04,
        CAMDD_DEV_FLAG_EOF_SENT         = 0x08,
        CAMDD_DEV_FLAG_EOF_QUEUED       = 0x10
} camdd_dev_flags;

struct camdd_dev {
        camdd_dev_type           dev_type;
        union camdd_dev_spec     dev_spec;
        camdd_dev_flags          flags;
        char                     device_name[MAXPATHLEN+1];
        uint32_t                 blocksize;
        uint32_t                 sector_size;
        uint64_t                 max_sector;
        uint64_t                 sector_io_limit;
        int                      min_cmd_size;
        int                      write_dev;
        int                      retry_count;
        int                      io_timeout;
        int                      debug;
        uint64_t                 start_offset_bytes;
        uint64_t                 next_io_pos_bytes;
        uint64_t                 next_peer_pos_bytes;
        uint64_t                 next_completion_pos_bytes;
        uint64_t                 peer_bytes_queued;
        uint64_t                 bytes_transferred;
        uint32_t                 target_queue_depth;
        uint32_t                 cur_active_io;
        uint8_t                 *extra_buf;
        uint32_t                 extra_buf_len;
        struct camdd_dev        *peer_dev;
        pthread_mutex_t          mutex;
        pthread_cond_t           cond;
        int                      kq;

        int                      (*run)(struct camdd_dev *dev);
        int                      (*fetch)(struct camdd_dev *dev);

        /*
         * Buffers that are available for I/O.  Uses links.
         */
        STAILQ_HEAD(,camdd_buf)  free_queue;

        /*
         * Free indirect buffers.  These are used for breaking a large
         * buffer into multiple pieces.
         */
        STAILQ_HEAD(,camdd_buf)  free_indirect_queue;

        /*
         * Buffers that have been queued to the kernel.  Uses links.
         */
        STAILQ_HEAD(,camdd_buf)  active_queue;

        /*
         * Will generally contain one of our buffers that is waiting for enough
         * I/O from our partner thread to be able to execute.  This will
         * generally happen when our per-I/O-size is larger than the
         * partner thread's per-I/O-size.  Uses links.
         */
        STAILQ_HEAD(,camdd_buf)  pending_queue;

        /*
         * Number of buffers on the pending queue
         */
        int                      num_pending_queue;

        /*
         * Buffers that are filled and ready to execute.  This is used when
         * our partner (reader) thread sends us blocks that are larger than
         * our blocksize, and so we have to split them into multiple pieces.
         */
        STAILQ_HEAD(,camdd_buf)  run_queue;

        /*
         * Number of buffers on the run queue.
         */
        int                      num_run_queue;

        STAILQ_HEAD(,camdd_buf)  reorder_queue;

        int                      num_reorder_queue;

        /*
         * Buffers that have been queued to us by our partner thread
         * (generally the reader thread) to be written out.  Uses
         * work_links.
         */
        STAILQ_HEAD(,camdd_buf)  work_queue;

        /*
         * Buffers that have been completed by our partner thread.  Uses
         * work_links.
         */
        STAILQ_HEAD(,camdd_buf)  peer_done_queue;

        /*
         * Number of buffers on the peer done queue.
         */
        uint32_t                 num_peer_done_queue;

        /*
         * A list of buffers that we have queued to our peer thread.  Uses
         * links.
         */
        STAILQ_HEAD(,camdd_buf)  peer_work_queue;

        /*
         * Number of buffers on the peer work queue.
         */
        uint32_t                 num_peer_work_queue;
};

static sem_t camdd_sem;
static sig_atomic_t need_exit = 0;
static sig_atomic_t error_exit = 0;
static sig_atomic_t need_status = 0;

#ifndef min
#define min(a, b) (a < b) ? a : b
#endif


/* Generically useful offsets into the peripheral private area */
#define ppriv_ptr0 periph_priv.entries[0].ptr
#define ppriv_ptr1 periph_priv.entries[1].ptr
#define ppriv_field0 periph_priv.entries[0].field
#define ppriv_field1 periph_priv.entries[1].field

#define ccb_buf ppriv_ptr0

#define CAMDD_FILE_DEFAULT_BLOCK        524288
#define CAMDD_FILE_DEFAULT_DEPTH        1
#define CAMDD_PASS_MAX_BLOCK            1048576
#define CAMDD_PASS_DEFAULT_DEPTH        6
#define CAMDD_PASS_RW_TIMEOUT           60 * 1000

static int parse_btl(char *tstr, int *bus, int *target, int *lun);
void camdd_free_dev(struct camdd_dev *dev);
struct camdd_dev *camdd_alloc_dev(camdd_dev_type dev_type,
                                  struct kevent *new_ke, int num_ke,
                                  int retry_count, int timeout);
static struct camdd_buf *camdd_alloc_buf(struct camdd_dev *dev,
                                         camdd_buf_type buf_type);
void camdd_release_buf(struct camdd_buf *buf);
struct camdd_buf *camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type);
int camdd_buf_sg_create(struct camdd_buf *buf, int iovec,
                        uint32_t sector_size, uint32_t *num_sectors_used,
                        int *double_buf_needed);
uint32_t camdd_buf_get_len(struct camdd_buf *buf);
void camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf);
int camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize,
                     uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran);
int camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb,
         camdd_argmask arglist, int probe_retry_count,
         int probe_timeout, uint64_t *maxsector, uint32_t *block_len);
int camdd_probe_pass_nvme(struct cam_device *cam_dev, union ccb *ccb,
         camdd_argmask arglist, int probe_retry_count,
         int probe_timeout, uint64_t *maxsector, uint32_t *block_len);
struct camdd_dev *camdd_probe_file(int fd, struct camdd_io_opts *io_opts,
                                   int retry_count, int timeout);
struct camdd_dev *camdd_probe_pass(struct cam_device *cam_dev,
                                   struct camdd_io_opts *io_opts,
                                   camdd_argmask arglist, int probe_retry_count,
                                   int probe_timeout, int io_retry_count,
                                   int io_timeout);
void nvme_read_write(struct ccb_nvmeio *nvmeio, uint32_t retries,
                void (*cbfcnp)(struct cam_periph *, union ccb *),
                uint32_t nsid, int readop, uint64_t lba,
                uint32_t block_count, uint8_t *data_ptr, uint32_t dxfer_len,
                uint32_t timeout);
void *camdd_file_worker(void *arg);
camdd_buf_status camdd_ccb_status(union ccb *ccb, int protocol);
int camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd);
int camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf);
int camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf);
void camdd_peer_done(struct camdd_buf *buf);
void camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf,
                        int *error_count);
int camdd_pass_fetch(struct camdd_dev *dev);
int camdd_file_run(struct camdd_dev *dev);
int camdd_pass_run(struct camdd_dev *dev);
int camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len);
int camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf);
void camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth,
                     uint32_t *peer_depth, uint32_t *our_bytes,
                     uint32_t *peer_bytes);
void *camdd_worker(void *arg);
void camdd_sig_handler(int sig);
void camdd_print_status(struct camdd_dev *camdd_dev,
                        struct camdd_dev *other_dev,
                        struct timespec *start_time);
int camdd_rw(struct camdd_io_opts *io_opts, camdd_argmask arglist,
             int num_io_opts, uint64_t max_io, int retry_count, int timeout);
int camdd_parse_io_opts(char *args, int is_write,
                        struct camdd_io_opts *io_opts);
void usage(void);

/*
 * Parse out a bus, or a bus, target and lun in the following
 * format:
 * bus
 * bus:target
 * bus:target:lun
 *
 * Returns the number of parsed components, or 0.
 */
static int
parse_btl(char *tstr, int *bus, int *target, int *lun)
{
        char *tmpstr;
        int convs = 0;

        while (isspace(*tstr) && (*tstr != '\0'))
                tstr++;

        tmpstr = (char *)strtok(tstr, ":");
        if ((tmpstr != NULL) && (*tmpstr != '\0')) {
                *bus = strtol(tmpstr, NULL, 0);
                convs++;
                tmpstr = (char *)strtok(NULL, ":");
                if ((tmpstr != NULL) && (*tmpstr != '\0')) {
                        *target = strtol(tmpstr, NULL, 0);
                        convs++;
                        tmpstr = (char *)strtok(NULL, ":");
                        if ((tmpstr != NULL) && (*tmpstr != '\0')) {
                                *lun = strtol(tmpstr, NULL, 0);
                                convs++;
                        }
                }
        }

        return convs;
}

/*
 * XXX KDM clean up and free all of the buffers on the queue!
 */
void
camdd_free_dev(struct camdd_dev *dev)
{
        if (dev == NULL)
                return;

        switch (dev->dev_type) {
        case CAMDD_DEV_FILE: {
                struct camdd_dev_file *file_dev = &dev->dev_spec.file;

                if (file_dev->fd != -1)
                        close(file_dev->fd);
                free(file_dev->tmp_buf);
                break;
        }
        case CAMDD_DEV_PASS: {
                struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;

                if (pass_dev->dev != NULL)
                        cam_close_device(pass_dev->dev);
                break;
        }
        default:
                break;
        }

        free(dev);
}

struct camdd_dev *
camdd_alloc_dev(camdd_dev_type dev_type, struct kevent *new_ke, int num_ke,
                int retry_count, int timeout)
{
        struct camdd_dev *dev = NULL;
        struct kevent *ke;
        size_t ke_size;
        int retval = 0;

        dev = calloc(1, sizeof(*dev));
        if (dev == NULL) {
                warn("%s: unable to malloc %zu bytes", __func__, sizeof(*dev));
                goto bailout;
        }

        dev->dev_type = dev_type;
        dev->io_timeout = timeout;
        dev->retry_count = retry_count;
        STAILQ_INIT(&dev->free_queue);
        STAILQ_INIT(&dev->free_indirect_queue);
        STAILQ_INIT(&dev->active_queue);
        STAILQ_INIT(&dev->pending_queue);
        STAILQ_INIT(&dev->run_queue);
        STAILQ_INIT(&dev->reorder_queue);
        STAILQ_INIT(&dev->work_queue);
        STAILQ_INIT(&dev->peer_done_queue);
        STAILQ_INIT(&dev->peer_work_queue);
        retval = pthread_mutex_init(&dev->mutex, NULL);
        if (retval != 0) {
                warnc(retval, "%s: failed to initialize mutex", __func__);
                goto bailout;
        }

        retval = pthread_cond_init(&dev->cond, NULL);
        if (retval != 0) {
                warnc(retval, "%s: failed to initialize condition variable",
                      __func__);
                goto bailout;
        }

        dev->kq = kqueue();
        if (dev->kq == -1) {
                warn("%s: Unable to create kqueue", __func__);
                goto bailout;
        }

        ke_size = sizeof(struct kevent) * (num_ke + 4);
        ke = calloc(1, ke_size);
        if (ke == NULL) {
                warn("%s: unable to malloc %zu bytes", __func__, ke_size);
                goto bailout;
        }
        if (num_ke > 0)
                bcopy(new_ke, ke, num_ke * sizeof(struct kevent));

        EV_SET(&ke[num_ke++], (uintptr_t)&dev->work_queue, EVFILT_USER,
               EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0);
        EV_SET(&ke[num_ke++], (uintptr_t)&dev->peer_done_queue, EVFILT_USER,
               EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0);
        EV_SET(&ke[num_ke++], SIGINFO, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0);
        EV_SET(&ke[num_ke++], SIGINT, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0);

        retval = kevent(dev->kq, ke, num_ke, NULL, 0, NULL);
        if (retval == -1) {
                warn("%s: Unable to register kevents", __func__);
                goto bailout;
        }


        return (dev);

bailout:
        free(dev);

        return (NULL);
}

static struct camdd_buf *
camdd_alloc_buf(struct camdd_dev *dev, camdd_buf_type buf_type)
{
        struct camdd_buf *buf = NULL;
        uint8_t *data_ptr = NULL;

        /*
         * We only need to allocate data space for data buffers.
         */
        switch (buf_type) {
        case CAMDD_BUF_DATA:
                data_ptr = malloc(dev->blocksize);
                if (data_ptr == NULL) {
                        warn("unable to allocate %u bytes", dev->blocksize);
                        goto bailout_error;
                }
                break;
        default:
                break;
        }
        
        buf = calloc(1, sizeof(*buf));
        if (buf == NULL) {
                warn("unable to allocate %zu bytes", sizeof(*buf));
                goto bailout_error;
        }

        buf->buf_type = buf_type;
        buf->dev = dev;
        switch (buf_type) {
        case CAMDD_BUF_DATA: {
                struct camdd_buf_data *data;

                data = &buf->buf_type_spec.data;

                data->alloc_len = dev->blocksize;
                data->buf = data_ptr;
                break;
        }
        case CAMDD_BUF_INDIRECT:
                break;
        default:
                break;
        }
        STAILQ_INIT(&buf->src_list);

        return (buf);

bailout_error:
        free(data_ptr);

        return (NULL);
}

void
camdd_release_buf(struct camdd_buf *buf)
{
        struct camdd_dev *dev;

        dev = buf->dev;

        switch (buf->buf_type) {
        case CAMDD_BUF_DATA: {
                struct camdd_buf_data *data;

                data = &buf->buf_type_spec.data;

                if (data->segs != NULL) {
                        if (data->extra_buf != 0) {
                                void *extra_buf;

                                extra_buf = (void *)
                                    data->segs[data->sg_count - 1].ds_addr;
                                free(extra_buf);
                                data->extra_buf = 0;
                        }
                        free(data->segs);
                        data->segs = NULL;
                        data->sg_count = 0;
                } else if (data->iovec != NULL) {
                        if (data->extra_buf != 0) {
                                free(data->iovec[data->sg_count - 1].iov_base);
                                data->extra_buf = 0;
                        }
                        free(data->iovec);
                        data->iovec = NULL;
                        data->sg_count = 0;
                }
                STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
                break;
        }
        case CAMDD_BUF_INDIRECT:
                STAILQ_INSERT_TAIL(&dev->free_indirect_queue, buf, links);
                break;
        default:
                err(1, "%s: Invalid buffer type %d for released buffer",
                    __func__, buf->buf_type);
                break;
        }
}

struct camdd_buf *
camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type)
{
        struct camdd_buf *buf = NULL;

        switch (buf_type) {
        case CAMDD_BUF_DATA:
                buf = STAILQ_FIRST(&dev->free_queue);
                if (buf != NULL) {
                        struct camdd_buf_data *data;
                        uint8_t *data_ptr;
                        uint32_t alloc_len;

                        STAILQ_REMOVE_HEAD(&dev->free_queue, links);
                        data = &buf->buf_type_spec.data;
                        data_ptr = data->buf;
                        alloc_len = data->alloc_len;
                        bzero(buf, sizeof(*buf));
                        data->buf = data_ptr;
                        data->alloc_len = alloc_len;
                }
                break;
        case CAMDD_BUF_INDIRECT:
                buf = STAILQ_FIRST(&dev->free_indirect_queue);
                if (buf != NULL) {
                        STAILQ_REMOVE_HEAD(&dev->free_indirect_queue, links);

                        bzero(buf, sizeof(*buf));
                }
                break;
        default:
                warnx("Unknown buffer type %d requested", buf_type);
                break;
        }


        if (buf == NULL)
                return (camdd_alloc_buf(dev, buf_type));
        else {
                STAILQ_INIT(&buf->src_list);
                buf->dev = dev;
                buf->buf_type = buf_type;

                return (buf);
        }
}

int
camdd_buf_sg_create(struct camdd_buf *buf, int iovec, uint32_t sector_size,
                    uint32_t *num_sectors_used, int *double_buf_needed)
{
        struct camdd_buf *tmp_buf;
        struct camdd_buf_data *data;
        uint8_t *extra_buf = NULL;
        size_t extra_buf_len = 0;
        int extra_buf_attached = 0;
        int i, retval = 0;

        data = &buf->buf_type_spec.data;

        data->sg_count = buf->src_count;
        /*
         * Compose a scatter/gather list from all of the buffers in the list.
         * If the length of the buffer isn't a multiple of the sector size,
         * we'll have to add an extra buffer.  This should only happen
         * at the end of a transfer.
         */
        if ((data->fill_len % sector_size) != 0) {
                extra_buf_len = sector_size - (data->fill_len % sector_size);
                extra_buf = calloc(extra_buf_len, 1);
                if (extra_buf == NULL) {
                        warn("%s: unable to allocate %zu bytes for extra "
                            "buffer space", __func__, extra_buf_len);
                        retval = 1;
                        goto bailout;
                }
                data->extra_buf = 1;
                data->sg_count++;
        }
        if (iovec == 0) {
                data->segs = calloc(data->sg_count, sizeof(bus_dma_segment_t));
                if (data->segs == NULL) {
                        warn("%s: unable to allocate %zu bytes for S/G list",
                            __func__, sizeof(bus_dma_segment_t) *
                            data->sg_count);
                        retval = 1;
                        goto bailout;
                }

        } else {
                data->iovec = calloc(data->sg_count, sizeof(struct iovec));
                if (data->iovec == NULL) {
                        warn("%s: unable to allocate %zu bytes for S/G list",
                            __func__, sizeof(struct iovec) * data->sg_count);
                        retval = 1;
                        goto bailout;
                }
        }

        for (i = 0, tmp_buf = STAILQ_FIRST(&buf->src_list);
             i < buf->src_count && tmp_buf != NULL; i++,
             tmp_buf = STAILQ_NEXT(tmp_buf, src_links)) {

                if (tmp_buf->buf_type == CAMDD_BUF_DATA) {
                        struct camdd_buf_data *tmp_data;

                        tmp_data = &tmp_buf->buf_type_spec.data;
                        if (iovec == 0) {
                                data->segs[i].ds_addr =
                                    (bus_addr_t) tmp_data->buf;
                                data->segs[i].ds_len = tmp_data->fill_len -
                                    tmp_data->resid;
                        } else {
                                data->iovec[i].iov_base = tmp_data->buf;
                                data->iovec[i].iov_len = tmp_data->fill_len -
                                    tmp_data->resid;
                        }
                        if (((tmp_data->fill_len - tmp_data->resid) %
                             sector_size) != 0)
                                *double_buf_needed = 1;
                } else {
                        struct camdd_buf_indirect *tmp_ind;

                        tmp_ind = &tmp_buf->buf_type_spec.indirect;
                        if (iovec == 0) {
                                data->segs[i].ds_addr =
                                    (bus_addr_t)tmp_ind->start_ptr;
                                data->segs[i].ds_len = tmp_ind->len;
                        } else {
                                data->iovec[i].iov_base = tmp_ind->start_ptr;
                                data->iovec[i].iov_len = tmp_ind->len;
                        }
                        if ((tmp_ind->len % sector_size) != 0)
                                *double_buf_needed = 1;
                }
        }

        if (extra_buf != NULL) {
                if (iovec == 0) {
                        data->segs[i].ds_addr = (bus_addr_t)extra_buf;
                        data->segs[i].ds_len = extra_buf_len;
                } else {
                        data->iovec[i].iov_base = extra_buf;
                        data->iovec[i].iov_len = extra_buf_len;
                }
                extra_buf_attached = 1;
                i++;
        }
        if ((tmp_buf != NULL) || (i != data->sg_count)) {
                warnx("buffer source count does not match "
                      "number of buffers in list!");
                retval = 1;
                goto bailout;
        }

bailout:
        if (retval == 0) {
                *num_sectors_used = (data->fill_len + extra_buf_len) /
                    sector_size;
        } else if (extra_buf_attached == 0) {
                /*
                 * If extra_buf isn't attached yet, we need to free it
                 * to avoid leaking.
                 */
                free(extra_buf);
                data->extra_buf = 0;
                data->sg_count--;
        }
        return (retval);
}

uint32_t
camdd_buf_get_len(struct camdd_buf *buf)
{
        uint32_t len = 0;

        if (buf->buf_type != CAMDD_BUF_DATA) {
                struct camdd_buf_indirect *indirect;

                indirect = &buf->buf_type_spec.indirect;
                len = indirect->len;
        } else {
                struct camdd_buf_data *data;

                data = &buf->buf_type_spec.data;
                len = data->fill_len;
        }

        return (len);
}

void
camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf)
{
        struct camdd_buf_data *data;

        assert(buf->buf_type == CAMDD_BUF_DATA);

        data = &buf->buf_type_spec.data;

        STAILQ_INSERT_TAIL(&buf->src_list, child_buf, src_links);
        buf->src_count++;

        data->fill_len += camdd_buf_get_len(child_buf);
}

typedef enum {
        CAMDD_TS_MAX_BLK,
        CAMDD_TS_MIN_BLK,
        CAMDD_TS_BLK_GRAN,
        CAMDD_TS_EFF_IOSIZE
} camdd_status_item_index;

static struct camdd_status_items {
        const char *name;
        struct mt_status_entry *entry;
} req_status_items[] = {
        { "max_blk", NULL },
        { "min_blk", NULL },
        { "blk_gran", NULL },
        { "max_effective_iosize", NULL }
};

int
camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize,
                 uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran)
{
        struct mt_status_data status_data;
        char *xml_str = NULL;
        unsigned int i;
        int retval = 0;
        
        retval = mt_get_xml_str(fd, MTIOCEXTGET, &xml_str);
        if (retval != 0)
                err(1, "Couldn't get XML string from %s", filename);

        retval = mt_get_status(xml_str, &status_data);
        if (retval != XML_STATUS_OK) {
                warn("couldn't get status for %s", filename);
                retval = 1;
                goto bailout;
        } else
                retval = 0;

        if (status_data.error != 0) {
                warnx("%s", status_data.error_str);
                retval = 1;
                goto bailout;
        }

        for (i = 0; i < nitems(req_status_items); i++) {
                char *name;

                name = __DECONST(char *, req_status_items[i].name);
                req_status_items[i].entry = mt_status_entry_find(&status_data,
                    name);
                if (req_status_items[i].entry == NULL) {
                        errx(1, "Cannot find status entry %s",
                            req_status_items[i].name);
                }
        }

        *max_iosize = req_status_items[CAMDD_TS_EFF_IOSIZE].entry->value_unsigned;
        *max_blk= req_status_items[CAMDD_TS_MAX_BLK].entry->value_unsigned;
        *min_blk= req_status_items[CAMDD_TS_MIN_BLK].entry->value_unsigned;
        *blk_gran = req_status_items[CAMDD_TS_BLK_GRAN].entry->value_unsigned;
bailout:

        free(xml_str);
        mt_status_free(&status_data);

        return (retval);
}

struct camdd_dev *
camdd_probe_file(int fd, struct camdd_io_opts *io_opts, int retry_count,
    int timeout)
{
        struct camdd_dev *dev = NULL;
        struct camdd_dev_file *file_dev;
        uint64_t blocksize = io_opts->blocksize;

        dev = camdd_alloc_dev(CAMDD_DEV_FILE, NULL, 0, retry_count, timeout);
        if (dev == NULL)
                goto bailout;

        file_dev = &dev->dev_spec.file;
        file_dev->fd = fd;
        strlcpy(file_dev->filename, io_opts->dev_name,
            sizeof(file_dev->filename));
        strlcpy(dev->device_name, io_opts->dev_name, sizeof(dev->device_name));
        if (blocksize == 0)
                dev->blocksize = CAMDD_FILE_DEFAULT_BLOCK;
        else
                dev->blocksize = blocksize;

        if ((io_opts->queue_depth != 0)
         && (io_opts->queue_depth != 1)) {
                warnx("Queue depth %ju for %s ignored, only 1 outstanding "
                    "command supported", (uintmax_t)io_opts->queue_depth,
                    io_opts->dev_name);
        }
        dev->target_queue_depth = CAMDD_FILE_DEFAULT_DEPTH;
        dev->run = camdd_file_run;
        dev->fetch = NULL;

        /*
         * We can effectively access files on byte boundaries.  We'll reset
         * this for devices like disks that can be accessed on sector
         * boundaries.
         */
        dev->sector_size = 1;

        if ((fd != STDIN_FILENO)
         && (fd != STDOUT_FILENO)) {
                int retval;

                retval = fstat(fd, &file_dev->sb);
                if (retval != 0) {
                        warn("Cannot stat %s", dev->device_name);
                        goto bailout_error;
                }
                if (S_ISREG(file_dev->sb.st_mode)) {
                        file_dev->file_type = CAMDD_FILE_REG;
                } else if (S_ISCHR(file_dev->sb.st_mode)) {
                        int type;

                        if (ioctl(fd, FIODTYPE, &type) == -1)
                                err(1, "FIODTYPE ioctl failed on %s",
                                    dev->device_name);
                        else {
                                if (type & D_TAPE)
                                        file_dev->file_type = CAMDD_FILE_TAPE;
                                else if (type & D_DISK)
                                        file_dev->file_type = CAMDD_FILE_DISK;
                                else if (type & D_MEM)
                                        file_dev->file_type = CAMDD_FILE_MEM;
                                else if (type & D_TTY)
                                        file_dev->file_type = CAMDD_FILE_TTY;
                        }
                } else if (S_ISDIR(file_dev->sb.st_mode)) {
                        errx(1, "cannot operate on directory %s",
                            dev->device_name);
                } else if (S_ISFIFO(file_dev->sb.st_mode)) {
                        file_dev->file_type = CAMDD_FILE_PIPE;
                } else
                        errx(1, "Cannot determine file type for %s",
                            dev->device_name);

                switch (file_dev->file_type) {
                case CAMDD_FILE_REG:
                        if (file_dev->sb.st_size != 0)
                                dev->max_sector = file_dev->sb.st_size - 1;
                        else
                                dev->max_sector = 0;
                        file_dev->file_flags |= CAMDD_FF_CAN_SEEK;
                        break;
                case CAMDD_FILE_TAPE: {
                        uint64_t max_iosize, max_blk, min_blk, blk_gran;
                        /*
                         * Check block limits and maximum effective iosize.
                         * Make sure the blocksize is within the block
                         * limits (and a multiple of the minimum blocksize)
                         * and that the blocksize is <= maximum effective
                         * iosize.
                         */
                        retval = camdd_probe_tape(fd, dev->device_name,
                            &max_iosize, &max_blk, &min_blk, &blk_gran);
                        if (retval != 0)
                                errx(1, "Unable to probe tape %s",
                                    dev->device_name);

                        /*
                         * The blocksize needs to be <= the maximum
                         * effective I/O size of the tape device.  Note
                         * that this also takes into account the maximum
                         * blocksize reported by READ BLOCK LIMITS.
                         */
                        if (dev->blocksize > max_iosize) {
                                warnx("Blocksize %u too big for %s, limiting "
                                    "to %ju", dev->blocksize, dev->device_name,
                                    max_iosize);
                                dev->blocksize = max_iosize;
                        }

                        /*
                         * The blocksize needs to be at least min_blk;
                         */
                        if (dev->blocksize < min_blk) {
                                warnx("Blocksize %u too small for %s, "
                                    "increasing to %ju", dev->blocksize,
                                    dev->device_name, min_blk);
                                dev->blocksize = min_blk;
                        }

                        /*
                         * And the blocksize needs to be a multiple of
                         * the block granularity.
                         */
                        if ((blk_gran != 0)
                         && (dev->blocksize % (1 << blk_gran))) {
                                warnx("Blocksize %u for %s not a multiple of "
                                    "%d, adjusting to %d", dev->blocksize,
                                    dev->device_name, (1 << blk_gran),
                                    dev->blocksize & ~((1 << blk_gran) - 1));
                                dev->blocksize &= ~((1 << blk_gran) - 1);
                        }

                        if (dev->blocksize == 0) {
                                errx(1, "Unable to derive valid blocksize for "
                                    "%s", dev->device_name);
                        }

                        /*
                         * For tape drives, set the sector size to the
                         * blocksize so that we make sure not to write
                         * less than the blocksize out to the drive.
                         */
                        dev->sector_size = dev->blocksize;
                        break;
                }
                case CAMDD_FILE_DISK: {
                        off_t media_size;
                        unsigned int sector_size;

                        file_dev->file_flags |= CAMDD_FF_CAN_SEEK;

                        if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) == -1) {
                                err(1, "DIOCGSECTORSIZE ioctl failed on %s",
                                    dev->device_name);
                        }

                        if (sector_size == 0) {
                                errx(1, "DIOCGSECTORSIZE ioctl returned "
                                    "invalid sector size %u for %s",
                                    sector_size, dev->device_name);
                        }

                        if (ioctl(fd, DIOCGMEDIASIZE, &media_size) == -1) {
                                err(1, "DIOCGMEDIASIZE ioctl failed on %s",
                                    dev->device_name);
                        }

                        if (media_size == 0) {
                                errx(1, "DIOCGMEDIASIZE ioctl returned "
                                    "invalid media size %ju for %s",
                                    (uintmax_t)media_size, dev->device_name);
                        }

                        if (dev->blocksize % sector_size) {
                                errx(1, "%s blocksize %u not a multiple of "
                                    "sector size %u", dev->device_name,
                                    dev->blocksize, sector_size);
                        }

                        dev->sector_size = sector_size;
                        dev->max_sector = (media_size / sector_size) - 1;
                        break;
                }
                case CAMDD_FILE_MEM:
                        file_dev->file_flags |= CAMDD_FF_CAN_SEEK;
                        break;
                default:
                        break;
                }
        }

        if ((io_opts->offset != 0)
         && ((file_dev->file_flags & CAMDD_FF_CAN_SEEK) == 0)) {
                warnx("Offset %ju specified for %s, but we cannot seek on %s",
                    io_opts->offset, io_opts->dev_name, io_opts->dev_name);
                goto bailout_error;
        }
#if 0
        else if ((io_opts->offset != 0)
                && ((io_opts->offset % dev->sector_size) != 0)) {
                warnx("Offset %ju for %s is not a multiple of the "
                      "sector size %u", io_opts->offset, 
                      io_opts->dev_name, dev->sector_size);
                goto bailout_error;
        } else {
                dev->start_offset_bytes = io_opts->offset;
        }
#endif

bailout:
        return (dev);

bailout_error:
        camdd_free_dev(dev);
        return (NULL);
}

/*
 * Get a get device CCB for the specified device.
 */
int
camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd)
{
        union ccb *ccb;
        int retval = 0;

        ccb = cam_getccb(device);
 
        if (ccb == NULL) {
                warnx("%s: couldn't allocate CCB", __func__);
                return -1;
        }

        CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgd);

        ccb->ccb_h.func_code = XPT_GDEV_TYPE;
 
        if (cam_send_ccb(device, ccb) < 0) {
                warn("%s: error sending Get Device Information CCB", __func__);
                        cam_error_print(device, ccb, CAM_ESF_ALL,
                                        CAM_EPF_ALL, stderr);
                retval = -1;
                goto bailout;
        }

        if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
                        cam_error_print(device, ccb, CAM_ESF_ALL,
                                        CAM_EPF_ALL, stderr);
                retval = -1;
                goto bailout;
        }

        bcopy(&ccb->cgd, cgd, sizeof(struct ccb_getdev));

bailout:
        cam_freeccb(ccb);
 
        return retval;
}

int
camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb,
                 camdd_argmask arglist, int probe_retry_count,
                 int probe_timeout, uint64_t *maxsector, uint32_t *block_len)
{
        struct scsi_read_capacity_data rcap;
        struct scsi_read_capacity_data_long rcaplong;
        int retval = -1;

        if (ccb == NULL) {
                warnx("%s: error passed ccb is NULL", __func__);
                goto bailout;
        }

        CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);

        scsi_read_capacity(&ccb->csio,
                           /*retries*/ probe_retry_count,
                           /*cbfcnp*/ NULL,
                           /*tag_action*/ MSG_SIMPLE_Q_TAG,
                           &rcap,
                           SSD_FULL_SIZE,
                           /*timeout*/ probe_timeout ? probe_timeout : 5000);

        /* Disable freezing the device queue */
        ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;

        if (arglist & CAMDD_ARG_ERR_RECOVER)
                ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;

        if (cam_send_ccb(cam_dev, ccb) < 0) {
                warn("error sending READ CAPACITY command");

                cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
                                CAM_EPF_ALL, stderr);

                goto bailout;
        }

        if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
                cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
                goto bailout;
        }

        *maxsector = scsi_4btoul(rcap.addr);
        *block_len = scsi_4btoul(rcap.length);

        /*
         * A last block of 2^32-1 means that the true capacity is over 2TB,
         * and we need to issue the long READ CAPACITY to get the real
         * capacity.  Otherwise, we're all set.
         */
        if (*maxsector != 0xffffffff) {
                retval = 0;
                goto bailout;
        }

        scsi_read_capacity_16(&ccb->csio,
                              /*retries*/ probe_retry_count,
                              /*cbfcnp*/ NULL,
                              /*tag_action*/ MSG_SIMPLE_Q_TAG,
                              /*lba*/ 0,
                              /*reladdr*/ 0,
                              /*pmi*/ 0,
                              (uint8_t *)&rcaplong,
                              sizeof(rcaplong),
                              /*sense_len*/ SSD_FULL_SIZE,
                              /*timeout*/ probe_timeout ? probe_timeout : 5000);

        /* Disable freezing the device queue */
        ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;

        if (arglist & CAMDD_ARG_ERR_RECOVER)
                ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;

        if (cam_send_ccb(cam_dev, ccb) < 0) {
                warn("error sending READ CAPACITY (16) command");
                cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
                                CAM_EPF_ALL, stderr);
                goto bailout;
        }

        if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
                cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
                goto bailout;
        }

        *maxsector = scsi_8btou64(rcaplong.addr);
        *block_len = scsi_4btoul(rcaplong.length);

        retval = 0;

bailout:
        return retval;
}

int
camdd_probe_pass_nvme(struct cam_device *cam_dev, union ccb *ccb,
                 camdd_argmask arglist, int probe_retry_count,
                 int probe_timeout, uint64_t *maxsector, uint32_t *block_len)
{
        struct nvme_command *nc = NULL;
        struct nvme_namespace_data nsdata;
        uint32_t nsid = cam_dev->target_lun & UINT32_MAX;
        uint8_t format = 0, lbads = 0;
        int retval = -1;

        if (ccb == NULL) {
                warnx("%s: error passed ccb is NULL", __func__);
                goto bailout;
        }

        CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->nvmeio);

        /* Send Identify Namespace to get block size and capacity */
        nc = &ccb->nvmeio.cmd;
        nc->opc = NVME_OPC_IDENTIFY;

        nc->nsid = nsid;
        nc->cdw10 = 0; /* Identify Namespace is CNS = 0 */

        cam_fill_nvmeadmin(&ccb->nvmeio,
                        /*retries*/ probe_retry_count,
                        /*cbfcnp*/ NULL,
                        CAM_DIR_IN,
                        (uint8_t *)&nsdata,
                        sizeof(nsdata),
                        probe_timeout);

        /* Disable freezing the device queue */
        ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;

        if (arglist & CAMDD_ARG_ERR_RECOVER)
                ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;

        if (cam_send_ccb(cam_dev, ccb) < 0) {
                warn("error sending Identify Namespace command");

                cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
                                CAM_EPF_ALL, stderr);

                goto bailout;
        }

        if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
                cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
                goto bailout;
        }

        *maxsector = nsdata.nsze;
        /* The LBA Data Size (LBADS) is reported as a power of 2 */
        format = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, nsdata.flbas);
        lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, nsdata.lbaf[format]);
        *block_len = 1 << lbads;

        retval = 0;

bailout:
        return retval;
}

/*
 * Need to implement this.  Do a basic probe:
 * - Check the inquiry data, make sure we're talking to a device that we
 *   can reasonably expect to talk to -- direct, RBC, CD, WORM.
 * - Send a test unit ready, make sure the device is available.
 * - Get the capacity and block size.
 */
struct camdd_dev *
camdd_probe_pass(struct cam_device *cam_dev, struct camdd_io_opts *io_opts,
                 camdd_argmask arglist, int probe_retry_count,
                 int probe_timeout, int io_retry_count, int io_timeout)
{
        union ccb *ccb;
        uint64_t maxsector = 0;
        uint32_t cpi_maxio, max_iosize, pass_numblocks;
        uint32_t block_len = 0;
        struct camdd_dev *dev = NULL;
        struct camdd_dev_pass *pass_dev;
        struct kevent ke;
        struct ccb_getdev cgd;
        int retval;
        int scsi_dev_type = T_NODEVICE;

        if ((retval = camdd_get_cgd(cam_dev, &cgd)) != 0) {
                warnx("%s: error retrieving CGD", __func__);
                return NULL;
        }

        ccb = cam_getccb(cam_dev);

        if (ccb == NULL) {
                warnx("%s: error allocating ccb", __func__);
                goto bailout;
        }

        switch (cgd.protocol) {
        case PROTO_SCSI:
                scsi_dev_type = SID_TYPE(&cam_dev->inq_data);

                /*
                 * For devices that support READ CAPACITY, we'll attempt to get the
                 * capacity.  Otherwise, we really don't support tape or other
                 * devices via SCSI passthrough, so just return an error in that case.
                 */
                switch (scsi_dev_type) {
                case T_DIRECT:
                case T_WORM:
                case T_CDROM:
                case T_OPTICAL:
                case T_RBC:
                case T_ZBC_HM:
                        break;
                default:
                        errx(1, "Unsupported SCSI device type %d", scsi_dev_type);
                        break; /*NOTREACHED*/
                }

                if ((retval = camdd_probe_pass_scsi(cam_dev, ccb, probe_retry_count,
                                                arglist, probe_timeout, &maxsector,
                                                &block_len))) {
                        goto bailout;
                }
                break;
        case PROTO_NVME:
                if ((retval = camdd_probe_pass_nvme(cam_dev, ccb, probe_retry_count,
                                                arglist, probe_timeout, &maxsector,
                                                &block_len))) {
                        goto bailout;
                }
                break;
        default:
                errx(1, "Unsupported PROTO type %d", cgd.protocol);
                break; /*NOTREACHED*/
        }

        if (block_len == 0) {
                warnx("Sector size for %s%u is 0, cannot continue",
                    cam_dev->device_name, cam_dev->dev_unit_num);
                goto bailout_error;
        }

        CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi);

        ccb->ccb_h.func_code = XPT_PATH_INQ;
        ccb->ccb_h.flags = CAM_DIR_NONE;
        ccb->ccb_h.retry_count = 1;
        
        if (cam_send_ccb(cam_dev, ccb) < 0) {
                warn("error sending XPT_PATH_INQ CCB");

                cam_error_print(cam_dev, ccb, CAM_ESF_ALL,
                                CAM_EPF_ALL, stderr);
                goto bailout;
        }

        EV_SET(&ke, cam_dev->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0);

        dev = camdd_alloc_dev(CAMDD_DEV_PASS, &ke, 1, io_retry_count,
                              io_timeout);
        if (dev == NULL)
                goto bailout;

        pass_dev = &dev->dev_spec.pass;
        pass_dev->scsi_dev_type = scsi_dev_type;
        pass_dev->protocol = cgd.protocol;
        pass_dev->dev = cam_dev;
        pass_dev->max_sector = maxsector;
        pass_dev->block_len = block_len;
        pass_dev->cpi_maxio = ccb->cpi.maxio;
        snprintf(dev->device_name, sizeof(dev->device_name), "%s%u",
                 pass_dev->dev->device_name, pass_dev->dev->dev_unit_num);
        dev->sector_size = block_len;
        dev->max_sector = maxsector;
        

        /*
         * Determine the optimal blocksize to use for this device.
         */

        /*
         * If the controller has not specified a maximum I/O size,
         * just go with 128K as a somewhat conservative value.
         */
        if (pass_dev->cpi_maxio == 0)
                cpi_maxio = 131072;
        else
                cpi_maxio = pass_dev->cpi_maxio;

        /*
         * If the controller has a large maximum I/O size, limit it
         * to something smaller so that the kernel doesn't have trouble
         * allocating buffers to copy data in and out for us.
         * XXX KDM this is until we have unmapped I/O support in the kernel.
         */
        max_iosize = min(cpi_maxio, CAMDD_PASS_MAX_BLOCK);

        /*
         * If we weren't able to get a block size for some reason,
         * default to 512 bytes.
         */
        block_len = pass_dev->block_len;
        if (block_len == 0)
                block_len = 512;

        /*
         * Figure out how many blocksize chunks will fit in the
         * maximum I/O size.
         */
        pass_numblocks = max_iosize / block_len;

        /*
         * And finally, multiple the number of blocks by the LBA
         * length to get our maximum block size;
         */
        dev->blocksize = pass_numblocks * block_len;

        if (io_opts->blocksize != 0) {
                if ((io_opts->blocksize % dev->sector_size) != 0) {
                        warnx("Blocksize %ju for %s is not a multiple of "
                              "sector size %u", (uintmax_t)io_opts->blocksize, 
                              dev->device_name, dev->sector_size);
                        goto bailout_error;
                }
                dev->blocksize = io_opts->blocksize;
        }
        dev->target_queue_depth = CAMDD_PASS_DEFAULT_DEPTH;
        if (io_opts->queue_depth != 0)
                dev->target_queue_depth = io_opts->queue_depth;

        if (io_opts->offset != 0) {
                if (io_opts->offset > (dev->max_sector * dev->sector_size)) {
                        warnx("Offset %ju is past the end of device %s",
                            io_opts->offset, dev->device_name);
                        goto bailout_error;
                }
#if 0
                else if ((io_opts->offset % dev->sector_size) != 0) {
                        warnx("Offset %ju for %s is not a multiple of the "
                              "sector size %u", io_opts->offset, 
                              dev->device_name, dev->sector_size);
                        goto bailout_error;
                }
                dev->start_offset_bytes = io_opts->offset;
#endif
        }

        dev->min_cmd_size = io_opts->min_cmd_size;

        dev->run = camdd_pass_run;
        dev->fetch = camdd_pass_fetch;

bailout:
        cam_freeccb(ccb);

        return (dev);

bailout_error:
        cam_freeccb(ccb);

        camdd_free_dev(dev);

        return (NULL);
}

void
nvme_read_write(struct ccb_nvmeio *nvmeio, uint32_t retries,
                void (*cbfcnp)(struct cam_periph *, union ccb *),
                uint32_t nsid, int readop, uint64_t lba,
                uint32_t block_count, uint8_t *data_ptr, uint32_t dxfer_len,
                uint32_t timeout)
{
        struct nvme_command *nc = &nvmeio->cmd;

        nc->opc = readop ? NVME_OPC_READ : NVME_OPC_WRITE;

        nc->nsid = nsid;

        nc->cdw10 = lba & UINT32_MAX;
        nc->cdw11 = lba >> 32;

        /* NLB (bits 15:0) is a zero based value */
        nc->cdw12 = (block_count - 1) & UINT16_MAX;

        cam_fill_nvmeio(nvmeio,
                        retries,
                        cbfcnp,
                        readop ? CAM_DIR_IN : CAM_DIR_OUT,
                        data_ptr,
                        dxfer_len,
                        timeout);
}

void *
camdd_worker(void *arg)
{
        struct camdd_dev *dev = arg;
        struct camdd_buf *buf;
        struct timespec ts, *kq_ts;

        ts.tv_sec = 0;
        ts.tv_nsec = 0;

        pthread_mutex_lock(&dev->mutex);

        dev->flags |= CAMDD_DEV_FLAG_ACTIVE;

        for (;;) {
                struct kevent ke;
                int retval = 0;

                /*
                 * XXX KDM check the reorder queue depth?
                 */
                if (dev->write_dev == 0) {
                        uint32_t our_depth, peer_depth, peer_bytes, our_bytes;
                        uint32_t target_depth = dev->target_queue_depth;
                        uint32_t peer_target_depth =
                            dev->peer_dev->target_queue_depth;
                        uint32_t peer_blocksize = dev->peer_dev->blocksize;

                        camdd_get_depth(dev, &our_depth, &peer_depth,
                                        &our_bytes, &peer_bytes);

#if 0
                        while (((our_depth < target_depth)
                             && (peer_depth < peer_target_depth))
                            || ((peer_bytes + our_bytes) <
                                 (peer_blocksize * 2))) {
#endif
                        while (((our_depth + peer_depth) <
                                (target_depth + peer_target_depth))
                            || ((peer_bytes + our_bytes) <
                                (peer_blocksize * 3))) {

                                retval = camdd_queue(dev, NULL);
                                if (retval == 1)
                                        break;
                                else if (retval != 0) {
                                        error_exit = 1;
                                        goto bailout;
                                }

                                camdd_get_depth(dev, &our_depth, &peer_depth,
                                                &our_bytes, &peer_bytes);
                        }
                }
                /*
                 * See if we have any I/O that is ready to execute.
                 */
                buf = STAILQ_FIRST(&dev->run_queue);
                if (buf != NULL) {
                        while (dev->target_queue_depth > dev->cur_active_io) {
                                retval = dev->run(dev);
                                if (retval == -1) {
                                        dev->flags |= CAMDD_DEV_FLAG_EOF;
                                        error_exit = 1;
                                        break;
                                } else if (retval != 0) {
                                        break;
                                }
                        }
                }

                /*
                 * We've reached EOF, or our partner has reached EOF.
                 */
                if ((dev->flags & CAMDD_DEV_FLAG_EOF)
                 || (dev->flags & CAMDD_DEV_FLAG_PEER_EOF)) {
                        if (dev->write_dev != 0) {
                                if ((STAILQ_EMPTY(&dev->work_queue))
                                 && (dev->num_run_queue == 0)
                                 && (dev->cur_active_io == 0)) {
                                        goto bailout;
                                }
                        } else {
                                /*
                                 * If we're the reader, and the writer
                                 * got EOF, he is already done.  If we got
                                 * the EOF, then we need to wait until
                                 * everything is flushed out for the writer.
                                 */
                                if (dev->flags & CAMDD_DEV_FLAG_PEER_EOF) {
                                        goto bailout;
                                } else if ((dev->num_peer_work_queue == 0)
                                        && (dev->num_peer_done_queue == 0)
                                        && (dev->cur_active_io == 0)
                                        && (dev->num_run_queue == 0)) {
                                        goto bailout;
                                }
                        }
                        /*
                         * XXX KDM need to do something about the pending
                         * queue and cleanup resources.
                         */
                } 

                if ((dev->write_dev == 0)
                 && (dev->cur_active_io == 0)
                 && (dev->peer_bytes_queued < dev->peer_dev->blocksize))
                        kq_ts = &ts;
                else
                        kq_ts = NULL;

                /*
                 * Run kevent to see if there are events to process.
                 */
                pthread_mutex_unlock(&dev->mutex);
                retval = kevent(dev->kq, NULL, 0, &ke, 1, kq_ts);
                pthread_mutex_lock(&dev->mutex);
                if (retval == -1) {
                        warn("%s: error returned from kevent",__func__);
                        goto bailout;
                } else if (retval != 0) {
                        switch (ke.filter) {
                        case EVFILT_READ:
                                if (dev->fetch != NULL) {
                                        retval = dev->fetch(dev);
                                        if (retval == -1) {
                                                error_exit = 1;
                                                goto bailout;
                                        }
                                }
                                break;
                        case EVFILT_SIGNAL:
                                /*
                                 * We register for this so we don't get
                                 * an error as a result of a SIGINFO or a
                                 * SIGINT.  It will actually get handled
                                 * by the signal handler.  If we get a
                                 * SIGINT, bail out without printing an
                                 * error message.  Any other signals 
                                 * will result in the error message above.
                                 */
                                if (ke.ident == SIGINT)
                                        goto bailout;
                                break;
                        case EVFILT_USER:
                                retval = 0;
                                /*
                                 * Check to see if the other thread has
                                 * queued any I/O for us to do.  (In this
                                 * case we're the writer.)
                                 */
                                for (buf = STAILQ_FIRST(&dev->work_queue);
                                     buf != NULL;
                                     buf = STAILQ_FIRST(&dev->work_queue)) {
                                        STAILQ_REMOVE_HEAD(&dev->work_queue,
                                                           work_links);
                                        retval = camdd_queue(dev, buf);
                                        /*
                                         * We keep going unless we get an
                                         * actual error.  If we get EOF, we
                                         * still want to remove the buffers
                                         * from the queue and send the back
                                         * to the reader thread.
                                         */
                                        if (retval == -1) {
                                                error_exit = 1;
                                                goto bailout;
                                        } else
                                                retval = 0;
                                }

                                /*
                                 * Next check to see if the other thread has
                                 * queued any completed buffers back to us.
                                 * (In this case we're the reader.)
                                 */
                                for (buf = STAILQ_FIRST(&dev->peer_done_queue);
                                     buf != NULL;
                                     buf = STAILQ_FIRST(&dev->peer_done_queue)){
                                        STAILQ_REMOVE_HEAD(
                                            &dev->peer_done_queue, work_links);
                                        dev->num_peer_done_queue--;
                                        camdd_peer_done(buf);
                                }
                                break;
                        default:
                                warnx("%s: unknown kevent filter %d",
                                      __func__, ke.filter);
                                break;
                        }
                }
        }

bailout:

        dev->flags &= ~CAMDD_DEV_FLAG_ACTIVE;

        /* XXX KDM cleanup resources here? */

        pthread_mutex_unlock(&dev->mutex);

        need_exit = 1;
        sem_post(&camdd_sem);

        return (NULL);
}

/*
 * Simplistic translation of CCB status to our local status.
 */
camdd_buf_status
camdd_ccb_status(union ccb *ccb, int protocol)
{
        camdd_buf_status status = CAMDD_STATUS_NONE;
        cam_status ccb_status;

        ccb_status = ccb->ccb_h.status & CAM_STATUS_MASK;

        switch (protocol) {
        case PROTO_SCSI:
                switch (ccb_status) {
                case CAM_REQ_CMP: {
                        if (ccb->csio.resid == 0) {
                                status = CAMDD_STATUS_OK;
                        } else if (ccb->csio.dxfer_len > ccb->csio.resid) {
                                status = CAMDD_STATUS_SHORT_IO;
                        } else {
                                status = CAMDD_STATUS_EOF;
                        }
                        break;
                }
                case CAM_SCSI_STATUS_ERROR: {
                        switch (ccb->csio.scsi_status) {
                        case SCSI_STATUS_OK:
                        case SCSI_STATUS_COND_MET:
                        case SCSI_STATUS_INTERMED:
                        case SCSI_STATUS_INTERMED_COND_MET:
                                status = CAMDD_STATUS_OK;
                                break;
                        case SCSI_STATUS_CMD_TERMINATED:
                        case SCSI_STATUS_CHECK_COND:
                        case SCSI_STATUS_QUEUE_FULL:
                        case SCSI_STATUS_BUSY:
                        case SCSI_STATUS_RESERV_CONFLICT:
                        default:
                                status = CAMDD_STATUS_ERROR;
                                break;
                        }
                        break;
                }
                default:
                        status = CAMDD_STATUS_ERROR;
                        break;
                }
                break;
        case PROTO_NVME:
                switch (ccb_status) {
                case CAM_REQ_CMP:
                        status = CAMDD_STATUS_OK;
                        break;
                default:
                        status = CAMDD_STATUS_ERROR;
                        break;
                }
                break;
        default:
                status = CAMDD_STATUS_ERROR;
                break;
        }

        return (status);
}

/*
 * Queue a buffer to our peer's work thread for writing.
 *
 * Returns 0 for success, -1 for failure, 1 if the other thread exited.
 */
int
camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf)
{
        struct kevent ke;
        STAILQ_HEAD(, camdd_buf) local_queue;
        struct camdd_buf *buf1, *buf2;
        struct camdd_buf_data *data = NULL;
        uint64_t peer_bytes_queued = 0;
        int active = 1;
        int retval = 0;

        STAILQ_INIT(&local_queue);

        /*
         * Since we're the reader, we need to queue our I/O to the writer
         * in sequential order in order to make sure it gets written out
         * in sequential order.
         *
         * Check the next expected I/O starting offset.  If this doesn't
         * match, put it on the reorder queue.
         */
        if ((buf->lba * dev->sector_size) != dev->next_completion_pos_bytes) {

                /*
                 * If there is nothing on the queue, there is no sorting
                 * needed.
                 */
                if (STAILQ_EMPTY(&dev->reorder_queue)) {
                        STAILQ_INSERT_TAIL(&dev->reorder_queue, buf, links);
                        dev->num_reorder_queue++;
                        goto bailout;
                }

                /*
                 * Sort in ascending order by starting LBA.  There should
                 * be no identical LBAs.
                 */
                for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL;
                     buf1 = buf2) {
                        buf2 = STAILQ_NEXT(buf1, links);
                        if (buf->lba < buf1->lba) {
                                /*
                                 * If we're less than the first one, then
                                 * we insert at the head of the list
                                 * because this has to be the first element
                                 * on the list.
                                 */
                                STAILQ_INSERT_HEAD(&dev->reorder_queue,
                                                   buf, links);
                                dev->num_reorder_queue++;
                                break;
                        } else if (buf->lba > buf1->lba) {
                                if (buf2 == NULL) {
                                        STAILQ_INSERT_TAIL(&dev->reorder_queue, 
                                            buf, links);
                                        dev->num_reorder_queue++;
                                        break;
                                } else if (buf->lba < buf2->lba) {
                                        STAILQ_INSERT_AFTER(&dev->reorder_queue,
                                            buf1, buf, links);
                                        dev->num_reorder_queue++;
                                        break;
                                }
                        } else {
                                errx(1, "Found buffers with duplicate LBA %ju!",
                                     buf->lba);
                        }
                }
                goto bailout;
        } else {

                /*
                 * We're the next expected I/O completion, so put ourselves
                 * on the local queue to be sent to the writer.  We use
                 * work_links here so that we can queue this to the 
                 * peer_work_queue before taking the buffer off of the
                 * local_queue.
                 */
                dev->next_completion_pos_bytes += buf->len;
                STAILQ_INSERT_TAIL(&local_queue, buf, work_links);

                /*
                 * Go through the reorder queue looking for more sequential
                 * I/O and add it to the local queue.
                 */
                for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL;
                     buf1 = STAILQ_FIRST(&dev->reorder_queue)) {
                        /*
                         * As soon as we see an I/O that is out of sequence,
                         * we're done.
                         */
                        if ((buf1->lba * dev->sector_size) !=
                             dev->next_completion_pos_bytes)
                                break;

                        STAILQ_REMOVE_HEAD(&dev->reorder_queue, links);
                        dev->num_reorder_queue--;
                        STAILQ_INSERT_TAIL(&local_queue, buf1, work_links);
                        dev->next_completion_pos_bytes += buf1->len;
                }
        }

        /*
         * Setup the event to let the other thread know that it has work
         * pending.
         */
        EV_SET(&ke, (uintptr_t)&dev->peer_dev->work_queue, EVFILT_USER, 0,
               NOTE_TRIGGER, 0, NULL);

        /*
         * Put this on our shadow queue so that we know what we've queued
         * to the other thread.
         */
        STAILQ_FOREACH_SAFE(buf1, &local_queue, work_links, buf2) {
                if (buf1->buf_type != CAMDD_BUF_DATA) {
                        errx(1, "%s: should have a data buffer, not an "
                            "indirect buffer", __func__);
                }
                data = &buf1->buf_type_spec.data;

                /*
                 * We only need to send one EOF to the writer, and don't
                 * need to continue sending EOFs after that.
                 */
                if (buf1->status == CAMDD_STATUS_EOF) {
                        if (dev->flags & CAMDD_DEV_FLAG_EOF_SENT) {
                                STAILQ_REMOVE(&local_queue, buf1, camdd_buf,
                                    work_links);
                                camdd_release_buf(buf1);
                                retval = 1;
                                continue;
                        }
                        dev->flags |= CAMDD_DEV_FLAG_EOF_SENT;
                }


                STAILQ_INSERT_TAIL(&dev->peer_work_queue, buf1, links);
                peer_bytes_queued += (data->fill_len - data->resid);
                dev->peer_bytes_queued += (data->fill_len - data->resid);
                dev->num_peer_work_queue++;
        }

        if (STAILQ_FIRST(&local_queue) == NULL)
                goto bailout;

        /*
         * Drop our mutex and pick up the other thread's mutex.  We need to
         * do this to avoid deadlocks.
         */
        pthread_mutex_unlock(&dev->mutex);
        pthread_mutex_lock(&dev->peer_dev->mutex);

        if (dev->peer_dev->flags & CAMDD_DEV_FLAG_ACTIVE) {
                /*
                 * Put the buffers on the other thread's incoming work queue.
                 */
                for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL;
                     buf1 = STAILQ_FIRST(&local_queue)) {
                        STAILQ_REMOVE_HEAD(&local_queue, work_links);
                        STAILQ_INSERT_TAIL(&dev->peer_dev->work_queue, buf1,
                                           work_links);
                }
                /*
                 * Send an event to the other thread's kqueue to let it know
                 * that there is something on the work queue.
                 */
                retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL);
                if (retval == -1)
                        warn("%s: unable to add peer work_queue kevent",
                             __func__);
                else
                        retval = 0;
        } else
                active = 0;

        pthread_mutex_unlock(&dev->peer_dev->mutex);
        pthread_mutex_lock(&dev->mutex);

        /*
         * If the other side isn't active, run through the queue and
         * release all of the buffers.
         */
        if (active == 0) {
                for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL;
                     buf1 = STAILQ_FIRST(&local_queue)) {
                        STAILQ_REMOVE_HEAD(&local_queue, work_links);
                        STAILQ_REMOVE(&dev->peer_work_queue, buf1, camdd_buf,
                                      links);
                        dev->num_peer_work_queue--;
                        camdd_release_buf(buf1);
                }
                dev->peer_bytes_queued -= peer_bytes_queued;
                retval = 1;
        }

bailout:
        return (retval);
}

/*
 * Return a buffer to the reader thread when we have completed writing it.
 */
int
camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf)
{
        struct kevent ke;
        int retval = 0;

        /*
         * Setup the event to let the other thread know that we have
         * completed a buffer.
         */
        EV_SET(&ke, (uintptr_t)&dev->peer_dev->peer_done_queue, EVFILT_USER, 0,
               NOTE_TRIGGER, 0, NULL);

        /*
         * Drop our lock and acquire the other thread's lock before
         * manipulating 
         */
        pthread_mutex_unlock(&dev->mutex);
        pthread_mutex_lock(&dev->peer_dev->mutex);

        /*
         * Put the buffer on the reader thread's peer done queue now that
         * we have completed it.
         */
        STAILQ_INSERT_TAIL(&dev->peer_dev->peer_done_queue, peer_buf,
                           work_links);
        dev->peer_dev->num_peer_done_queue++;

        /*
         * Send an event to the peer thread to let it know that we've added
         * something to its peer done queue.
         */
        retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL);
        if (retval == -1)
                warn("%s: unable to add peer_done_queue kevent", __func__);
        else
                retval = 0;

        /*
         * Drop the other thread's lock and reacquire ours.
         */
        pthread_mutex_unlock(&dev->peer_dev->mutex);
        pthread_mutex_lock(&dev->mutex);

        return (retval);
}

/*
 * Free a buffer that was written out by the writer thread and returned to
 * the reader thread.
 */
void
camdd_peer_done(struct camdd_buf *buf)
{
        struct camdd_dev *dev;
        struct camdd_buf_data *data;

        dev = buf->dev;
        if (buf->buf_type != CAMDD_BUF_DATA) {
                errx(1, "%s: should have a data buffer, not an "
                    "indirect buffer", __func__);
        }

        data = &buf->buf_type_spec.data;

        STAILQ_REMOVE(&dev->peer_work_queue, buf, camdd_buf, links);
        dev->num_peer_work_queue--;
        dev->peer_bytes_queued -= (data->fill_len - data->resid);

        if (buf->status == CAMDD_STATUS_EOF)
                dev->flags |= CAMDD_DEV_FLAG_PEER_EOF;

        STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
}

/*
 * Assumes caller holds the lock for this device.
 */
void
camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf,
                   int *error_count)
{
        int retval = 0;

        /*
         * If we're the reader, we need to send the completed I/O
         * to the writer.  If we're the writer, we need to just
         * free up resources, or let the reader know if we've
         * encountered an error.
         */
        if (dev->write_dev == 0) {
                retval = camdd_queue_peer_buf(dev, buf);
                if (retval != 0)
                        (*error_count)++;
        } else {
                struct camdd_buf *tmp_buf, *next_buf;

                STAILQ_FOREACH_SAFE(tmp_buf, &buf->src_list, src_links,
                                    next_buf) {
                        struct camdd_buf *src_buf;
                        struct camdd_buf_indirect *indirect;

                        STAILQ_REMOVE(&buf->src_list, tmp_buf,
                                      camdd_buf, src_links);

                        tmp_buf->status = buf->status;

                        if (tmp_buf->buf_type == CAMDD_BUF_DATA) {
                                camdd_complete_peer_buf(dev, tmp_buf);
                                continue;
                        }

                        indirect = &tmp_buf->buf_type_spec.indirect;
                        src_buf = indirect->src_buf;
                        src_buf->refcount--;
                        /*
                         * XXX KDM we probably need to account for
                         * exactly how many bytes we were able to
                         * write.  Allocate the residual to the
                         * first N buffers?  Or just track the
                         * number of bytes written?  Right now the reader
                         * doesn't do anything with a residual.
                         */
                        src_buf->status = buf->status;
                        if (src_buf->refcount <= 0)
                                camdd_complete_peer_buf(dev, src_buf);
                        STAILQ_INSERT_TAIL(&dev->free_indirect_queue,
                                           tmp_buf, links);
                }

                STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
        }
}

/*
 * Fetch all completed commands from the pass(4) device.
 *
 * Returns the number of commands received, or -1 if any of the commands
 * completed with an error.  Returns 0 if no commands are available.
 */
int
camdd_pass_fetch(struct camdd_dev *dev)
{
        struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;
        union ccb ccb;
        int retval = 0, num_fetched = 0, error_count = 0;

        pthread_mutex_unlock(&dev->mutex);
        /*
         * XXX KDM we don't distinguish between EFAULT and ENOENT.
         */
        while ((retval = ioctl(pass_dev->dev->fd, CAMIOGET, &ccb)) != -1) {
                struct camdd_buf *buf;
                struct camdd_buf_data *data;
                cam_status ccb_status;
                union ccb *buf_ccb;

                buf = ccb.ccb_h.ccb_buf;
                data = &buf->buf_type_spec.data;
                buf_ccb = &data->ccb;

                num_fetched++;

                /*
                 * Copy the CCB back out so we get status, sense data, etc.
                 */
                bcopy(&ccb, buf_ccb, sizeof(ccb));

                pthread_mutex_lock(&dev->mutex);

                /*
                 * We're now done, so take this off the active queue.
                 */
                STAILQ_REMOVE(&dev->active_queue, buf, camdd_buf, links);
                dev->cur_active_io--;

                ccb_status = ccb.ccb_h.status & CAM_STATUS_MASK;
                if (ccb_status != CAM_REQ_CMP) {
                        cam_error_print(pass_dev->dev, &ccb, CAM_ESF_ALL,
                                        CAM_EPF_ALL, stderr);
                }

                switch (pass_dev->protocol) {
                case PROTO_SCSI:
                        data->resid = ccb.csio.resid;
                        dev->bytes_transferred += (ccb.csio.dxfer_len - ccb.csio.resid);
                        break;
                case PROTO_NVME:
                        data->resid = 0;
                        dev->bytes_transferred += ccb.nvmeio.dxfer_len;
                        break;
                default:
                        return -1;
                        break;
                }

                if (buf->status == CAMDD_STATUS_NONE)
                        buf->status = camdd_ccb_status(&ccb, pass_dev->protocol);
                if (buf->status == CAMDD_STATUS_ERROR)
                        error_count++;
                else if (buf->status == CAMDD_STATUS_EOF) {
                        /*
                         * Once we queue this buffer to our partner thread,
                         * he will know that we've hit EOF.
                         */
                        dev->flags |= CAMDD_DEV_FLAG_EOF;
                }

                camdd_complete_buf(dev, buf, &error_count);

                /*
                 * Unlock in preparation for the ioctl call.
                 */
                pthread_mutex_unlock(&dev->mutex);
        }

        pthread_mutex_lock(&dev->mutex);

        if (error_count > 0)
                return (-1);
        else
                return (num_fetched);
}

/*
 * Returns -1 for error, 0 for success/continue, and 1 for resource
 * shortage/stop processing.
 */
int
camdd_file_run(struct camdd_dev *dev)
{
        struct camdd_dev_file *file_dev = &dev->dev_spec.file;
        struct camdd_buf_data *data;
        struct camdd_buf *buf;
        off_t io_offset;
        int retval = 0, write_dev = dev->write_dev;
        int error_count = 0, no_resources = 0, double_buf_needed = 0;
        uint32_t num_sectors = 0, db_len = 0;

        buf = STAILQ_FIRST(&dev->run_queue);
        if (buf == NULL) {
                no_resources = 1;
                goto bailout;
        } else if ((dev->write_dev == 0)
                && (dev->flags & (CAMDD_DEV_FLAG_EOF |
                                  CAMDD_DEV_FLAG_EOF_SENT))) {
                STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
                dev->num_run_queue--;
                buf->status = CAMDD_STATUS_EOF;
                error_count++;
                goto bailout;
        }

        /*
         * If we're writing, we need to go through the source buffer list
         * and create an S/G list.
         */
        if (write_dev != 0) {
                retval = camdd_buf_sg_create(buf, /*iovec*/ 1,
                    dev->sector_size, &num_sectors, &double_buf_needed);
                if (retval != 0) {
                        no_resources = 1;
                        goto bailout;
                }
        }

        STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
        dev->num_run_queue--;

        data = &buf->buf_type_spec.data;

        /*
         * pread(2) and pwrite(2) offsets are byte offsets.
         */
        io_offset = buf->lba * dev->sector_size;

        /*
         * Unlock the mutex while we read or write.
         */
        pthread_mutex_unlock(&dev->mutex);

        /*
         * Note that we don't need to double buffer if we're the reader
         * because in that case, we have allocated a single buffer of
         * sufficient size to do the read.  This copy is necessary on
         * writes because if one of the components of the S/G list is not
         * a sector size multiple, the kernel will reject the write.  This
         * is unfortunate but not surprising.  So this will make sure that
         * we're using a single buffer that is a multiple of the sector size.
         */
        if ((double_buf_needed != 0)
         && (data->sg_count > 1)
         && (write_dev != 0)) {
                uint32_t cur_offset;
                int i;

                if (file_dev->tmp_buf == NULL)
                        file_dev->tmp_buf = calloc(dev->blocksize, 1);
                if (file_dev->tmp_buf == NULL) {
                        buf->status = CAMDD_STATUS_ERROR;
                        error_count++;
                        pthread_mutex_lock(&dev->mutex);
                        goto bailout;
                }
                for (i = 0, cur_offset = 0; i < data->sg_count; i++) {
                        bcopy(data->iovec[i].iov_base,
                            &file_dev->tmp_buf[cur_offset],
                            data->iovec[i].iov_len);
                        cur_offset += data->iovec[i].iov_len;
                }
                db_len = cur_offset;
        }

        if (file_dev->file_flags & CAMDD_FF_CAN_SEEK) {
                if (write_dev == 0) {
                        /*
                         * XXX KDM is there any way we would need a S/G
                         * list here?
                         */
                        retval = pread(file_dev->fd, data->buf,
                            buf->len, io_offset);
                } else {
                        if (double_buf_needed != 0) {
                                retval = pwrite(file_dev->fd, file_dev->tmp_buf,
                                    db_len, io_offset);
                        } else if (data->sg_count == 0) {
                                retval = pwrite(file_dev->fd, data->buf,
                                    data->fill_len, io_offset);
                        } else {
                                retval = pwritev(file_dev->fd, data->iovec,
                                    data->sg_count, io_offset);
                        }
                }
        } else {
                if (write_dev == 0) {
                        /*
                         * XXX KDM is there any way we would need a S/G
                         * list here?
                         */
                        retval = read(file_dev->fd, data->buf, buf->len);
                } else {
                        if (double_buf_needed != 0) {
                                retval = write(file_dev->fd, file_dev->tmp_buf,
                                    db_len);
                        } else if (data->sg_count == 0) {
                                retval = write(file_dev->fd, data->buf,
                                    data->fill_len);
                        } else {
                                retval = writev(file_dev->fd, data->iovec,
                                    data->sg_count);
                        }
                }
        }

        /* We're done, re-acquire the lock */
        pthread_mutex_lock(&dev->mutex);

        if (retval >= (ssize_t)data->fill_len) {
                /*
                 * If the bytes transferred is more than the request size,
                 * that indicates an overrun, which should only happen at
                 * the end of a transfer if we have to round up to a sector
                 * boundary.
                 */
                if (buf->status == CAMDD_STATUS_NONE)
                        buf->status = CAMDD_STATUS_OK;
                data->resid = 0;
                dev->bytes_transferred += retval;
        } else if (retval == -1) {
                warn("Error %s %s", (write_dev) ? "writing to" :
                    "reading from", file_dev->filename);

                buf->status = CAMDD_STATUS_ERROR;
                data->resid = data->fill_len;
                error_count++;

                if (dev->debug == 0)
                        goto bailout;

                if ((double_buf_needed != 0)
                 && (write_dev != 0)) {
                        fprintf(stderr, "%s: fd %d, DB buf %p, len %u lba %ju "
                            "offset %ju\n", __func__, file_dev->fd,
                            file_dev->tmp_buf, db_len, (uintmax_t)buf->lba,
                            (uintmax_t)io_offset);
                } else if (data->sg_count == 0) {
                        fprintf(stderr, "%s: fd %d, buf %p, len %u, lba %ju "
                            "offset %ju\n", __func__, file_dev->fd, data->buf,
                            data->fill_len, (uintmax_t)buf->lba,
                            (uintmax_t)io_offset);
                } else {
                        int i;

                        fprintf(stderr, "%s: fd %d, len %u, lba %ju "
                            "offset %ju\n", __func__, file_dev->fd, 
                            data->fill_len, (uintmax_t)buf->lba,
                            (uintmax_t)io_offset);

                        for (i = 0; i < data->sg_count; i++) {
                                fprintf(stderr, "index %d ptr %p len %zu\n",
                                    i, data->iovec[i].iov_base,
                                    data->iovec[i].iov_len);
                        }
                }
        } else if (retval == 0) {
                buf->status = CAMDD_STATUS_EOF;
                if (dev->debug != 0)
                        printf("%s: got EOF from %s!\n", __func__,
                            file_dev->filename);
                data->resid = data->fill_len;
                error_count++;
        } else if (retval < (ssize_t)data->fill_len) {
                if (buf->status == CAMDD_STATUS_NONE)
                        buf->status = CAMDD_STATUS_SHORT_IO;
                data->resid = data->fill_len - retval;
                dev->bytes_transferred += retval;
        }

bailout:
        if (buf != NULL) {
                if (buf->status == CAMDD_STATUS_EOF) {
                        struct camdd_buf *buf2;
                        dev->flags |= CAMDD_DEV_FLAG_EOF;
                        STAILQ_FOREACH(buf2, &dev->run_queue, links)
                                buf2->status = CAMDD_STATUS_EOF;
                }

                camdd_complete_buf(dev, buf, &error_count);
        }

        if (error_count != 0)
                return (-1);
        else if (no_resources != 0)
                return (1);
        else
                return (0);
}

/*
 * Execute one command from the run queue.  Returns 0 for success, 1 for
 * stop processing, and -1 for error.
 */
int
camdd_pass_run(struct camdd_dev *dev)
{
        struct camdd_buf *buf = NULL;
        struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass;
        struct camdd_buf_data *data;
        uint32_t num_blocks, sectors_used = 0;
        union ccb *ccb;
        int retval = 0, is_write = dev->write_dev;
        int double_buf_needed = 0;

        buf = STAILQ_FIRST(&dev->run_queue);
        if (buf == NULL) {
                retval = 1;
                goto bailout;
        }

        /*
         * If we're writing, we need to go through the source buffer list
         * and create an S/G list.
         */
        if (is_write != 0) {
                retval = camdd_buf_sg_create(buf, /*iovec*/ 0,dev->sector_size,
                    &sectors_used, &double_buf_needed);
                if (retval != 0) {
                        retval = -1;
                        goto bailout;
                }
        }

        STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links);
        dev->num_run_queue--;

        data = &buf->buf_type_spec.data;

        /*
         * In almost every case the number of blocks should be the device
         * block size.  The exception may be at the end of an I/O stream
         * for a partial block or at the end of a device.
         */
        if (is_write != 0)
                num_blocks = sectors_used;
        else
                num_blocks = data->fill_len / pass_dev->block_len;

        ccb = &data->ccb;

        switch (pass_dev->protocol) {
        case PROTO_SCSI:
                CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);

                scsi_read_write(&ccb->csio,
                                /*retries*/ dev->retry_count,
                                /*cbfcnp*/ NULL,
                                /*tag_action*/ MSG_SIMPLE_Q_TAG,
                                /*readop*/ (dev->write_dev == 0) ? SCSI_RW_READ :
                                           SCSI_RW_WRITE,
                                /*byte2*/ 0,
                                /*minimum_cmd_size*/ dev->min_cmd_size,
                                /*lba*/ buf->lba,
                                /*block_count*/ num_blocks,
                                /*data_ptr*/ (data->sg_count != 0) ?
                                             (uint8_t *)data->segs : data->buf,
                                /*dxfer_len*/ (num_blocks * pass_dev->block_len),
                                /*sense_len*/ SSD_FULL_SIZE,
                                /*timeout*/ dev->io_timeout);

                if (data->sg_count != 0) {
                        ccb->csio.sglist_cnt = data->sg_count;
                }
                break;
        case PROTO_NVME:
                CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->nvmeio);

                nvme_read_write(&ccb->nvmeio,
                                /*retries*/ dev->retry_count,
                                /*cbfcnp*/ NULL,
                                /*nsid*/ pass_dev->dev->target_lun & UINT32_MAX,
                                /*readop*/ dev->write_dev == 0,
                                /*lba*/ buf->lba,
                                /*block_count*/ num_blocks,
                                /*data_ptr*/ (data->sg_count != 0) ?
                                             (uint8_t *)data->segs : data->buf,
                                /*dxfer_len*/ (num_blocks * pass_dev->block_len),
                                /*timeout*/ dev->io_timeout);

                ccb->nvmeio.sglist_cnt = data->sg_count;
                break;
        default:
                retval = -1;
                goto bailout;
        }

        /* Disable freezing the device queue */
        ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;

        if (dev->retry_count != 0)
                ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;

        if (data->sg_count != 0) {
                ccb->ccb_h.flags |= CAM_DATA_SG;
        }

        /*
         * Store a pointer to the buffer in the CCB.  The kernel will
         * restore this when we get it back, and we'll use it to identify
         * the buffer this CCB came from.
         */
        ccb->ccb_h.ccb_buf = buf;

        /*
         * Unlock our mutex in preparation for issuing the ioctl.
         */
        pthread_mutex_unlock(&dev->mutex);
        /*
         * Queue the CCB to the pass(4) driver.
         */
        if (ioctl(pass_dev->dev->fd, CAMIOQUEUE, ccb) == -1) {
                pthread_mutex_lock(&dev->mutex);

                warn("%s: error sending CAMIOQUEUE ioctl to %s%u", __func__,
                     pass_dev->dev->device_name, pass_dev->dev->dev_unit_num);
                warn("%s: CCB address is %p", __func__, ccb);
                retval = -1;

                STAILQ_INSERT_TAIL(&dev->free_queue, buf, links);
        } else {
                pthread_mutex_lock(&dev->mutex);

                dev->cur_active_io++;
                STAILQ_INSERT_TAIL(&dev->active_queue, buf, links);
        }

bailout:
        return (retval);
}

int
camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len)
{
        uint32_t num_blocks;
        int retval = 0;

        *lba = dev->next_io_pos_bytes / dev->sector_size;
        *len = dev->blocksize;
        num_blocks = *len / dev->sector_size;

        /*
         * If max_sector is 0, then we have no set limit.  This can happen
         * if we're writing to a file in a filesystem, or reading from
         * something like /dev/zero.
         */
        if ((dev->max_sector != 0)
         || (dev->sector_io_limit != 0)) {
                uint64_t max_sector;

                if ((dev->max_sector != 0)
                 && (dev->sector_io_limit != 0)) 
                        max_sector = min(dev->sector_io_limit, dev->max_sector);
                else if (dev->max_sector != 0)
                        max_sector = dev->max_sector;
                else
                        max_sector = dev->sector_io_limit;


                /*
                 * Check to see whether we're starting off past the end of
                 * the device.  If so, we need to just send an EOF      
                 * notification to the writer.
                 */
                if (*lba > max_sector) {
                        *len = 0;
                        retval = 1;
                } else if (((*lba + num_blocks) > max_sector + 1)
                        || ((*lba + num_blocks) < *lba)) {
                        /*
                         * If we get here (but pass the first check), we
                         * can trim the request length down to go to the
                         * end of the device.
                         */
                        num_blocks = (max_sector + 1) - *lba;
                        *len = num_blocks * dev->sector_size;
                        retval = 1;
                }
        }

        dev->next_io_pos_bytes += *len;

        return (retval);
}

/*
 * Returns 0 for success, 1 for EOF detected, and -1 for failure.
 */
int
camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf)
{
        struct camdd_buf *buf = NULL;
        struct camdd_buf_data *data;
        size_t new_len;
        struct camdd_buf_data *rb_data;
        int is_write = dev->write_dev;
        int eof_flush_needed = 0;
        int retval = 0;

        /*
         * If we've gotten EOF or our partner has, we should not continue
         * queueing I/O.  If we're a writer, though, we should continue
         * to write any buffers that don't have EOF status.
         */
        if ((dev->flags & CAMDD_DEV_FLAG_EOF)
         || ((dev->flags & CAMDD_DEV_FLAG_PEER_EOF)
          && (is_write == 0))) {
                /*
                 * Tell the worker thread that we have seen EOF.
                 */
                retval = 1;

                /*
                 * If we're the writer, send the buffer back with EOF status.
                 */
                if (is_write) {
                        read_buf->status = CAMDD_STATUS_EOF;
                        
                        camdd_complete_peer_buf(dev, read_buf);
                }
                goto bailout;
        }

        if (is_write == 0) {
                buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
                if (buf == NULL) {
                        retval = -1;
                        goto bailout;
                }
                data = &buf->buf_type_spec.data;

                retval = camdd_get_next_lba_len(dev, &buf->lba, &buf->len);
                if (retval != 0) {
                        buf->status = CAMDD_STATUS_EOF;

                        if ((buf->len == 0)
                         && ((dev->flags & (CAMDD_DEV_FLAG_EOF_SENT |
                             CAMDD_DEV_FLAG_EOF_QUEUED)) != 0)) {
                                camdd_release_buf(buf);
                                goto bailout;
                        }
                        dev->flags |= CAMDD_DEV_FLAG_EOF_QUEUED;
                }

                data->fill_len = buf->len;
                data->src_start_offset = buf->lba * dev->sector_size;

                /*
                 * Put this on the run queue.
                 */
                STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
                dev->num_run_queue++;

                /* We're done. */
                goto bailout;
        }

        /*
         * Check for new EOF status from the reader.
         */
        if ((read_buf->status == CAMDD_STATUS_EOF)
         || (read_buf->status == CAMDD_STATUS_ERROR)) {
                dev->flags |= CAMDD_DEV_FLAG_PEER_EOF;
                if ((STAILQ_FIRST(&dev->pending_queue) == NULL)
                 && (read_buf->len == 0)) {
                        camdd_complete_peer_buf(dev, read_buf);
                        retval = 1;
                        goto bailout;
                } else
                        eof_flush_needed = 1;
        }

        /*
         * See if we have a buffer we're composing with pieces from our
         * partner thread.
         */
        buf = STAILQ_FIRST(&dev->pending_queue);
        if (buf == NULL) {
                uint64_t lba;
                ssize_t len;

                retval = camdd_get_next_lba_len(dev, &lba, &len);
                if (retval != 0) {
                        read_buf->status = CAMDD_STATUS_EOF;

                        if (len == 0) {
                                dev->flags |= CAMDD_DEV_FLAG_EOF;
                                camdd_complete_peer_buf(dev, read_buf);
                                goto bailout;
                        }
                }

                /*
                 * If we don't have a pending buffer, we need to grab a new
                 * one from the free list or allocate another one.
                 */
                buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
                if (buf == NULL) {
                        retval = 1;
                        goto bailout;
                }

                buf->lba = lba;
                buf->len = len;

                STAILQ_INSERT_TAIL(&dev->pending_queue, buf, links);
                dev->num_pending_queue++;
        }

        data = &buf->buf_type_spec.data;

        rb_data = &read_buf->buf_type_spec.data;

        if ((rb_data->src_start_offset != dev->next_peer_pos_bytes)
         && (dev->debug != 0)) {
                printf("%s: WARNING: reader offset %#jx != expected offset "
                    "%#jx\n", __func__, (uintmax_t)rb_data->src_start_offset,
                    (uintmax_t)dev->next_peer_pos_bytes);
        }
        dev->next_peer_pos_bytes = rb_data->src_start_offset +
            (rb_data->fill_len - rb_data->resid);

        new_len = (rb_data->fill_len - rb_data->resid) + data->fill_len;
        if (new_len < buf->len) {
                /*
                 * There are three cases here:
                 * 1. We need more data to fill up a block, so we put 
                 *    this I/O on the queue and wait for more I/O.
                 * 2. We have a pending buffer in the queue that is
                 *    smaller than our blocksize, but we got an EOF.  So we
                 *    need to go ahead and flush the write out.
                 * 3. We got an error.
                 */

                /*
                 * Increment our fill length.
                 */
                data->fill_len += (rb_data->fill_len - rb_data->resid);

                /*
                 * Add the new read buffer to the list for writing.
                 */
                STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links);

                /* Increment the count */
                buf->src_count++;

                if (eof_flush_needed == 0) {
                        /*
                         * We need to exit, because we don't have enough
                         * data yet.
                         */
                        goto bailout;
                } else {
                        /*
                         * Take the buffer off of the pending queue.
                         */
                        STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf,
                                      links);
                        dev->num_pending_queue--;

                        /*
                         * If we need an EOF flush, but there is no data
                         * to flush, go ahead and return this buffer.
                         */
                        if (data->fill_len == 0) {
                                camdd_complete_buf(dev, buf, /*error_count*/0);
                                retval = 1;
                                goto bailout;
                        }

                        /*
                         * Put this on the next queue for execution.
                         */
                        STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
                        dev->num_run_queue++;
                }
        } else if (new_len == buf->len) {
                /*
                 * We have enough data to completey fill one block,
                 * so we're ready to issue the I/O.
                 */

                /*
                 * Take the buffer off of the pending queue.
                 */
                STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links);
                dev->num_pending_queue--;

                /*
                 * Add the new read buffer to the list for writing.
                 */
                STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links);

                /* Increment the count */
                buf->src_count++;

                /*
                 * Increment our fill length.
                 */
                data->fill_len += (rb_data->fill_len - rb_data->resid);

                /*
                 * Put this on the next queue for execution.
                 */
                STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
                dev->num_run_queue++;
        } else {
                struct camdd_buf *idb;
                struct camdd_buf_indirect *indirect;
                uint32_t len_to_go, cur_offset;

                
                idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT);
                if (idb == NULL) {
                        retval = 1;
                        goto bailout;
                }
                indirect = &idb->buf_type_spec.indirect;
                indirect->src_buf = read_buf;
                read_buf->refcount++;
                indirect->offset = 0;
                indirect->start_ptr = rb_data->buf;
                /*
                 * We've already established that there is more
                 * data in read_buf than we have room for in our
                 * current write request.  So this particular chunk
                 * of the request should just be the remainder
                 * needed to fill up a block.
                 */
                indirect->len = buf->len - (data->fill_len - data->resid);

                camdd_buf_add_child(buf, idb);

                /*
                 * This buffer is ready to execute, so we can take
                 * it off the pending queue and put it on the run
                 * queue.
                 */
                STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf,
                              links);
                dev->num_pending_queue--;
                STAILQ_INSERT_TAIL(&dev->run_queue, buf, links);
                dev->num_run_queue++;

                cur_offset = indirect->offset + indirect->len;

                /*
                 * The resulting I/O would be too large to fit in
                 * one block.  We need to split this I/O into
                 * multiple pieces.  Allocate as many buffers as needed.
                 */
                for (len_to_go = rb_data->fill_len - rb_data->resid -
                     indirect->len; len_to_go > 0;) {
                        struct camdd_buf *new_buf;
                        struct camdd_buf_data *new_data;
                        uint64_t lba;
                        ssize_t len;

                        retval = camdd_get_next_lba_len(dev, &lba, &len);
                        if ((retval != 0)
                         && (len == 0)) {
                                /*
                                 * The device has already been marked
                                 * as EOF, and there is no space left.
                                 */
                                goto bailout;
                        }

                        new_buf = camdd_get_buf(dev, CAMDD_BUF_DATA);
                        if (new_buf == NULL) {
                                retval = 1;
                                goto bailout;
                        }

                        new_buf->lba = lba;
                        new_buf->len = len;

                        idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT);
                        if (idb == NULL) {
                                retval = 1;
                                goto bailout;
                        }

                        indirect = &idb->buf_type_spec.indirect;

                        indirect->src_buf = read_buf;
                        read_buf->refcount++;
                        indirect->offset = cur_offset;
                        indirect->start_ptr = rb_data->buf + cur_offset;
                        indirect->len = min(len_to_go, new_buf->len);
#if 0
                        if (((indirect->len % dev->sector_size) != 0)
                         || ((indirect->offset % dev->sector_size) != 0)) {
                                warnx("offset %ju len %ju not aligned with "
                                    "sector size %u", indirect->offset,
                                    (uintmax_t)indirect->len, dev->sector_size);
                        }
#endif
                        cur_offset += indirect->len;
                        len_to_go -= indirect->len;

                        camdd_buf_add_child(new_buf, idb);

                        new_data = &new_buf->buf_type_spec.data;

                        if ((new_data->fill_len == new_buf->len)
                         || (eof_flush_needed != 0)) {
                                STAILQ_INSERT_TAIL(&dev->run_queue,
                                                   new_buf, links);
                                dev->num_run_queue++;
                        } else if (new_data->fill_len < buf->len) {
                                STAILQ_INSERT_TAIL(&dev->pending_queue,
                                                new_buf, links);
                                dev->num_pending_queue++;
                        } else {
                                warnx("%s: too much data in new "
                                      "buffer!", __func__);
                                retval = 1;
                                goto bailout;
                        }
                }
        }

bailout:
        return (retval);
}

void
camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth,
                uint32_t *peer_depth, uint32_t *our_bytes, uint32_t *peer_bytes)
{
        *our_depth = dev->cur_active_io + dev->num_run_queue;
        if (dev->num_peer_work_queue >
            dev->num_peer_done_queue)
                *peer_depth = dev->num_peer_work_queue -
                              dev->num_peer_done_queue;
        else
                *peer_depth = 0;
        *our_bytes = *our_depth * dev->blocksize;
        *peer_bytes = dev->peer_bytes_queued;
}

void
camdd_sig_handler(int sig)
{
        if (sig == SIGINFO)
                need_status = 1;
        else {
                need_exit = 1;
                error_exit = 1;
        }

        sem_post(&camdd_sem);
}

void
camdd_print_status(struct camdd_dev *camdd_dev, struct camdd_dev *other_dev, 
                   struct timespec *start_time)
{
        struct timespec done_time;
        uint64_t total_ns;
        long double mb_sec, total_sec;
        int error = 0;

        error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &done_time);
        if (error != 0) {
                warn("Unable to get done time");
                return;
        }

        timespecsub(&done_time, start_time, &done_time);
        
        total_ns = done_time.tv_nsec + (done_time.tv_sec * 1000000000);
        total_sec = total_ns;
        total_sec /= 1000000000;

        fprintf(stderr, "%ju bytes %s %s\n%ju bytes %s %s\n"
                "%.4Lf seconds elapsed\n",
                (uintmax_t)camdd_dev->bytes_transferred,
                (camdd_dev->write_dev == 0) ?  "read from" : "written to",
                camdd_dev->device_name,
                (uintmax_t)other_dev->bytes_transferred,
                (other_dev->write_dev == 0) ? "read from" : "written to",
                other_dev->device_name, total_sec);

        mb_sec = min(other_dev->bytes_transferred,camdd_dev->bytes_transferred);
        mb_sec /= 1024 * 1024;
        mb_sec *= 1000000000;
        mb_sec /= total_ns;
        fprintf(stderr, "%.2Lf MB/sec\n", mb_sec);
}

int
camdd_rw(struct camdd_io_opts *io_opts, camdd_argmask arglist, int num_io_opts,
         uint64_t max_io, int retry_count, int timeout)
{
        struct cam_device *new_cam_dev = NULL;
        struct camdd_dev *devs[2];
        struct timespec start_time;
        pthread_t threads[2];
        int unit = 0;
        int error = 0;
        int i;

        bzero(devs, sizeof(devs));

        if (num_io_opts != 2) {
                warnx("Must have one input and one output path");
                error = 1;
                goto bailout;
        }

        for (i = 0; i < num_io_opts; i++) {
                switch (io_opts[i].dev_type) {
                case CAMDD_DEV_PASS: {
                        if (isdigit(io_opts[i].dev_name[0])) {
                                int bus = 0, target = 0, lun = 0;
                                int rv;

                                /* device specified as bus:target[:lun] */
                                rv = parse_btl(io_opts[i].dev_name, &bus,
                                    &target, &lun);
                                if (rv < 2) {
                                        warnx("numeric device specification "
                                             "must be either bus:target, or "
                                             "bus:target:lun");
                                        error = 1;
                                        goto bailout;
                                }
                                /* default to 0 if lun was not specified */
                                if (rv == 2) {
                                        lun = 0;
                                }
                                new_cam_dev = cam_open_btl(bus, target, lun,
                                    O_RDWR, NULL);
                        } else {
                                char name[30];

                                if (cam_get_device(io_opts[i].dev_name, name,
                                                   sizeof name, &unit) == -1) {
                                        warnx("%s", cam_errbuf);
                                        error = 1;
                                        goto bailout;
                                }
                                new_cam_dev = cam_open_spec_device(name, unit,
                                    O_RDWR, NULL);
                        }

                        if (new_cam_dev == NULL) {
                                warnx("%s", cam_errbuf);
                                error = 1;
                                goto bailout;
                        }

                        devs[i] = camdd_probe_pass(new_cam_dev,
                            /*io_opts*/ &io_opts[i],
                            arglist, 
                            /*probe_retry_count*/ 3,
                            /*probe_timeout*/ 5000,
                            /*io_retry_count*/ retry_count,
                            /*io_timeout*/ timeout);
                        if (devs[i] == NULL) {
                                warn("Unable to probe device %s%u",
                                     new_cam_dev->device_name,
                                     new_cam_dev->dev_unit_num);
                                error = 1;
                                goto bailout;
                        }
                        break;
                }
                case CAMDD_DEV_FILE: {
                        int fd = -1;

                        if (io_opts[i].dev_name[0] == '-') {
                                if (io_opts[i].write_dev != 0)
                                        fd = STDOUT_FILENO;
                                else
                                        fd = STDIN_FILENO;
                        } else {
                                if (io_opts[i].write_dev != 0) {
                                        fd = open(io_opts[i].dev_name,
                                            O_RDWR | O_CREAT, S_IWUSR |S_IRUSR);
                                } else {
                                        fd = open(io_opts[i].dev_name,
                                            O_RDONLY);
                                }
                        }
                        if (fd == -1) {
                                warn("error opening file %s",
                                    io_opts[i].dev_name);
                                error = 1;
                                goto bailout;
                        }

                        devs[i] = camdd_probe_file(fd, &io_opts[i],
                            retry_count, timeout);
                        if (devs[i] == NULL) {
                                error = 1;
                                goto bailout;
                        }

                        break;
                }
                default:
                        warnx("Unknown device type %d (%s)",
                            io_opts[i].dev_type, io_opts[i].dev_name);
                        error = 1;
                        goto bailout;
                        break; /*NOTREACHED */
                }

                devs[i]->write_dev = io_opts[i].write_dev;

                devs[i]->start_offset_bytes = io_opts[i].offset;

                if (max_io != 0) {
                        devs[i]->sector_io_limit =
                            (devs[i]->start_offset_bytes /
                            devs[i]->sector_size) +
                            (max_io / devs[i]->sector_size) - 1;
                }

                devs[i]->next_io_pos_bytes = devs[i]->start_offset_bytes;
                devs[i]->next_completion_pos_bytes =devs[i]->start_offset_bytes;
        }

        devs[0]->peer_dev = devs[1];
        devs[1]->peer_dev = devs[0];
        devs[0]->next_peer_pos_bytes = devs[0]->peer_dev->next_io_pos_bytes;
        devs[1]->next_peer_pos_bytes = devs[1]->peer_dev->next_io_pos_bytes;

        sem_init(&camdd_sem, /*pshared*/ 0, 0);

        signal(SIGINFO, camdd_sig_handler);
        signal(SIGINT, camdd_sig_handler);

        error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &start_time);
        if (error != 0) {
                warn("Unable to get start time");
                goto bailout;
        }

        for (i = 0; i < num_io_opts; i++) {
                error = pthread_create(&threads[i], NULL, camdd_worker,
                                       (void *)devs[i]);
                if (error != 0) {
                        warnc(error, "pthread_create() failed");
                        goto bailout;
                }
        }

        for (;;) {
                if ((sem_wait(&camdd_sem) == -1)
                 || (need_exit != 0)) {
                        struct kevent ke;

                        for (i = 0; i < num_io_opts; i++) {
                                EV_SET(&ke, (uintptr_t)&devs[i]->work_queue,
                                    EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL);

                                devs[i]->flags |= CAMDD_DEV_FLAG_EOF;

                                error = kevent(devs[i]->kq, &ke, 1, NULL, 0,
                                                NULL);
                                if (error == -1)
                                        warn("%s: unable to wake up thread",
                                            __func__);
                                error = 0;
                        }
                        break;
                } else if (need_status != 0) {
                        camdd_print_status(devs[0], devs[1], &start_time);
                        need_status = 0;
                }
        } 
        for (i = 0; i < num_io_opts; i++) {
                pthread_join(threads[i], NULL);
        }

        camdd_print_status(devs[0], devs[1], &start_time);

bailout:

        for (i = 0; i < num_io_opts; i++)
                camdd_free_dev(devs[i]);

        return (error + error_exit);
}

void
usage(void)
{
        fprintf(stderr,
"usage:  camdd <-i|-o pass=pass0,bs=1M,offset=1M,depth=4>\n"
"              <-i|-o file=/tmp/file,bs=512K,offset=1M>\n"
"              <-i|-o file=/dev/da0,bs=512K,offset=1M>\n"
"              <-i|-o file=/dev/nsa0,bs=512K>\n"
"              [-C retry_count][-E][-m max_io_amt][-t timeout_secs][-v][-h]\n"
"Option description\n"
"-i <arg=val>  Specify input device/file and parameters\n"
"-o <arg=val>  Specify output device/file and parameters\n"
"Input and Output parameters\n"
"pass=name     Specify a pass(4) device like pass0 or /dev/pass0\n"
"file=name     Specify a file or device, /tmp/foo, /dev/da0, /dev/null\n"
"              or - for stdin/stdout\n"
"bs=blocksize  Specify blocksize in bytes, or using K, M, G, etc. suffix\n"
"offset=len    Specify starting offset in bytes or using K, M, G suffix\n"
"              NOTE: offset cannot be specified on tapes, pipes, stdin/out\n"
"depth=N       Specify a numeric queue depth.  This only applies to pass(4)\n"
"mcs=N         Specify a minimum cmd size for pass(4) read/write commands\n"
"Optional arguments\n"
"-C retry_cnt  Specify a retry count for pass(4) devices\n"
"-E            Enable CAM error recovery for pass(4) devices\n"
"-m max_io     Specify the maximum amount to be transferred in bytes or\n"
"              using K, G, M, etc. suffixes\n"
"-t timeout    Specify the I/O timeout to use with pass(4) devices\n"
"-v            Enable verbose error recovery\n"
"-h            Print this message\n");
}


int
camdd_parse_io_opts(char *args, int is_write, struct camdd_io_opts *io_opts)
{
        char *tmpstr, *tmpstr2;
        char *orig_tmpstr = NULL;
        int retval = 0;

        io_opts->write_dev = is_write;

        tmpstr = strdup(args);
        if (tmpstr == NULL) {
                warn("strdup failed");
                retval = 1;
                goto bailout;
        }
        orig_tmpstr = tmpstr;
        while ((tmpstr2 = strsep(&tmpstr, ",")) != NULL) {
                char *name, *value;

                /*
                 * If the user creates an empty parameter by putting in two
                 * commas, skip over it and look for the next field.
                 */
                if (*tmpstr2 == '\0')
                        continue;

                name = strsep(&tmpstr2, "=");
                if (*name == '\0') {
                        warnx("Got empty I/O parameter name");
                        retval = 1;
                        goto bailout;
                }
                value = strsep(&tmpstr2, "=");
                if ((value == NULL)
                 || (*value == '\0')) {
                        warnx("Empty I/O parameter value for %s", name);
                        retval = 1;
                        goto bailout;
                }
                if (strncasecmp(name, "file", 4) == 0) {
                        io_opts->dev_type = CAMDD_DEV_FILE;
                        io_opts->dev_name = strdup(value);
                        if (io_opts->dev_name == NULL) {
                                warn("Error allocating memory");
                                retval = 1;
                                goto bailout;
                        }
                } else if (strncasecmp(name, "pass", 4) == 0) {
                        io_opts->dev_type = CAMDD_DEV_PASS;
                        io_opts->dev_name = strdup(value);
                        if (io_opts->dev_name == NULL) {
                                warn("Error allocating memory");
                                retval = 1;
                                goto bailout;
                        }
                } else if ((strncasecmp(name, "bs", 2) == 0)
                        || (strncasecmp(name, "blocksize", 9) == 0)) {
                        retval = expand_number(value, &io_opts->blocksize);
                        if (retval == -1) {
                                warn("expand_number(3) failed on %s=%s", name,
                                    value);
                                retval = 1;
                                goto bailout;
                        }
                } else if (strncasecmp(name, "depth", 5) == 0) {
                        char *endptr;

                        io_opts->queue_depth = strtoull(value, &endptr, 0);
                        if (*endptr != '\0') {
                                warnx("invalid queue depth %s", value);
                                retval = 1;
                                goto bailout;
                        }
                } else if (strncasecmp(name, "mcs", 3) == 0) {
                        char *endptr;

                        io_opts->min_cmd_size = strtol(value, &endptr, 0);
                        if ((*endptr != '\0')
                         || ((io_opts->min_cmd_size > 16)
                          || (io_opts->min_cmd_size < 0))) {
                                warnx("invalid minimum cmd size %s", value);
                                retval = 1;
                                goto bailout;
                        }
                } else if (strncasecmp(name, "offset", 6) == 0) {
                        retval = expand_number(value, &io_opts->offset);
                        if (retval == -1) {
                                warn("expand_number(3) failed on %s=%s", name,
                                    value);
                                retval = 1;
                                goto bailout;
                        }
                } else if (strncasecmp(name, "debug", 5) == 0) {
                        char *endptr;

                        io_opts->debug = strtoull(value, &endptr, 0);
                        if (*endptr != '\0') {
                                warnx("invalid debug level %s", value);
                                retval = 1;
                                goto bailout;
                        }
                } else {
                        warnx("Unrecognized parameter %s=%s", name, value);
                }
        }
bailout:
        free(orig_tmpstr);

        return (retval);
}

int
main(int argc, char **argv)
{
        int c;
        camdd_argmask arglist = CAMDD_ARG_NONE;
        int timeout = 0, retry_count = 1;
        int error = 0;
        uint64_t max_io = 0;
        struct camdd_io_opts *opt_list = NULL;

        if (argc == 1) {
                usage();
                exit(1);
        }

        opt_list = calloc(2, sizeof(struct camdd_io_opts));
        if (opt_list == NULL) {
                warn("Unable to allocate option list");
                error = 1;
                goto bailout;
        }

        while ((c = getopt(argc, argv, "C:Ehi:m:o:t:v")) != -1){
                switch (c) {
                case 'C':
                        retry_count = strtol(optarg, NULL, 0);
                        if (retry_count < 0)
                                errx(1, "retry count %d is < 0",
                                     retry_count);
                        break;
                case 'E':
                        arglist |= CAMDD_ARG_ERR_RECOVER;
                        break;
                case 'i':
                case 'o':
                        if (((c == 'i')
                          && (opt_list[0].dev_type != CAMDD_DEV_NONE))
                         || ((c == 'o')
                          && (opt_list[1].dev_type != CAMDD_DEV_NONE))) {
                                errx(1, "Only one input and output path "
                                    "allowed");
                        }
                        error = camdd_parse_io_opts(optarg, (c == 'o') ? 1 : 0,
                            (c == 'o') ? &opt_list[1] : &opt_list[0]);
                        if (error != 0)
                                goto bailout;
                        break;
                case 'm':
                        error = expand_number(optarg, &max_io);
                        if (error == -1) {
                                warn("invalid maximum I/O amount %s", optarg);
                                error = 1;
                                goto bailout;
                        }
                        break;
                case 't':
                        timeout = strtol(optarg, NULL, 0);
                        if (timeout < 0)
                                errx(1, "invalid timeout %d", timeout);
                        /* Convert the timeout from seconds to ms */
                        timeout *= 1000;
                        break;
                case 'v':
                        arglist |= CAMDD_ARG_VERBOSE;
                        break;
                case 'h':
                default:
                        usage();
                        exit(1);
                        break; /*NOTREACHED*/
                }
        }

        if ((opt_list[0].dev_type == CAMDD_DEV_NONE)
         || (opt_list[1].dev_type == CAMDD_DEV_NONE))
                errx(1, "Must specify both -i and -o");

        /*
         * Set the timeout if the user hasn't specified one.
         */
        if (timeout == 0)
                timeout = CAMDD_PASS_RW_TIMEOUT;

        error = camdd_rw(opt_list, arglist, 2, max_io, retry_count, timeout);

bailout:
        free(opt_list);

        exit(error);
}