root/sys/geom/stripe/g_stripe.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/bio.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <vm/uma.h>
#include <geom/geom.h>
#include <geom/geom_dbg.h>
#include <geom/stripe/g_stripe.h>

FEATURE(geom_stripe, "GEOM striping support");

static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data");

static uma_zone_t g_stripe_zone;

static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force);
static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp,
    struct g_geom *gp);

static g_taste_t g_stripe_taste;
static g_ctl_req_t g_stripe_config;
static g_dumpconf_t g_stripe_dumpconf;
static g_init_t g_stripe_init;
static g_fini_t g_stripe_fini;

struct g_class g_stripe_class = {
        .name = G_STRIPE_CLASS_NAME,
        .version = G_VERSION,
        .ctlreq = g_stripe_config,
        .taste = g_stripe_taste,
        .destroy_geom = g_stripe_destroy_geom,
        .init = g_stripe_init,
        .fini = g_stripe_fini
};

SYSCTL_DECL(_kern_geom);
static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
    "GEOM_STRIPE stuff");
static u_int g_stripe_debug = 0;
SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0,
    "Debug level");
static int g_stripe_fast = 0;
SYSCTL_INT(_kern_geom_stripe, OID_AUTO, fast,
    CTLFLAG_RWTUN, &g_stripe_fast, 0,
    "Fast, but memory-consuming, mode");
static u_long g_stripe_maxmem;
SYSCTL_ULONG(_kern_geom_stripe, OID_AUTO, maxmem,
    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &g_stripe_maxmem, 0,
    "Maximum memory that can be allocated in \"fast\" mode (in bytes)");
static u_int g_stripe_fast_failed = 0;
SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD,
    &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed");

/*
 * Greatest Common Divisor.
 */
static u_int
gcd(u_int a, u_int b)
{
        u_int c;

        while (b != 0) {
                c = a;
                a = b;
                b = (c % b);
        }
        return (a);
}

/*
 * Least Common Multiple.
 */
static u_int
lcm(u_int a, u_int b)
{

        return ((a * b) / gcd(a, b));
}

static void
g_stripe_init(struct g_class *mp __unused)
{

        g_stripe_maxmem = maxphys * 100;
        TUNABLE_ULONG_FETCH("kern.geom.stripe.maxmem,", &g_stripe_maxmem);
        g_stripe_zone = uma_zcreate("g_stripe_zone", maxphys, NULL, NULL,
            NULL, NULL, 0, 0);
        g_stripe_maxmem -= g_stripe_maxmem % maxphys;
        uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / maxphys);
}

static void
g_stripe_fini(struct g_class *mp __unused)
{

        uma_zdestroy(g_stripe_zone);
}

/*
 * Return the number of valid disks.
 */
static u_int
g_stripe_nvalid(struct g_stripe_softc *sc)
{
        u_int i, no;

        no = 0;
        for (i = 0; i < sc->sc_ndisks; i++) {
                if (sc->sc_disks[i] != NULL)
                        no++;
        }

        return (no);
}

static void
g_stripe_remove_disk(struct g_consumer *cp)
{
        struct g_stripe_softc *sc;

        g_topology_assert();
        KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
        sc = (struct g_stripe_softc *)cp->geom->softc;
        KASSERT(sc != NULL, ("NULL sc in %s.", __func__));

        if (cp->private == NULL) {
                G_STRIPE_DEBUG(0, "Disk %s removed from %s.",
                    cp->provider->name, sc->sc_name);
                cp->private = (void *)(uintptr_t)-1;
        }

        if (sc->sc_provider != NULL) {
                G_STRIPE_DEBUG(0, "Device %s deactivated.",
                    sc->sc_provider->name);
                g_wither_provider(sc->sc_provider, ENXIO);
                sc->sc_provider = NULL;
        }

        if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
                return;
        sc->sc_disks[cp->index] = NULL;
        cp->index = 0;
        g_detach(cp);
        g_destroy_consumer(cp);
        /* If there are no valid disks anymore, remove device. */
        if (LIST_EMPTY(&sc->sc_geom->consumer))
                g_stripe_destroy(sc, 1);
}

static void
g_stripe_orphan(struct g_consumer *cp)
{
        struct g_stripe_softc *sc;
        struct g_geom *gp;

        g_topology_assert();
        gp = cp->geom;
        sc = gp->softc;
        if (sc == NULL)
                return;

        g_stripe_remove_disk(cp);
}

static int
g_stripe_access(struct g_provider *pp, int dr, int dw, int de)
{
        struct g_consumer *cp1, *cp2, *tmp;
        struct g_stripe_softc *sc __diagused;
        struct g_geom *gp;
        int error;

        g_topology_assert();
        gp = pp->geom;
        sc = gp->softc;
        KASSERT(sc != NULL, ("NULL sc in %s.", __func__));

        /* On first open, grab an extra "exclusive" bit */
        if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
                de++;
        /* ... and let go of it on last close */
        if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
                de--;

        LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
                error = g_access(cp1, dr, dw, de);
                if (error != 0)
                        goto fail;
                if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
                    cp1->private != NULL) {
                        g_stripe_remove_disk(cp1); /* May destroy geom. */
                }
        }
        return (0);

fail:
        LIST_FOREACH(cp2, &gp->consumer, consumer) {
                if (cp1 == cp2)
                        break;
                g_access(cp2, -dr, -dw, -de);
        }
        return (error);
}

static void
g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset,
    off_t length, int mode)
{
        off_t stripesize;
        size_t len;

        stripesize = sc->sc_stripesize;
        len = (size_t)(stripesize - (offset & (stripesize - 1)));
        do {
                bcopy(src, dst, len);
                if (mode) {
                        dst += len + stripesize * (sc->sc_ndisks - 1);
                        src += len;
                } else {
                        dst += len;
                        src += len + stripesize * (sc->sc_ndisks - 1);
                }
                length -= len;
                KASSERT(length >= 0,
                    ("Length < 0 (stripesize=%ju, offset=%ju, length=%jd).",
                    (uintmax_t)stripesize, (uintmax_t)offset, (intmax_t)length));
                if (length > stripesize)
                        len = stripesize;
                else
                        len = length;
        } while (length > 0);
}

static void
g_stripe_done(struct bio *bp)
{
        struct g_stripe_softc *sc;
        struct bio *pbp;

        pbp = bp->bio_parent;
        sc = pbp->bio_to->geom->softc;
        if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) {
                g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset,
                    bp->bio_length, 1);
                bp->bio_data = bp->bio_caller1;
                bp->bio_caller1 = NULL;
        }
        mtx_lock(&sc->sc_lock);
        if (pbp->bio_error == 0)
                pbp->bio_error = bp->bio_error;
        pbp->bio_completed += bp->bio_completed;
        pbp->bio_inbed++;
        if (pbp->bio_children == pbp->bio_inbed) {
                mtx_unlock(&sc->sc_lock);
                if (pbp->bio_driver1 != NULL)
                        uma_zfree(g_stripe_zone, pbp->bio_driver1);
                if (bp->bio_cmd == BIO_SPEEDUP)
                        pbp->bio_completed = pbp->bio_length;
                g_io_deliver(pbp, pbp->bio_error);
        } else
                mtx_unlock(&sc->sc_lock);
        g_destroy_bio(bp);
}

static int
g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length)
{
        TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
        struct g_stripe_softc *sc;
        char *addr, *data = NULL;
        struct bio *cbp;
        off_t stripesize;
        u_int nparts = 0;
        int error;

        sc = bp->bio_to->geom->softc;

        addr = bp->bio_data;
        stripesize = sc->sc_stripesize;

        cbp = g_clone_bio(bp);
        if (cbp == NULL) {
                error = ENOMEM;
                goto failure;
        }
        TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
        nparts++;
        /*
         * Fill in the component buf structure.
         */
        cbp->bio_done = g_stripe_done;
        cbp->bio_offset = offset;
        cbp->bio_data = addr;
        cbp->bio_caller1 = NULL;
        cbp->bio_length = length;
        cbp->bio_caller2 = sc->sc_disks[no];

        /* offset -= offset % stripesize; */
        offset -= offset & (stripesize - 1);
        addr += length;
        length = bp->bio_length - length;
        for (no++; length > 0; no++, length -= stripesize, addr += stripesize) {
                if (no > sc->sc_ndisks - 1) {
                        no = 0;
                        offset += stripesize;
                }
                if (nparts >= sc->sc_ndisks) {
                        cbp = TAILQ_NEXT(cbp, bio_queue);
                        if (cbp == NULL)
                                cbp = TAILQ_FIRST(&queue);
                        nparts++;
                        /*
                         * Update bio structure.
                         */
                        /*
                         * MIN() is in case when
                         * (bp->bio_length % sc->sc_stripesize) != 0.
                         */
                        cbp->bio_length += MIN(stripesize, length);
                        if (cbp->bio_caller1 == NULL) {
                                cbp->bio_caller1 = cbp->bio_data;
                                cbp->bio_data = NULL;
                                if (data == NULL) {
                                        data = uma_zalloc(g_stripe_zone,
                                            M_NOWAIT);
                                        if (data == NULL) {
                                                error = ENOMEM;
                                                goto failure;
                                        }
                                }
                        }
                } else {
                        cbp = g_clone_bio(bp);
                        if (cbp == NULL) {
                                error = ENOMEM;
                                goto failure;
                        }
                        TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
                        nparts++;
                        /*
                         * Fill in the component buf structure.
                         */
                        cbp->bio_done = g_stripe_done;
                        cbp->bio_offset = offset;
                        cbp->bio_data = addr;
                        cbp->bio_caller1 = NULL;
                        /*
                         * MIN() is in case when
                         * (bp->bio_length % sc->sc_stripesize) != 0.
                         */
                        cbp->bio_length = MIN(stripesize, length);
                        cbp->bio_caller2 = sc->sc_disks[no];
                }
        }
        if (data != NULL)
                bp->bio_driver1 = data;
        /*
         * Fire off all allocated requests!
         */
        while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
                struct g_consumer *cp;

                TAILQ_REMOVE(&queue, cbp, bio_queue);
                cp = cbp->bio_caller2;
                cbp->bio_caller2 = NULL;
                cbp->bio_to = cp->provider;
                if (cbp->bio_caller1 != NULL) {
                        cbp->bio_data = data;
                        if (bp->bio_cmd == BIO_WRITE) {
                                g_stripe_copy(sc, cbp->bio_caller1, data,
                                    cbp->bio_offset, cbp->bio_length, 0);
                        }
                        data += cbp->bio_length;
                }
                G_STRIPE_LOGREQ(cbp, "Sending request.");
                g_io_request(cbp, cp);
        }
        return (0);
failure:
        if (data != NULL)
                uma_zfree(g_stripe_zone, data);
        while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
                TAILQ_REMOVE(&queue, cbp, bio_queue);
                if (cbp->bio_caller1 != NULL) {
                        cbp->bio_data = cbp->bio_caller1;
                        cbp->bio_caller1 = NULL;
                }
                bp->bio_children--;
                g_destroy_bio(cbp);
        }
        return (error);
}

static int
g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length)
{
        TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
        struct g_stripe_softc *sc;
        off_t stripesize;
        struct bio *cbp;
        char *addr;
        int error;

        sc = bp->bio_to->geom->softc;

        stripesize = sc->sc_stripesize;

        cbp = g_clone_bio(bp);
        if (cbp == NULL) {
                error = ENOMEM;
                goto failure;
        }
        TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
        /*
         * Fill in the component buf structure.
         */
        if (bp->bio_length == length)
                cbp->bio_done = g_std_done;     /* Optimized lockless case. */
        else
                cbp->bio_done = g_stripe_done;
        cbp->bio_offset = offset;
        cbp->bio_length = length;
        if ((bp->bio_flags & BIO_UNMAPPED) != 0)
                addr = NULL;
        else
                addr = bp->bio_data;
        cbp->bio_caller2 = sc->sc_disks[no];

        /* offset -= offset % stripesize; */
        offset -= offset & (stripesize - 1);
        if (bp->bio_cmd != BIO_DELETE)
                addr += length;
        length = bp->bio_length - length;
        for (no++; length > 0; no++, length -= stripesize) {
                if (no > sc->sc_ndisks - 1) {
                        no = 0;
                        offset += stripesize;
                }
                cbp = g_clone_bio(bp);
                if (cbp == NULL) {
                        error = ENOMEM;
                        goto failure;
                }
                TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);

                /*
                 * Fill in the component buf structure.
                 */
                cbp->bio_done = g_stripe_done;
                cbp->bio_offset = offset;
                /*
                 * MIN() is in case when
                 * (bp->bio_length % sc->sc_stripesize) != 0.
                 */
                cbp->bio_length = MIN(stripesize, length);
                if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
                        cbp->bio_ma_offset += (uintptr_t)addr;
                        cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
                        cbp->bio_ma_offset %= PAGE_SIZE;
                        cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
                            cbp->bio_length) / PAGE_SIZE;
                } else
                        cbp->bio_data = addr;

                cbp->bio_caller2 = sc->sc_disks[no];

                if (bp->bio_cmd != BIO_DELETE)
                        addr += stripesize;
        }
        /*
         * Fire off all allocated requests!
         */
        while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
                struct g_consumer *cp;

                TAILQ_REMOVE(&queue, cbp, bio_queue);
                cp = cbp->bio_caller2;
                cbp->bio_caller2 = NULL;
                cbp->bio_to = cp->provider;
                G_STRIPE_LOGREQ(cbp, "Sending request.");
                g_io_request(cbp, cp);
        }
        return (0);
failure:
        while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
                TAILQ_REMOVE(&queue, cbp, bio_queue);
                bp->bio_children--;
                g_destroy_bio(cbp);
        }
        return (error);
}

static void
g_stripe_pushdown(struct g_stripe_softc *sc, struct bio *bp)
{
        struct bio_queue_head queue;
        struct g_consumer *cp;
        struct bio *cbp;
        u_int no;

        bioq_init(&queue);
        for (no = 0; no < sc->sc_ndisks; no++) {
                cbp = g_clone_bio(bp);
                if (cbp == NULL) {
                        for (cbp = bioq_first(&queue); cbp != NULL;
                            cbp = bioq_first(&queue)) {
                                bioq_remove(&queue, cbp);
                                g_destroy_bio(cbp);
                        }
                        if (bp->bio_error == 0)
                                bp->bio_error = ENOMEM;
                        g_io_deliver(bp, bp->bio_error);
                        return;
                }
                bioq_insert_tail(&queue, cbp);
                cbp->bio_done = g_stripe_done;
                cbp->bio_caller2 = sc->sc_disks[no];
                cbp->bio_to = sc->sc_disks[no]->provider;
        }
        for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
                bioq_remove(&queue, cbp);
                G_STRIPE_LOGREQ(cbp, "Sending request.");
                cp = cbp->bio_caller2;
                cbp->bio_caller2 = NULL;
                g_io_request(cbp, cp);
        }
}

static void
g_stripe_start(struct bio *bp)
{
        off_t offset, start, length, nstripe, stripesize;
        struct g_stripe_softc *sc;
        u_int no;
        int error, fast = 0;

        sc = bp->bio_to->geom->softc;
        /*
         * If sc == NULL, provider's error should be set and g_stripe_start()
         * should not be called at all.
         */
        KASSERT(sc != NULL,
            ("Provider's error should be set (error=%d)(device=%s).",
            bp->bio_to->error, bp->bio_to->name));

        G_STRIPE_LOGREQ(bp, "Request received.");

        switch (bp->bio_cmd) {
        case BIO_READ:
        case BIO_WRITE:
        case BIO_DELETE:
                break;
        case BIO_SPEEDUP:
        case BIO_FLUSH:
                g_stripe_pushdown(sc, bp);
                return;
        case BIO_GETATTR:
                if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
                        int val = (sc->sc_flags & G_STRIPE_FLAG_CANDELETE) != 0;
                        g_handleattr(bp, "GEOM::candelete", &val, sizeof(val));
                        return;
                }
                /* otherwise: To which provider it should be delivered? */
        default:
                g_io_deliver(bp, EOPNOTSUPP);
                return;
        }

        stripesize = sc->sc_stripesize;

        /*
         * Calculations are quite messy, but fast I hope.
         */

        /* Stripe number. */
        /* nstripe = bp->bio_offset / stripesize; */
        nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits;
        /* Disk number. */
        no = nstripe % sc->sc_ndisks;
        /* Start position in stripe. */
        /* start = bp->bio_offset % stripesize; */
        start = bp->bio_offset & (stripesize - 1);
        /* Start position in disk. */
        /* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */
        offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start;
        /* Length of data to operate. */
        length = MIN(bp->bio_length, stripesize - start);

        /*
         * Do use "fast" mode when:
         * 1. "Fast" mode is ON.
         * and
         * 2. Request size is less than or equal to maxphys,
         *    which should always be true.
         * and
         * 3. Request size is bigger than stripesize * ndisks. If it isn't,
         *    there will be no need to send more than one I/O request to
         *    a provider, so there is nothing to optmize.
         * and
         * 4. Request is not unmapped.
         * and
         * 5. It is not a BIO_DELETE.
         */
        if (g_stripe_fast && bp->bio_length <= maxphys &&
            bp->bio_length >= stripesize * sc->sc_ndisks &&
            (bp->bio_flags & BIO_UNMAPPED) == 0 &&
            bp->bio_cmd != BIO_DELETE) {
                fast = 1;
        }
        error = 0;
        if (fast) {
                error = g_stripe_start_fast(bp, no, offset, length);
                if (error != 0)
                        g_stripe_fast_failed++;
        }
        /*
         * Do use "economic" when:
         * 1. "Economic" mode is ON.
         * or
         * 2. "Fast" mode failed. It can only fail if there is no memory.
         */
        if (!fast || error != 0)
                error = g_stripe_start_economic(bp, no, offset, length);
        if (error != 0) {
                if (bp->bio_error == 0)
                        bp->bio_error = error;
                g_io_deliver(bp, bp->bio_error);
        }
}

static void
g_stripe_check_and_run(struct g_stripe_softc *sc)
{
        struct g_provider *dp;
        off_t mediasize, ms;
        u_int no, sectorsize = 0;

        g_topology_assert();
        if (g_stripe_nvalid(sc) != sc->sc_ndisks)
                return;

        sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s",
            sc->sc_name);
        sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
        if (g_stripe_fast == 0)
                sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED;
        /*
         * Find the smallest disk.
         */
        mediasize = sc->sc_disks[0]->provider->mediasize;
        if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
                mediasize -= sc->sc_disks[0]->provider->sectorsize;
        mediasize -= mediasize % sc->sc_stripesize;
        sectorsize = sc->sc_disks[0]->provider->sectorsize;
        for (no = 1; no < sc->sc_ndisks; no++) {
                dp = sc->sc_disks[no]->provider;
                ms = dp->mediasize;
                if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
                        ms -= dp->sectorsize;
                ms -= ms % sc->sc_stripesize;
                if (ms < mediasize)
                        mediasize = ms;
                sectorsize = lcm(sectorsize, dp->sectorsize);

                /* A provider underneath us doesn't support unmapped */
                if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
                        G_STRIPE_DEBUG(1, "Cancelling unmapped "
                            "because of %s.", dp->name);
                        sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED;
                }
        }
        sc->sc_provider->sectorsize = sectorsize;
        sc->sc_provider->mediasize = mediasize * sc->sc_ndisks;
        sc->sc_provider->stripesize = sc->sc_stripesize;
        sc->sc_provider->stripeoffset = 0;
        g_error_provider(sc->sc_provider, 0);

        G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
}

static int
g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md)
{
        struct g_provider *pp;
        u_char *buf;
        int error;

        g_topology_assert();

        error = g_access(cp, 1, 0, 0);
        if (error != 0)
                return (error);
        pp = cp->provider;
        g_topology_unlock();
        buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
            &error);
        g_topology_lock();
        g_access(cp, -1, 0, 0);
        if (buf == NULL)
                return (error);

        /* Decode metadata. */
        stripe_metadata_decode(buf, md);
        g_free(buf);

        return (0);
}

/*
 * Add disk to given device.
 */
static int
g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no)
{
        struct g_consumer *cp, *fcp;
        struct g_geom *gp;
        int error;

        g_topology_assert();
        /* Metadata corrupted? */
        if (no >= sc->sc_ndisks)
                return (EINVAL);

        /* Check if disk is not already attached. */
        if (sc->sc_disks[no] != NULL)
                return (EEXIST);

        gp = sc->sc_geom;
        fcp = LIST_FIRST(&gp->consumer);

        cp = g_new_consumer(gp);
        cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
        cp->private = NULL;
        cp->index = no;
        error = g_attach(cp, pp);
        if (error != 0) {
                g_destroy_consumer(cp);
                return (error);
        }

        if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
                error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
                if (error != 0) {
                        g_detach(cp);
                        g_destroy_consumer(cp);
                        return (error);
                }
        }
        if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) {
                struct g_stripe_metadata md;

                /* Reread metadata. */
                error = g_stripe_read_metadata(cp, &md);
                if (error != 0)
                        goto fail;

                if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 ||
                    strcmp(md.md_name, sc->sc_name) != 0 ||
                    md.md_id != sc->sc_id) {
                        G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name);
                        goto fail;
                }
        }

        sc->sc_disks[no] = cp;

        /* cascade candelete */
        error = g_access(cp, 1, 0, 0);
        if (error == 0) {
                int can_delete;

                error = g_getattr("GEOM::candelete", cp, &can_delete);
                if (error == 0 && can_delete != 0)
                        sc->sc_flags |= G_STRIPE_FLAG_CANDELETE;
                G_STRIPE_DEBUG(1, "Provider %s candelete %i.", pp->name,
                    can_delete);
                g_access(cp, -1, 0, 0);
        }

        G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
        g_stripe_check_and_run(sc);

        return (0);
fail:
        if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
                g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
        g_detach(cp);
        g_destroy_consumer(cp);
        return (error);
}

static struct g_geom *
g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md,
    u_int type)
{
        struct g_stripe_softc *sc;
        struct g_geom *gp;
        u_int no;

        g_topology_assert();
        G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
            md->md_id);

        /* Two disks is minimum. */
        if (md->md_all < 2) {
                G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name);
                return (NULL);
        }
#if 0
        /* Stripe size have to be grater than or equal to sector size. */
        if (md->md_stripesize < sectorsize) {
                G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
                return (NULL);
        }
#endif
        /* Stripe size have to be power of 2. */
        if (!powerof2(md->md_stripesize)) {
                G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
                return (NULL);
        }

        /* Check for duplicate unit */
        LIST_FOREACH(gp, &mp->geom, geom) {
                sc = gp->softc;
                if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
                        G_STRIPE_DEBUG(0, "Device %s already configured.",
                            sc->sc_name);
                        return (NULL);
                }
        }
        gp = g_new_geom(mp, md->md_name);
        sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO);
        gp->start = g_stripe_start;
        gp->spoiled = g_stripe_orphan;
        gp->orphan = g_stripe_orphan;
        gp->access = g_stripe_access;
        gp->dumpconf = g_stripe_dumpconf;

        sc->sc_id = md->md_id;
        sc->sc_stripesize = md->md_stripesize;
        sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1);
        sc->sc_ndisks = md->md_all;
        sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
            M_STRIPE, M_WAITOK | M_ZERO);
        for (no = 0; no < sc->sc_ndisks; no++)
                sc->sc_disks[no] = NULL;
        sc->sc_type = type;
        mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF);

        gp->softc = sc;
        sc->sc_geom = gp;
        sc->sc_provider = NULL;

        G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);

        return (gp);
}

static int
g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force)
{
        struct g_provider *pp;
        struct g_consumer *cp, *cp1;
        struct g_geom *gp;

        g_topology_assert();

        if (sc == NULL)
                return (ENXIO);

        pp = sc->sc_provider;
        if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
                if (force) {
                        G_STRIPE_DEBUG(0, "Device %s is still open, so it "
                            "can't be definitely removed.", pp->name);
                } else {
                        G_STRIPE_DEBUG(1,
                            "Device %s is still open (r%dw%de%d).", pp->name,
                            pp->acr, pp->acw, pp->ace);
                        return (EBUSY);
                }
        }

        gp = sc->sc_geom;
        LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
                g_stripe_remove_disk(cp);
                if (cp1 == NULL)
                        return (0);     /* Recursion happened. */
        }
        if (!LIST_EMPTY(&gp->consumer))
                return (EINPROGRESS);

        gp->softc = NULL;
        KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
            gp->name));
        free(sc->sc_disks, M_STRIPE);
        mtx_destroy(&sc->sc_lock);
        free(sc, M_STRIPE);
        G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name);
        g_wither_geom(gp, ENXIO);
        return (0);
}

static int
g_stripe_destroy_geom(struct gctl_req *req __unused,
    struct g_class *mp __unused, struct g_geom *gp)
{
        struct g_stripe_softc *sc;

        sc = gp->softc;
        return (g_stripe_destroy(sc, 0));
}

static struct g_geom *
g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
{
        struct g_stripe_metadata md;
        struct g_stripe_softc *sc;
        struct g_consumer *cp;
        struct g_geom *gp;
        int error;

        g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
        g_topology_assert();

        /* Skip providers that are already open for writing. */
        if (pp->acw > 0)
                return (NULL);

        G_STRIPE_DEBUG(3, "Tasting %s.", pp->name);

        gp = g_new_geom(mp, "stripe:taste");
        gp->start = g_stripe_start;
        gp->access = g_stripe_access;
        gp->orphan = g_stripe_orphan;
        cp = g_new_consumer(gp);
        cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
        error = g_attach(cp, pp);
        if (error == 0) {
                error = g_stripe_read_metadata(cp, &md);
                g_detach(cp);
        }
        g_destroy_consumer(cp);
        g_destroy_geom(gp);
        if (error != 0)
                return (NULL);
        gp = NULL;

        if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0)
                return (NULL);
        if (md.md_version > G_STRIPE_VERSION) {
                printf("geom_stripe.ko module is too old to handle %s.\n",
                    pp->name);
                return (NULL);
        }
        /*
         * Backward compatibility:
         */
        /* There was no md_provider field in earlier versions of metadata. */
        if (md.md_version < 2)
                bzero(md.md_provider, sizeof(md.md_provider));
        /* There was no md_provsize field in earlier versions of metadata. */
        if (md.md_version < 3)
                md.md_provsize = pp->mediasize;

        if (md.md_provider[0] != '\0' &&
            !g_compare_names(md.md_provider, pp->name))
                return (NULL);
        if (md.md_provsize != pp->mediasize)
                return (NULL);

        /*
         * Let's check if device already exists.
         */
        sc = NULL;
        LIST_FOREACH(gp, &mp->geom, geom) {
                sc = gp->softc;
                if (sc == NULL)
                        continue;
                if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC)
                        continue;
                if (strcmp(md.md_name, sc->sc_name) != 0)
                        continue;
                if (md.md_id != sc->sc_id)
                        continue;
                break;
        }
        if (gp != NULL) {
                G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
                error = g_stripe_add_disk(sc, pp, md.md_no);
                if (error != 0) {
                        G_STRIPE_DEBUG(0,
                            "Cannot add disk %s to %s (error=%d).", pp->name,
                            gp->name, error);
                        return (NULL);
                }
        } else {
                gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC);
                if (gp == NULL) {
                        G_STRIPE_DEBUG(0, "Cannot create device %s.",
                            md.md_name);
                        return (NULL);
                }
                sc = gp->softc;
                G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
                error = g_stripe_add_disk(sc, pp, md.md_no);
                if (error != 0) {
                        G_STRIPE_DEBUG(0,
                            "Cannot add disk %s to %s (error=%d).", pp->name,
                            gp->name, error);
                        g_stripe_destroy(sc, 1);
                        return (NULL);
                }
        }

        return (gp);
}

static void
g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp)
{
        u_int attached, no;
        struct g_stripe_metadata md;
        struct g_provider *pp;
        struct g_stripe_softc *sc;
        struct g_geom *gp;
        struct sbuf *sb;
        off_t *stripesize;
        const char *name;
        char param[16];
        int *nargs;

        g_topology_assert();
        nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
        if (nargs == NULL) {
                gctl_error(req, "No '%s' argument.", "nargs");
                return;
        }
        if (*nargs <= 2) {
                gctl_error(req, "Too few arguments.");
                return;
        }

        strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic));
        md.md_version = G_STRIPE_VERSION;
        name = gctl_get_asciiparam(req, "arg0");
        if (name == NULL) {
                gctl_error(req, "No 'arg%u' argument.", 0);
                return;
        }
        strlcpy(md.md_name, name, sizeof(md.md_name));
        md.md_id = arc4random();
        md.md_no = 0;
        md.md_all = *nargs - 1;
        stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize));
        if (stripesize == NULL) {
                gctl_error(req, "No '%s' argument.", "stripesize");
                return;
        }
        md.md_stripesize = (uint32_t)*stripesize;
        bzero(md.md_provider, sizeof(md.md_provider));
        /* This field is not important here. */
        md.md_provsize = 0;

        /* Check all providers are valid */
        for (no = 1; no < *nargs; no++) {
                snprintf(param, sizeof(param), "arg%u", no);
                pp = gctl_get_provider(req, param);
                if (pp == NULL)
                        return;
        }

        gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL);
        if (gp == NULL) {
                gctl_error(req, "Can't configure %s.", md.md_name);
                return;
        }

        sc = gp->softc;
        sb = sbuf_new_auto();
        sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
        for (attached = 0, no = 1; no < *nargs; no++) {
                snprintf(param, sizeof(param), "arg%u", no);
                pp  = gctl_get_provider(req, param);
                if (pp == NULL) {
                        name = gctl_get_asciiparam(req, param);
                        MPASS(name != NULL);
                        sbuf_printf(sb, " %s", name);
                        continue;
                }
                if (g_stripe_add_disk(sc, pp, no - 1) != 0) {
                        G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.",
                            no, pp->name, gp->name);
                        sbuf_printf(sb, " %s", pp->name);
                        continue;
                }
                attached++;
        }
        sbuf_finish(sb);
        if (md.md_all != attached) {
                g_stripe_destroy(gp->softc, 1);
                gctl_error(req, "%s", sbuf_data(sb));
        }
        sbuf_delete(sb);
}

static struct g_stripe_softc *
g_stripe_find_device(struct g_class *mp, const char *name)
{
        struct g_stripe_softc *sc;
        struct g_geom *gp;

        LIST_FOREACH(gp, &mp->geom, geom) {
                sc = gp->softc;
                if (sc == NULL)
                        continue;
                if (strcmp(sc->sc_name, name) == 0)
                        return (sc);
        }
        return (NULL);
}

static void
g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp)
{
        struct g_stripe_softc *sc;
        int *force, *nargs, error;
        const char *name;
        char param[16];
        u_int i;

        g_topology_assert();

        nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
        if (nargs == NULL) {
                gctl_error(req, "No '%s' argument.", "nargs");
                return;
        }
        if (*nargs <= 0) {
                gctl_error(req, "Missing device(s).");
                return;
        }
        force = gctl_get_paraml(req, "force", sizeof(*force));
        if (force == NULL) {
                gctl_error(req, "No '%s' argument.", "force");
                return;
        }

        for (i = 0; i < (u_int)*nargs; i++) {
                snprintf(param, sizeof(param), "arg%u", i);
                name = gctl_get_asciiparam(req, param);
                if (name == NULL) {
                        gctl_error(req, "No 'arg%u' argument.", i);
                        return;
                }
                sc = g_stripe_find_device(mp, name);
                if (sc == NULL) {
                        gctl_error(req, "No such device: %s.", name);
                        return;
                }
                error = g_stripe_destroy(sc, *force);
                if (error != 0) {
                        gctl_error(req, "Cannot destroy device %s (error=%d).",
                            sc->sc_name, error);
                        return;
                }
        }
}

static void
g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb)
{
        uint32_t *version;

        g_topology_assert();

        version = gctl_get_paraml(req, "version", sizeof(*version));
        if (version == NULL) {
                gctl_error(req, "No '%s' argument.", "version");
                return;
        }
        if (*version != G_STRIPE_VERSION) {
                gctl_error(req, "Userland and kernel parts are out of sync.");
                return;
        }

        if (strcmp(verb, "create") == 0) {
                g_stripe_ctl_create(req, mp);
                return;
        } else if (strcmp(verb, "destroy") == 0 ||
            strcmp(verb, "stop") == 0) {
                g_stripe_ctl_destroy(req, mp);
                return;
        }

        gctl_error(req, "Unknown verb.");
}

static void
g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
    struct g_consumer *cp, struct g_provider *pp)
{
        struct g_stripe_softc *sc;

        sc = gp->softc;
        if (sc == NULL)
                return;
        if (pp != NULL) {
                /* Nothing here. */
        } else if (cp != NULL) {
                sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
                    (u_int)cp->index);
        } else {
                sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
                sbuf_printf(sb, "%s<Stripesize>%ju</Stripesize>\n", indent,
                    (uintmax_t)sc->sc_stripesize);
                sbuf_printf(sb, "%s<Type>", indent);
                switch (sc->sc_type) {
                case G_STRIPE_TYPE_AUTOMATIC:
                        sbuf_cat(sb, "AUTOMATIC");
                        break;
                case G_STRIPE_TYPE_MANUAL:
                        sbuf_cat(sb, "MANUAL");
                        break;
                default:
                        sbuf_cat(sb, "UNKNOWN");
                        break;
                }
                sbuf_cat(sb, "</Type>\n");
                sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
                    indent, sc->sc_ndisks, g_stripe_nvalid(sc));
                sbuf_printf(sb, "%s<State>", indent);
                if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
                        sbuf_cat(sb, "UP");
                else
                        sbuf_cat(sb, "DOWN");
                sbuf_cat(sb, "</State>\n");
        }
}

DECLARE_GEOM_CLASS(g_stripe_class, g_stripe);
MODULE_VERSION(geom_stripe, 0);