root/drivers/md/dm-switch.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
 * Copyright (C) 2011-2013 Red Hat, Inc.
 *
 * This file is released under the GPL.
 *
 * dm-switch is a device-mapper target that maps IO to underlying block
 * devices efficiently when there are a large number of fixed-sized
 * address regions but there is no simple pattern to allow for a compact
 * mapping representation such as dm-stripe.
 */

#include <linux/device-mapper.h>

#include <linux/module.h>
#include <linux/init.h>
#include <linux/vmalloc.h>

#define DM_MSG_PREFIX "switch"

/*
 * One region_table_slot_t holds <region_entries_per_slot> region table
 * entries each of which is <region_table_entry_bits> in size.
 */
typedef unsigned long region_table_slot_t;

/*
 * A device with the offset to its start sector.
 */
struct switch_path {
        struct dm_dev *dmdev;
        sector_t start;
};

/*
 * Context block for a dm switch device.
 */
struct switch_ctx {
        struct dm_target *ti;

        unsigned int nr_paths;          /* Number of paths in path_list. */

        unsigned int region_size;               /* Region size in 512-byte sectors */
        unsigned long nr_regions;       /* Number of regions making up the device */
        signed char region_size_bits;   /* log2 of region_size or -1 */

        unsigned char region_table_entry_bits;  /* Number of bits in one region table entry */
        unsigned char region_entries_per_slot;  /* Number of entries in one region table slot */
        signed char region_entries_per_slot_bits;       /* log2 of region_entries_per_slot or -1 */

        region_table_slot_t *region_table;      /* Region table */

        /*
         * Array of dm devices to switch between.
         */
        struct switch_path path_list[];
};

static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned int nr_paths,
                                           unsigned int region_size)
{
        struct switch_ctx *sctx;

        sctx = kzalloc_flex(*sctx, path_list, nr_paths);
        if (!sctx)
                return NULL;

        sctx->ti = ti;
        sctx->region_size = region_size;

        ti->private = sctx;

        return sctx;
}

static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths)
{
        struct switch_ctx *sctx = ti->private;
        sector_t nr_regions = ti->len;
        sector_t nr_slots;

        if (!(sctx->region_size & (sctx->region_size - 1)))
                sctx->region_size_bits = __ffs(sctx->region_size);
        else
                sctx->region_size_bits = -1;

        sctx->region_table_entry_bits = 1;
        while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
               (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
                sctx->region_table_entry_bits++;

        sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
        if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
                sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
        else
                sctx->region_entries_per_slot_bits = -1;

        if (sector_div(nr_regions, sctx->region_size))
                nr_regions++;

        if (nr_regions >= ULONG_MAX) {
                ti->error = "Region table too large";
                return -EINVAL;
        }
        sctx->nr_regions = nr_regions;

        nr_slots = nr_regions;
        if (sector_div(nr_slots, sctx->region_entries_per_slot))
                nr_slots++;

        if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
                ti->error = "Region table too large";
                return -EINVAL;
        }

        sctx->region_table = vmalloc_array(nr_slots,
                                           sizeof(region_table_slot_t));
        if (!sctx->region_table) {
                ti->error = "Cannot allocate region table";
                return -ENOMEM;
        }

        return 0;
}

static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
                                unsigned long *region_index, unsigned int *bit)
{
        if (sctx->region_entries_per_slot_bits >= 0) {
                *region_index = region_nr >> sctx->region_entries_per_slot_bits;
                *bit = region_nr & (sctx->region_entries_per_slot - 1);
        } else {
                *region_index = region_nr / sctx->region_entries_per_slot;
                *bit = region_nr % sctx->region_entries_per_slot;
        }

        *bit *= sctx->region_table_entry_bits;
}

static unsigned int switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
{
        unsigned long region_index;
        unsigned int bit;

        switch_get_position(sctx, region_nr, &region_index, &bit);

        return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
                ((1 << sctx->region_table_entry_bits) - 1);
}

/*
 * Find which path to use at given offset.
 */
static unsigned int switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
{
        unsigned int path_nr;
        sector_t p;

        p = offset;
        if (sctx->region_size_bits >= 0)
                p >>= sctx->region_size_bits;
        else
                sector_div(p, sctx->region_size);

        path_nr = switch_region_table_read(sctx, p);

        /* This can only happen if the processor uses non-atomic stores. */
        if (unlikely(path_nr >= sctx->nr_paths))
                path_nr = 0;

        return path_nr;
}

static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
                                      unsigned int value)
{
        unsigned long region_index;
        unsigned int bit;
        region_table_slot_t pte;

        switch_get_position(sctx, region_nr, &region_index, &bit);

        pte = sctx->region_table[region_index];
        pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
        pte |= (region_table_slot_t)value << bit;
        sctx->region_table[region_index] = pte;
}

/*
 * Fill the region table with an initial round robin pattern.
 */
static void initialise_region_table(struct switch_ctx *sctx)
{
        unsigned int path_nr = 0;
        unsigned long region_nr;

        for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
                switch_region_table_write(sctx, region_nr, path_nr);
                if (++path_nr >= sctx->nr_paths)
                        path_nr = 0;
        }
}

static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
{
        struct switch_ctx *sctx = ti->private;
        unsigned long long start;
        int r;

        r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
                          &sctx->path_list[sctx->nr_paths].dmdev);
        if (r) {
                ti->error = "Device lookup failed";
                return r;
        }

        if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
                ti->error = "Invalid device starting offset";
                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
                return -EINVAL;
        }

        sctx->path_list[sctx->nr_paths].start = start;

        sctx->nr_paths++;

        return 0;
}

/*
 * Destructor: Don't free the dm_target, just the ti->private data (if any).
 */
static void switch_dtr(struct dm_target *ti)
{
        struct switch_ctx *sctx = ti->private;

        while (sctx->nr_paths--)
                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);

        vfree(sctx->region_table);
        kfree(sctx);
}

/*
 * Constructor arguments:
 *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
 *   [<dev_path> <offset>]+
 *
 * Optional args are to allow for future extension: currently this
 * parameter must be 0.
 */
static int switch_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
        static const struct dm_arg _args[] = {
                {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
                {1, UINT_MAX, "Invalid region size"},
                {0, 0, "Invalid number of optional args"},
        };

        struct switch_ctx *sctx;
        struct dm_arg_set as;
        unsigned int nr_paths, region_size, nr_optional_args;
        int r;

        as.argc = argc;
        as.argv = argv;

        r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
        if (r)
                return -EINVAL;

        r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
        if (r)
                return r;

        r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
        if (r)
                return r;
        /* parse optional arguments here, if we add any */

        if (as.argc != nr_paths * 2) {
                ti->error = "Incorrect number of path arguments";
                return -EINVAL;
        }

        sctx = alloc_switch_ctx(ti, nr_paths, region_size);
        if (!sctx) {
                ti->error = "Cannot allocate redirection context";
                return -ENOMEM;
        }

        r = dm_set_target_max_io_len(ti, region_size);
        if (r)
                goto error;

        while (as.argc) {
                r = parse_path(&as, ti);
                if (r)
                        goto error;
        }

        r = alloc_region_table(ti, nr_paths);
        if (r)
                goto error;

        initialise_region_table(sctx);

        /* For UNMAP, sending the request down any path is sufficient */
        ti->num_discard_bios = 1;

        return 0;

error:
        switch_dtr(ti);

        return r;
}

static int switch_map(struct dm_target *ti, struct bio *bio)
{
        struct switch_ctx *sctx = ti->private;
        sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
        unsigned int path_nr = switch_get_path_nr(sctx, offset);

        bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
        bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;

        return DM_MAPIO_REMAPPED;
}

/*
 * We need to parse hex numbers in the message as quickly as possible.
 *
 * This table-based hex parser improves performance.
 * It improves a time to load 1000000 entries compared to the condition-based
 * parser.
 *              table-based parser      condition-based parser
 * PA-RISC      0.29s                   0.31s
 * Opteron      0.0495s                 0.0498s
 */
static const unsigned char hex_table[256] = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
};

static __always_inline unsigned long parse_hex(const char **string)
{
        unsigned char d;
        unsigned long r = 0;

        while ((d = hex_table[(unsigned char)**string]) < 16) {
                r = (r << 4) | d;
                (*string)++;
        }

        return r;
}

static int process_set_region_mappings(struct switch_ctx *sctx,
                                       unsigned int argc, char **argv)
{
        unsigned int i;
        unsigned long region_index = 0;

        for (i = 1; i < argc; i++) {
                unsigned long path_nr;
                const char *string = argv[i];

                if ((*string & 0xdf) == 'R') {
                        unsigned long cycle_length, num_write;

                        string++;
                        if (unlikely(*string == ',')) {
                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                                return -EINVAL;
                        }
                        cycle_length = parse_hex(&string);
                        if (unlikely(*string != ',')) {
                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                                return -EINVAL;
                        }
                        string++;
                        if (unlikely(!*string)) {
                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                                return -EINVAL;
                        }
                        num_write = parse_hex(&string);
                        if (unlikely(*string)) {
                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                                return -EINVAL;
                        }

                        if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
                                DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
                                       cycle_length - 1, region_index);
                                return -EINVAL;
                        }
                        if (unlikely(region_index + num_write < region_index) ||
                            unlikely(region_index + num_write >= sctx->nr_regions)) {
                                DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
                                       region_index, num_write, sctx->nr_regions);
                                return -EINVAL;
                        }

                        while (num_write--) {
                                region_index++;
                                path_nr = switch_region_table_read(sctx, region_index - cycle_length);
                                switch_region_table_write(sctx, region_index, path_nr);
                        }

                        continue;
                }

                if (*string == ':')
                        region_index++;
                else {
                        region_index = parse_hex(&string);
                        if (unlikely(*string != ':')) {
                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                                return -EINVAL;
                        }
                }

                string++;
                if (unlikely(!*string)) {
                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                        return -EINVAL;
                }

                path_nr = parse_hex(&string);
                if (unlikely(*string)) {
                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
                        return -EINVAL;
                }
                if (unlikely(region_index >= sctx->nr_regions)) {
                        DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
                        return -EINVAL;
                }
                if (unlikely(path_nr >= sctx->nr_paths)) {
                        DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
                        return -EINVAL;
                }

                switch_region_table_write(sctx, region_index, path_nr);
        }

        return 0;
}

/*
 * Messages are processed one-at-a-time.
 *
 * Only set_region_mappings is supported.
 */
static int switch_message(struct dm_target *ti, unsigned int argc, char **argv,
                          char *result, unsigned int maxlen)
{
        static DEFINE_MUTEX(message_mutex);

        struct switch_ctx *sctx = ti->private;
        int r = -EINVAL;

        mutex_lock(&message_mutex);

        if (!strcasecmp(argv[0], "set_region_mappings"))
                r = process_set_region_mappings(sctx, argc, argv);
        else
                DMWARN("Unrecognised message received.");

        mutex_unlock(&message_mutex);

        return r;
}

static void switch_status(struct dm_target *ti, status_type_t type,
                          unsigned int status_flags, char *result, unsigned int maxlen)
{
        struct switch_ctx *sctx = ti->private;
        unsigned int sz = 0;
        int path_nr;

        switch (type) {
        case STATUSTYPE_INFO:
                result[0] = '\0';
                break;

        case STATUSTYPE_TABLE:
                DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
                for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
                        DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
                               (unsigned long long)sctx->path_list[path_nr].start);
                break;

        case STATUSTYPE_IMA:
                result[0] = '\0';
                break;
        }
}

/*
 * Switch ioctl:
 *
 * Passthrough all ioctls to the path for sector 0
 */
static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
                                unsigned int cmd, unsigned long arg,
                                bool *forward)
{
        struct switch_ctx *sctx = ti->private;
        unsigned int path_nr;

        path_nr = switch_get_path_nr(sctx, 0);

        *bdev = sctx->path_list[path_nr].dmdev->bdev;

        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
        if (ti->len + sctx->path_list[path_nr].start !=
            bdev_nr_sectors((*bdev)))
                return 1;
        return 0;
}

static int switch_iterate_devices(struct dm_target *ti,
                                  iterate_devices_callout_fn fn, void *data)
{
        struct switch_ctx *sctx = ti->private;
        int path_nr;
        int r;

        for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
                r = fn(ti, sctx->path_list[path_nr].dmdev,
                         sctx->path_list[path_nr].start, ti->len, data);
                if (r)
                        return r;
        }

        return 0;
}

static struct target_type switch_target = {
        .name = "switch",
        .version = {1, 1, 0},
        .features = DM_TARGET_NOWAIT,
        .module = THIS_MODULE,
        .ctr = switch_ctr,
        .dtr = switch_dtr,
        .map = switch_map,
        .message = switch_message,
        .status = switch_status,
        .prepare_ioctl = switch_prepare_ioctl,
        .iterate_devices = switch_iterate_devices,
};
module_dm(switch);

MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
MODULE_LICENSE("GPL");