#include <sys/zfs_context.h>
#include <sys/types.h>
#include <sys/zio.h>
#include <sys/debug.h>
#include <sys/zfs_debug.h>
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
#include <sys/simd.h>
#ifndef isspace
#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \
(c) == '\r' || (c) == '\f' || (c) == '\013')
#endif
extern boolean_t raidz_will_scalar_work(void);
static const raidz_impl_ops_t vdev_raidz_original_impl = {
.name = "original",
.is_supported = raidz_will_scalar_work,
};
static raidz_impl_ops_t vdev_raidz_fastest_impl = {
.name = "fastest"
};
const raidz_impl_ops_t *raidz_all_maths[] = {
&vdev_raidz_original_impl,
&vdev_raidz_scalar_impl,
#if defined(__amd64)
&vdev_raidz_sse2_impl,
&vdev_raidz_ssse3_impl,
&vdev_raidz_avx2_impl,
#endif
};
static boolean_t raidz_math_initialized = B_FALSE;
#define IMPL_FASTEST (UINT32_MAX)
#define IMPL_CYCLE (UINT32_MAX - 1)
#define IMPL_ORIGINAL (0)
#define IMPL_SCALAR (1)
#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
static uint32_t user_sel_impl = IMPL_FASTEST;
static size_t raidz_supp_impl_cnt = 0;
static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
#if defined(_KERNEL)
static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
#endif
const raidz_impl_ops_t *
vdev_raidz_math_get_ops(void)
{
if (!kfpu_allowed())
return (&vdev_raidz_scalar_impl);
raidz_impl_ops_t *ops = NULL;
const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
switch (impl) {
case IMPL_FASTEST:
ASSERT(raidz_math_initialized);
ops = &vdev_raidz_fastest_impl;
break;
case IMPL_CYCLE:
ASSERT(raidz_math_initialized);
ASSERT3U(raidz_supp_impl_cnt, >, 0);
static size_t cycle_impl_idx = 0;
size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
ops = raidz_supp_impl[idx];
break;
case IMPL_ORIGINAL:
ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
break;
case IMPL_SCALAR:
ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
break;
default:
ASSERT3U(impl, <, raidz_supp_impl_cnt);
ASSERT3U(raidz_supp_impl_cnt, >, 0);
if (impl < ARRAY_SIZE(raidz_all_maths))
ops = raidz_supp_impl[impl];
break;
}
ASSERT3P(ops, !=, NULL);
return (ops);
}
int
vdev_raidz_math_generate(raidz_map_t *rm)
{
raidz_gen_f gen_parity = NULL;
switch (raidz_parity(rm)) {
case 1:
gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
break;
case 2:
gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
break;
case 3:
gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
break;
default:
gen_parity = NULL;
cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
(uint_t)raidz_parity(rm));
break;
}
if (gen_parity == NULL)
return (RAIDZ_ORIGINAL_IMPL);
gen_parity(rm);
return (0);
}
static raidz_rec_f
reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
const int nbaddata)
{
if (nbaddata == 1 && parity_valid[CODE_P]) {
return (rm->rm_ops->rec[RAIDZ_REC_P]);
}
return ((raidz_rec_f) NULL);
}
static raidz_rec_f
reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
const int nbaddata)
{
if (nbaddata == 1) {
if (parity_valid[CODE_P]) {
return (rm->rm_ops->rec[RAIDZ_REC_P]);
} else if (parity_valid[CODE_Q]) {
return (rm->rm_ops->rec[RAIDZ_REC_Q]);
}
} else if (nbaddata == 2 &&
parity_valid[CODE_P] && parity_valid[CODE_Q]) {
return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
}
return ((raidz_rec_f) NULL);
}
static raidz_rec_f
reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
const int nbaddata)
{
if (nbaddata == 1) {
if (parity_valid[CODE_P]) {
return (rm->rm_ops->rec[RAIDZ_REC_P]);
} else if (parity_valid[CODE_Q]) {
return (rm->rm_ops->rec[RAIDZ_REC_Q]);
} else if (parity_valid[CODE_R]) {
return (rm->rm_ops->rec[RAIDZ_REC_R]);
}
} else if (nbaddata == 2) {
if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
return (rm->rm_ops->rec[RAIDZ_REC_PR]);
} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
return (rm->rm_ops->rec[RAIDZ_REC_QR]);
}
} else if (nbaddata == 3 &&
parity_valid[CODE_P] && parity_valid[CODE_Q] &&
parity_valid[CODE_R]) {
return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
}
return ((raidz_rec_f) NULL);
}
int
vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
const int *dt, const int nbaddata)
{
raidz_rec_f rec_fn = NULL;
switch (raidz_parity(rm)) {
case PARITY_P:
rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
break;
case PARITY_PQ:
rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
break;
case PARITY_PQR:
rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
(uint_t)raidz_parity(rm));
break;
}
if (rec_fn == NULL)
return (RAIDZ_ORIGINAL_IMPL);
else
return (rec_fn(rm, dt));
}
const char *raidz_gen_name[] = {
"gen_p", "gen_pq", "gen_pqr"
};
const char *raidz_rec_name[] = {
"rec_p", "rec_q", "rec_r",
"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
};
#if defined(_KERNEL)
#define BENCH_D_COLS (8ULL)
#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define BENCH_NS MSEC2NSEC(1)
typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
static void
benchmark_gen_impl(raidz_map_t *rm, const int fn)
{
(void) fn;
vdev_raidz_generate_parity(rm);
}
static void
benchmark_rec_impl(raidz_map_t *rm, const int fn)
{
static const int rec_tgt[7][3] = {
{1, 2, 3},
{0, 2, 3},
{0, 1, 3},
{2, 3, 4},
{1, 3, 4},
{0, 3, 4},
{3, 4, 5}
};
vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
}
static void
benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
{
uint64_t run_cnt, speed, best_speed = 0;
hrtime_t t_start, t_diff;
raidz_impl_ops_t *curr_impl;
raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
int impl, i;
for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
curr_impl = raidz_supp_impl[impl];
bench_rm->rm_ops = curr_impl;
run_cnt = 0;
t_start = gethrtime();
do {
for (i = 0; i < 5; i++, run_cnt++)
bench_fn(bench_rm, fn);
t_diff = gethrtime() - t_start;
} while (t_diff < BENCH_NS);
speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
speed /= (t_diff * BENCH_COLS);
if (bench_fn == benchmark_gen_impl)
raidz_impl_kstats[impl].gen[fn] = speed;
else
raidz_impl_kstats[impl].rec[fn] = speed;
if (speed > best_speed) {
best_speed = speed;
if (bench_fn == benchmark_gen_impl) {
fstat->gen[fn] = impl;
vdev_raidz_fastest_impl.gen[fn] =
curr_impl->gen[fn];
} else {
fstat->rec[fn] = impl;
vdev_raidz_fastest_impl.rec[fn] =
curr_impl->rec[fn];
}
}
}
}
#endif
static void
benchmark_raidz(void)
{
raidz_impl_ops_t *curr_impl;
int i, c;
for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
if (curr_impl->init)
curr_impl->init();
if (curr_impl->is_supported())
raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
}
membar_producer();
raidz_supp_impl_cnt = c;
#if defined(_KERNEL)
zio_t *bench_zio = NULL;
raidz_map_t *bench_rm = NULL;
uint64_t bench_parity;
bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
bench_zio->io_offset = 0;
bench_zio->io_size = BENCH_ZIO_SIZE;
bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
bench_parity = fn + 1;
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
BENCH_D_COLS + bench_parity, bench_parity);
benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
vdev_raidz_map_free(bench_rm);
}
bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
BENCH_COLS, PARITY_PQR);
for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
vdev_raidz_map_free(bench_rm);
abd_free(bench_zio->io_abd);
kmem_free(bench_zio, sizeof (zio_t));
#else
memcpy(&vdev_raidz_fastest_impl,
raidz_supp_impl[raidz_supp_impl_cnt - 1],
sizeof (vdev_raidz_fastest_impl));
strcpy(vdev_raidz_fastest_impl.name, "fastest");
#endif
}
void
vdev_raidz_math_init(void)
{
benchmark_raidz();
atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
raidz_math_initialized = B_TRUE;
}
void
vdev_raidz_math_fini(void)
{
raidz_impl_ops_t const *curr_impl;
for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
curr_impl = raidz_all_maths[i];
if (curr_impl->fini)
curr_impl->fini();
}
}
static const struct {
char *name;
uint32_t sel;
} math_impl_opts[] = {
{ "cycle", IMPL_CYCLE },
{ "fastest", IMPL_FASTEST },
{ "original", IMPL_ORIGINAL },
{ "scalar", IMPL_SCALAR }
};
int
vdev_raidz_impl_set(const char *val)
{
int err = EINVAL;
char req_name[RAIDZ_IMPL_NAME_MAX];
uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
size_t i;
i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
return (err);
strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
while (i > 0 && !!isspace(req_name[i-1]))
i--;
req_name[i] = '\0';
for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
if (strcmp(req_name, math_impl_opts[i].name) == 0) {
impl = math_impl_opts[i].sel;
err = 0;
break;
}
}
if (err != 0 && raidz_math_initialized) {
for (i = 0; i < raidz_supp_impl_cnt; i++) {
if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
impl = i;
err = 0;
break;
}
}
}
if (err == 0) {
if (raidz_math_initialized)
atomic_swap_32(&zfs_vdev_raidz_impl, impl);
else
atomic_swap_32(&user_sel_impl, impl);
}
return (err);
}
#if defined(_KERNEL) && defined(__linux__)
static int
zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
{
return (vdev_raidz_impl_set(val));
}
static int
zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
{
int i, cnt = 0;
char *fmt;
const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
ASSERT(raidz_math_initialized);
for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
}
for (i = 0; i < raidz_supp_impl_cnt; i++) {
fmt = (i == impl) ? "[%s] " : "%s ";
cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
}
return (cnt);
}
module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
zfs_vdev_raidz_impl_get, NULL, 0644);
MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
#endif