#include <sys/queue.h>
#include <stdint.h>
#include <string.h>
#include "int.h"
#ifndef VOSS_X3_LOG2_COMBA
#define VOSS_X3_LOG2_COMBA 5
#endif
#if (VOSS_X3_LOG2_COMBA < 2)
#error "VOSS_X3_LOG2_COMBA must be greater than 1"
#endif
struct voss_x3_input_double {
double a;
double b;
} __aligned(16);
static void
voss_x3_multiply_sub_double(struct voss_x3_input_double *input, double *ptr_low, double *ptr_high,
const size_t stride, const uint8_t toggle)
{
size_t x;
size_t y;
if (stride >= (1UL << VOSS_X3_LOG2_COMBA)) {
const size_t strideh = stride >> 1;
if (toggle) {
for (x = 0; x != strideh; x++) {
double a, b, c, d;
a = ptr_low[x];
b = ptr_low[x + strideh];
c = ptr_high[x];
d = ptr_high[x + strideh];
ptr_low[x + strideh] = a + b;
ptr_high[x] = a + b + c + d;
}
voss_x3_multiply_sub_double(input, ptr_low, ptr_low + strideh, strideh, 1);
for (x = 0; x != strideh; x++)
ptr_low[x + strideh] = -ptr_low[x + strideh];
voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high + strideh, strideh, 1);
for (x = 0; x != strideh; x++) {
double a, b, c, d;
a = ptr_low[x];
b = ptr_low[x + strideh];
c = ptr_high[x];
d = ptr_high[x + strideh];
ptr_low[x + strideh] = -a - b;
ptr_high[x] = c + b - d;
input[x + strideh].a += input[x].a;
input[x + strideh].b += input[x].b;
}
voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high, strideh, 0);
} else {
voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high, strideh, 1);
for (x = 0; x != strideh; x++) {
double a, b, c, d;
a = ptr_low[x];
b = ptr_low[x + strideh];
c = ptr_high[x];
d = ptr_high[x + strideh];
ptr_low[x + strideh] = -a - b;
ptr_high[x] = a + b + c + d;
input[x + strideh].a -= input[x].a;
input[x + strideh].b -= input[x].b;
}
voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high + strideh, strideh, 0);
for (x = 0; x != strideh; x++)
ptr_low[x + strideh] = -ptr_low[x + strideh];
voss_x3_multiply_sub_double(input, ptr_low, ptr_low + strideh, strideh, 0);
for (x = 0; x != strideh; x++) {
double a, b, c, d;
a = ptr_low[x];
b = ptr_low[x + strideh];
c = ptr_high[x];
d = ptr_high[x + strideh];
ptr_low[x + strideh] = b - a;
ptr_high[x] = c - b - d;
}
}
} else {
for (x = 0; x != stride; x++) {
double value = input[x].a;
for (y = 0; y != (stride - x); y++) {
ptr_low[x + y] += input[y].b * value;
}
for (; y != stride; y++) {
ptr_high[x + y - stride] += input[y].b * value;
}
}
}
}
void
voss_x3_multiply_double(const int64_t *va, const double *vb, double *pc, const size_t max)
{
struct voss_x3_input_double input[max];
size_t x;
if (max & (max - 1))
return;
for (x = 0; x != max; x++) {
input[x].a = va[x];
input[x].b = vb[x];
}
voss_x3_multiply_sub_double(input, pc, pc + max, max, 1);
}