COPY_STRIDE
for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) {
LOAD(src + i + COPY_STRIDE, COPY_D);
STORE(dst + i + COPY_STRIDE, COPY_D);