LHALF
u[2] = LHALF(tmp.ul[H]);
u[4] = LHALF(tmp.ul[L]);
v[2] = LHALF(tmp.ul[H]);
v[4] = LHALF(tmp.ul[L]);
u[i + j] = LHALF(t);
u[j] = LHALF(t);
u[i + j] = LHALF(t);
u[j] = LHALF(u[j] + t);
LHALF(u[i - 1] << (HALF_BITS - d));
p[i] = LHALF(p[i] << sh) | (p[i + 1] >> (HALF_BITS - sh));
p[i] = LHALF(p[i] << sh);
u0 = LHALF(u);
v0 = LHALF(v);
u[2] = LHALF(tmp.ul[H]);
u[4] = LHALF(tmp.ul[L]);
v[2] = LHALF(tmp.ul[H]);
v[4] = LHALF(tmp.ul[L]);
u[i + j] = LHALF(t);
u[j] = LHALF(t);
u[i + j] = LHALF(t);
u[j] = LHALF(u[j] + t);
LHALF(u[i - 1] << (HALF_BITS - d));
p[i] = LHALF(p[i] << sh) | (p[i + 1] >> (HALF_BITS - sh));
p[i] = LHALF(p[i] << sh);