GET_U8_BE
(T0) = S1[GET_U8_BE(T0, 0)] ^ S2[GET_U8_BE(T0, 1)] ^ X1[GET_U8_BE(T0, 2)] ^ X2[GET_U8_BE(T0, 3)]; \
(T1) = S1[GET_U8_BE(T1, 0)] ^ S2[GET_U8_BE(T1, 1)] ^ X1[GET_U8_BE(T1, 2)] ^ X2[GET_U8_BE(T1, 3)]; \
(T2) = S1[GET_U8_BE(T2, 0)] ^ S2[GET_U8_BE(T2, 1)] ^ X1[GET_U8_BE(T2, 2)] ^ X2[GET_U8_BE(T2, 3)]; \
(T3) = S1[GET_U8_BE(T3, 0)] ^ S2[GET_U8_BE(T3, 1)] ^ X1[GET_U8_BE(T3, 2)] ^ X2[GET_U8_BE(T3, 3)]; \
(T0) = X1[GET_U8_BE(T0, 0)] ^ X2[GET_U8_BE(T0, 1)] ^ S1[GET_U8_BE(T0, 2)] ^ S2[GET_U8_BE(T0, 3)]; \
(T1) = X1[GET_U8_BE(T1, 0)] ^ X2[GET_U8_BE(T1, 1)] ^ S1[GET_U8_BE(T1, 2)] ^ S2[GET_U8_BE(T1, 3)]; \
(T2) = X1[GET_U8_BE(T2, 0)] ^ X2[GET_U8_BE(T2, 1)] ^ S1[GET_U8_BE(T2, 2)] ^ S2[GET_U8_BE(T2, 3)]; \
(T3) = X1[GET_U8_BE(T3, 0)] ^ X2[GET_U8_BE(T3, 1)] ^ S1[GET_U8_BE(T3, 2)] ^ S2[GET_U8_BE(T3, 3)]; \
((uint8_t *)(DEST))[IDX * 4] = GET_U8_BE(VAL, 0); \
((uint8_t *)(DEST))[IDX * 4 + 1] = GET_U8_BE(VAL, 1); \
((uint8_t *)(DEST))[IDX * 4 + 2] = GET_U8_BE(VAL, 2); \
reg0 = rk->u[0] ^ MAKE_U32((uint8_t)(X1[GET_U8_BE(reg0, 0)]), (uint8_t)(X2[GET_U8_BE(reg0, 1)] >> 8), (uint8_t)(S1[GET_U8_BE(reg0, 2)]), (uint8_t)(S2[GET_U8_BE(reg0, 3)]));
reg1 = rk->u[1] ^ MAKE_U32((uint8_t)(X1[GET_U8_BE(reg1, 0)]), (uint8_t)(X2[GET_U8_BE(reg1, 1)] >> 8), (uint8_t)(S1[GET_U8_BE(reg1, 2)]), (uint8_t)(S2[GET_U8_BE(reg1, 3)]));
reg2 = rk->u[2] ^ MAKE_U32((uint8_t)(X1[GET_U8_BE(reg2, 0)]), (uint8_t)(X2[GET_U8_BE(reg2, 1)] >> 8), (uint8_t)(S1[GET_U8_BE(reg2, 2)]), (uint8_t)(S2[GET_U8_BE(reg2, 3)]));
reg3 = rk->u[3] ^ MAKE_U32((uint8_t)(X1[GET_U8_BE(reg3, 0)]), (uint8_t)(X2[GET_U8_BE(reg3, 1)] >> 8), (uint8_t)(S1[GET_U8_BE(reg3, 2)]), (uint8_t)(S2[GET_U8_BE(reg3, 3)]));
((uint8_t *)(DEST))[IDX * 4 + 3] = GET_U8_BE(VAL, 3); \