B3
uint32_t B0, B1, B2, B3;
B3 = crypto_load_be32toh(&in[3 * 4]);
B0 ^= SM4_T_slow(B1 ^ B2 ^ B3 ^ ks->rk[0]);
B1 ^= SM4_T_slow(B0 ^ B2 ^ B3 ^ ks->rk[1]);
B2 ^= SM4_T_slow(B0 ^ B1 ^ B3 ^ ks->rk[2]);
B3 ^= SM4_T_slow(B0 ^ B1 ^ B2 ^ ks->rk[3]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[4]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[5]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[6]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[7]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[8]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[9]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[10]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[11]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[12]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[13]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[14]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[15]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[16]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[17]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[18]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[19]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[20]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[21]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[22]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[23]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[24]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[25]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[26]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[27]);
B0 ^= SM4_T_slow(B1 ^ B2 ^ B3 ^ ks->rk[28]);
B1 ^= SM4_T_slow(B0 ^ B2 ^ B3 ^ ks->rk[29]);
B2 ^= SM4_T_slow(B0 ^ B1 ^ B3 ^ ks->rk[30]);
B3 ^= SM4_T_slow(B0 ^ B1 ^ B2 ^ ks->rk[31]);
crypto_store_htobe32(&out[0 * 4], B3);
uint32_t B0, B1, B2, B3;
B3 = crypto_load_be32toh(&in[3 * 4]);
B0 ^= SM4_T_slow(B1 ^ B2 ^ B3 ^ ks->rk[31]);
B1 ^= SM4_T_slow(B0 ^ B2 ^ B3 ^ ks->rk[30]);
B2 ^= SM4_T_slow(B0 ^ B1 ^ B3 ^ ks->rk[29]);
B3 ^= SM4_T_slow(B0 ^ B1 ^ B2 ^ ks->rk[28]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[27]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[26]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[25]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[24]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[23]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[22]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[21]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[20]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[19]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[18]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[17]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[16]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[15]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[14]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[13]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[12]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[11]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[10]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[9]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[8]);
B0 ^= SM4_T(B1 ^ B2 ^ B3 ^ ks->rk[7]);
B1 ^= SM4_T(B0 ^ B2 ^ B3 ^ ks->rk[6]);
B2 ^= SM4_T(B0 ^ B1 ^ B3 ^ ks->rk[5]);
B3 ^= SM4_T(B0 ^ B1 ^ B2 ^ ks->rk[4]);
B0 ^= SM4_T_slow(B1 ^ B2 ^ B3 ^ ks->rk[3]);
B1 ^= SM4_T_slow(B0 ^ B2 ^ B3 ^ ks->rk[2]);
B2 ^= SM4_T_slow(B0 ^ B1 ^ B3 ^ ks->rk[1]);
B3 ^= SM4_T_slow(B0 ^ B1 ^ B2 ^ ks->rk[0]);
crypto_store_htobe32(&out[0 * 4], B3);
B3(r->base));
B3(r->length));
func(B3) \