root/arch/arm64/crypto/sm4-ce-core.S
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
 * as specified in
 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
 *
 * Copyright (C) 2022, Alibaba Group.
 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"

.arch   armv8-a+crypto

.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
                20, 24, 25, 26, 27, 28, 29, 30, 31
        .set .Lv\b\().4s, \b
.endr

.macro sm4e, vd, vn
        .inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm

.macro sm4ekey, vd, vn, vm
        .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm

/* Register macros */

#define RTMP0   v16
#define RTMP1   v17
#define RTMP2   v18
#define RTMP3   v19

#define RIV     v20
#define RMAC    v20
#define RMASK   v21


.align 3
SYM_FUNC_START(sm4_ce_expand_key)
        /* input:
         *   x0: 128-bit key
         *   x1: rkey_enc
         *   x2: rkey_dec
         *   x3: fk array
         *   x4: ck array
         */
        ld1             {v0.16b}, [x0];
        rev32           v0.16b, v0.16b;
        ld1             {v1.16b}, [x3];
        /* load ck */
        ld1             {v24.16b-v27.16b}, [x4], #64;
        ld1             {v28.16b-v31.16b}, [x4];

        /* input ^ fk */
        eor             v0.16b, v0.16b, v1.16b;

        sm4ekey         v0.4s, v0.4s, v24.4s;
        sm4ekey         v1.4s, v0.4s, v25.4s;
        sm4ekey         v2.4s, v1.4s, v26.4s;
        sm4ekey         v3.4s, v2.4s, v27.4s;
        sm4ekey         v4.4s, v3.4s, v28.4s;
        sm4ekey         v5.4s, v4.4s, v29.4s;
        sm4ekey         v6.4s, v5.4s, v30.4s;
        sm4ekey         v7.4s, v6.4s, v31.4s;

        adr_l           x5, .Lbswap128_mask
        ld1             {v24.16b}, [x5]

        st1             {v0.16b-v3.16b}, [x1], #64;
        st1             {v4.16b-v7.16b}, [x1];

        tbl             v16.16b, {v7.16b}, v24.16b
        tbl             v17.16b, {v6.16b}, v24.16b
        tbl             v18.16b, {v5.16b}, v24.16b
        tbl             v19.16b, {v4.16b}, v24.16b
        tbl             v20.16b, {v3.16b}, v24.16b
        tbl             v21.16b, {v2.16b}, v24.16b
        tbl             v22.16b, {v1.16b}, v24.16b
        tbl             v23.16b, {v0.16b}, v24.16b

        st1             {v16.16b-v19.16b}, [x2], #64
        st1             {v20.16b-v23.16b}, [x2]

        ret;
SYM_FUNC_END(sm4_ce_expand_key)

.align 3
SYM_FUNC_START(sm4_ce_crypt_block)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         */
        SM4_PREPARE(x0)

        ld1             {v0.16b}, [x2];
        SM4_CRYPT_BLK(v0);
        st1             {v0.16b}, [x1];

        ret;
SYM_FUNC_END(sm4_ce_crypt_block)

.align 3
SYM_FUNC_START(sm4_ce_crypt)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   w3: nblocks
         */
        SM4_PREPARE(x0)

.Lcrypt_loop_blk:
        sub             w3, w3, #8;
        tbnz            w3, #31, .Lcrypt_tail8;

        ld1             {v0.16b-v3.16b}, [x2], #64;
        ld1             {v4.16b-v7.16b}, [x2], #64;

        SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

        st1             {v0.16b-v3.16b}, [x1], #64;
        st1             {v4.16b-v7.16b}, [x1], #64;

        cbz             w3, .Lcrypt_end;
        b               .Lcrypt_loop_blk;

.Lcrypt_tail8:
        add             w3, w3, #8;
        cmp             w3, #4;
        blt             .Lcrypt_tail4;

        sub             w3, w3, #4;

        ld1             {v0.16b-v3.16b}, [x2], #64;
        SM4_CRYPT_BLK4(v0, v1, v2, v3);
        st1             {v0.16b-v3.16b}, [x1], #64;

        cbz             w3, .Lcrypt_end;

.Lcrypt_tail4:
        sub             w3, w3, #1;

        ld1             {v0.16b}, [x2], #16;
        SM4_CRYPT_BLK(v0);
        st1             {v0.16b}, [x1], #16;

        cbnz            w3, .Lcrypt_tail4;

.Lcrypt_end:
        ret;
SYM_FUNC_END(sm4_ce_crypt)

.align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: iv (big endian, 128 bit)
         *   w4: nblocks
         */
        SM4_PREPARE(x0)

        ld1             {RIV.16b}, [x3]

.Lcbc_enc_loop_4x:
        cmp             w4, #4
        blt             .Lcbc_enc_loop_1x

        sub             w4, w4, #4

        ld1             {v0.16b-v3.16b}, [x2], #64

        eor             v0.16b, v0.16b, RIV.16b
        SM4_CRYPT_BLK(v0)
        eor             v1.16b, v1.16b, v0.16b
        SM4_CRYPT_BLK(v1)
        eor             v2.16b, v2.16b, v1.16b
        SM4_CRYPT_BLK(v2)
        eor             v3.16b, v3.16b, v2.16b
        SM4_CRYPT_BLK(v3)

        st1             {v0.16b-v3.16b}, [x1], #64
        mov             RIV.16b, v3.16b

        cbz             w4, .Lcbc_enc_end
        b               .Lcbc_enc_loop_4x

.Lcbc_enc_loop_1x:
        sub             w4, w4, #1

        ld1             {v0.16b}, [x2], #16

        eor             RIV.16b, RIV.16b, v0.16b
        SM4_CRYPT_BLK(RIV)

        st1             {RIV.16b}, [x1], #16

        cbnz            w4, .Lcbc_enc_loop_1x

.Lcbc_enc_end:
        /* store new IV */
        st1             {RIV.16b}, [x3]

        ret
SYM_FUNC_END(sm4_ce_cbc_enc)

.align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: iv (big endian, 128 bit)
         *   w4: nblocks
         */
        SM4_PREPARE(x0)

        ld1             {RIV.16b}, [x3]

.Lcbc_dec_loop_8x:
        sub             w4, w4, #8
        tbnz            w4, #31, .Lcbc_dec_4x

        ld1             {v0.16b-v3.16b}, [x2], #64
        ld1             {v4.16b-v7.16b}, [x2], #64

        rev32           v8.16b, v0.16b
        rev32           v9.16b, v1.16b
        rev32           v10.16b, v2.16b
        rev32           v11.16b, v3.16b
        rev32           v12.16b, v4.16b
        rev32           v13.16b, v5.16b
        rev32           v14.16b, v6.16b
        rev32           v15.16b, v7.16b

        SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)

        eor             v8.16b, v8.16b, RIV.16b
        eor             v9.16b, v9.16b, v0.16b
        eor             v10.16b, v10.16b, v1.16b
        eor             v11.16b, v11.16b, v2.16b
        eor             v12.16b, v12.16b, v3.16b
        eor             v13.16b, v13.16b, v4.16b
        eor             v14.16b, v14.16b, v5.16b
        eor             v15.16b, v15.16b, v6.16b

        st1             {v8.16b-v11.16b}, [x1], #64
        st1             {v12.16b-v15.16b}, [x1], #64

        mov             RIV.16b, v7.16b

        cbz             w4, .Lcbc_dec_end
        b               .Lcbc_dec_loop_8x

.Lcbc_dec_4x:
        add             w4, w4, #8
        cmp             w4, #4
        blt             .Lcbc_dec_loop_1x

        sub             w4, w4, #4

        ld1             {v0.16b-v3.16b}, [x2], #64

        rev32           v8.16b, v0.16b
        rev32           v9.16b, v1.16b
        rev32           v10.16b, v2.16b
        rev32           v11.16b, v3.16b

        SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)

        eor             v8.16b, v8.16b, RIV.16b
        eor             v9.16b, v9.16b, v0.16b
        eor             v10.16b, v10.16b, v1.16b
        eor             v11.16b, v11.16b, v2.16b

        st1             {v8.16b-v11.16b}, [x1], #64

        mov             RIV.16b, v3.16b

        cbz             w4, .Lcbc_dec_end

.Lcbc_dec_loop_1x:
        sub             w4, w4, #1

        ld1             {v0.16b}, [x2], #16

        rev32           v8.16b, v0.16b

        SM4_CRYPT_BLK_BE(v8)

        eor             v8.16b, v8.16b, RIV.16b
        st1             {v8.16b}, [x1], #16

        mov             RIV.16b, v0.16b

        cbnz            w4, .Lcbc_dec_loop_1x

.Lcbc_dec_end:
        /* store new IV */
        st1             {RIV.16b}, [x3]

        ret
SYM_FUNC_END(sm4_ce_cbc_dec)

.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: iv (big endian, 128 bit)
         *   w4: nbytes
         */
        SM4_PREPARE(x0)

        sub             w5, w4, #16
        uxtw            x5, w5

        ld1             {RIV.16b}, [x3]

        ld1             {v0.16b}, [x2]
        eor             RIV.16b, RIV.16b, v0.16b
        SM4_CRYPT_BLK(RIV)

        /* load permute table */
        adr_l           x6, .Lcts_permute_table
        add             x7, x6, #32
        add             x6, x6, x5
        sub             x7, x7, x5
        ld1             {v3.16b}, [x6]
        ld1             {v4.16b}, [x7]

        /* overlapping loads */
        add             x2, x2, x5
        ld1             {v1.16b}, [x2]

        /* create Cn from En-1 */
        tbl             v0.16b, {RIV.16b}, v3.16b
        /* padding Pn with zeros */
        tbl             v1.16b, {v1.16b}, v4.16b

        eor             v1.16b, v1.16b, RIV.16b
        SM4_CRYPT_BLK(v1)

        /* overlapping stores */
        add             x5, x1, x5
        st1             {v0.16b}, [x5]
        st1             {v1.16b}, [x1]

        ret
SYM_FUNC_END(sm4_ce_cbc_cts_enc)

.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: iv (big endian, 128 bit)
         *   w4: nbytes
         */
        SM4_PREPARE(x0)

        sub             w5, w4, #16
        uxtw            x5, w5

        ld1             {RIV.16b}, [x3]

        /* load permute table */
        adr_l           x6, .Lcts_permute_table
        add             x7, x6, #32
        add             x6, x6, x5
        sub             x7, x7, x5
        ld1             {v3.16b}, [x6]
        ld1             {v4.16b}, [x7]

        /* overlapping loads */
        ld1             {v0.16b}, [x2], x5
        ld1             {v1.16b}, [x2]

        SM4_CRYPT_BLK(v0)
        /* select the first Ln bytes of Xn to create Pn */
        tbl             v2.16b, {v0.16b}, v3.16b
        eor             v2.16b, v2.16b, v1.16b

        /* overwrite the first Ln bytes with Cn to create En-1 */
        tbx             v0.16b, {v1.16b}, v4.16b
        SM4_CRYPT_BLK(v0)
        eor             v0.16b, v0.16b, RIV.16b

        /* overlapping stores */
        add             x5, x1, x5
        st1             {v2.16b}, [x5]
        st1             {v0.16b}, [x1]

        ret
SYM_FUNC_END(sm4_ce_cbc_cts_dec)

.align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: ctr (big endian, 128 bit)
         *   w4: nblocks
         */
        SM4_PREPARE(x0)

        ldp             x7, x8, [x3]
        rev             x7, x7
        rev             x8, x8

.Lctr_loop_8x:
        sub             w4, w4, #8
        tbnz            w4, #31, .Lctr_4x

#define inc_le128(vctr)                                 \
                mov             vctr.d[1], x8;          \
                mov             vctr.d[0], x7;          \
                adds            x8, x8, #1;             \
                rev64           vctr.16b, vctr.16b;     \
                adc             x7, x7, xzr;

        /* construct CTRs */
        inc_le128(v0)                   /* +0 */
        inc_le128(v1)                   /* +1 */
        inc_le128(v2)                   /* +2 */
        inc_le128(v3)                   /* +3 */
        inc_le128(v4)                   /* +4 */
        inc_le128(v5)                   /* +5 */
        inc_le128(v6)                   /* +6 */
        inc_le128(v7)                   /* +7 */

        ld1             {v8.16b-v11.16b}, [x2], #64
        ld1             {v12.16b-v15.16b}, [x2], #64

        SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

        eor             v0.16b, v0.16b, v8.16b
        eor             v1.16b, v1.16b, v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        eor             v4.16b, v4.16b, v12.16b
        eor             v5.16b, v5.16b, v13.16b
        eor             v6.16b, v6.16b, v14.16b
        eor             v7.16b, v7.16b, v15.16b

        st1             {v0.16b-v3.16b}, [x1], #64
        st1             {v4.16b-v7.16b}, [x1], #64

        cbz             w4, .Lctr_end
        b               .Lctr_loop_8x

.Lctr_4x:
        add             w4, w4, #8
        cmp             w4, #4
        blt             .Lctr_loop_1x

        sub             w4, w4, #4

        /* construct CTRs */
        inc_le128(v0)                   /* +0 */
        inc_le128(v1)                   /* +1 */
        inc_le128(v2)                   /* +2 */
        inc_le128(v3)                   /* +3 */

        ld1             {v8.16b-v11.16b}, [x2], #64

        SM4_CRYPT_BLK4(v0, v1, v2, v3)

        eor             v0.16b, v0.16b, v8.16b
        eor             v1.16b, v1.16b, v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b

        st1             {v0.16b-v3.16b}, [x1], #64

        cbz             w4, .Lctr_end

.Lctr_loop_1x:
        sub             w4, w4, #1

        /* construct CTRs */
        inc_le128(v0)

        ld1             {v8.16b}, [x2], #16

        SM4_CRYPT_BLK(v0)

        eor             v0.16b, v0.16b, v8.16b
        st1             {v0.16b}, [x1], #16

        cbnz            w4, .Lctr_loop_1x

.Lctr_end:
        /* store new CTR */
        rev             x7, x7
        rev             x8, x8
        stp             x7, x8, [x3]

        ret
SYM_FUNC_END(sm4_ce_ctr_enc)


#define tweak_next(vt, vin, RTMP)                                       \
                sshr            RTMP.2d, vin.2d, #63;                   \
                and             RTMP.16b, RTMP.16b, RMASK.16b;          \
                add             vt.2d, vin.2d, vin.2d;                  \
                ext             RTMP.16b, RTMP.16b, RTMP.16b, #8;       \
                eor             vt.16b, vt.16b, RTMP.16b;

.align 3
SYM_FUNC_START(sm4_ce_xts_enc)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: tweak (big endian, 128 bit)
         *   w4: nbytes
         *   x5: round key array for IV
         */
        ld1             {v8.16b}, [x3]

        cbz             x5, .Lxts_enc_nofirst

        SM4_PREPARE(x5)

        /* Generate first tweak */
        SM4_CRYPT_BLK(v8)

.Lxts_enc_nofirst:
        SM4_PREPARE(x0)

        ands            w5, w4, #15
        lsr             w4, w4, #4
        sub             w6, w4, #1
        csel            w4, w4, w6, eq
        uxtw            x5, w5

        movi            RMASK.2s, #0x1
        movi            RTMP0.2s, #0x87
        uzp1            RMASK.4s, RMASK.4s, RTMP0.4s

        cbz             w4, .Lxts_enc_cts

.Lxts_enc_loop_8x:
        sub             w4, w4, #8
        tbnz            w4, #31, .Lxts_enc_4x

        tweak_next( v9,  v8, RTMP0)
        tweak_next(v10,  v9, RTMP1)
        tweak_next(v11, v10, RTMP2)
        tweak_next(v12, v11, RTMP3)
        tweak_next(v13, v12, RTMP0)
        tweak_next(v14, v13, RTMP1)
        tweak_next(v15, v14, RTMP2)

        ld1             {v0.16b-v3.16b}, [x2], #64
        ld1             {v4.16b-v7.16b}, [x2], #64
        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        eor             v4.16b, v4.16b, v12.16b
        eor             v5.16b, v5.16b, v13.16b
        eor             v6.16b, v6.16b, v14.16b
        eor             v7.16b, v7.16b, v15.16b

        SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        eor             v4.16b, v4.16b, v12.16b
        eor             v5.16b, v5.16b, v13.16b
        eor             v6.16b, v6.16b, v14.16b
        eor             v7.16b, v7.16b, v15.16b
        st1             {v0.16b-v3.16b}, [x1], #64
        st1             {v4.16b-v7.16b}, [x1], #64

        tweak_next(v8, v15, RTMP3)

        cbz             w4, .Lxts_enc_cts
        b               .Lxts_enc_loop_8x

.Lxts_enc_4x:
        add             w4, w4, #8
        cmp             w4, #4
        blt             .Lxts_enc_loop_1x

        sub             w4, w4, #4

        tweak_next( v9,  v8, RTMP0)
        tweak_next(v10,  v9, RTMP1)
        tweak_next(v11, v10, RTMP2)

        ld1             {v0.16b-v3.16b}, [x2], #64
        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b

        SM4_CRYPT_BLK4(v0, v1, v2, v3)

        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        st1             {v0.16b-v3.16b}, [x1], #64

        tweak_next(v8, v11, RTMP3)

        cbz             w4, .Lxts_enc_cts

.Lxts_enc_loop_1x:
        sub             w4, w4, #1

        ld1             {v0.16b}, [x2], #16
        eor             v0.16b, v0.16b, v8.16b

        SM4_CRYPT_BLK(v0)

        eor             v0.16b, v0.16b, v8.16b
        st1             {v0.16b}, [x1], #16

        tweak_next(v8, v8, RTMP0)

        cbnz            w4, .Lxts_enc_loop_1x

.Lxts_enc_cts:
        cbz             x5, .Lxts_enc_end

        /* cipher text stealing */

        tweak_next(v9, v8, RTMP0)
        ld1             {v0.16b}, [x2]
        eor             v0.16b, v0.16b, v8.16b
        SM4_CRYPT_BLK(v0)
        eor             v0.16b, v0.16b, v8.16b

        /* load permute table */
        adr_l           x6, .Lcts_permute_table
        add             x7, x6, #32
        add             x6, x6, x5
        sub             x7, x7, x5
        ld1             {v3.16b}, [x6]
        ld1             {v4.16b}, [x7]

        /* overlapping loads */
        add             x2, x2, x5
        ld1             {v1.16b}, [x2]

        /* create Cn from En-1 */
        tbl             v2.16b, {v0.16b}, v3.16b
        /* padding Pn with En-1 at the end */
        tbx             v0.16b, {v1.16b}, v4.16b

        eor             v0.16b, v0.16b, v9.16b
        SM4_CRYPT_BLK(v0)
        eor             v0.16b, v0.16b, v9.16b


        /* overlapping stores */
        add             x5, x1, x5
        st1             {v2.16b}, [x5]
        st1             {v0.16b}, [x1]

        b               .Lxts_enc_ret

.Lxts_enc_end:
        /* store new tweak */
        st1             {v8.16b}, [x3]

.Lxts_enc_ret:
        ret
SYM_FUNC_END(sm4_ce_xts_enc)

.align 3
SYM_FUNC_START(sm4_ce_xts_dec)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: tweak (big endian, 128 bit)
         *   w4: nbytes
         *   x5: round key array for IV
         */
        ld1             {v8.16b}, [x3]

        cbz             x5, .Lxts_dec_nofirst

        SM4_PREPARE(x5)

        /* Generate first tweak */
        SM4_CRYPT_BLK(v8)

.Lxts_dec_nofirst:
        SM4_PREPARE(x0)

        ands            w5, w4, #15
        lsr             w4, w4, #4
        sub             w6, w4, #1
        csel            w4, w4, w6, eq
        uxtw            x5, w5

        movi            RMASK.2s, #0x1
        movi            RTMP0.2s, #0x87
        uzp1            RMASK.4s, RMASK.4s, RTMP0.4s

        cbz             w4, .Lxts_dec_cts

.Lxts_dec_loop_8x:
        sub             w4, w4, #8
        tbnz            w4, #31, .Lxts_dec_4x

        tweak_next( v9,  v8, RTMP0)
        tweak_next(v10,  v9, RTMP1)
        tweak_next(v11, v10, RTMP2)
        tweak_next(v12, v11, RTMP3)
        tweak_next(v13, v12, RTMP0)
        tweak_next(v14, v13, RTMP1)
        tweak_next(v15, v14, RTMP2)

        ld1             {v0.16b-v3.16b}, [x2], #64
        ld1             {v4.16b-v7.16b}, [x2], #64
        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        eor             v4.16b, v4.16b, v12.16b
        eor             v5.16b, v5.16b, v13.16b
        eor             v6.16b, v6.16b, v14.16b
        eor             v7.16b, v7.16b, v15.16b

        SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        eor             v4.16b, v4.16b, v12.16b
        eor             v5.16b, v5.16b, v13.16b
        eor             v6.16b, v6.16b, v14.16b
        eor             v7.16b, v7.16b, v15.16b
        st1             {v0.16b-v3.16b}, [x1], #64
        st1             {v4.16b-v7.16b}, [x1], #64

        tweak_next(v8, v15, RTMP3)

        cbz             w4, .Lxts_dec_cts
        b               .Lxts_dec_loop_8x

.Lxts_dec_4x:
        add             w4, w4, #8
        cmp             w4, #4
        blt             .Lxts_dec_loop_1x

        sub             w4, w4, #4

        tweak_next( v9,  v8, RTMP0)
        tweak_next(v10,  v9, RTMP1)
        tweak_next(v11, v10, RTMP2)

        ld1             {v0.16b-v3.16b}, [x2], #64
        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b

        SM4_CRYPT_BLK4(v0, v1, v2, v3)

        eor             v0.16b, v0.16b,  v8.16b
        eor             v1.16b, v1.16b,  v9.16b
        eor             v2.16b, v2.16b, v10.16b
        eor             v3.16b, v3.16b, v11.16b
        st1             {v0.16b-v3.16b}, [x1], #64

        tweak_next(v8, v11, RTMP3)

        cbz             w4, .Lxts_dec_cts

.Lxts_dec_loop_1x:
        sub             w4, w4, #1

        ld1             {v0.16b}, [x2], #16
        eor             v0.16b, v0.16b, v8.16b

        SM4_CRYPT_BLK(v0)

        eor             v0.16b, v0.16b, v8.16b
        st1             {v0.16b}, [x1], #16

        tweak_next(v8, v8, RTMP0)

        cbnz            w4, .Lxts_dec_loop_1x

.Lxts_dec_cts:
        cbz             x5, .Lxts_dec_end

        /* cipher text stealing */

        tweak_next(v9, v8, RTMP0)
        ld1             {v0.16b}, [x2]
        eor             v0.16b, v0.16b, v9.16b
        SM4_CRYPT_BLK(v0)
        eor             v0.16b, v0.16b, v9.16b

        /* load permute table */
        adr_l           x6, .Lcts_permute_table
        add             x7, x6, #32
        add             x6, x6, x5
        sub             x7, x7, x5
        ld1             {v3.16b}, [x6]
        ld1             {v4.16b}, [x7]

        /* overlapping loads */
        add             x2, x2, x5
        ld1             {v1.16b}, [x2]

        /* create Cn from En-1 */
        tbl             v2.16b, {v0.16b}, v3.16b
        /* padding Pn with En-1 at the end */
        tbx             v0.16b, {v1.16b}, v4.16b

        eor             v0.16b, v0.16b, v8.16b
        SM4_CRYPT_BLK(v0)
        eor             v0.16b, v0.16b, v8.16b


        /* overlapping stores */
        add             x5, x1, x5
        st1             {v2.16b}, [x5]
        st1             {v0.16b}, [x1]

        b               .Lxts_dec_ret

.Lxts_dec_end:
        /* store new tweak */
        st1             {v8.16b}, [x3]

.Lxts_dec_ret:
        ret
SYM_FUNC_END(sm4_ce_xts_dec)

.align 3
SYM_FUNC_START(sm4_ce_mac_update)
        /* input:
         *   x0: round key array, CTX
         *   x1: digest
         *   x2: src
         *   w3: nblocks
         *   w4: enc_before
         *   w5: enc_after
         */
        SM4_PREPARE(x0)

        ld1             {RMAC.16b}, [x1]

        cbz             w4, .Lmac_update

        SM4_CRYPT_BLK(RMAC)

.Lmac_update:
        cbz             w3, .Lmac_ret

        sub             w6, w3, #1
        cmp             w5, wzr
        csel            w3, w3, w6, ne

        cbz             w3, .Lmac_end

.Lmac_loop_4x:
        cmp             w3, #4
        blt             .Lmac_loop_1x

        sub             w3, w3, #4

        ld1             {v0.16b-v3.16b}, [x2], #64

        eor             RMAC.16b, RMAC.16b, v0.16b
        SM4_CRYPT_BLK(RMAC)
        eor             RMAC.16b, RMAC.16b, v1.16b
        SM4_CRYPT_BLK(RMAC)
        eor             RMAC.16b, RMAC.16b, v2.16b
        SM4_CRYPT_BLK(RMAC)
        eor             RMAC.16b, RMAC.16b, v3.16b
        SM4_CRYPT_BLK(RMAC)

        cbz             w3, .Lmac_end
        b               .Lmac_loop_4x

.Lmac_loop_1x:
        sub             w3, w3, #1

        ld1             {v0.16b}, [x2], #16

        eor             RMAC.16b, RMAC.16b, v0.16b
        SM4_CRYPT_BLK(RMAC)

        cbnz            w3, .Lmac_loop_1x


.Lmac_end:
        cbnz            w5, .Lmac_ret

        ld1             {v0.16b}, [x2], #16
        eor             RMAC.16b, RMAC.16b, v0.16b

.Lmac_ret:
        st1             {RMAC.16b}, [x1]
        ret
SYM_FUNC_END(sm4_ce_mac_update)


        .section        ".rodata", "a"
        .align 4
.Lbswap128_mask:
        .byte           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
        .byte           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03

.Lcts_permute_table:
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff