root/arch/arm64/crypto/sm4-ce-gcm-core.S
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
 * as specified in rfc8998
 * https://datatracker.ietf.org/doc/html/rfc8998
 *
 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 */

#include <linux/linkage.h>
#include <linux/cfi_types.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"

.arch   armv8-a+crypto

.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
        .set .Lv\b\().4s, \b
.endr

.macro sm4e, vd, vn
        .inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm

/* Register macros */

/* Used for both encryption and decryption */
#define RHASH   v21
#define RRCONST v22
#define RZERO   v23

/* Helper macros. */

/*
 * input: m0, m1
 * output: r0:r1 (low 128-bits in r0, high in r1)
 */
#define PMUL_128x128(r0, r1, m0, m1, T0, T1)                    \
                ext             T0.16b, m1.16b, m1.16b, #8;     \
                pmull           r0.1q, m0.1d, m1.1d;            \
                pmull           T1.1q, m0.1d, T0.1d;            \
                pmull2          T0.1q, m0.2d, T0.2d;            \
                pmull2          r1.1q, m0.2d, m1.2d;            \
                eor             T0.16b, T0.16b, T1.16b;         \
                ext             T1.16b, RZERO.16b, T0.16b, #8;  \
                ext             T0.16b, T0.16b, RZERO.16b, #8;  \
                eor             r0.16b, r0.16b, T1.16b;         \
                eor             r1.16b, r1.16b, T0.16b;

#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1,                 \
                        r2, r3, m2, m3, T2, T3,                 \
                        r4, r5, m4, m5, T4, T5,                 \
                        r6, r7, m6, m7, T6, T7)                 \
                ext             T0.16b, m1.16b, m1.16b, #8;     \
                ext             T2.16b, m3.16b, m3.16b, #8;     \
                ext             T4.16b, m5.16b, m5.16b, #8;     \
                ext             T6.16b, m7.16b, m7.16b, #8;     \
                pmull           r0.1q, m0.1d, m1.1d;            \
                pmull           r2.1q, m2.1d, m3.1d;            \
                pmull           r4.1q, m4.1d, m5.1d;            \
                pmull           r6.1q, m6.1d, m7.1d;            \
                pmull           T1.1q, m0.1d, T0.1d;            \
                pmull           T3.1q, m2.1d, T2.1d;            \
                pmull           T5.1q, m4.1d, T4.1d;            \
                pmull           T7.1q, m6.1d, T6.1d;            \
                pmull2          T0.1q, m0.2d, T0.2d;            \
                pmull2          T2.1q, m2.2d, T2.2d;            \
                pmull2          T4.1q, m4.2d, T4.2d;            \
                pmull2          T6.1q, m6.2d, T6.2d;            \
                pmull2          r1.1q, m0.2d, m1.2d;            \
                pmull2          r3.1q, m2.2d, m3.2d;            \
                pmull2          r5.1q, m4.2d, m5.2d;            \
                pmull2          r7.1q, m6.2d, m7.2d;            \
                eor             T0.16b, T0.16b, T1.16b;         \
                eor             T2.16b, T2.16b, T3.16b;         \
                eor             T4.16b, T4.16b, T5.16b;         \
                eor             T6.16b, T6.16b, T7.16b;         \
                ext             T1.16b, RZERO.16b, T0.16b, #8;  \
                ext             T3.16b, RZERO.16b, T2.16b, #8;  \
                ext             T5.16b, RZERO.16b, T4.16b, #8;  \
                ext             T7.16b, RZERO.16b, T6.16b, #8;  \
                ext             T0.16b, T0.16b, RZERO.16b, #8;  \
                ext             T2.16b, T2.16b, RZERO.16b, #8;  \
                ext             T4.16b, T4.16b, RZERO.16b, #8;  \
                ext             T6.16b, T6.16b, RZERO.16b, #8;  \
                eor             r0.16b, r0.16b, T1.16b;         \
                eor             r2.16b, r2.16b, T3.16b;         \
                eor             r4.16b, r4.16b, T5.16b;         \
                eor             r6.16b, r6.16b, T7.16b;         \
                eor             r1.16b, r1.16b, T0.16b;         \
                eor             r3.16b, r3.16b, T2.16b;         \
                eor             r5.16b, r5.16b, T4.16b;         \
                eor             r7.16b, r7.16b, T6.16b;

/*
 * input: r0:r1 (low 128-bits in r0, high in r1)
 * output: a
 */
#define REDUCTION(a, r0, r1, rconst, T0, T1)                    \
                pmull2          T0.1q, r1.2d, rconst.2d;        \
                ext             T1.16b, T0.16b, RZERO.16b, #8;  \
                ext             T0.16b, RZERO.16b, T0.16b, #8;  \
                eor             r1.16b, r1.16b, T1.16b;         \
                eor             r0.16b, r0.16b, T0.16b;         \
                pmull           T0.1q, r1.1d, rconst.1d;        \
                eor             a.16b, r0.16b, T0.16b;

#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1)  \
        rev32                   b0.16b, b0.16b;                 \
                ext             T0.16b, m1.16b, m1.16b, #8;     \
        sm4e                    b0.4s, v24.4s;                  \
                pmull           r0.1q, m0.1d, m1.1d;            \
        sm4e                    b0.4s, v25.4s;                  \
                pmull           T1.1q, m0.1d, T0.1d;            \
        sm4e                    b0.4s, v26.4s;                  \
                pmull2          T0.1q, m0.2d, T0.2d;            \
        sm4e                    b0.4s, v27.4s;                  \
                pmull2          r1.1q, m0.2d, m1.2d;            \
        sm4e                    b0.4s, v28.4s;                  \
                eor             T0.16b, T0.16b, T1.16b;         \
        sm4e                    b0.4s, v29.4s;                  \
                ext             T1.16b, RZERO.16b, T0.16b, #8;  \
        sm4e                    b0.4s, v30.4s;                  \
                ext             T0.16b, T0.16b, RZERO.16b, #8;  \
        sm4e                    b0.4s, v31.4s;                  \
                eor             r0.16b, r0.16b, T1.16b;         \
        rev64                   b0.4s, b0.4s;                   \
                eor             r1.16b, r1.16b, T0.16b;         \
        ext                     b0.16b, b0.16b, b0.16b, #8;     \
        rev32                   b0.16b, b0.16b;

#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2,                 \
                                    r0, r1, m0, m1, T0, T1,     \
                                    r2, r3, m2, m3, T2, T3,     \
                                    r4, r5, m4, m5, T4, T5)     \
        rev32                   b0.16b, b0.16b;                 \
        rev32                   b1.16b, b1.16b;                 \
        rev32                   b2.16b, b2.16b;                 \
                ext             T0.16b, m1.16b, m1.16b, #8;     \
                ext             T2.16b, m3.16b, m3.16b, #8;     \
                ext             T4.16b, m5.16b, m5.16b, #8;     \
        sm4e                    b0.4s, v24.4s;                  \
        sm4e                    b1.4s, v24.4s;                  \
        sm4e                    b2.4s, v24.4s;                  \
                pmull           r0.1q, m0.1d, m1.1d;            \
                pmull           r2.1q, m2.1d, m3.1d;            \
                pmull           r4.1q, m4.1d, m5.1d;            \
        sm4e                    b0.4s, v25.4s;                  \
        sm4e                    b1.4s, v25.4s;                  \
        sm4e                    b2.4s, v25.4s;                  \
                pmull           T1.1q, m0.1d, T0.1d;            \
                pmull           T3.1q, m2.1d, T2.1d;            \
                pmull           T5.1q, m4.1d, T4.1d;            \
        sm4e                    b0.4s, v26.4s;                  \
        sm4e                    b1.4s, v26.4s;                  \
        sm4e                    b2.4s, v26.4s;                  \
                pmull2          T0.1q, m0.2d, T0.2d;            \
                pmull2          T2.1q, m2.2d, T2.2d;            \
                pmull2          T4.1q, m4.2d, T4.2d;            \
        sm4e                    b0.4s, v27.4s;                  \
        sm4e                    b1.4s, v27.4s;                  \
        sm4e                    b2.4s, v27.4s;                  \
                pmull2          r1.1q, m0.2d, m1.2d;            \
                pmull2          r3.1q, m2.2d, m3.2d;            \
                pmull2          r5.1q, m4.2d, m5.2d;            \
        sm4e                    b0.4s, v28.4s;                  \
        sm4e                    b1.4s, v28.4s;                  \
        sm4e                    b2.4s, v28.4s;                  \
                eor             T0.16b, T0.16b, T1.16b;         \
                eor             T2.16b, T2.16b, T3.16b;         \
                eor             T4.16b, T4.16b, T5.16b;         \
        sm4e                    b0.4s, v29.4s;                  \
        sm4e                    b1.4s, v29.4s;                  \
        sm4e                    b2.4s, v29.4s;                  \
                ext             T1.16b, RZERO.16b, T0.16b, #8;  \
                ext             T3.16b, RZERO.16b, T2.16b, #8;  \
                ext             T5.16b, RZERO.16b, T4.16b, #8;  \
        sm4e                    b0.4s, v30.4s;                  \
        sm4e                    b1.4s, v30.4s;                  \
        sm4e                    b2.4s, v30.4s;                  \
                ext             T0.16b, T0.16b, RZERO.16b, #8;  \
                ext             T2.16b, T2.16b, RZERO.16b, #8;  \
                ext             T4.16b, T4.16b, RZERO.16b, #8;  \
        sm4e                    b0.4s, v31.4s;                  \
        sm4e                    b1.4s, v31.4s;                  \
        sm4e                    b2.4s, v31.4s;                  \
                eor             r0.16b, r0.16b, T1.16b;         \
                eor             r2.16b, r2.16b, T3.16b;         \
                eor             r4.16b, r4.16b, T5.16b;         \
        rev64                   b0.4s, b0.4s;                   \
        rev64                   b1.4s, b1.4s;                   \
        rev64                   b2.4s, b2.4s;                   \
                eor             r1.16b, r1.16b, T0.16b;         \
                eor             r3.16b, r3.16b, T2.16b;         \
                eor             r5.16b, r5.16b, T4.16b;         \
        ext                     b0.16b, b0.16b, b0.16b, #8;     \
        ext                     b1.16b, b1.16b, b1.16b, #8;     \
        ext                     b2.16b, b2.16b, b2.16b, #8;     \
                eor             r0.16b, r0.16b, r2.16b;         \
                eor             r1.16b, r1.16b, r3.16b;         \
        rev32                   b0.16b, b0.16b;                 \
        rev32                   b1.16b, b1.16b;                 \
        rev32                   b2.16b, b2.16b;                 \
                eor             r0.16b, r0.16b, r4.16b;         \
                eor             r1.16b, r1.16b, r5.16b;

#define inc32_le128(vctr)                                       \
                mov             vctr.d[1], x9;                  \
                add             w6, w9, #1;                     \
                mov             vctr.d[0], x8;                  \
                bfi             x9, x6, #0, #32;                \
                rev64           vctr.16b, vctr.16b;

#define GTAG_HASH_LENGTHS(vctr0, vlen)                                  \
                ld1             {vlen.16b}, [x7];                       \
                /* construct CTR0 */                                    \
                /* the lower 32-bits of initial IV is always be32(1) */ \
                mov             x6, #0x1;                               \
                bfi             x9, x6, #0, #32;                        \
                mov             vctr0.d[0], x8;                         \
                mov             vctr0.d[1], x9;                         \
                rbit            vlen.16b, vlen.16b;                     \
                rev64           vctr0.16b, vctr0.16b;                   \
                /* authtag = GCTR(CTR0, GHASH) */                       \
                eor             RHASH.16b, RHASH.16b, vlen.16b;         \
                SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
                                           RTMP0, RTMP1);               \
                REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3);      \
                rbit            RHASH.16b, RHASH.16b;                   \
                eor             RHASH.16b, RHASH.16b, vctr0.16b;


/* Register macros for encrypt and ghash */

/* can be the same as input v0-v3 */
#define RR1     v0
#define RR3     v1
#define RR5     v2
#define RR7     v3

#define RR0     v4
#define RR2     v5
#define RR4     v6
#define RR6     v7

#define RTMP0   v8
#define RTMP1   v9
#define RTMP2   v10
#define RTMP3   v11
#define RTMP4   v12
#define RTMP5   v13
#define RTMP6   v14
#define RTMP7   v15

#define RH1     v16
#define RH2     v17
#define RH3     v18
#define RH4     v19

.align 3
SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
        /* input:
         *   x0: round key array, CTX
         *   x1: ghash table
         */
        SM4_PREPARE(x0)

        adr_l           x2, .Lghash_rconst
        ld1r            {RRCONST.2d}, [x2]

        eor             RZERO.16b, RZERO.16b, RZERO.16b

        /* H = E(K, 0^128) */
        rev32           v0.16b, RZERO.16b
        SM4_CRYPT_BLK_BE(v0)

        /* H ^ 1 */
        rbit            RH1.16b, v0.16b

        /* H ^ 2 */
        PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
        REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)

        /* H ^ 3 */
        PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
        REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)

        /* H ^ 4 */
        PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
        REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)

        st1             {RH1.16b-RH4.16b}, [x1]

        ret
SYM_FUNC_END(sm4_ce_pmull_ghash_setup)

.align 3
SYM_FUNC_START(pmull_ghash_update)
        /* input:
         *   x0: ghash table
         *   x1: ghash result
         *   x2: src
         *   w3: nblocks
         */
        ld1             {RH1.16b-RH4.16b}, [x0]

        ld1             {RHASH.16b}, [x1]
        rbit            RHASH.16b, RHASH.16b

        adr_l           x4, .Lghash_rconst
        ld1r            {RRCONST.2d}, [x4]

        eor             RZERO.16b, RZERO.16b, RZERO.16b

.Lghash_loop_4x:
        cmp             w3, #4
        blt             .Lghash_loop_1x

        sub             w3, w3, #4

        ld1             {v0.16b-v3.16b}, [x2], #64

        rbit            v0.16b, v0.16b
        rbit            v1.16b, v1.16b
        rbit            v2.16b, v2.16b
        rbit            v3.16b, v3.16b

        /*
         * (in0 ^ HASH) * H^4 => rr0:rr1
         * (in1)        * H^3 => rr2:rr3
         * (in2)        * H^2 => rr4:rr5
         * (in3)        * H^1 => rr6:rr7
         */
        eor             RHASH.16b, RHASH.16b, v0.16b

        PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
                        RR2, RR3, v1, RH3, RTMP2, RTMP3,
                        RR4, RR5, v2, RH2, RTMP4, RTMP5,
                        RR6, RR7, v3, RH1, RTMP6, RTMP7)

        eor             RR0.16b, RR0.16b, RR2.16b
        eor             RR1.16b, RR1.16b, RR3.16b
        eor             RR0.16b, RR0.16b, RR4.16b
        eor             RR1.16b, RR1.16b, RR5.16b
        eor             RR0.16b, RR0.16b, RR6.16b
        eor             RR1.16b, RR1.16b, RR7.16b

        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)

        cbz             w3, .Lghash_end
        b               .Lghash_loop_4x

.Lghash_loop_1x:
        sub             w3, w3, #1

        ld1             {v0.16b}, [x2], #16
        rbit            v0.16b, v0.16b
        eor             RHASH.16b, RHASH.16b, v0.16b

        PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)

        cbnz            w3, .Lghash_loop_1x

.Lghash_end:
        rbit            RHASH.16b, RHASH.16b
        st1             {RHASH.2d}, [x1]

        ret
SYM_FUNC_END(pmull_ghash_update)

.align 3
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: ctr (big endian, 128 bit)
         *   w4: nbytes
         *   x5: ghash result
         *   x6: ghash table
         *   x7: lengths (only for last block)
         */
        SM4_PREPARE(x0)

        ldp             x8, x9, [x3]
        rev             x8, x8
        rev             x9, x9

        ld1             {RH1.16b-RH4.16b}, [x6]

        ld1             {RHASH.16b}, [x5]
        rbit            RHASH.16b, RHASH.16b

        adr_l           x6, .Lghash_rconst
        ld1r            {RRCONST.2d}, [x6]

        eor             RZERO.16b, RZERO.16b, RZERO.16b

        cbz             w4, .Lgcm_enc_hash_len

.Lgcm_enc_loop_4x:
        cmp             w4, #(4 * 16)
        blt             .Lgcm_enc_loop_1x

        sub             w4, w4, #(4 * 16)

        /* construct CTRs */
        inc32_le128(v0)                 /* +0 */
        inc32_le128(v1)                 /* +1 */
        inc32_le128(v2)                 /* +2 */
        inc32_le128(v3)                 /* +3 */

        ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64

        SM4_CRYPT_BLK4(v0, v1, v2, v3)

        eor             v0.16b, v0.16b, RTMP0.16b
        eor             v1.16b, v1.16b, RTMP1.16b
        eor             v2.16b, v2.16b, RTMP2.16b
        eor             v3.16b, v3.16b, RTMP3.16b
        st1             {v0.16b-v3.16b}, [x1], #64

        /* ghash update */

        rbit            v0.16b, v0.16b
        rbit            v1.16b, v1.16b
        rbit            v2.16b, v2.16b
        rbit            v3.16b, v3.16b

        /*
         * (in0 ^ HASH) * H^4 => rr0:rr1
         * (in1)        * H^3 => rr2:rr3
         * (in2)        * H^2 => rr4:rr5
         * (in3)        * H^1 => rr6:rr7
         */
        eor             RHASH.16b, RHASH.16b, v0.16b

        PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
                        RR2, RR3, v1, RH3, RTMP2, RTMP3,
                        RR4, RR5, v2, RH2, RTMP4, RTMP5,
                        RR6, RR7, v3, RH1, RTMP6, RTMP7)

        eor             RR0.16b, RR0.16b, RR2.16b
        eor             RR1.16b, RR1.16b, RR3.16b
        eor             RR0.16b, RR0.16b, RR4.16b
        eor             RR1.16b, RR1.16b, RR5.16b
        eor             RR0.16b, RR0.16b, RR6.16b
        eor             RR1.16b, RR1.16b, RR7.16b

        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)

        cbz             w4, .Lgcm_enc_hash_len
        b               .Lgcm_enc_loop_4x

.Lgcm_enc_loop_1x:
        cmp             w4, #16
        blt             .Lgcm_enc_tail

        sub             w4, w4, #16

        /* construct CTRs */
        inc32_le128(v0)

        ld1             {RTMP0.16b}, [x2], #16

        SM4_CRYPT_BLK(v0)

        eor             v0.16b, v0.16b, RTMP0.16b
        st1             {v0.16b}, [x1], #16

        /* ghash update */
        rbit            v0.16b, v0.16b
        eor             RHASH.16b, RHASH.16b, v0.16b
        PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)

        cbz             w4, .Lgcm_enc_hash_len
        b               .Lgcm_enc_loop_1x

.Lgcm_enc_tail:
        /* construct CTRs */
        inc32_le128(v0)
        SM4_CRYPT_BLK(v0)

        /* load permute table */
        adr_l           x0, .Lcts_permute_table
        add             x0, x0, #32
        sub             x0, x0, w4, uxtw
        ld1             {v3.16b}, [x0]

.Lgcm_enc_tail_loop:
        /* do encrypt */
        ldrb            w0, [x2], #1    /* get 1 byte from input */
        umov            w6, v0.b[0]     /* get top crypted byte */
        eor             w6, w6, w0      /* w6 = CTR ^ input */
        strb            w6, [x1], #1    /* store out byte */

        /* shift right out one byte */
        ext             v0.16b, v0.16b, v0.16b, #1
        /* the last ciphertext is placed in high bytes */
        ins             v0.b[15], w6

        subs            w4, w4, #1
        bne             .Lgcm_enc_tail_loop

        /* padding last block with zeros */
        tbl             v0.16b, {v0.16b}, v3.16b

        /* ghash update */
        rbit            v0.16b, v0.16b
        eor             RHASH.16b, RHASH.16b, v0.16b
        PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)

.Lgcm_enc_hash_len:
        cbz             x7, .Lgcm_enc_end

        GTAG_HASH_LENGTHS(v1, v3)

        b               .Lgcm_enc_ret

.Lgcm_enc_end:
        /* store new CTR */
        rev             x8, x8
        rev             x9, x9
        stp             x8, x9, [x3]

        rbit            RHASH.16b, RHASH.16b

.Lgcm_enc_ret:
        /* store new MAC */
        st1             {RHASH.2d}, [x5]

        ret
SYM_FUNC_END(sm4_ce_pmull_gcm_enc)

#undef  RR1
#undef  RR3
#undef  RR5
#undef  RR7
#undef  RR0
#undef  RR2
#undef  RR4
#undef  RR6
#undef RTMP0
#undef RTMP1
#undef RTMP2
#undef RTMP3
#undef RTMP4
#undef RTMP5
#undef RTMP6
#undef RTMP7
#undef  RH1
#undef  RH2
#undef  RH3
#undef  RH4


/* Register macros for decrypt */

/* v0-v2 for building CTRs, v3-v5 for saving inputs */

#define RR1     v6
#define RR3     v7
#define RR5     v8

#define RR0     v9
#define RR2     v10
#define RR4     v11

#define RTMP0   v12
#define RTMP1   v13
#define RTMP2   v14
#define RTMP3   v15
#define RTMP4   v16
#define RTMP5   v17

#define RH1     v18
#define RH2     v19
#define RH3     v20

.align 3
SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
        /* input:
         *   x0: round key array, CTX
         *   x1: dst
         *   x2: src
         *   x3: ctr (big endian, 128 bit)
         *   w4: nbytes
         *   x5: ghash result
         *   x6: ghash table
         *   x7: lengths (only for last block)
         */
        SM4_PREPARE(x0)

        ldp             x8, x9, [x3]
        rev             x8, x8
        rev             x9, x9

        ld1             {RH1.16b-RH3.16b}, [x6]

        ld1             {RHASH.16b}, [x5]
        rbit            RHASH.16b, RHASH.16b

        adr_l           x6, .Lghash_rconst
        ld1r            {RRCONST.2d}, [x6]

        eor             RZERO.16b, RZERO.16b, RZERO.16b

        cbz             w4, .Lgcm_dec_hash_len

.Lgcm_dec_loop_3x:
        cmp             w4, #(3 * 16)
        blt             .Lgcm_dec_loop_1x

        sub             w4, w4, #(3 * 16)

        ld1             {v3.16b-v5.16b}, [x2], #(3 * 16)

        /* construct CTRs */
        inc32_le128(v0)                 /* +0 */
        rbit            v6.16b, v3.16b
        inc32_le128(v1)                 /* +1 */
        rbit            v7.16b, v4.16b
        inc32_le128(v2)                 /* +2 */
        rbit            v8.16b, v5.16b

        eor             RHASH.16b, RHASH.16b, v6.16b

        /* decrypt & ghash update */
        SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
                                    RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
                                    RR2, RR3, v7, RH2, RTMP2, RTMP3,
                                    RR4, RR5, v8, RH1, RTMP4, RTMP5)

        eor             v0.16b, v0.16b, v3.16b
        eor             v1.16b, v1.16b, v4.16b
        eor             v2.16b, v2.16b, v5.16b

        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)

        st1             {v0.16b-v2.16b}, [x1], #(3 * 16)

        cbz             w4, .Lgcm_dec_hash_len
        b               .Lgcm_dec_loop_3x

.Lgcm_dec_loop_1x:
        cmp             w4, #16
        blt             .Lgcm_dec_tail

        sub             w4, w4, #16

        ld1             {v3.16b}, [x2], #16

        /* construct CTRs */
        inc32_le128(v0)
        rbit            v6.16b, v3.16b

        eor             RHASH.16b, RHASH.16b, v6.16b

        SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)

        eor             v0.16b, v0.16b, v3.16b

        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)

        st1             {v0.16b}, [x1], #16

        cbz             w4, .Lgcm_dec_hash_len
        b               .Lgcm_dec_loop_1x

.Lgcm_dec_tail:
        /* construct CTRs */
        inc32_le128(v0)
        SM4_CRYPT_BLK(v0)

        /* load permute table */
        adr_l           x0, .Lcts_permute_table
        add             x0, x0, #32
        sub             x0, x0, w4, uxtw
        ld1             {v3.16b}, [x0]

.Lgcm_dec_tail_loop:
        /* do decrypt */
        ldrb            w0, [x2], #1    /* get 1 byte from input */
        umov            w6, v0.b[0]     /* get top crypted byte */
        eor             w6, w6, w0      /* w6 = CTR ^ input */
        strb            w6, [x1], #1    /* store out byte */

        /* shift right out one byte */
        ext             v0.16b, v0.16b, v0.16b, #1
        /* the last ciphertext is placed in high bytes */
        ins             v0.b[15], w0

        subs            w4, w4, #1
        bne             .Lgcm_dec_tail_loop

        /* padding last block with zeros */
        tbl             v0.16b, {v0.16b}, v3.16b

        /* ghash update */
        rbit            v0.16b, v0.16b
        eor             RHASH.16b, RHASH.16b, v0.16b
        PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
        REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)

.Lgcm_dec_hash_len:
        cbz             x7, .Lgcm_dec_end

        GTAG_HASH_LENGTHS(v1, v3)

        b               .Lgcm_dec_ret

.Lgcm_dec_end:
        /* store new CTR */
        rev             x8, x8
        rev             x9, x9
        stp             x8, x9, [x3]

        rbit            RHASH.16b, RHASH.16b

.Lgcm_dec_ret:
        /* store new MAC */
        st1             {RHASH.2d}, [x5]

        ret
SYM_FUNC_END(sm4_ce_pmull_gcm_dec)

        .section        ".rodata", "a"
        .align 4
.Lcts_permute_table:
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff

.Lghash_rconst:
        .quad           0x87