root/arch/arm/crypto/ghash-ce-core.S
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
 *
 * Copyright (C) 2015 - 2017 Linaro Ltd.
 * Copyright (C) 2023 Google LLC. <ardb@google.com>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

        .arch           armv8-a
        .fpu            crypto-neon-fp-armv8

        SHASH           .req    q0
        T1              .req    q1
        XL              .req    q2
        XM              .req    q3
        XH              .req    q4
        IN1             .req    q4

        SHASH_L         .req    d0
        SHASH_H         .req    d1
        T1_L            .req    d2
        T1_H            .req    d3
        XL_L            .req    d4
        XL_H            .req    d5
        XM_L            .req    d6
        XM_H            .req    d7
        XH_L            .req    d8

        t0l             .req    d10
        t0h             .req    d11
        t1l             .req    d12
        t1h             .req    d13
        t2l             .req    d14
        t2h             .req    d15
        t3l             .req    d16
        t3h             .req    d17
        t4l             .req    d18
        t4h             .req    d19

        t0q             .req    q5
        t1q             .req    q6
        t2q             .req    q7
        t3q             .req    q8
        t4q             .req    q9
        XH2             .req    q9

        s1l             .req    d20
        s1h             .req    d21
        s2l             .req    d22
        s2h             .req    d23
        s3l             .req    d24
        s3h             .req    d25
        s4l             .req    d26
        s4h             .req    d27

        MASK            .req    d28
        SHASH2_p8       .req    d28

        k16             .req    d29
        k32             .req    d30
        k48             .req    d31
        SHASH2_p64      .req    d31

        HH              .req    q10
        HH3             .req    q11
        HH4             .req    q12
        HH34            .req    q13

        HH_L            .req    d20
        HH_H            .req    d21
        HH3_L           .req    d22
        HH3_H           .req    d23
        HH4_L           .req    d24
        HH4_H           .req    d25
        HH34_L          .req    d26
        HH34_H          .req    d27
        SHASH2_H        .req    d29

        XL2             .req    q5
        XM2             .req    q6
        T2              .req    q7
        T3              .req    q8

        XL2_L           .req    d10
        XL2_H           .req    d11
        XM2_L           .req    d12
        XM2_H           .req    d13
        T3_L            .req    d16
        T3_H            .req    d17

        .text

        .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
        vmull.p64       \rd, \rn, \rm
        .endm

        /*
         * This implementation of 64x64 -> 128 bit polynomial multiplication
         * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
         * "Fast Software Polynomial Multiplication on ARM Processors Using
         * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
         * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
         *
         * It has been slightly tweaked for in-order performance, and to allow
         * 'rq' to overlap with 'ad' or 'bd'.
         */
        .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
        vext.8          t0l, \ad, \ad, #1       @ A1
        .ifc            \b1, t4l
        vext.8          t4l, \bd, \bd, #1       @ B1
        .endif
        vmull.p8        t0q, t0l, \bd           @ F = A1*B
        vext.8          t1l, \ad, \ad, #2       @ A2
        vmull.p8        t4q, \ad, \b1           @ E = A*B1
        .ifc            \b2, t3l
        vext.8          t3l, \bd, \bd, #2       @ B2
        .endif
        vmull.p8        t1q, t1l, \bd           @ H = A2*B
        vext.8          t2l, \ad, \ad, #3       @ A3
        vmull.p8        t3q, \ad, \b2           @ G = A*B2
        veor            t0q, t0q, t4q           @ L = E + F
        .ifc            \b3, t4l
        vext.8          t4l, \bd, \bd, #3       @ B3
        .endif
        vmull.p8        t2q, t2l, \bd           @ J = A3*B
        veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
        veor            t1q, t1q, t3q           @ M = G + H
        .ifc            \b4, t3l
        vext.8          t3l, \bd, \bd, #4       @ B4
        .endif
        vmull.p8        t4q, \ad, \b3           @ I = A*B3
        veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
        vmull.p8        t3q, \ad, \b4           @ K = A*B4
        vand            t0h, t0h, k48
        vand            t1h, t1h, k32
        veor            t2q, t2q, t4q           @ N = I + J
        veor            t0l, t0l, t0h
        veor            t1l, t1l, t1h
        veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
        vand            t2h, t2h, k16
        veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
        vmov.i64        t3h, #0
        vext.8          t0q, t0q, t0q, #15
        veor            t2l, t2l, t2h
        vext.8          t1q, t1q, t1q, #14
        vmull.p8        \rq, \ad, \bd           @ D = A*B
        vext.8          t2q, t2q, t2q, #13
        vext.8          t3q, t3q, t3q, #12
        veor            t0q, t0q, t1q
        veor            t2q, t2q, t3q
        veor            \rq, \rq, t0q
        veor            \rq, \rq, t2q
        .endm

        //
        // PMULL (64x64->128) based reduction for CPUs that can do
        // it in a single instruction.
        //
        .macro          __pmull_reduce_p64
        vmull.p64       T1, XL_L, MASK

        veor            XH_L, XH_L, XM_H
        vext.8          T1, T1, T1, #8
        veor            XL_H, XL_H, XM_L
        veor            T1, T1, XL

        vmull.p64       XL, T1_H, MASK
        .endm

        //
        // Alternative reduction for CPUs that lack support for the
        // 64x64->128 PMULL instruction
        //
        .macro          __pmull_reduce_p8
        veor            XL_H, XL_H, XM_L
        veor            XH_L, XH_L, XM_H

        vshl.i64        T1, XL, #57
        vshl.i64        T2, XL, #62
        veor            T1, T1, T2
        vshl.i64        T2, XL, #63
        veor            T1, T1, T2
        veor            XL_H, XL_H, T1_L
        veor            XH_L, XH_L, T1_H

        vshr.u64        T1, XL, #1
        veor            XH, XH, XL
        veor            XL, XL, T1
        vshr.u64        T1, T1, #6
        vshr.u64        XL, XL, #1
        .endm

        .macro          ghash_update, pn, enc, aggregate=1, head=1
        vld1.64         {XL}, [r1]

        .if             \head
        /* do the head block first, if supplied */
        ldr             ip, [sp]
        teq             ip, #0
        beq             0f
        vld1.64         {T1}, [ip]
        teq             r0, #0
        b               3f
        .endif

0:      .ifc            \pn, p64
        .if             \aggregate
        tst             r0, #3                  // skip until #blocks is a
        bne             2f                      // round multiple of 4

        vld1.8          {XL2-XM2}, [r2]!
1:      vld1.8          {T2-T3}, [r2]!

        .ifnb           \enc
        \enc\()_4x      XL2, XM2, T2, T3

        add             ip, r3, #16
        vld1.64         {HH}, [ip, :128]!
        vld1.64         {HH3-HH4}, [ip, :128]

        veor            SHASH2_p64, SHASH_L, SHASH_H
        veor            SHASH2_H, HH_L, HH_H
        veor            HH34_L, HH3_L, HH3_H
        veor            HH34_H, HH4_L, HH4_H

        vmov.i8         MASK, #0xe1
        vshl.u64        MASK, MASK, #57
        .endif

        vrev64.8        XL2, XL2
        vrev64.8        XM2, XM2

        subs            r0, r0, #4

        vext.8          T1, XL2, XL2, #8
        veor            XL2_H, XL2_H, XL_L
        veor            XL, XL, T1

        vrev64.8        T1, T3
        vrev64.8        T3, T2

        vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
        veor            XL2_H, XL2_H, XL_H
        vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
        vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)

        vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
        veor            XM2_L, XM2_L, XM2_H
        vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
        vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)

        veor            XH, XH, XH2
        veor            XL, XL, XL2
        veor            XM, XM, XM2

        vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
        veor            T3_L, T3_L, T3_H
        vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
        vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)

        veor            XH, XH, XH2
        veor            XL, XL, XL2
        veor            XM, XM, XM2

        vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
        veor            T1_L, T1_L, T1_H
        vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
        vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)

        veor            XH, XH, XH2
        veor            XL, XL, XL2
        veor            XM, XM, XM2

        beq             4f

        vld1.8          {XL2-XM2}, [r2]!

        veor            T1, XL, XH
        veor            XM, XM, T1

        __pmull_reduce_p64

        veor            T1, T1, XH
        veor            XL, XL, T1

        b               1b
        .endif
        .endif

2:      vld1.8          {T1}, [r2]!

        .ifnb           \enc
        \enc\()_1x      T1
        veor            SHASH2_p64, SHASH_L, SHASH_H
        vmov.i8         MASK, #0xe1
        vshl.u64        MASK, MASK, #57
        .endif

        subs            r0, r0, #1

3:      /* multiply XL by SHASH in GF(2^128) */
        vrev64.8        T1, T1

        vext.8          IN1, T1, T1, #8
        veor            T1_L, T1_L, XL_H
        veor            XL, XL, IN1

        __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
        veor            T1, T1, XL
        __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
        __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)

4:      veor            T1, XL, XH
        veor            XM, XM, T1

        __pmull_reduce_\pn

        veor            T1, T1, XH
        veor            XL, XL, T1

        bne             0b
        .endm

        /*
         * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
         *                         struct ghash_key const *k, const char *head)
         */
ENTRY(pmull_ghash_update_p64)
        vld1.64         {SHASH}, [r3]!
        vld1.64         {HH}, [r3]!
        vld1.64         {HH3-HH4}, [r3]

        veor            SHASH2_p64, SHASH_L, SHASH_H
        veor            SHASH2_H, HH_L, HH_H
        veor            HH34_L, HH3_L, HH3_H
        veor            HH34_H, HH4_L, HH4_H

        vmov.i8         MASK, #0xe1
        vshl.u64        MASK, MASK, #57

        ghash_update    p64
        vst1.64         {XL}, [r1]

        bx              lr
ENDPROC(pmull_ghash_update_p64)

ENTRY(pmull_ghash_update_p8)
        vld1.64         {SHASH}, [r3]
        veor            SHASH2_p8, SHASH_L, SHASH_H

        vext.8          s1l, SHASH_L, SHASH_L, #1
        vext.8          s2l, SHASH_L, SHASH_L, #2
        vext.8          s3l, SHASH_L, SHASH_L, #3
        vext.8          s4l, SHASH_L, SHASH_L, #4
        vext.8          s1h, SHASH_H, SHASH_H, #1
        vext.8          s2h, SHASH_H, SHASH_H, #2
        vext.8          s3h, SHASH_H, SHASH_H, #3
        vext.8          s4h, SHASH_H, SHASH_H, #4

        vmov.i64        k16, #0xffff
        vmov.i64        k32, #0xffffffff
        vmov.i64        k48, #0xffffffffffff

        ghash_update    p8
        vst1.64         {XL}, [r1]

        bx              lr
ENDPROC(pmull_ghash_update_p8)

        e0              .req    q9
        e1              .req    q10
        e2              .req    q11
        e3              .req    q12
        e0l             .req    d18
        e0h             .req    d19
        e2l             .req    d22
        e2h             .req    d23
        e3l             .req    d24
        e3h             .req    d25
        ctr             .req    q13
        ctr0            .req    d26
        ctr1            .req    d27

        ek0             .req    q14
        ek1             .req    q15

        .macro          round, rk:req, regs:vararg
        .irp            r, \regs
        aese.8          \r, \rk
        aesmc.8         \r, \r
        .endr
        .endm

        .macro          aes_encrypt, rkp, rounds, regs:vararg
        vld1.8          {ek0-ek1}, [\rkp, :128]!
        cmp             \rounds, #12
        blt             .L\@                    // AES-128

        round           ek0, \regs
        vld1.8          {ek0}, [\rkp, :128]!
        round           ek1, \regs
        vld1.8          {ek1}, [\rkp, :128]!

        beq             .L\@                    // AES-192

        round           ek0, \regs
        vld1.8          {ek0}, [\rkp, :128]!
        round           ek1, \regs
        vld1.8          {ek1}, [\rkp, :128]!

.L\@:   .rept           4
        round           ek0, \regs
        vld1.8          {ek0}, [\rkp, :128]!
        round           ek1, \regs
        vld1.8          {ek1}, [\rkp, :128]!
        .endr

        round           ek0, \regs
        vld1.8          {ek0}, [\rkp, :128]

        .irp            r, \regs
        aese.8          \r, ek1
        .endr
        .irp            r, \regs
        veor            \r, \r, ek0
        .endr
        .endm

pmull_aes_encrypt:
        add             ip, r5, #4
        vld1.8          {ctr0}, [r5]            // load 12 byte IV
        vld1.8          {ctr1}, [ip]
        rev             r8, r7
        vext.8          ctr1, ctr1, ctr1, #4
        add             r7, r7, #1
        vmov.32         ctr1[1], r8
        vmov            e0, ctr

        add             ip, r3, #64
        aes_encrypt     ip, r6, e0
        bx              lr
ENDPROC(pmull_aes_encrypt)

pmull_aes_encrypt_4x:
        add             ip, r5, #4
        vld1.8          {ctr0}, [r5]
        vld1.8          {ctr1}, [ip]
        rev             r8, r7
        vext.8          ctr1, ctr1, ctr1, #4
        add             r7, r7, #1
        vmov.32         ctr1[1], r8
        rev             ip, r7
        vmov            e0, ctr
        add             r7, r7, #1
        vmov.32         ctr1[1], ip
        rev             r8, r7
        vmov            e1, ctr
        add             r7, r7, #1
        vmov.32         ctr1[1], r8
        rev             ip, r7
        vmov            e2, ctr
        add             r7, r7, #1
        vmov.32         ctr1[1], ip
        vmov            e3, ctr

        add             ip, r3, #64
        aes_encrypt     ip, r6, e0, e1, e2, e3
        bx              lr
ENDPROC(pmull_aes_encrypt_4x)

pmull_aes_encrypt_final:
        add             ip, r5, #4
        vld1.8          {ctr0}, [r5]
        vld1.8          {ctr1}, [ip]
        rev             r8, r7
        vext.8          ctr1, ctr1, ctr1, #4
        mov             r7, #1 << 24            // BE #1 for the tag
        vmov.32         ctr1[1], r8
        vmov            e0, ctr
        vmov.32         ctr1[1], r7
        vmov            e1, ctr

        add             ip, r3, #64
        aes_encrypt     ip, r6, e0, e1
        bx              lr
ENDPROC(pmull_aes_encrypt_final)

        .macro          enc_1x, in0
        bl              pmull_aes_encrypt
        veor            \in0, \in0, e0
        vst1.8          {\in0}, [r4]!
        .endm

        .macro          dec_1x, in0
        bl              pmull_aes_encrypt
        veor            e0, e0, \in0
        vst1.8          {e0}, [r4]!
        .endm

        .macro          enc_4x, in0, in1, in2, in3
        bl              pmull_aes_encrypt_4x

        veor            \in0, \in0, e0
        veor            \in1, \in1, e1
        veor            \in2, \in2, e2
        veor            \in3, \in3, e3

        vst1.8          {\in0-\in1}, [r4]!
        vst1.8          {\in2-\in3}, [r4]!
        .endm

        .macro          dec_4x, in0, in1, in2, in3
        bl              pmull_aes_encrypt_4x

        veor            e0, e0, \in0
        veor            e1, e1, \in1
        veor            e2, e2, \in2
        veor            e3, e3, \in3

        vst1.8          {e0-e1}, [r4]!
        vst1.8          {e2-e3}, [r4]!
        .endm

        /*
         * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
         *                        struct gcm_key const *k, char *dst,
         *                        char *iv, int rounds, u32 counter)
         */
ENTRY(pmull_gcm_encrypt)
        push            {r4-r8, lr}
        ldrd            r4, r5, [sp, #24]
        ldrd            r6, r7, [sp, #32]

        vld1.64         {SHASH}, [r3]

        ghash_update    p64, enc, head=0
        vst1.64         {XL}, [r1]

        pop             {r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)

        /*
         * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
         *                        struct gcm_key const *k, char *dst,
         *                        char *iv, int rounds, u32 counter)
         */
ENTRY(pmull_gcm_decrypt)
        push            {r4-r8, lr}
        ldrd            r4, r5, [sp, #24]
        ldrd            r6, r7, [sp, #32]

        vld1.64         {SHASH}, [r3]

        ghash_update    p64, dec, head=0
        vst1.64         {XL}, [r1]

        pop             {r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)

        /*
         * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
         *                          struct gcm_key const *k, char *head,
         *                          char *iv, int rounds, u32 counter)
         */
ENTRY(pmull_gcm_enc_final)
        push            {r4-r8, lr}
        ldrd            r4, r5, [sp, #24]
        ldrd            r6, r7, [sp, #32]

        bl              pmull_aes_encrypt_final

        cmp             r0, #0
        beq             .Lenc_final

        mov_l           ip, .Lpermute
        sub             r4, r4, #16
        add             r8, ip, r0
        add             ip, ip, #32
        add             r4, r4, r0
        sub             ip, ip, r0

        vld1.8          {e3}, [r8]              // permute vector for key stream
        vld1.8          {e2}, [ip]              // permute vector for ghash input

        vtbl.8          e3l, {e0}, e3l
        vtbl.8          e3h, {e0}, e3h

        vld1.8          {e0}, [r4]              // encrypt tail block
        veor            e0, e0, e3
        vst1.8          {e0}, [r4]

        vtbl.8          T1_L, {e0}, e2l
        vtbl.8          T1_H, {e0}, e2h

        vld1.64         {XL}, [r1]
.Lenc_final:
        vld1.64         {SHASH}, [r3, :128]
        vmov.i8         MASK, #0xe1
        veor            SHASH2_p64, SHASH_L, SHASH_H
        vshl.u64        MASK, MASK, #57
        mov             r0, #1
        bne             3f                      // process head block first
        ghash_update    p64, aggregate=0, head=0

        vrev64.8        XL, XL
        vext.8          XL, XL, XL, #8
        veor            XL, XL, e1

        sub             r2, r2, #16             // rewind src pointer
        vst1.8          {XL}, [r2]              // store tag

        pop             {r4-r8, pc}
ENDPROC(pmull_gcm_enc_final)

        /*
         * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
         *                         struct gcm_key const *k, char *head,
         *                         char *iv, int rounds, u32 counter,
         *                         const char *otag, int authsize)
         */
ENTRY(pmull_gcm_dec_final)
        push            {r4-r8, lr}
        ldrd            r4, r5, [sp, #24]
        ldrd            r6, r7, [sp, #32]

        bl              pmull_aes_encrypt_final

        cmp             r0, #0
        beq             .Ldec_final

        mov_l           ip, .Lpermute
        sub             r4, r4, #16
        add             r8, ip, r0
        add             ip, ip, #32
        add             r4, r4, r0
        sub             ip, ip, r0

        vld1.8          {e3}, [r8]              // permute vector for key stream
        vld1.8          {e2}, [ip]              // permute vector for ghash input

        vtbl.8          e3l, {e0}, e3l
        vtbl.8          e3h, {e0}, e3h

        vld1.8          {e0}, [r4]

        vtbl.8          T1_L, {e0}, e2l
        vtbl.8          T1_H, {e0}, e2h

        veor            e0, e0, e3
        vst1.8          {e0}, [r4]

        vld1.64         {XL}, [r1]
.Ldec_final:
        vld1.64         {SHASH}, [r3]
        vmov.i8         MASK, #0xe1
        veor            SHASH2_p64, SHASH_L, SHASH_H
        vshl.u64        MASK, MASK, #57
        mov             r0, #1
        bne             3f                      // process head block first
        ghash_update    p64, aggregate=0, head=0

        vrev64.8        XL, XL
        vext.8          XL, XL, XL, #8
        veor            XL, XL, e1

        mov_l           ip, .Lpermute
        ldrd            r2, r3, [sp, #40]       // otag and authsize
        vld1.8          {T1}, [r2]
        add             ip, ip, r3
        vceq.i8         T1, T1, XL              // compare tags
        vmvn            T1, T1                  // 0 for eq, -1 for ne

        vld1.8          {e0}, [ip]
        vtbl.8          XL_L, {T1}, e0l         // keep authsize bytes only
        vtbl.8          XL_H, {T1}, e0h

        vpmin.s8        XL_L, XL_L, XL_H        // take the minimum s8 across the vector
        vpmin.s8        XL_L, XL_L, XL_L
        vmov.32         r0, XL_L[0]             // fail if != 0x0

        pop             {r4-r8, pc}
ENDPROC(pmull_gcm_dec_final)

        .section        ".rodata", "a", %progbits
        .align          5
.Lpermute:
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
        .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff