root/arch/arm64/crypto/aes-ce-ccm-core.S
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
 *
 * Copyright (C) 2013 - 2017 Linaro Ltd.
 * Copyright (C) 2024 Google LLC
 *
 * Author: Ard Biesheuvel <ardb@kernel.org>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

        .text
        .arch   armv8-a+crypto

        .macro  load_round_keys, rk, nr, tmp
        sub     w\tmp, \nr, #10
        add     \tmp, \rk, w\tmp, sxtw #4
        ld1     {v10.4s-v13.4s}, [\rk]
        ld1     {v14.4s-v17.4s}, [\tmp], #64
        ld1     {v18.4s-v21.4s}, [\tmp], #64
        ld1     {v3.4s-v5.4s}, [\tmp]
        .endm

        .macro  dround, va, vb, vk
        aese    \va\().16b, \vk\().16b
        aesmc   \va\().16b, \va\().16b
        aese    \vb\().16b, \vk\().16b
        aesmc   \vb\().16b, \vb\().16b
        .endm

        .macro  aes_encrypt, va, vb, nr
        tbz     \nr, #2, .L\@
        dround  \va, \vb, v10
        dround  \va, \vb, v11
        tbz     \nr, #1, .L\@
        dround  \va, \vb, v12
        dround  \va, \vb, v13
.L\@:   .irp    v, v14, v15, v16, v17, v18, v19, v20, v21, v3
        dround  \va, \vb, \v
        .endr
        aese    \va\().16b, v4.16b
        aese    \vb\().16b, v4.16b
        .endm

        .macro  aes_ccm_do_crypt,enc
        load_round_keys x3, w4, x10

        ld1     {v0.16b}, [x5]                  /* load mac */
        cbz     x2, ce_aes_ccm_final
        ldr     x8, [x6, #8]                    /* load lower ctr */
CPU_LE( rev     x8, x8                  )       /* keep swabbed ctr in reg */
0:      /* outer loop */
        ld1     {v1.8b}, [x6]                   /* load upper ctr */
        prfm    pldl1strm, [x1]
        add     x8, x8, #1
        rev     x9, x8
        ins     v1.d[1], x9                     /* no carry in lower ctr */

        aes_encrypt     v0, v1, w4

        subs    w2, w2, #16
        bmi     ce_aes_ccm_crypt_tail
        ld1     {v2.16b}, [x1], #16             /* load next input block */
        .if     \enc == 1
        eor     v2.16b, v2.16b, v5.16b          /* final round enc+mac */
        eor     v6.16b, v1.16b, v2.16b          /* xor with crypted ctr */
        .else
        eor     v2.16b, v2.16b, v1.16b          /* xor with crypted ctr */
        eor     v6.16b, v2.16b, v5.16b          /* final round enc */
        .endif
        eor     v0.16b, v0.16b, v2.16b          /* xor mac with pt ^ rk[last] */
        st1     {v6.16b}, [x0], #16             /* write output block */
        bne     0b
CPU_LE( rev     x8, x8                  )
        str     x8, [x6, #8]                    /* store lsb end of ctr (BE) */
        cbnz    x7, ce_aes_ccm_final
        st1     {v0.16b}, [x5]                  /* store mac */
        ret
        .endm

SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail)
        eor     v0.16b, v0.16b, v5.16b          /* final round mac */
        eor     v1.16b, v1.16b, v5.16b          /* final round enc */

        add     x1, x1, w2, sxtw                /* rewind the input pointer (w2 < 0) */
        add     x0, x0, w2, sxtw                /* rewind the output pointer */

        adr_l   x8, .Lpermute                   /* load permute vectors */
        add     x9, x8, w2, sxtw
        sub     x8, x8, w2, sxtw
        ld1     {v7.16b-v8.16b}, [x9]
        ld1     {v9.16b}, [x8]

        ld1     {v2.16b}, [x1]                  /* load a full block of input */
        tbl     v1.16b, {v1.16b}, v7.16b        /* move keystream to end of register */
        eor     v7.16b, v2.16b, v1.16b          /* encrypt partial input block */
        bif     v2.16b, v7.16b, v22.16b         /* select plaintext */
        tbx     v7.16b, {v6.16b}, v8.16b        /* insert output from previous iteration */
        tbl     v2.16b, {v2.16b}, v9.16b        /* copy plaintext to start of v2 */
        eor     v0.16b, v0.16b, v2.16b          /* fold plaintext into mac */

        st1     {v7.16b}, [x0]                  /* store output block */
        cbz     x7, 0f

SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL)
        ld1     {v1.16b}, [x7]                  /* load 1st ctriv */

        aes_encrypt     v0, v1, w4

        /* final round key cancels out */
        eor     v0.16b, v0.16b, v1.16b          /* en-/decrypt the mac */
0:      st1     {v0.16b}, [x5]                  /* store result */
        ret
SYM_FUNC_END(ce_aes_ccm_crypt_tail)

        /*
         * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
         *                         u8 const rk[], u32 rounds, u8 mac[],
         *                         u8 ctr[], u8 const final_iv[]);
         * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
         *                         u8 const rk[], u32 rounds, u8 mac[],
         *                         u8 ctr[], u8 const final_iv[]);
         */
SYM_FUNC_START(ce_aes_ccm_encrypt)
        movi    v22.16b, #255
        aes_ccm_do_crypt        1
SYM_FUNC_END(ce_aes_ccm_encrypt)

SYM_FUNC_START(ce_aes_ccm_decrypt)
        movi    v22.16b, #0
        aes_ccm_do_crypt        0
SYM_FUNC_END(ce_aes_ccm_decrypt)

        .section ".rodata", "a"
        .align  6
        .fill   15, 1, 0xff
.Lpermute:
        .byte   0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
        .byte   0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
        .fill   15, 1, 0xff