root/arch/arm64/kernel/vdso/vgetrandom-chacha.S
// SPDX-License-Identifier: GPL-2.0

#include <linux/linkage.h>
#include <asm/cache.h>
#include <asm/assembler.h>

        .text

#define state0          v0
#define state1          v1
#define state2          v2
#define state3          v3
#define copy0           v4
#define copy0_q         q4
#define copy1           v5
#define copy2           v6
#define copy3           v7
#define copy3_d         d7
#define one_d           d16
#define one_q           q16
#define one_v           v16
#define tmp             v17
#define rot8            v18

/*
 * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
 * number of blocks of output with nonce 0, taking an input key and 8-bytes
 * counter.  Importantly does not spill to the stack.
 *
 * This implementation avoids d8-d15 because they are callee-save in user
 * space.
 *
 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
 *                                     const uint8_t *key,
 *                                     uint32_t *counter,
 *                                     size_t nblocks)
 *
 *      x0: output bytes
 *      x1: 32-byte key input
 *      x2: 8-byte counter input/output
 *      x3: number of 64-byte block to write to output
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)

        /* copy0 = "expand 32-byte k" */
        mov_q           x8, 0x3320646e61707865
        mov_q           x9, 0x6b20657479622d32
        mov             copy0.d[0], x8
        mov             copy0.d[1], x9

        /* copy1,copy2 = key */
        ld1             { copy1.4s, copy2.4s }, [x1]
        /* copy3 = counter || zero nonce  */
        ld1             { copy3.2s }, [x2]

        movi            one_v.2s, #1
        uzp1            one_v.4s, one_v.4s, one_v.4s

.Lblock:
        /* copy state to auxiliary vectors for the final add after the permute.  */
        mov             state0.16b, copy0.16b
        mov             state1.16b, copy1.16b
        mov             state2.16b, copy2.16b
        mov             state3.16b, copy3.16b

        mov             w4, 20
.Lpermute:
        /*
         * Permute one 64-byte block where the state matrix is stored in the four NEON
         * registers state0-state3.  It performs matrix operations on four words in parallel,
         * but requires shuffling to rearrange the words after each round.
         */

.Ldoubleround:
        /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
        add             state0.4s, state0.4s, state1.4s
        eor             state3.16b, state3.16b, state0.16b
        rev32           state3.8h, state3.8h

        /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
        add             state2.4s, state2.4s, state3.4s
        eor             tmp.16b, state1.16b, state2.16b
        shl             state1.4s, tmp.4s, #12
        sri             state1.4s, tmp.4s, #20

        /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
        add             state0.4s, state0.4s, state1.4s
        eor             tmp.16b, state3.16b, state0.16b
        shl             state3.4s, tmp.4s, #8
        sri             state3.4s, tmp.4s, #24

        /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
        add             state2.4s, state2.4s, state3.4s
        eor             tmp.16b, state1.16b, state2.16b
        shl             state1.4s, tmp.4s, #7
        sri             state1.4s, tmp.4s, #25

        /* state1[0,1,2,3] = state1[1,2,3,0] */
        ext             state1.16b, state1.16b, state1.16b, #4
        /* state2[0,1,2,3] = state2[2,3,0,1] */
        ext             state2.16b, state2.16b, state2.16b, #8
        /* state3[0,1,2,3] = state3[1,2,3,0] */
        ext             state3.16b, state3.16b, state3.16b, #12

        /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
        add             state0.4s, state0.4s, state1.4s
        eor             state3.16b, state3.16b, state0.16b
        rev32           state3.8h, state3.8h

        /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
        add             state2.4s, state2.4s, state3.4s
        eor             tmp.16b, state1.16b, state2.16b
        shl             state1.4s, tmp.4s, #12
        sri             state1.4s, tmp.4s, #20

        /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
        add             state0.4s, state0.4s, state1.4s
        eor             tmp.16b, state3.16b, state0.16b
        shl             state3.4s, tmp.4s, #8
        sri             state3.4s, tmp.4s, #24

        /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
        add             state2.4s, state2.4s, state3.4s
        eor             tmp.16b, state1.16b, state2.16b
        shl             state1.4s, tmp.4s, #7
        sri             state1.4s, tmp.4s, #25

        /* state1[0,1,2,3] = state1[3,0,1,2] */
        ext             state1.16b, state1.16b, state1.16b, #12
        /* state2[0,1,2,3] = state2[2,3,0,1] */
        ext             state2.16b, state2.16b, state2.16b, #8
        /* state3[0,1,2,3] = state3[1,2,3,0] */
        ext             state3.16b, state3.16b, state3.16b, #4

        subs            w4, w4, #2
        b.ne            .Ldoubleround

        /* output0 = state0 + state0 */
        add             state0.4s, state0.4s, copy0.4s
        /* output1 = state1 + state1 */
        add             state1.4s, state1.4s, copy1.4s
        /* output2 = state2 + state2 */
        add             state2.4s, state2.4s, copy2.4s
        /* output2 = state3 + state3 */
        add             state3.4s, state3.4s, copy3.4s
        st1             { state0.16b - state3.16b }, [x0]

        /*
         * ++copy3.counter, the 'add' clears the upper half of the SIMD register
         * which is the expected behaviour here.
         */
        add             copy3_d, copy3_d, one_d

        /* output += 64, --nblocks */
        add             x0, x0, 64
        subs            x3, x3, #1
        b.ne            .Lblock

        /* counter = copy3.counter */
        st1             { copy3.2s }, [x2]

        /* Zero out the potentially sensitive regs, in case nothing uses these again. */
        movi            state0.16b, #0
        movi            state1.16b, #0
        movi            state2.16b, #0
        movi            state3.16b, #0
        movi            copy1.16b, #0
        movi            copy2.16b, #0
        ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)

emit_aarch64_feature_1_and