root/arch/powerpc/kernel/vdso/vgetrandom-chacha.S
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2024 Christophe Leroy <christophe.leroy@csgroup.eu>, CS GROUP France
 */

#include <linux/linkage.h>

#include <asm/ppc_asm.h>

#define dst_bytes       r3
#define key             r4
#define counter         r5
#define nblocks         r6

#define idx_r0          r0
#define val4            r4

#define const0          0x61707865
#define const1          0x3320646e
#define const2          0x79622d32
#define const3          0x6b206574

#define key0            r5
#define key1            r6
#define key2            r7
#define key3            r8
#define key4            r9
#define key5            r10
#define key6            r11
#define key7            r12

#define counter0        r14
#define counter1        r15

#define state0          r16
#define state1          r17
#define state2          r18
#define state3          r19
#define state4          r20
#define state5          r21
#define state6          r22
#define state7          r23
#define state8          r24
#define state9          r25
#define state10         r26
#define state11         r27
#define state12         r28
#define state13         r29
#define state14         r30
#define state15         r31

.macro quarterround4 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4
        add     \a1, \a1, \b1
        add     \a2, \a2, \b2
        add     \a3, \a3, \b3
        add     \a4, \a4, \b4
        xor     \d1, \d1, \a1
        xor     \d2, \d2, \a2
        xor     \d3, \d3, \a3
        xor     \d4, \d4, \a4
        rotlwi  \d1, \d1, 16
        rotlwi  \d2, \d2, 16
        rotlwi  \d3, \d3, 16
        rotlwi  \d4, \d4, 16
        add     \c1, \c1, \d1
        add     \c2, \c2, \d2
        add     \c3, \c3, \d3
        add     \c4, \c4, \d4
        xor     \b1, \b1, \c1
        xor     \b2, \b2, \c2
        xor     \b3, \b3, \c3
        xor     \b4, \b4, \c4
        rotlwi  \b1, \b1, 12
        rotlwi  \b2, \b2, 12
        rotlwi  \b3, \b3, 12
        rotlwi  \b4, \b4, 12
        add     \a1, \a1, \b1
        add     \a2, \a2, \b2
        add     \a3, \a3, \b3
        add     \a4, \a4, \b4
        xor     \d1, \d1, \a1
        xor     \d2, \d2, \a2
        xor     \d3, \d3, \a3
        xor     \d4, \d4, \a4
        rotlwi  \d1, \d1, 8
        rotlwi  \d2, \d2, 8
        rotlwi  \d3, \d3, 8
        rotlwi  \d4, \d4, 8
        add     \c1, \c1, \d1
        add     \c2, \c2, \d2
        add     \c3, \c3, \d3
        add     \c4, \c4, \d4
        xor     \b1, \b1, \c1
        xor     \b2, \b2, \c2
        xor     \b3, \b3, \c3
        xor     \b4, \b4, \c4
        rotlwi  \b1, \b1, 7
        rotlwi  \b2, \b2, 7
        rotlwi  \b3, \b3, 7
        rotlwi  \b4, \b4, 7
.endm

#define QUARTERROUND4(a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,a4,b4,c4,d4) \
        quarterround4 state##a1 state##b1 state##c1 state##d1 \
                      state##a2 state##b2 state##c2 state##d2 \
                      state##a3 state##b3 state##c3 state##d3 \
                      state##a4 state##b4 state##c4 state##d4

/*
 * Very basic 32 bits implementation of ChaCha20. Produces a given positive number
 * of blocks of output with a nonce of 0, taking an input key and 8-byte
 * counter. Importantly does not spill to the stack. Its arguments are:
 *
 *      r3: output bytes
 *      r4: 32-byte key input
 *      r5: 8-byte counter input/output (saved on stack)
 *      r6: number of 64-byte blocks to write to output
 *
 *      r0: counter of blocks (initialised with r6)
 *      r4: Value '4' after key has been read.
 *      r5-r12: key
 *      r14-r15: counter
 *      r16-r31: state
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
#ifdef __powerpc64__
        std     counter, -216(r1)

        std     r14, -144(r1)
        std     r15, -136(r1)
        std     r16, -128(r1)
        std     r17, -120(r1)
        std     r18, -112(r1)
        std     r19, -104(r1)
        std     r20, -96(r1)
        std     r21, -88(r1)
        std     r22, -80(r1)
        std     r23, -72(r1)
        std     r24, -64(r1)
        std     r25, -56(r1)
        std     r26, -48(r1)
        std     r27, -40(r1)
        std     r28, -32(r1)
        std     r29, -24(r1)
        std     r30, -16(r1)
        std     r31, -8(r1)
#else
        stwu    r1, -96(r1)
        stw     counter, 20(r1)
#ifdef __BIG_ENDIAN__
        stmw    r14, 24(r1)
#else
        stw     r14, 24(r1)
        stw     r15, 28(r1)
        stw     r16, 32(r1)
        stw     r17, 36(r1)
        stw     r18, 40(r1)
        stw     r19, 44(r1)
        stw     r20, 48(r1)
        stw     r21, 52(r1)
        stw     r22, 56(r1)
        stw     r23, 60(r1)
        stw     r24, 64(r1)
        stw     r25, 68(r1)
        stw     r26, 72(r1)
        stw     r27, 76(r1)
        stw     r28, 80(r1)
        stw     r29, 84(r1)
        stw     r30, 88(r1)
        stw     r31, 92(r1)
#endif
#endif  /* __powerpc64__ */

        lwz     counter0, 0(counter)
        lwz     counter1, 4(counter)
#ifdef __powerpc64__
        rldimi  counter0, counter1, 32, 0
#endif
        mr      idx_r0, nblocks
        subi    dst_bytes, dst_bytes, 4

        lwz     key0, 0(key)
        lwz     key1, 4(key)
        lwz     key2, 8(key)
        lwz     key3, 12(key)
        lwz     key4, 16(key)
        lwz     key5, 20(key)
        lwz     key6, 24(key)
        lwz     key7, 28(key)

        li      val4, 4
.Lblock:
        li      r31, 10

        lis     state0, const0@ha
        lis     state1, const1@ha
        lis     state2, const2@ha
        lis     state3, const3@ha
        addi    state0, state0, const0@l
        addi    state1, state1, const1@l
        addi    state2, state2, const2@l
        addi    state3, state3, const3@l

        mtctr   r31

        mr      state4, key0
        mr      state5, key1
        mr      state6, key2
        mr      state7, key3
        mr      state8, key4
        mr      state9, key5
        mr      state10, key6
        mr      state11, key7

        mr      state12, counter0
        mr      state13, counter1

        li      state14, 0
        li      state15, 0

.Lpermute:
        QUARTERROUND4( 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15)
        QUARTERROUND4( 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14)

        bdnz    .Lpermute

        addis   state0, state0, const0@ha
        addis   state1, state1, const1@ha
        addis   state2, state2, const2@ha
        addis   state3, state3, const3@ha
        addi    state0, state0, const0@l
        addi    state1, state1, const1@l
        addi    state2, state2, const2@l
        addi    state3, state3, const3@l

        add     state4, state4, key0
        add     state5, state5, key1
        add     state6, state6, key2
        add     state7, state7, key3
        add     state8, state8, key4
        add     state9, state9, key5
        add     state10, state10, key6
        add     state11, state11, key7

        add     state12, state12, counter0
        add     state13, state13, counter1

#ifdef __BIG_ENDIAN__
        stwbrx  state0, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state1, 0, dst_bytes
        stwbrx  state2, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state3, 0, dst_bytes
        stwbrx  state4, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state5, 0, dst_bytes
        stwbrx  state6, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state7, 0, dst_bytes
        stwbrx  state8, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state9, 0, dst_bytes
        stwbrx  state10, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state11, 0, dst_bytes
        stwbrx  state12, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state13, 0, dst_bytes
        stwbrx  state14, val4, dst_bytes
        addi    dst_bytes, dst_bytes, 8
        stwbrx  state15, 0, dst_bytes
#else
        stw     state0, 4(dst_bytes)
        stw     state1, 8(dst_bytes)
        stw     state2, 12(dst_bytes)
        stw     state3, 16(dst_bytes)
        stw     state4, 20(dst_bytes)
        stw     state5, 24(dst_bytes)
        stw     state6, 28(dst_bytes)
        stw     state7, 32(dst_bytes)
        stw     state8, 36(dst_bytes)
        stw     state9, 40(dst_bytes)
        stw     state10, 44(dst_bytes)
        stw     state11, 48(dst_bytes)
        stw     state12, 52(dst_bytes)
        stw     state13, 56(dst_bytes)
        stw     state14, 60(dst_bytes)
        stwu    state15, 64(dst_bytes)
#endif

        subic.  idx_r0, idx_r0, 1       /* subi. can't use r0 as source */

#ifdef __powerpc64__
        addi    counter0, counter0, 1
        srdi    counter1, counter0, 32
#else
        addic   counter0, counter0, 1
        addze   counter1, counter1
#endif

        bne     .Lblock

#ifdef __powerpc64__
        ld      counter, -216(r1)
#else
        lwz     counter, 20(r1)
#endif
        stw     counter0, 0(counter)
        stw     counter1, 4(counter)

        li      r6, 0
        li      r7, 0
        li      r8, 0
        li      r9, 0
        li      r10, 0
        li      r11, 0
        li      r12, 0

#ifdef __powerpc64__
        ld      r14, -144(r1)
        ld      r15, -136(r1)
        ld      r16, -128(r1)
        ld      r17, -120(r1)
        ld      r18, -112(r1)
        ld      r19, -104(r1)
        ld      r20, -96(r1)
        ld      r21, -88(r1)
        ld      r22, -80(r1)
        ld      r23, -72(r1)
        ld      r24, -64(r1)
        ld      r25, -56(r1)
        ld      r26, -48(r1)
        ld      r27, -40(r1)
        ld      r28, -32(r1)
        ld      r29, -24(r1)
        ld      r30, -16(r1)
        ld      r31, -8(r1)
#else
#ifdef __BIG_ENDIAN__
        lmw     r14, 24(r1)
#else
        lwz     r14, 24(r1)
        lwz     r15, 28(r1)
        lwz     r16, 32(r1)
        lwz     r17, 36(r1)
        lwz     r18, 40(r1)
        lwz     r19, 44(r1)
        lwz     r20, 48(r1)
        lwz     r21, 52(r1)
        lwz     r22, 56(r1)
        lwz     r23, 60(r1)
        lwz     r24, 64(r1)
        lwz     r25, 68(r1)
        lwz     r26, 72(r1)
        lwz     r27, 76(r1)
        lwz     r28, 80(r1)
        lwz     r29, 84(r1)
        lwz     r30, 88(r1)
        lwz     r31, 92(r1)
#endif
        addi    r1, r1, 96
#endif  /* __powerpc64__ */
        blr
SYM_FUNC_END(__arch_chacha20_blocks_nostack)