root/arch/s390/kernel/vdso/vgetrandom-chacha.S
/* SPDX-License-Identifier: GPL-2.0 */

#include <linux/stringify.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/dwarf.h>
#include <asm/fpu-insn.h>

#define STATE0  %v0
#define STATE1  %v1
#define STATE2  %v2
#define STATE3  %v3
#define COPY0   %v4
#define COPY1   %v5
#define COPY2   %v6
#define COPY3   %v7
#define BEPERM  %v19
#define TMP0    %v20
#define TMP1    %v21
#define TMP2    %v22
#define TMP3    %v23

        .section .rodata

        .balign 32
SYM_DATA_START_LOCAL(chacha20_constants)
        .long   0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
        .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
SYM_DATA_END(chacha20_constants)

        .text
/*
 * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
 * number of blocks of output with nonce 0, taking an input key and 8-bytes
 * counter. Does not spill to the stack.
 *
 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
 *                                     const uint8_t *key,
 *                                     uint32_t *counter,
 *                                     size_t nblocks)
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
        CFI_STARTPROC
        larl    %r1,chacha20_constants

        /* COPY0 = "expand 32-byte k" */
        VL      COPY0,0,,%r1

        /* BEPERM = byte selectors for VPERM */
        ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148)

        /* COPY1,COPY2 = key */
        VLM     COPY1,COPY2,0,%r3

        /* COPY3 = counter || zero nonce  */
        lg      %r3,0(%r4)
        VZERO   COPY3
        VLVGG   COPY3,%r3,0

        lghi    %r1,0
.Lblock:
        VLR     STATE0,COPY0
        VLR     STATE1,COPY1
        VLR     STATE2,COPY2
        VLR     STATE3,COPY3

        lghi    %r0,10
.Ldoubleround:
        /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
        VAF     STATE0,STATE0,STATE1
        VX      STATE3,STATE3,STATE0
        VERLLF  STATE3,STATE3,16

        /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
        VAF     STATE2,STATE2,STATE3
        VX      STATE1,STATE1,STATE2
        VERLLF  STATE1,STATE1,12

        /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
        VAF     STATE0,STATE0,STATE1
        VX      STATE3,STATE3,STATE0
        VERLLF  STATE3,STATE3,8

        /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
        VAF     STATE2,STATE2,STATE3
        VX      STATE1,STATE1,STATE2
        VERLLF  STATE1,STATE1,7

        /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
        VSLDB   STATE1,STATE1,STATE1,4
        /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
        VSLDB   STATE2,STATE2,STATE2,8
        /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
        VSLDB   STATE3,STATE3,STATE3,12

        /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
        VAF     STATE0,STATE0,STATE1
        VX      STATE3,STATE3,STATE0
        VERLLF  STATE3,STATE3,16

        /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
        VAF     STATE2,STATE2,STATE3
        VX      STATE1,STATE1,STATE2
        VERLLF  STATE1,STATE1,12

        /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
        VAF     STATE0,STATE0,STATE1
        VX      STATE3,STATE3,STATE0
        VERLLF  STATE3,STATE3,8

        /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
        VAF     STATE2,STATE2,STATE3
        VX      STATE1,STATE1,STATE2
        VERLLF  STATE1,STATE1,7

        /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
        VSLDB   STATE1,STATE1,STATE1,12
        /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
        VSLDB   STATE2,STATE2,STATE2,8
        /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
        VSLDB   STATE3,STATE3,STATE3,4
        brctg   %r0,.Ldoubleround

        /* OUTPUT0 = STATE0 + COPY0 */
        VAF     STATE0,STATE0,COPY0
        /* OUTPUT1 = STATE1 + COPY1 */
        VAF     STATE1,STATE1,COPY1
        /* OUTPUT2 = STATE2 + COPY2 */
        VAF     STATE2,STATE2,COPY2
        /* OUTPUT3 = STATE3 + COPY3 */
        VAF     STATE3,STATE3,COPY3

        ALTERNATIVE                                                     \
                __stringify(                                            \
                /* Convert STATE to little endian and store to OUTPUT */\
                VPERM   TMP0,STATE0,STATE0,BEPERM;                      \
                VPERM   TMP1,STATE1,STATE1,BEPERM;                      \
                VPERM   TMP2,STATE2,STATE2,BEPERM;                      \
                VPERM   TMP3,STATE3,STATE3,BEPERM;                      \
                VSTM    TMP0,TMP3,0,%r2),                               \
                __stringify(                                            \
                /* 32 bit wise little endian store to OUTPUT */         \
                VSTBRF  STATE0,0,,%r2;                                  \
                VSTBRF  STATE1,16,,%r2;                                 \
                VSTBRF  STATE2,32,,%r2;                                 \
                VSTBRF  STATE3,48,,%r2;                                 \
                brcl    0,0),                                           \
                ALT_FACILITY(148)

        /* ++COPY3.COUNTER */
        /* alsih %r3,1 */
        .insn   rilu,0xcc0a00000000,%r3,1
        alcr    %r3,%r1
        VLVGG   COPY3,%r3,0

        /* OUTPUT += 64, --NBLOCKS */
        aghi    %r2,64
        brctg   %r5,.Lblock

        /* COUNTER = COPY3.COUNTER */
        stg     %r3,0(%r4)

        /* Zero out potentially sensitive regs */
        VZERO   STATE0
        VZERO   STATE1
        VZERO   STATE2
        VZERO   STATE3
        VZERO   COPY1
        VZERO   COPY2

        /* Early exit if TMP0-TMP3 have not been used */
        ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)

        VZERO   TMP0
        VZERO   TMP1
        VZERO   TMP2
        VZERO   TMP3

        br      %r14
        CFI_ENDPROC
SYM_FUNC_END(__arch_chacha20_blocks_nostack)