root/lib/crypto/s390/chacha-s390.S
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Original implementation written by Andy Polyakov, @dot-asm.
 * This is an adaptation of the original code for kernel use.
 *
 * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
 */

#include <linux/linkage.h>
#include <asm/nospec-insn.h>
#include <asm/fpu-insn.h>

#define SP      %r15
#define FRAME   (16 * 8 + 4 * 8)

        .data
        .balign 32

SYM_DATA_START_LOCAL(sigma)
        .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
        .long   1,0,0,0
        .long   2,0,0,0
        .long   3,0,0,0
        .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap

        .long   0,1,2,3
        .long   0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
        .long   0x3320646e,0x3320646e,0x3320646e,0x3320646e
        .long   0x79622d32,0x79622d32,0x79622d32,0x79622d32
        .long   0x6b206574,0x6b206574,0x6b206574,0x6b206574
SYM_DATA_END(sigma)

        .previous

        GEN_BR_THUNK %r14

        .text

#############################################################################
# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
#                     counst u32 *key, const u32 *counter)

#define OUT             %r2
#define INP             %r3
#define LEN             %r4
#define KEY             %r5
#define COUNTER         %r6

#define BEPERM          %v31
#define CTR             %v26

#define K0              %v16
#define K1              %v17
#define K2              %v18
#define K3              %v19

#define XA0             %v0
#define XA1             %v1
#define XA2             %v2
#define XA3             %v3

#define XB0             %v4
#define XB1             %v5
#define XB2             %v6
#define XB3             %v7

#define XC0             %v8
#define XC1             %v9
#define XC2             %v10
#define XC3             %v11

#define XD0             %v12
#define XD1             %v13
#define XD2             %v14
#define XD3             %v15

#define XT0             %v27
#define XT1             %v28
#define XT2             %v29
#define XT3             %v30

SYM_FUNC_START(chacha20_vx_4x)
        stmg    %r6,%r7,6*8(SP)

        larl    %r7,sigma
        lhi     %r0,10
        lhi     %r1,0

        VL      K0,0,,%r7               # load sigma
        VL      K1,0,,KEY               # load key
        VL      K2,16,,KEY
        VL      K3,0,,COUNTER           # load counter

        VL      BEPERM,0x40,,%r7
        VL      CTR,0x50,,%r7

        VLM     XA0,XA3,0x60,%r7,4      # load [smashed] sigma

        VREPF   XB0,K1,0                # smash the key
        VREPF   XB1,K1,1
        VREPF   XB2,K1,2
        VREPF   XB3,K1,3

        VREPF   XD0,K3,0
        VREPF   XD1,K3,1
        VREPF   XD2,K3,2
        VREPF   XD3,K3,3
        VAF     XD0,XD0,CTR

        VREPF   XC0,K2,0
        VREPF   XC1,K2,1
        VREPF   XC2,K2,2
        VREPF   XC3,K2,3

.Loop_4x:
        VAF     XA0,XA0,XB0
        VX      XD0,XD0,XA0
        VERLLF  XD0,XD0,16

        VAF     XA1,XA1,XB1
        VX      XD1,XD1,XA1
        VERLLF  XD1,XD1,16

        VAF     XA2,XA2,XB2
        VX      XD2,XD2,XA2
        VERLLF  XD2,XD2,16

        VAF     XA3,XA3,XB3
        VX      XD3,XD3,XA3
        VERLLF  XD3,XD3,16

        VAF     XC0,XC0,XD0
        VX      XB0,XB0,XC0
        VERLLF  XB0,XB0,12

        VAF     XC1,XC1,XD1
        VX      XB1,XB1,XC1
        VERLLF  XB1,XB1,12

        VAF     XC2,XC2,XD2
        VX      XB2,XB2,XC2
        VERLLF  XB2,XB2,12

        VAF     XC3,XC3,XD3
        VX      XB3,XB3,XC3
        VERLLF  XB3,XB3,12

        VAF     XA0,XA0,XB0
        VX      XD0,XD0,XA0
        VERLLF  XD0,XD0,8

        VAF     XA1,XA1,XB1
        VX      XD1,XD1,XA1
        VERLLF  XD1,XD1,8

        VAF     XA2,XA2,XB2
        VX      XD2,XD2,XA2
        VERLLF  XD2,XD2,8

        VAF     XA3,XA3,XB3
        VX      XD3,XD3,XA3
        VERLLF  XD3,XD3,8

        VAF     XC0,XC0,XD0
        VX      XB0,XB0,XC0
        VERLLF  XB0,XB0,7

        VAF     XC1,XC1,XD1
        VX      XB1,XB1,XC1
        VERLLF  XB1,XB1,7

        VAF     XC2,XC2,XD2
        VX      XB2,XB2,XC2
        VERLLF  XB2,XB2,7

        VAF     XC3,XC3,XD3
        VX      XB3,XB3,XC3
        VERLLF  XB3,XB3,7

        VAF     XA0,XA0,XB1
        VX      XD3,XD3,XA0
        VERLLF  XD3,XD3,16

        VAF     XA1,XA1,XB2
        VX      XD0,XD0,XA1
        VERLLF  XD0,XD0,16

        VAF     XA2,XA2,XB3
        VX      XD1,XD1,XA2
        VERLLF  XD1,XD1,16

        VAF     XA3,XA3,XB0
        VX      XD2,XD2,XA3
        VERLLF  XD2,XD2,16

        VAF     XC2,XC2,XD3
        VX      XB1,XB1,XC2
        VERLLF  XB1,XB1,12

        VAF     XC3,XC3,XD0
        VX      XB2,XB2,XC3
        VERLLF  XB2,XB2,12

        VAF     XC0,XC0,XD1
        VX      XB3,XB3,XC0
        VERLLF  XB3,XB3,12

        VAF     XC1,XC1,XD2
        VX      XB0,XB0,XC1
        VERLLF  XB0,XB0,12

        VAF     XA0,XA0,XB1
        VX      XD3,XD3,XA0
        VERLLF  XD3,XD3,8

        VAF     XA1,XA1,XB2
        VX      XD0,XD0,XA1
        VERLLF  XD0,XD0,8

        VAF     XA2,XA2,XB3
        VX      XD1,XD1,XA2
        VERLLF  XD1,XD1,8

        VAF     XA3,XA3,XB0
        VX      XD2,XD2,XA3
        VERLLF  XD2,XD2,8

        VAF     XC2,XC2,XD3
        VX      XB1,XB1,XC2
        VERLLF  XB1,XB1,7

        VAF     XC3,XC3,XD0
        VX      XB2,XB2,XC3
        VERLLF  XB2,XB2,7

        VAF     XC0,XC0,XD1
        VX      XB3,XB3,XC0
        VERLLF  XB3,XB3,7

        VAF     XC1,XC1,XD2
        VX      XB0,XB0,XC1
        VERLLF  XB0,XB0,7
        brct    %r0,.Loop_4x

        VAF     XD0,XD0,CTR

        VMRHF   XT0,XA0,XA1             # transpose data
        VMRHF   XT1,XA2,XA3
        VMRLF   XT2,XA0,XA1
        VMRLF   XT3,XA2,XA3
        VPDI    XA0,XT0,XT1,0b0000
        VPDI    XA1,XT0,XT1,0b0101
        VPDI    XA2,XT2,XT3,0b0000
        VPDI    XA3,XT2,XT3,0b0101

        VMRHF   XT0,XB0,XB1
        VMRHF   XT1,XB2,XB3
        VMRLF   XT2,XB0,XB1
        VMRLF   XT3,XB2,XB3
        VPDI    XB0,XT0,XT1,0b0000
        VPDI    XB1,XT0,XT1,0b0101
        VPDI    XB2,XT2,XT3,0b0000
        VPDI    XB3,XT2,XT3,0b0101

        VMRHF   XT0,XC0,XC1
        VMRHF   XT1,XC2,XC3
        VMRLF   XT2,XC0,XC1
        VMRLF   XT3,XC2,XC3
        VPDI    XC0,XT0,XT1,0b0000
        VPDI    XC1,XT0,XT1,0b0101
        VPDI    XC2,XT2,XT3,0b0000
        VPDI    XC3,XT2,XT3,0b0101

        VMRHF   XT0,XD0,XD1
        VMRHF   XT1,XD2,XD3
        VMRLF   XT2,XD0,XD1
        VMRLF   XT3,XD2,XD3
        VPDI    XD0,XT0,XT1,0b0000
        VPDI    XD1,XT0,XT1,0b0101
        VPDI    XD2,XT2,XT3,0b0000
        VPDI    XD3,XT2,XT3,0b0101

        VAF     XA0,XA0,K0
        VAF     XB0,XB0,K1
        VAF     XC0,XC0,K2
        VAF     XD0,XD0,K3

        VPERM   XA0,XA0,XA0,BEPERM
        VPERM   XB0,XB0,XB0,BEPERM
        VPERM   XC0,XC0,XC0,BEPERM
        VPERM   XD0,XD0,XD0,BEPERM

        VLM     XT0,XT3,0,INP,0

        VX      XT0,XT0,XA0
        VX      XT1,XT1,XB0
        VX      XT2,XT2,XC0
        VX      XT3,XT3,XD0

        VSTM    XT0,XT3,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40

        VAF     XA0,XA1,K0
        VAF     XB0,XB1,K1
        VAF     XC0,XC1,K2
        VAF     XD0,XD1,K3

        VPERM   XA0,XA0,XA0,BEPERM
        VPERM   XB0,XB0,XB0,BEPERM
        VPERM   XC0,XC0,XC0,BEPERM
        VPERM   XD0,XD0,XD0,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_4x

        VLM     XT0,XT3,0,INP,0

        VX      XT0,XT0,XA0
        VX      XT1,XT1,XB0
        VX      XT2,XT2,XC0
        VX      XT3,XT3,XD0

        VSTM    XT0,XT3,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_4x

        VAF     XA0,XA2,K0
        VAF     XB0,XB2,K1
        VAF     XC0,XC2,K2
        VAF     XD0,XD2,K3

        VPERM   XA0,XA0,XA0,BEPERM
        VPERM   XB0,XB0,XB0,BEPERM
        VPERM   XC0,XC0,XC0,BEPERM
        VPERM   XD0,XD0,XD0,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_4x

        VLM     XT0,XT3,0,INP,0

        VX      XT0,XT0,XA0
        VX      XT1,XT1,XB0
        VX      XT2,XT2,XC0
        VX      XT3,XT3,XD0

        VSTM    XT0,XT3,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_4x

        VAF     XA0,XA3,K0
        VAF     XB0,XB3,K1
        VAF     XC0,XC3,K2
        VAF     XD0,XD3,K3

        VPERM   XA0,XA0,XA0,BEPERM
        VPERM   XB0,XB0,XB0,BEPERM
        VPERM   XC0,XC0,XC0,BEPERM
        VPERM   XD0,XD0,XD0,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_4x

        VLM     XT0,XT3,0,INP,0

        VX      XT0,XT0,XA0
        VX      XT1,XT1,XB0
        VX      XT2,XT2,XC0
        VX      XT3,XT3,XD0

        VSTM    XT0,XT3,0,OUT,0

.Ldone_4x:
        lmg     %r6,%r7,6*8(SP)
        BR_EX   %r14

.Ltail_4x:
        VLR     XT0,XC0
        VLR     XT1,XD0

        VST     XA0,8*8+0x00,,SP
        VST     XB0,8*8+0x10,,SP
        VST     XT0,8*8+0x20,,SP
        VST     XT1,8*8+0x30,,SP

        lghi    %r1,0

.Loop_tail_4x:
        llgc    %r5,0(%r1,INP)
        llgc    %r6,8*8(%r1,SP)
        xr      %r6,%r5
        stc     %r6,0(%r1,OUT)
        la      %r1,1(%r1)
        brct    LEN,.Loop_tail_4x

        lmg     %r6,%r7,6*8(SP)
        BR_EX   %r14
SYM_FUNC_END(chacha20_vx_4x)

#undef  OUT
#undef  INP
#undef  LEN
#undef  KEY
#undef  COUNTER

#undef BEPERM

#undef K0
#undef K1
#undef K2
#undef K3


#############################################################################
# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
#                  counst u32 *key, const u32 *counter)

#define OUT             %r2
#define INP             %r3
#define LEN             %r4
#define KEY             %r5
#define COUNTER         %r6

#define BEPERM          %v31

#define K0              %v27
#define K1              %v24
#define K2              %v25
#define K3              %v26

#define A0              %v0
#define B0              %v1
#define C0              %v2
#define D0              %v3

#define A1              %v4
#define B1              %v5
#define C1              %v6
#define D1              %v7

#define A2              %v8
#define B2              %v9
#define C2              %v10
#define D2              %v11

#define A3              %v12
#define B3              %v13
#define C3              %v14
#define D3              %v15

#define A4              %v16
#define B4              %v17
#define C4              %v18
#define D4              %v19

#define A5              %v20
#define B5              %v21
#define C5              %v22
#define D5              %v23

#define T0              %v27
#define T1              %v28
#define T2              %v29
#define T3              %v30

SYM_FUNC_START(chacha20_vx)
        clgfi   LEN,256
        jle     chacha20_vx_4x
        stmg    %r6,%r7,6*8(SP)

        lghi    %r1,-FRAME
        lgr     %r0,SP
        la      SP,0(%r1,SP)
        stg     %r0,0(SP)               # back-chain

        larl    %r7,sigma
        lhi     %r0,10

        VLM     K1,K2,0,KEY,0           # load key
        VL      K3,0,,COUNTER           # load counter

        VLM     K0,BEPERM,0,%r7,4       # load sigma, increments, ...

.Loop_outer_vx:
        VLR     A0,K0
        VLR     B0,K1
        VLR     A1,K0
        VLR     B1,K1
        VLR     A2,K0
        VLR     B2,K1
        VLR     A3,K0
        VLR     B3,K1
        VLR     A4,K0
        VLR     B4,K1
        VLR     A5,K0
        VLR     B5,K1

        VLR     D0,K3
        VAF     D1,K3,T1                # K[3]+1
        VAF     D2,K3,T2                # K[3]+2
        VAF     D3,K3,T3                # K[3]+3
        VAF     D4,D2,T2                # K[3]+4
        VAF     D5,D2,T3                # K[3]+5

        VLR     C0,K2
        VLR     C1,K2
        VLR     C2,K2
        VLR     C3,K2
        VLR     C4,K2
        VLR     C5,K2

        VLR     T1,D1
        VLR     T2,D2
        VLR     T3,D3

.Loop_vx:
        VAF     A0,A0,B0
        VAF     A1,A1,B1
        VAF     A2,A2,B2
        VAF     A3,A3,B3
        VAF     A4,A4,B4
        VAF     A5,A5,B5
        VX      D0,D0,A0
        VX      D1,D1,A1
        VX      D2,D2,A2
        VX      D3,D3,A3
        VX      D4,D4,A4
        VX      D5,D5,A5
        VERLLF  D0,D0,16
        VERLLF  D1,D1,16
        VERLLF  D2,D2,16
        VERLLF  D3,D3,16
        VERLLF  D4,D4,16
        VERLLF  D5,D5,16

        VAF     C0,C0,D0
        VAF     C1,C1,D1
        VAF     C2,C2,D2
        VAF     C3,C3,D3
        VAF     C4,C4,D4
        VAF     C5,C5,D5
        VX      B0,B0,C0
        VX      B1,B1,C1
        VX      B2,B2,C2
        VX      B3,B3,C3
        VX      B4,B4,C4
        VX      B5,B5,C5
        VERLLF  B0,B0,12
        VERLLF  B1,B1,12
        VERLLF  B2,B2,12
        VERLLF  B3,B3,12
        VERLLF  B4,B4,12
        VERLLF  B5,B5,12

        VAF     A0,A0,B0
        VAF     A1,A1,B1
        VAF     A2,A2,B2
        VAF     A3,A3,B3
        VAF     A4,A4,B4
        VAF     A5,A5,B5
        VX      D0,D0,A0
        VX      D1,D1,A1
        VX      D2,D2,A2
        VX      D3,D3,A3
        VX      D4,D4,A4
        VX      D5,D5,A5
        VERLLF  D0,D0,8
        VERLLF  D1,D1,8
        VERLLF  D2,D2,8
        VERLLF  D3,D3,8
        VERLLF  D4,D4,8
        VERLLF  D5,D5,8

        VAF     C0,C0,D0
        VAF     C1,C1,D1
        VAF     C2,C2,D2
        VAF     C3,C3,D3
        VAF     C4,C4,D4
        VAF     C5,C5,D5
        VX      B0,B0,C0
        VX      B1,B1,C1
        VX      B2,B2,C2
        VX      B3,B3,C3
        VX      B4,B4,C4
        VX      B5,B5,C5
        VERLLF  B0,B0,7
        VERLLF  B1,B1,7
        VERLLF  B2,B2,7
        VERLLF  B3,B3,7
        VERLLF  B4,B4,7
        VERLLF  B5,B5,7

        VSLDB   C0,C0,C0,8
        VSLDB   C1,C1,C1,8
        VSLDB   C2,C2,C2,8
        VSLDB   C3,C3,C3,8
        VSLDB   C4,C4,C4,8
        VSLDB   C5,C5,C5,8
        VSLDB   B0,B0,B0,4
        VSLDB   B1,B1,B1,4
        VSLDB   B2,B2,B2,4
        VSLDB   B3,B3,B3,4
        VSLDB   B4,B4,B4,4
        VSLDB   B5,B5,B5,4
        VSLDB   D0,D0,D0,12
        VSLDB   D1,D1,D1,12
        VSLDB   D2,D2,D2,12
        VSLDB   D3,D3,D3,12
        VSLDB   D4,D4,D4,12
        VSLDB   D5,D5,D5,12

        VAF     A0,A0,B0
        VAF     A1,A1,B1
        VAF     A2,A2,B2
        VAF     A3,A3,B3
        VAF     A4,A4,B4
        VAF     A5,A5,B5
        VX      D0,D0,A0
        VX      D1,D1,A1
        VX      D2,D2,A2
        VX      D3,D3,A3
        VX      D4,D4,A4
        VX      D5,D5,A5
        VERLLF  D0,D0,16
        VERLLF  D1,D1,16
        VERLLF  D2,D2,16
        VERLLF  D3,D3,16
        VERLLF  D4,D4,16
        VERLLF  D5,D5,16

        VAF     C0,C0,D0
        VAF     C1,C1,D1
        VAF     C2,C2,D2
        VAF     C3,C3,D3
        VAF     C4,C4,D4
        VAF     C5,C5,D5
        VX      B0,B0,C0
        VX      B1,B1,C1
        VX      B2,B2,C2
        VX      B3,B3,C3
        VX      B4,B4,C4
        VX      B5,B5,C5
        VERLLF  B0,B0,12
        VERLLF  B1,B1,12
        VERLLF  B2,B2,12
        VERLLF  B3,B3,12
        VERLLF  B4,B4,12
        VERLLF  B5,B5,12

        VAF     A0,A0,B0
        VAF     A1,A1,B1
        VAF     A2,A2,B2
        VAF     A3,A3,B3
        VAF     A4,A4,B4
        VAF     A5,A5,B5
        VX      D0,D0,A0
        VX      D1,D1,A1
        VX      D2,D2,A2
        VX      D3,D3,A3
        VX      D4,D4,A4
        VX      D5,D5,A5
        VERLLF  D0,D0,8
        VERLLF  D1,D1,8
        VERLLF  D2,D2,8
        VERLLF  D3,D3,8
        VERLLF  D4,D4,8
        VERLLF  D5,D5,8

        VAF     C0,C0,D0
        VAF     C1,C1,D1
        VAF     C2,C2,D2
        VAF     C3,C3,D3
        VAF     C4,C4,D4
        VAF     C5,C5,D5
        VX      B0,B0,C0
        VX      B1,B1,C1
        VX      B2,B2,C2
        VX      B3,B3,C3
        VX      B4,B4,C4
        VX      B5,B5,C5
        VERLLF  B0,B0,7
        VERLLF  B1,B1,7
        VERLLF  B2,B2,7
        VERLLF  B3,B3,7
        VERLLF  B4,B4,7
        VERLLF  B5,B5,7

        VSLDB   C0,C0,C0,8
        VSLDB   C1,C1,C1,8
        VSLDB   C2,C2,C2,8
        VSLDB   C3,C3,C3,8
        VSLDB   C4,C4,C4,8
        VSLDB   C5,C5,C5,8
        VSLDB   B0,B0,B0,12
        VSLDB   B1,B1,B1,12
        VSLDB   B2,B2,B2,12
        VSLDB   B3,B3,B3,12
        VSLDB   B4,B4,B4,12
        VSLDB   B5,B5,B5,12
        VSLDB   D0,D0,D0,4
        VSLDB   D1,D1,D1,4
        VSLDB   D2,D2,D2,4
        VSLDB   D3,D3,D3,4
        VSLDB   D4,D4,D4,4
        VSLDB   D5,D5,D5,4
        brct    %r0,.Loop_vx

        VAF     A0,A0,K0
        VAF     B0,B0,K1
        VAF     C0,C0,K2
        VAF     D0,D0,K3
        VAF     A1,A1,K0
        VAF     D1,D1,T1                # +K[3]+1

        VPERM   A0,A0,A0,BEPERM
        VPERM   B0,B0,B0,BEPERM
        VPERM   C0,C0,C0,BEPERM
        VPERM   D0,D0,D0,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VAF     D2,D2,T2                # +K[3]+2
        VAF     D3,D3,T3                # +K[3]+3
        VLM     T0,T3,0,INP,0

        VX      A0,A0,T0
        VX      B0,B0,T1
        VX      C0,C0,T2
        VX      D0,D0,T3

        VLM     K0,T3,0,%r7,4           # re-load sigma and increments

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_vx

        VAF     B1,B1,K1
        VAF     C1,C1,K2

        VPERM   A0,A1,A1,BEPERM
        VPERM   B0,B1,B1,BEPERM
        VPERM   C0,C1,C1,BEPERM
        VPERM   D0,D1,D1,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VLM     A1,D1,0,INP,0

        VX      A0,A0,A1
        VX      B0,B0,B1
        VX      C0,C0,C1
        VX      D0,D0,D1

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_vx

        VAF     A2,A2,K0
        VAF     B2,B2,K1
        VAF     C2,C2,K2

        VPERM   A0,A2,A2,BEPERM
        VPERM   B0,B2,B2,BEPERM
        VPERM   C0,C2,C2,BEPERM
        VPERM   D0,D2,D2,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VLM     A1,D1,0,INP,0

        VX      A0,A0,A1
        VX      B0,B0,B1
        VX      C0,C0,C1
        VX      D0,D0,D1

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_vx

        VAF     A3,A3,K0
        VAF     B3,B3,K1
        VAF     C3,C3,K2
        VAF     D2,K3,T3                # K[3]+3

        VPERM   A0,A3,A3,BEPERM
        VPERM   B0,B3,B3,BEPERM
        VPERM   C0,C3,C3,BEPERM
        VPERM   D0,D3,D3,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VAF     D3,D2,T1                # K[3]+4
        VLM     A1,D1,0,INP,0

        VX      A0,A0,A1
        VX      B0,B0,B1
        VX      C0,C0,C1
        VX      D0,D0,D1

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_vx

        VAF     A4,A4,K0
        VAF     B4,B4,K1
        VAF     C4,C4,K2
        VAF     D4,D4,D3                # +K[3]+4
        VAF     D3,D3,T1                # K[3]+5
        VAF     K3,D2,T3                # K[3]+=6

        VPERM   A0,A4,A4,BEPERM
        VPERM   B0,B4,B4,BEPERM
        VPERM   C0,C4,C4,BEPERM
        VPERM   D0,D4,D4,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VLM     A1,D1,0,INP,0

        VX      A0,A0,A1
        VX      B0,B0,B1
        VX      C0,C0,C1
        VX      D0,D0,D1

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        aghi    LEN,-0x40
        je      .Ldone_vx

        VAF     A5,A5,K0
        VAF     B5,B5,K1
        VAF     C5,C5,K2
        VAF     D5,D5,D3                # +K[3]+5

        VPERM   A0,A5,A5,BEPERM
        VPERM   B0,B5,B5,BEPERM
        VPERM   C0,C5,C5,BEPERM
        VPERM   D0,D5,D5,BEPERM

        clgfi   LEN,0x40
        jl      .Ltail_vx

        VLM     A1,D1,0,INP,0

        VX      A0,A0,A1
        VX      B0,B0,B1
        VX      C0,C0,C1
        VX      D0,D0,D1

        VSTM    A0,D0,0,OUT,0

        la      INP,0x40(INP)
        la      OUT,0x40(OUT)
        lhi     %r0,10
        aghi    LEN,-0x40
        jne     .Loop_outer_vx

.Ldone_vx:
        lmg     %r6,%r7,FRAME+6*8(SP)
        la      SP,FRAME(SP)
        BR_EX   %r14

.Ltail_vx:
        VSTM    A0,D0,8*8,SP,3
        lghi    %r1,0

.Loop_tail_vx:
        llgc    %r5,0(%r1,INP)
        llgc    %r6,8*8(%r1,SP)
        xr      %r6,%r5
        stc     %r6,0(%r1,OUT)
        la      %r1,1(%r1)
        brct    LEN,.Loop_tail_vx

        lmg     %r6,%r7,FRAME+6*8(SP)
        la      SP,FRAME(SP)
        BR_EX   %r14
SYM_FUNC_END(chacha20_vx)

.previous