#include <linux/linkage.h>
.text
.option arch, +zvkb
#define STATEP a0
#define INP a1
#define OUTP a2
#define NBLOCKS a3
#define NROUNDS a4
#define CONSTS0 a5
#define CONSTS1 a6
#define CONSTS2 a7
#define CONSTS3 t0
#define TMP t1
#define VL t2
#define STRIDE t3
#define ROUND_CTR t4
#define KEY0 t5
#define KEY1 s1
#define KEY2 s2
#define KEY3 s3
#define KEY4 s4
#define KEY5 s5
#define KEY6 s6
#define KEY7 s7
#define COUNTER s8
#define NONCE0 s9
#define NONCE1 s10
#define NONCE2 s11
.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
a2, b2, c2, d2, a3, b3, c3, d3
vadd.vv \a0, \a0, \b0
vadd.vv \a1, \a1, \b1
vadd.vv \a2, \a2, \b2
vadd.vv \a3, \a3, \b3
vxor.vv \d0, \d0, \a0
vxor.vv \d1, \d1, \a1
vxor.vv \d2, \d2, \a2
vxor.vv \d3, \d3, \a3
vror.vi \d0, \d0, 32 - 16
vror.vi \d1, \d1, 32 - 16
vror.vi \d2, \d2, 32 - 16
vror.vi \d3, \d3, 32 - 16
vadd.vv \c0, \c0, \d0
vadd.vv \c1, \c1, \d1
vadd.vv \c2, \c2, \d2
vadd.vv \c3, \c3, \d3
vxor.vv \b0, \b0, \c0
vxor.vv \b1, \b1, \c1
vxor.vv \b2, \b2, \c2
vxor.vv \b3, \b3, \c3
vror.vi \b0, \b0, 32 - 12
vror.vi \b1, \b1, 32 - 12
vror.vi \b2, \b2, 32 - 12
vror.vi \b3, \b3, 32 - 12
vadd.vv \a0, \a0, \b0
vadd.vv \a1, \a1, \b1
vadd.vv \a2, \a2, \b2
vadd.vv \a3, \a3, \b3
vxor.vv \d0, \d0, \a0
vxor.vv \d1, \d1, \a1
vxor.vv \d2, \d2, \a2
vxor.vv \d3, \d3, \a3
vror.vi \d0, \d0, 32 - 8
vror.vi \d1, \d1, 32 - 8
vror.vi \d2, \d2, 32 - 8
vror.vi \d3, \d3, 32 - 8
vadd.vv \c0, \c0, \d0
vadd.vv \c1, \c1, \d1
vadd.vv \c2, \c2, \d2
vadd.vv \c3, \c3, \d3
vxor.vv \b0, \b0, \c0
vxor.vv \b1, \b1, \c1
vxor.vv \b2, \b2, \c2
vxor.vv \b3, \b3, \c3
vror.vi \b0, \b0, 32 - 7
vror.vi \b1, \b1, 32 - 7
vror.vi \b2, \b2, 32 - 7
vror.vi \b3, \b3, 32 - 7
.endm
SYM_FUNC_START(chacha_zvkb)
addi sp, sp, -96
sd s1, 8(sp)
sd s2, 16(sp)
sd s3, 24(sp)
sd s4, 32(sp)
sd s5, 40(sp)
sd s6, 48(sp)
sd s7, 56(sp)
sd s8, 64(sp)
sd s9, 72(sp)
sd s10, 80(sp)
sd s11, 88(sp)
li STRIDE, 64
lw CONSTS0, 0(STATEP)
lw CONSTS1, 4(STATEP)
lw CONSTS2, 8(STATEP)
lw CONSTS3, 12(STATEP)
lw KEY0, 16(STATEP)
lw KEY1, 20(STATEP)
lw KEY2, 24(STATEP)
lw KEY3, 28(STATEP)
lw KEY4, 32(STATEP)
lw KEY5, 36(STATEP)
lw KEY6, 40(STATEP)
lw KEY7, 44(STATEP)
lw COUNTER, 48(STATEP)
lw NONCE0, 52(STATEP)
lw NONCE1, 56(STATEP)
lw NONCE2, 60(STATEP)
.Lblock_loop:
vsetvli VL, NBLOCKS, e32, m1, ta, ma
vmv.v.x v0, CONSTS0
vmv.v.x v1, CONSTS1
vmv.v.x v2, CONSTS2
vmv.v.x v3, CONSTS3
vmv.v.x v4, KEY0
vmv.v.x v5, KEY1
vmv.v.x v6, KEY2
vmv.v.x v7, KEY3
vmv.v.x v8, KEY4
vmv.v.x v9, KEY5
vmv.v.x v10, KEY6
vmv.v.x v11, KEY7
vid.v v12
vadd.vx v12, v12, COUNTER
vmv.v.x v13, NONCE0
vmv.v.x v14, NONCE1
vmv.v.x v15, NONCE2
vlsseg8e32.v v16, (INP), STRIDE
mv ROUND_CTR, NROUNDS
.Lnext_doubleround:
addi ROUND_CTR, ROUND_CTR, -2
chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
v2, v6, v10, v14, v3, v7, v11, v15
chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
v2, v7, v8, v13, v3, v4, v9, v14
bnez ROUND_CTR, .Lnext_doubleround
addi TMP, INP, 32
vlsseg8e32.v v24, (TMP), STRIDE
vadd.vx v0, v0, CONSTS0
vadd.vx v1, v1, CONSTS1
vadd.vx v2, v2, CONSTS2
vadd.vx v3, v3, CONSTS3
vadd.vx v4, v4, KEY0
vadd.vx v5, v5, KEY1
vadd.vx v6, v6, KEY2
vadd.vx v7, v7, KEY3
vxor.vv v16, v16, v0
vxor.vv v17, v17, v1
vxor.vv v18, v18, v2
vxor.vv v19, v19, v3
vxor.vv v20, v20, v4
vxor.vv v21, v21, v5
vxor.vv v22, v22, v6
vxor.vv v23, v23, v7
vssseg8e32.v v16, (OUTP), STRIDE
vadd.vx v8, v8, KEY4
vadd.vx v9, v9, KEY5
vadd.vx v10, v10, KEY6
vadd.vx v11, v11, KEY7
vid.v v0
vadd.vx v12, v12, COUNTER
vadd.vx v13, v13, NONCE0
vadd.vx v14, v14, NONCE1
vadd.vx v15, v15, NONCE2
vadd.vv v12, v12, v0
vxor.vv v24, v24, v8
vxor.vv v25, v25, v9
vxor.vv v26, v26, v10
vxor.vv v27, v27, v11
vxor.vv v29, v29, v13
vxor.vv v28, v28, v12
vxor.vv v30, v30, v14
vxor.vv v31, v31, v15
addi TMP, OUTP, 32
vssseg8e32.v v24, (TMP), STRIDE
add COUNTER, COUNTER, VL
sub NBLOCKS, NBLOCKS, VL
slli TMP, VL, 6
add OUTP, OUTP, TMP
add INP, INP, TMP
bnez NBLOCKS, .Lblock_loop
sw COUNTER, 48(STATEP)
ld s1, 8(sp)
ld s2, 16(sp)
ld s3, 24(sp)
ld s4, 32(sp)
ld s5, 40(sp)
ld s6, 48(sp)
ld s7, 56(sp)
ld s8, 64(sp)
ld s9, 72(sp)
ld s10, 80(sp)
ld s11, 88(sp)
addi sp, sp, 96
ret
SYM_FUNC_END(chacha_zvkb)