#include <linux/linkage.h>
#include <asm/cache.h>
#include <asm/assembler.h>
.text
#define state0 v0
#define state1 v1
#define state2 v2
#define state3 v3
#define copy0 v4
#define copy0_q q4
#define copy1 v5
#define copy2 v6
#define copy3 v7
#define copy3_d d7
#define one_d d16
#define one_q q16
#define one_v v16
#define tmp v17
#define rot8 v18
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
mov_q x8, 0x3320646e61707865
mov_q x9, 0x6b20657479622d32
mov copy0.d[0], x8
mov copy0.d[1], x9
ld1 { copy1.4s, copy2.4s }, [x1]
ld1 { copy3.2s }, [x2]
movi one_v.2s, #1
uzp1 one_v.4s, one_v.4s, one_v.4s
.Lblock:
mov state0.16b, copy0.16b
mov state1.16b, copy1.16b
mov state2.16b, copy2.16b
mov state3.16b, copy3.16b
mov w4, 20
.Lpermute:
.Ldoubleround:
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
ext state1.16b, state1.16b, state1.16b, #4
ext state2.16b, state2.16b, state2.16b, #8
ext state3.16b, state3.16b, state3.16b, #12
add state0.4s, state0.4s, state1.4s
eor state3.16b, state3.16b, state0.16b
rev32 state3.8h, state3.8h
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #12
sri state1.4s, tmp.4s, #20
add state0.4s, state0.4s, state1.4s
eor tmp.16b, state3.16b, state0.16b
shl state3.4s, tmp.4s, #8
sri state3.4s, tmp.4s, #24
add state2.4s, state2.4s, state3.4s
eor tmp.16b, state1.16b, state2.16b
shl state1.4s, tmp.4s, #7
sri state1.4s, tmp.4s, #25
ext state1.16b, state1.16b, state1.16b, #12
ext state2.16b, state2.16b, state2.16b, #8
ext state3.16b, state3.16b, state3.16b, #4
subs w4, w4, #2
b.ne .Ldoubleround
add state0.4s, state0.4s, copy0.4s
add state1.4s, state1.4s, copy1.4s
add state2.4s, state2.4s, copy2.4s
add state3.4s, state3.4s, copy3.4s
st1 { state0.16b - state3.16b }, [x0]
add copy3_d, copy3_d, one_d
add x0, x0, 64
subs x3, x3, #1
b.ne .Lblock
st1 { copy3.2s }, [x2]
movi state0.16b, #0
movi state1.16b, #0
movi state2.16b, #0
movi state3.16b, #0
movi copy1.16b, #0
movi copy2.16b, #0
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
emit_aarch64_feature_1_and