#define ctx x0
#define in x1
#define num x2
#define hc0 v16
#define hc1 v17
#define hc1s s17
#define hs0 v18
#define hs0q q18
#define hs1 v19
#define hs1s s19
#define w0 v20
#define w1 v21
#define w2 v22
#define w3 v23
#define k0 v24
#define k1 v25
#define k2 v26
#define k3 v27
#define tmp0 v28
#define tmp1 s29
#define tmp2 w11
#define sha1_message_schedule_update(m0, m1, m2, m3) \
sha1su0 m0.4s, m1.4s, m2.4s; \
sha1su1 m0.4s, m3.4s
#define sha1_round1(h0, h1, w, k) \
add tmp0.4s, w.4s, k.4s; \
mov tmp1, h0.s[0]; \
sha1c h0##q, h1##s, tmp0.4s; \
sha1h h1##s, tmp1
#define sha1_round2(h0, h1, w, k) \
add tmp0.4s, w.4s, k.4s; \
mov tmp1, h0.s[0]; \
sha1p h0##q, h1##s, tmp0.4s; \
sha1h h1##s, tmp1
#define sha1_round3(h0, h1, w, k) \
add tmp0.4s, w.4s, k.4s; \
mov tmp1, h0.s[0]; \
sha1m h0##q, h1##s, tmp0.4s; \
sha1h h1##s, tmp1
#define sha1_round4(h0, h1, w, k) \
add tmp0.4s, w.4s, k.4s; \
mov tmp1, h0.s[0]; \
sha1p h0##q, h1##s, tmp0.4s; \
sha1h h1##s, tmp1
.arch armv8-a+sha2
.section .text
.globl sha1_block_ce
.type sha1_block_ce,@function
sha1_block_ce:
movz tmp2, #0x5a82, lsl #16
movk tmp2, #0x7999
dup k0.4s, tmp2
movz tmp2, #0x6ed9, lsl #16
movk tmp2, #0xeba1
dup k1.4s, tmp2
movz tmp2, #0x8f1b, lsl #16
movk tmp2, #0xbcdc
dup k2.4s, tmp2
movz tmp2, #0xca62, lsl #16
movk tmp2, #0xc1d6
dup k3.4s, tmp2
ld1 {hc0.4s}, [ctx]
ldr hc1s, [ctx, #(4*4)]
.Lblock_loop:
mov hs0.16b, hc0.16b
mov hs1s, hc1.s[0]
ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
rev32 w0.16b, w0.16b
rev32 w1.16b, w1.16b
rev32 w2.16b, w2.16b
rev32 w3.16b, w3.16b
sha1_round1(hs0, hs1, w0, k0)
sha1_round1(hs0, hs1, w1, k0)
sha1_round1(hs0, hs1, w2, k0)
sha1_round1(hs0, hs1, w3, k0)
sha1_message_schedule_update(w0, w1, w2, w3)
sha1_message_schedule_update(w1, w2, w3, w0)
sha1_message_schedule_update(w2, w3, w0, w1)
sha1_message_schedule_update(w3, w0, w1, w2)
sha1_round1(hs0, hs1, w0, k0)
sha1_round2(hs0, hs1, w1, k1)
sha1_round2(hs0, hs1, w2, k1)
sha1_round2(hs0, hs1, w3, k1)
sha1_message_schedule_update(w0, w1, w2, w3)
sha1_message_schedule_update(w1, w2, w3, w0)
sha1_message_schedule_update(w2, w3, w0, w1)
sha1_message_schedule_update(w3, w0, w1, w2)
sha1_round2(hs0, hs1, w0, k1)
sha1_round2(hs0, hs1, w1, k1)
sha1_round3(hs0, hs1, w2, k2)
sha1_round3(hs0, hs1, w3, k2)
sha1_message_schedule_update(w0, w1, w2, w3)
sha1_message_schedule_update(w1, w2, w3, w0)
sha1_message_schedule_update(w2, w3, w0, w1)
sha1_message_schedule_update(w3, w0, w1, w2)
sha1_round3(hs0, hs1, w0, k2)
sha1_round3(hs0, hs1, w1, k2)
sha1_round3(hs0, hs1, w2, k2)
sha1_round4(hs0, hs1, w3, k3)
sha1_message_schedule_update(w0, w1, w2, w3)
sha1_message_schedule_update(w1, w2, w3, w0)
sha1_message_schedule_update(w2, w3, w0, w1)
sha1_message_schedule_update(w3, w0, w1, w2)
sha1_round4(hs0, hs1, w0, k3)
sha1_round4(hs0, hs1, w1, k3)
sha1_round4(hs0, hs1, w2, k3)
sha1_round4(hs0, hs1, w3, k3)
add hc0.4s, hc0.4s, hs0.4s
add hc1.4s, hc1.4s, hs1.4s
sub num, num, #1
cbnz num, .Lblock_loop
st1 {hc0.4s}, [ctx]
str hc1s, [ctx, #(4*4)]
ret