#define ctx x0
#define in x1
#define num x2
#define k256_base x9
#define k256 x10
#define hc0 v16
#define hc1 v17
#define hs0 v18
#define hs1 v19
#define w0 v20
#define w1 v21
#define w2 v22
#define w3 v23
#define k0 v24
#define k1 v25
#define k2 v26
#define k3 v27
#define tmp0 v28
#define tmp1 v29
#define tmp1q q29
#define v18q q18
#define v19q q19
#define sha256_message_schedule_update(m0, m1, m2, m3) \
sha256su0 m0.4s, m1.4s; \
sha256su1 m0.4s, m2.4s, m3.4s
#define sha256_round(h0, h1, w, k) \
add tmp0.4s, w.4s, k.4s; \
mov tmp1.16b, h0.16b; \
sha256h h0##q, h1##q, tmp0.4s; \
sha256h2 h1##q, tmp1##q, tmp0.4s
#define sha256_round_initial(h0, h1, w, k) \
sha256_round(h0, h1, w, k)
#define sha256_round_update(h0, h1, m0, m1, m2, m3, k) \
sha256_message_schedule_update(m0, m1, m2, m3); \
sha256_round(h0, h1, m0, k)
.arch armv8-a+sha2
.section .text
.globl sha256_block_ce
.type sha256_block_ce,@function
sha256_block_ce:
adrp k256_base, K256
add k256_base, k256_base, :lo12:K256
ld1 {hc0.4s, hc1.4s}, [ctx]
.Lblock_loop:
mov k256, k256_base
mov hs0.16b, hc0.16b
mov hs1.16b, hc1.16b
ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
rev32 w0.16b, w0.16b
rev32 w1.16b, w1.16b
rev32 w2.16b, w2.16b
rev32 w3.16b, w3.16b
ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
sha256_round_initial(hs0, hs1, w0, k0)
sha256_round_initial(hs0, hs1, w1, k1)
sha256_round_initial(hs0, hs1, w2, k2)
sha256_round_initial(hs0, hs1, w3, k3)
ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
add hc0.4s, hc0.4s, hs0.4s
add hc1.4s, hc1.4s, hs1.4s
sub num, num, #1
cbnz num, .Lblock_loop
st1 {hc0.4s, hc1.4s}, [ctx]
ret
.section .rodata
.align 4
.type K256,@object
K256:
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.size K256,.-K256