#include "crypto_assembly.h"
#define ctx %rdi
#define in %rsi
#define num %rdx
#define end %rbx
#define xabcd_save %xmm0
#define xe_save %xmm1
#define xabcd %xmm2
#define xe0 %xmm3
#define xe1 %xmm4
#define xmsg0 %xmm5
#define xmsg1 %xmm6
#define xmsg2 %xmm7
#define xmsg3 %xmm8
#define xshufmask %xmm9
#define sha1_message_schedule_load(idx, m, xmsg) \
movdqu (idx*16)(m), xmsg; \
pshufb xshufmask, xmsg
#define sha1_message_schedule_update(xm0, xm1, xm2, xm3) \
sha1msg1 xm1, xm0; \
pxor xm2, xm0; \
sha1msg2 xm3, xm0
#define sha1_shani_round(fn, xmsg, xe, xe_next) \
sha1nexte xmsg, xe; \
movdqa xabcd, xe_next; \
sha1rnds4 fn, xe, xabcd
#define sha1_shani_round_load(fn, idx, m, xmsg, xe, xe_next) \
sha1_message_schedule_load(idx, m, xmsg); \
sha1_shani_round(fn, xmsg, xe, xe_next)
#define sha1_shani_round_update(fn, xm0, xm1, xm2, xm3, xe, xe_next) \
sha1_message_schedule_update(xm0, xm1, xm2, xm3); \
sha1_shani_round(fn, xm0, xe, xe_next)
.section .text
.align 16
.globl sha1_block_shani
.type sha1_block_shani,@function
sha1_block_shani:
_CET_ENDBR
pushq %rbx
shlq $6, num
leaq (in, num, 1), end
movdqa shufmask(%rip), xshufmask
movdqu (0*16)(ctx), xabcd
pshufd $0x1b, xabcd, xabcd
pxor xe0, xe0
pinsrd $3, (1*16)(ctx), xe0
jmp .Lshani_block_loop
.align 16
.Lshani_block_loop:
movdqa xabcd, xabcd_save
movdqa xe0, xe_save
sha1_message_schedule_load(0, in, xmsg0)
paddd xmsg0, xe0
movdqa xabcd, xe1
sha1rnds4 $0, xe0, xabcd
sha1_shani_round_load($0, 1, in, xmsg1, xe1, xe0)
sha1_shani_round_load($0, 2, in, xmsg2, xe0, xe1)
sha1_shani_round_load($0, 3, in, xmsg3, xe1, xe0)
sha1_shani_round_update($0, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
sha1_shani_round_update($1, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
sha1_shani_round_update($1, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
sha1_shani_round_update($1, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
sha1_shani_round_update($1, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
sha1_shani_round_update($2, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
sha1_shani_round_update($2, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
sha1_shani_round_update($2, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
sha1_shani_round_update($2, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
sha1_shani_round_update($3, xmsg0, xmsg1, xmsg2, xmsg3, xe0, xe1)
sha1_shani_round_update($3, xmsg1, xmsg2, xmsg3, xmsg0, xe1, xe0)
sha1_shani_round_update($3, xmsg2, xmsg3, xmsg0, xmsg1, xe0, xe1)
sha1_shani_round_update($3, xmsg3, xmsg0, xmsg1, xmsg2, xe1, xe0)
paddd xabcd_save, xabcd
sha1nexte xe_save, xe0
addq $64, in
cmpq end, in
jb .Lshani_block_loop
pshufd $0x1b, xabcd, xabcd
movdqu xabcd, (0*16)(ctx)
pextrd $3, xe0, (1*16)(ctx)
popq %rbx
ret
.section .rodata
.align 16
.type shufmask,@object
shufmask:
.octa 0x000102030405060708090a0b0c0d0e0f
.size shufmask,.-shufmask