#include "crypto_assembly.h"
#define ctx %rdi
#define in %rsi
#define num %rdx
#define end %rbp
#define hs0 %r8d
#define hs1 %r9d
#define hs2 %r10d
#define hs3 %r11d
#define hs4 %r12d
#define tmp0 %eax
#define tmp1 %ebx
#define tmp2 %ecx
#define tmp3 %edx
#define sha1_message_schedule_load(idx, m, w, wt) \
movl ((idx&0xf)*4)(m), wt; \
bswapl wt; \
movl wt, ((idx&0xf)*4)(w)
#define sha1_message_schedule_update(idx, w, wt) \
movl (((idx-3)&0xf)*4)(w), wt; \
xorl (((idx-8)&0xf)*4)(w), wt; \
xorl (((idx-14)&0xf)*4)(w), wt; \
xorl (((idx)&0xf)*4)(w), wt; \
roll $1, wt; \
\
movl wt, ((idx&0xf)*4)(w)
#define sha1_round(a, b, c, d, e, kt, wt) \
leal kt(wt, e, 1), e; \
\
movl a, tmp1; \
roll $5, tmp1; \
addl tmp1, e; \
\
roll $30, b;
#define sha1_round_ch(a, b, c, d, e, kt, wt) \
movl c, tmp2; \
xorl d, tmp2; \
andl b, tmp2; \
xorl d, tmp2; \
addl tmp2, e; \
\
sha1_round(a, b, c, d, e, kt, wt)
#define sha1_round_parity(a, b, c, d, e, kt, wt) \
movl b, tmp2; \
xorl c, tmp2; \
xorl d, tmp2; \
addl tmp2, e; \
\
sha1_round(a, b, c, d, e, kt, wt)
#define sha1_round_maj(a, b, c, d, e, kt, wt) \
movl c, tmp2; \
xorl d, tmp2; \
andl b, tmp2; \
movl c, tmp3; \
andl d, tmp3; \
xorl tmp2, tmp3; \
addl tmp3, e; \
\
sha1_round(a, b, c, d, e, kt, wt)
#define sha1_round1_load(idx, a, b, c, d, e) \
sha1_message_schedule_load(idx, in, %rsp, tmp0); \
sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
#define sha1_round1_update(idx, a, b, c, d, e) \
sha1_message_schedule_update(idx, %rsp, tmp0); \
sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
#define sha1_round2_update(idx, a, b, c, d, e) \
sha1_message_schedule_update(idx, %rsp, tmp0); \
sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
#define sha1_round3_update(idx, a, b, c, d, e) \
sha1_message_schedule_update(idx, %rsp, tmp0); \
sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
#define sha1_round4_update(idx, a, b, c, d, e) \
sha1_message_schedule_update(idx, %rsp, tmp0); \
sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
.section .text
.align 16
.globl sha1_block_generic
.type sha1_block_generic,@function
sha1_block_generic:
_CET_ENDBR
pushq %rbx
pushq %rbp
pushq %r12
movq %rsp, %rax
subq $(64+1*8), %rsp
andq $~63, %rsp
movq %rax, (64+0*8)(%rsp)
shlq $6, num
leaq (in, num, 1), end
movl (0*4)(ctx), hs0
movl (1*4)(ctx), hs1
movl (2*4)(ctx), hs2
movl (3*4)(ctx), hs3
movl (4*4)(ctx), hs4
jmp .Lblock_loop
.align 16
.Lblock_loop:
sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3)
sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2)
sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1)
sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0)
sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4)
sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3)
sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2)
sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1)
sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0)
sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4)
sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3)
sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2)
sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1)
sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0)
sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4)
sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3)
sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2)
sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1)
sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0)
sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4)
sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3)
sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2)
sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1)
sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0)
sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4)
sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3)
sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2)
sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1)
sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0)
sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4)
sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3)
sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2)
sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1)
sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0)
sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4)
sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3)
sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2)
sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1)
sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0)
sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4)
sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3)
sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2)
sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1)
sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0)
sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4)
sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3)
sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2)
sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1)
sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0)
sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4)
sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3)
sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2)
sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1)
sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0)
sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4)
sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3)
sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2)
sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1)
sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0)
sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4)
sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3)
sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2)
sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1)
sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0)
addl (0*4)(ctx), hs0
addl (1*4)(ctx), hs1
addl (2*4)(ctx), hs2
addl (3*4)(ctx), hs3
addl (4*4)(ctx), hs4
movl hs0, (0*4)(ctx)
movl hs1, (1*4)(ctx)
movl hs2, (2*4)(ctx)
movl hs3, (3*4)(ctx)
movl hs4, (4*4)(ctx)
addq $64, in
cmpq end, in
jb .Lblock_loop
movq (64+0*8)(%rsp), %rsp
popq %r12
popq %rbp
popq %rbx
ret