#include "crypto_assembly.h"
#define ctx %rdi
#define in %rsi
#define num %rdx
#define end %rbx
#define k256 %rbp
#define xmsg %xmm0
#define xhs0 %xmm1
#define xhs1 %xmm2
#define xabef %xmm3
#define xcdgh %xmm4
#define xmsgtmp0 %xmm6
#define xmsgtmp1 %xmm7
#define xmsgtmp2 %xmm8
#define xmsgtmp3 %xmm9
#define xmsgtmp4 %xmm10
#define xshufmask %xmm11
#define xtmp0 %xmm12
#define sha256_message_schedule_load(idx, m, xmsgtmp) \
movdqu (idx*16)(m), xmsg; \
pshufb xshufmask, xmsg; \
movdqa xmsg, xmsgtmp
#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \
sha256msg1 xmt1, xmt0; \
movdqa xmt3, xmsgtmp4; \
palignr $4, xmt2, xmsgtmp4; \
paddd xmsgtmp4, xmt0; \
sha256msg2 xmt3, xmt0
#define sha256_shani_round(idx) \
paddd (idx*16)(k256), xmsg; \
sha256rnds2 xmsg, xhs0, xhs1; \
pshufd $0x0e, xmsg, xmsg; \
sha256rnds2 xmsg, xhs1, xhs0
#define sha256_shani_round_load(idx, m, xmsgtmp) \
sha256_message_schedule_load(idx, m, xmsgtmp); \
sha256_shani_round(idx)
#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \
sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \
movdqa xmt0, xmsg; \
sha256_shani_round(idx)
.section .text
.align 16
.globl sha256_block_shani
.type sha256_block_shani,@function
sha256_block_shani:
_CET_ENDBR
pushq %rbx
pushq %rbp
shlq $6, num
leaq (in, num, 1), end
leaq K256(%rip), k256
movdqa shufmask(%rip), xshufmask
movdqu (0*16)(ctx), xhs0
movdqu (1*16)(ctx), xhs1
pshufd $0xb1, xhs0, xhs0
pshufd $0x1b, xhs1, xhs1
movdqa xhs0, xtmp0
palignr $8, xhs1, xhs0
pblendw $0xf0, xtmp0, xhs1
jmp .Lshani_block_loop
.align 16
.Lshani_block_loop:
movdqa xhs0, xabef
movdqa xhs1, xcdgh
sha256_shani_round_load(0, in, xmsgtmp0)
sha256_shani_round_load(1, in, xmsgtmp1)
sha256_shani_round_load(2, in, xmsgtmp2)
sha256_shani_round_load(3, in, xmsgtmp3)
sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
paddd xabef, xhs0
paddd xcdgh, xhs1
addq $64, in
cmpq end, in
jb .Lshani_block_loop
pshufd $0x1b, xhs0, xhs0
pshufd $0xb1, xhs1, xhs1
movdqa xhs0, xtmp0
pblendw $0xf0, xhs1, xhs0
palignr $8, xtmp0, xhs1
movdqu xhs0, (0*16)(ctx)
movdqu xhs1, (1*16)(ctx)
popq %rbp
popq %rbx
ret
.section .rodata
.align 16
.type shufmask,@object
shufmask:
.octa 0x0c0d0e0f08090a0b0405060700010203
.size shufmask,.-shufmask
.align 64
.type K256,@object
K256:
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.size K256,.-K256