#include <linux/linkage.h>
.section .rodata.cst32.iv, "aM", @progbits, 32
.align 32
.Liv:
.octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ror16, "aM", @progbits, 16
.align 16
.Lror16:
.octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ror8, "aM", @progbits, 16
.align 16
.Lror8:
.octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.sigma, "aM", @progbits, 160
.align 64
.Lsigma:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
.section .rodata.cst64.sigma2, "aM", @progbits, 160
.align 64
.Lsigma2:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#define CTX %rdi
#define DATA %rsi
#define NBLOCKS %rdx
#define INC %ecx
.text
SYM_FUNC_START(blake2s_compress_ssse3)
movdqu (CTX),%xmm0
movdqu 16(CTX),%xmm1
movdqa .Lror16(%rip),%xmm12
movdqa .Lror8(%rip),%xmm13
movdqu 32(CTX),%xmm14
movd INC,%xmm15
leaq .Lsigma+160(%rip),%r8
jmp .Lssse3_mainloop
.align 32
.Lssse3_mainloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa .Liv(%rip),%xmm2
movdqa %xmm14,%xmm3
pxor .Liv+16(%rip),%xmm3
leaq .Lsigma(%rip),%rcx
.Lssse3_roundloop:
movzbl (%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 1(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 2(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 3(%rcx),%eax
movd (DATA,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 4(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 5(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 6(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 7(%rcx),%eax
movd (DATA,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 8(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 9(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 10(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 11(%rcx),%eax
movd (DATA,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 12(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 13(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 14(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 15(%rcx),%eax
movd (DATA,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $16,%rcx
cmpq %r8,%rcx
jnz .Lssse3_roundloop
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $64,DATA
decq NBLOCKS
jnz .Lssse3_mainloop
movdqu %xmm0,(CTX)
movdqu %xmm1,16(CTX)
movq %xmm14,32(CTX)
RET
SYM_FUNC_END(blake2s_compress_ssse3)
SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu (CTX),%xmm0
vmovdqu 16(CTX),%xmm1
vmovdqu 32(CTX),%xmm4
vmovd INC,%xmm5
vmovdqa .Liv(%rip),%xmm14
vmovdqa .Liv+16(%rip),%xmm15
jmp .Lavx512_mainloop
.align 32
.Lavx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (DATA),%ymm6
vmovdqu 32(DATA),%ymm7
addq $64,DATA
leaq .Lsigma2(%rip),%rax
movb $10,%cl
.Lavx512_roundloop:
vpmovzxbd (%rax),%ymm8
vpmovzxbd 8(%rax),%ymm9
addq $16,%rax
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lavx512_roundloop
vpternlogd $0x96,%xmm10,%xmm2,%xmm0
vpternlogd $0x96,%xmm11,%xmm3,%xmm1
decq NBLOCKS
jne .Lavx512_mainloop
vmovdqu %xmm0,(CTX)
vmovdqu %xmm1,16(CTX)
vmovq %xmm4,32(CTX)
vzeroupper
RET
SYM_FUNC_END(blake2s_compress_avx512)