#include "s2n_bignum_internal.h"
.intel_syntax noprefix
S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16)
S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16)
.text
#define z rdi
#define x rsi
#define zero rbp
#define zeroe ebp
.macro mulpadd arg1,arg2
mulx rcx, rax, [x+8*\arg1]
.if ((\arg1 + \arg2) % 8 == 0)
adcx r8, rax
adox r9, rcx
.elseif ((\arg1 + \arg2) % 8 == 1)
adcx r9, rax
adox r10, rcx
.elseif ((\arg1 + \arg2) % 8 == 2)
adcx r10, rax
adox r11, rcx
.elseif ((\arg1 + \arg2) % 8 == 3)
adcx r11, rax
adox r12, rcx
.elseif ((\arg1 + \arg2) % 8 == 4)
adcx r12, rax
adox r13, rcx
.elseif ((\arg1 + \arg2) % 8 == 5)
adcx r13, rax
adox r14, rcx
.elseif ((\arg1 + \arg2) % 8 == 6)
adcx r14, rax
adox r15, rcx
.elseif ((\arg1 + \arg2) % 8 == 7)
adcx r15, rax
adox r8, rcx
.endif
.endm
.macro mulpade arg1,arg2
.if ((\arg1 + \arg2) % 8 == 0)
mulx r9, rax, [x+8*\arg1]
adcx r8, rax
adox r9, zero
.elseif ((\arg1 + \arg2) % 8 == 1)
mulx r10, rax, [x+8*\arg1]
adcx r9, rax
adox r10, zero
.elseif ((\arg1 + \arg2) % 8 == 2)
mulx r11, rax, [x+8*\arg1]
adcx r10, rax
adox r11, zero
.elseif ((\arg1 + \arg2) % 8 == 3)
mulx r12, rax, [x+8*\arg1]
adcx r11, rax
adox r12, zero
.elseif ((\arg1 + \arg2) % 8 == 4)
mulx r13, rax, [x+8*\arg1]
adcx r12, rax
adox r13, zero
.elseif ((\arg1 + \arg2) % 8 == 5)
mulx r14, rax, [x+8*\arg1]
adcx r13, rax
adox r14, zero
.elseif ((\arg1 + \arg2) % 8 == 6)
mulx r15, rax, [x+8*\arg1]
adcx r14, rax
adox r15, zero
.elseif ((\arg1 + \arg2) % 8 == 7)
mulx r8, rax, [x+8*\arg1]
adcx r15, rax
adox r8, zero
.endif
.endm
.macro diagonals
xor zeroe, zeroe
mov rdx, [x]
mulx rax, r9, [x+8]
mov [z+8], r9
mulx rcx, r10, [x+16]
adcx r10, rax
mov [z+16], r10
mulx rax, r11, [x+24]
adcx r11, rcx
mulx rcx, r12, [x+32]
adcx r12, rax
mulx rax, r13, [x+40]
adcx r13, rcx
mulx rcx, r14, [x+48]
adcx r14, rax
mulx r8, r15, [x+56]
adcx r15, rcx
adcx r8, zero
xor zeroe, zeroe
mov rdx, [x+8]
mulpadd 2, 1
mov [z+24], r11
mulpadd 3, 1
mov [z+32], r12
mulpadd 4, 1
mulpadd 5, 1
mulpadd 6, 1
mulpade 7, 1
mov rdx, [x+32]
mulpade 5, 4
adcx r10, zero
xor zeroe, zeroe
mov rdx, [x+16]
mulpadd 3, 2
mov [z+40], r13
mulpadd 4, 2
mov [z+48], r14
mulpadd 5, 2
mulpadd 6, 2
mulpadd 7, 2
mov rdx, [x+48]
mulpade 4, 6
mulpade 5, 6
adcx r12, zero
xor zeroe, zeroe
mov rdx, [x+24]
mulpadd 4, 3
mov [z+56], r15
mulpadd 5, 3
mov [z+64], r8
mulpadd 6, 3
mulpadd 7, 3
mov rdx, [x+56]
mulpadd 4, 7
mulpade 5, 7
mulpade 6, 7
adcx r14, zero
xor zeroe, zeroe
mov rdx, [x]
mulx rcx, rax, rdx
mov [z], rax
mov rax, [z+8]
adcx rax, rax
adox rax, rcx
mov [z+8], rax
mov rax, [z+16]
mov rdx, [x+8]
mulx rcx, rdx, rdx
adcx rax, rax
adox rax, rdx
mov [z+16], rax
mov rax, [z+24]
adcx rax, rax
adox rax, rcx
mov [z+24], rax
mov rax, [z+32]
mov rdx, [x+16]
mulx rcx, rdx, rdx
adcx rax, rax
adox rax, rdx
mov [z+32], rax
mov rax, [z+40]
adcx rax, rax
adox rax, rcx
mov [z+40], rax
mov rax, [z+48]
mov rdx, [x+24]
mulx rcx, rdx, rdx
adcx rax, rax
adox rax, rdx
mov [z+48], rax
mov rax, [z+56]
adcx rax, rax
adox rax, rcx
mov [z+56], rax
mov rax, [z+64]
mov rdx, [x+32]
mulx rcx, rdx, rdx
adcx rax, rax
adox rax, rdx
mov [z+64], rax
adcx r9, r9
adox r9, rcx
mov [z+72], r9
mov rdx, [x+40]
mulx rcx, rdx, rdx
adcx r10, r10
adox r10, rdx
mov [z+80], r10
adcx r11, r11
adox r11, rcx
mov [z+88], r11
mov rdx, [x+48]
mulx rcx, rdx, rdx
adcx r12, r12
adox r12, rdx
mov [z+96], r12
adcx r13, r13
adox r13, rcx
mov [z+104], r13
mov rdx, [x+56]
mulx r15, rdx, rdx
adcx r14, r14
adox r14, rdx
mov [z+112], r14
adcx r15, zero
adox r15, zero
mov [z+120], r15
.endm
S2N_BN_SYMBOL(bignum_sqr_8_16):
_CET_ENDBR
#if WINDOWS_ABI
push rdi
push rsi
mov rdi, rcx
mov rsi, rdx
#endif
push rbp
push r12
push r13
push r14
push r15
diagonals
pop r15
pop r14
pop r13
pop r12
pop rbp
#if WINDOWS_ABI
pop rsi
pop rdi
#endif
ret
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif