root/lib/libcrypto/bn/arch/amd64/bignum_sqr_6_12.S
// $OpenBSD: bignum_sqr_6_12.S,v 1.4 2025/08/12 10:23:40 jsing Exp $
//
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Permission to use, copy, modify, and/or distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

// ----------------------------------------------------------------------------
// Square, z := x^2
// Input x[6]; output z[12]
//
//    extern void bignum_sqr_6_12(uint64_t z[static 12], const uint64_t x[static 6]);
//
// Standard x86-64 ABI: RDI = z, RSI = x
// Microsoft x64 ABI:   RCX = z, RDX = x
// ----------------------------------------------------------------------------

#include "s2n_bignum_internal.h"

        .intel_syntax noprefix
        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12)
        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12)
        .text

// These are actually right

#define z rdi
#define x rsi

// A zero register

#define zero rbp
#define zeroe ebp

// Other registers

#define d1 r8
#define d2 r9
#define d3 r10
#define d4 r11
#define d5 r12
#define d6 r13
#define d7 r14
#define d8 r15
#define d9 rbx

// Care is needed: re-using the zero register

#define d10 rbp


S2N_BN_SYMBOL(bignum_sqr_6_12):
        _CET_ENDBR

#if WINDOWS_ABI
        push    rdi
        push    rsi
        mov     rdi, rcx
        mov     rsi, rdx
#endif

// Save more registers to play with

        push    rbp
        push    rbx
        push    r12
        push    r13
        push    r14
        push    r15

// Set up an initial window [d8;...d1] = [34;05;03;01]

        mov     rdx, [x]
        mulx    d2, d1, [x+8]
        mulx    d4, d3, [x+24]
        mulx    d6, d5, [x+40]
        mov     rdx, [x+24]
        mulx    d8, d7, [x+32]

// Clear our zero register, and also initialize the flags for the carry chain

        xor     zeroe, zeroe

// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window
// (no carry-out possible since we add it to the top of a product)

        mov     rdx, [x+16]
        mulx    rcx, rax, [x]
        adcx    d2, rax
        adox    d3, rcx
        mulx    rcx, rax, [x+8]
        adcx    d3, rax
        adox    d4, rcx
        mov     rdx, [x+8]
        mulx    rcx, rax, [x+24]
        adcx    d4, rax
        adox    d5, rcx
        mulx    rcx, rax, [x+32]
        adcx    d5, rax
        adox    d6, rcx
        mulx    rcx, rax, [x+40]
        adcx    d6, rax
        adox    d7, rcx
        adcx    d7, zero
        adox    d8, zero
        adcx    d8, zero

// Again zero out the flags. Actually they are already cleared but it may
// help decouple these in the OOO engine not to wait for the chain above

        xor     zeroe, zeroe

// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms
// We are running out of registers and here our zero register is not zero!

        mov     rdx, [x+32]
        mulx    rcx, rax, [x]
        adcx    d4, rax
        adox    d5, rcx
        mov     rdx, [x+16]
        mulx    rcx, rax, [x+24]
        adcx    d5, rax
        adox    d6, rcx
        mulx    rcx, rax, [x+32]
        adcx    d6, rax
        adox    d7, rcx
        mulx    rcx, rax, [x+40]
        adcx    d7, rax
        adox    d8, rcx
        mov     rdx, [x+24]
        mulx    d9, rax, [x+40]
        adcx    d8, rax
        adox    d9, zero
        mov     rdx, [x+32]
        mulx    d10, rax, [x+40]
        adcx    d9, rax
        mov     eax, 0
        adox    d10, rax
        adcx    d10, rax

// Again, just for a clear fresh start for the flags

        xor     eax, eax

// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms
//
// We could use shift-double but this seems tidier and in larger squarings
// it was actually more efficient. I haven't experimented with this small
// case to see how much that matters. Note: the writeback here is sprinkled
// into the sequence in such a way that things still work if z = x, i.e. if
// the output overwrites the input buffer and beyond.

        mov     rdx, [x]
        mulx    rdx, rax, rdx
        mov     [z], rax
        adcx    d1, d1
        adox    d1, rdx
        mov     rdx, [x+8]
        mov     [z+8], d1
        mulx    rdx, rax, rdx
        adcx    d2, d2
        adox    d2, rax
        adcx    d3, d3
        adox    d3, rdx
        mov     rdx, [x+16]
        mov     [z+16], d2
        mulx    rdx, rax, rdx
        adcx    d4, d4
        adox    d4, rax
        adcx    d5, d5
        adox    d5, rdx
        mov     rdx, [x+24]
        mov     [z+24], d3
        mulx    rdx, rax, rdx
        adcx    d6, d6
        adox    d6, rax
        adcx    d7, d7
        adox    d7, rdx
        mov     rdx, [x+32]
        mov     [z+32], d4
        mulx    rdx, rax, rdx
        adcx    d8, d8
        adox    d8, rax
        adcx    d9, d9
        adox    d9, rdx
        mov     rdx, [x+40]
        mov     [z+40], d5
        mulx    rdx, rax, rdx
        mov     [z+48], d6
        adcx    d10, d10
        mov     [z+56], d7
        adox    d10, rax
        mov     [z+64], d8
        mov     eax, 0
        mov     [z+72], d9
        adcx    rdx, rax
        mov     [z+80], d10
        adox    rdx, rax
        mov     [z+88], rdx

// Restore saved registers and return

        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     rbx
        pop     rbp

#if WINDOWS_ABI
        pop    rsi
        pop    rdi
#endif
        ret

#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif