root/crypto/krb5/src/lib/crypto/builtin/aes/iaesx64.s
[bits 64]
[CPU intelnop]

; Copyright (c) 2010, Intel Corporation
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
;     * Redistributions of source code must retain the above copyright notice,
;       this list of conditions and the following disclaimer.
;     * Redistributions in binary form must reproduce the above copyright notice,
;       this list of conditions and the following disclaimer in the documentation
;       and/or other materials provided with the distribution.
;     * Neither the name of Intel Corporation nor the names of its contributors
;       may be used to endorse or promote products derived from this software
;       without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

%define iEncExpandKey128 k5_iEncExpandKey128
%define iEncExpandKey256 k5_iEncExpandKey256
%define iDecExpandKey128 k5_iDecExpandKey128
%define iDecExpandKey256 k5_iDecExpandKey256
%define iEnc128_CBC      k5_iEnc128_CBC
%define iEnc256_CBC      k5_iEnc256_CBC
%define iDec128_CBC      k5_iDec128_CBC
%define iDec256_CBC      k5_iDec256_CBC

%macro linux_setup 0
%ifdef __linux__
        mov rcx, rdi
        mov rdx, rsi
%endif
%endmacro

%macro inversekey 1
        movdqu  xmm1,%1
        aesimc  xmm0,xmm1
        movdqu  %1,xmm0
%endmacro

%macro aesdeclast1 1
        aesdeclast      xmm0,%1
%endmacro

%macro aesenclast1 1
        aesenclast      xmm0,%1
%endmacro

%macro aesdec1 1
        aesdec  xmm0,%1
%endmacro

%macro aesenc1 1
        aesenc  xmm0,%1
%endmacro


%macro aesdeclast1_u 1
        movdqu xmm4,%1
        aesdeclast      xmm0,xmm4
%endmacro

%macro aesenclast1_u 1
        movdqu xmm4,%1
        aesenclast      xmm0,xmm4
%endmacro

%macro aesdec1_u 1
        movdqu xmm4,%1
        aesdec  xmm0,xmm4
%endmacro

%macro aesenc1_u 1
        movdqu xmm4,%1
        aesenc  xmm0,xmm4
%endmacro

%macro aesdec4 1
        movdqa  xmm4,%1

        aesdec  xmm0,xmm4
        aesdec  xmm1,xmm4
        aesdec  xmm2,xmm4
        aesdec  xmm3,xmm4

%endmacro

%macro aesdeclast4 1
        movdqa  xmm4,%1

        aesdeclast      xmm0,xmm4
        aesdeclast      xmm1,xmm4
        aesdeclast      xmm2,xmm4
        aesdeclast      xmm3,xmm4

%endmacro


%macro aesenc4 1
        movdqa  xmm4,%1

        aesenc  xmm0,xmm4
        aesenc  xmm1,xmm4
        aesenc  xmm2,xmm4
        aesenc  xmm3,xmm4

%endmacro

%macro aesenclast4 1
        movdqa  xmm4,%1

        aesenclast      xmm0,xmm4
        aesenclast      xmm1,xmm4
        aesenclast      xmm2,xmm4
        aesenclast      xmm3,xmm4

%endmacro


%macro xor_with_input4 1
        movdqu xmm4,[%1]
        pxor xmm0,xmm4
        movdqu xmm4,[%1+16]
        pxor xmm1,xmm4
        movdqu xmm4,[%1+32]
        pxor xmm2,xmm4
        movdqu xmm4,[%1+48]
        pxor xmm3,xmm4
%endmacro



%macro load_and_xor4 2
        movdqa  xmm4,%2
        movdqu  xmm0,[%1 + 0*16]
        pxor    xmm0,xmm4
        movdqu  xmm1,[%1 + 1*16]
        pxor    xmm1,xmm4
        movdqu  xmm2,[%1 + 2*16]
        pxor    xmm2,xmm4
        movdqu  xmm3,[%1 + 3*16]
        pxor    xmm3,xmm4
%endmacro

%macro store4 1
        movdqu [%1 + 0*16],xmm0
        movdqu [%1 + 1*16],xmm1
        movdqu [%1 + 2*16],xmm2
        movdqu [%1 + 3*16],xmm3
%endmacro

%macro copy_round_keys 3
        movdqu xmm4,[%2 + ((%3)*16)]
        movdqa [%1 + ((%3)*16)],xmm4
%endmacro


%macro key_expansion_1_192 1
                ;; Assumes the xmm3 includes all zeros at this point.
        pshufd xmm2, xmm2, 11111111b
        shufps xmm3, xmm1, 00010000b
        pxor xmm1, xmm3
        shufps xmm3, xmm1, 10001100b
        pxor xmm1, xmm3
                pxor xmm1, xmm2
                movdqu [rdx+%1], xmm1
%endmacro

; Calculate w10 and w11 using calculated w9 and known w4-w5
%macro key_expansion_2_192 1
                movdqa xmm5, xmm4
                pslldq xmm5, 4
                shufps xmm6, xmm1, 11110000b
                pxor xmm6, xmm5
                pxor xmm4, xmm6
                pshufd xmm7, xmm4, 00001110b
                movdqu [rdx+%1], xmm7
%endmacro


section .rodata
align 16
shuffle_mask:
DD 0FFFFFFFFh
DD 03020100h
DD 07060504h
DD 0B0A0908h



section .text

align 16
key_expansion256:

    pshufd xmm2, xmm2, 011111111b

    movdqa xmm4, xmm1
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pxor xmm1, xmm2

    movdqu [rdx], xmm1
    add rdx, 0x10

    aeskeygenassist xmm4, xmm1, 0
    pshufd xmm2, xmm4, 010101010b

    movdqa xmm4, xmm3
    pshufb xmm4, xmm5
    pxor xmm3, xmm4
    pshufb xmm4, xmm5
    pxor xmm3, xmm4
    pshufb xmm4, xmm5
    pxor xmm3, xmm4
    pxor xmm3, xmm2

    movdqu [rdx], xmm3
    add rdx, 0x10

    ret



align 16
key_expansion128:
    pshufd xmm2, xmm2, 0xFF;
    movdqa xmm3, xmm1
    pshufb xmm3, xmm5
    pxor xmm1, xmm3
    pshufb xmm3, xmm5
    pxor xmm1, xmm3
    pshufb xmm3, xmm5
    pxor xmm1, xmm3
    pxor xmm1, xmm2

    ; storing the result in the key schedule array
    movdqu [rdx], xmm1
    add rdx, 0x10
    ret






align 16
global iEncExpandKey128
iEncExpandKey128:

                linux_setup

        movdqu xmm1, [rcx]    ; loading the key

        movdqu [rdx], xmm1

        movdqa xmm5, [shuffle_mask wrt rip]

        add rdx,16

        aeskeygenassist xmm2, xmm1, 0x1     ; Generating round key 1
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
        call key_expansion128
        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
        call key_expansion128

                ret



align 16
global iDecExpandKey128
iDecExpandKey128:

        linux_setup
        push rcx
        push rdx
        sub rsp,16+8

        call iEncExpandKey128

        add rsp,16+8
        pop rdx
        pop rcx

        inversekey [rdx + 1*16]
        inversekey [rdx + 2*16]
        inversekey [rdx + 3*16]
        inversekey [rdx + 4*16]
        inversekey [rdx + 5*16]
        inversekey [rdx + 6*16]
        inversekey [rdx + 7*16]
        inversekey [rdx + 8*16]
        inversekey [rdx + 9*16]

        ret



align 16
global iDecExpandKey256
iDecExpandKey256:

        linux_setup
        push rcx
        push rdx
        sub rsp,16+8

        call iEncExpandKey256

        add rsp,16+8
        pop rdx
        pop rcx

        inversekey [rdx + 1*16]
        inversekey [rdx + 2*16]
        inversekey [rdx + 3*16]
        inversekey [rdx + 4*16]
        inversekey [rdx + 5*16]
        inversekey [rdx + 6*16]
        inversekey [rdx + 7*16]
        inversekey [rdx + 8*16]
        inversekey [rdx + 9*16]
        inversekey [rdx + 10*16]
        inversekey [rdx + 11*16]
        inversekey [rdx + 12*16]
        inversekey [rdx + 13*16]

        ret




align 16
global iEncExpandKey256
iEncExpandKey256:

        linux_setup

    movdqu xmm1, [rcx]    ; loading the key
    movdqu xmm3, [rcx+16]
    movdqu [rdx], xmm1  ; Storing key in memory where all key schedule will be stored
    movdqu [rdx+16], xmm3

    add rdx,32

    movdqa xmm5, [shuffle_mask wrt rip]  ; this mask is used by key_expansion

    aeskeygenassist xmm2, xmm3, 0x1     ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x2     ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x4     ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x8     ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x10    ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x20    ;
    call key_expansion256
    aeskeygenassist xmm2, xmm3, 0x40    ;
;    call key_expansion256

    pshufd xmm2, xmm2, 011111111b

    movdqa xmm4, xmm1
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pshufb xmm4, xmm5
    pxor xmm1, xmm4
    pxor xmm1, xmm2

    movdqu [rdx], xmm1


        ret



align 16
global iDec128_CBC
iDec128_CBC:

        linux_setup
        sub rsp,16*16+8

        mov r9,rcx
        mov rax,[rcx+24]
        movdqu  xmm5,[rax]

        mov eax,[rcx+32] ; numblocks
        mov rdx,[rcx]
        mov r8,[rcx+8]
        mov rcx,[rcx+16]


        sub r8,rdx


        test eax,eax
        jz end_dec128_CBC

        cmp eax,4
        jl      lp128decsingle_CBC

        test    rcx,0xf
        jz              lp128decfour_CBC

        copy_round_keys rsp,rcx,0
        copy_round_keys rsp,rcx,1
        copy_round_keys rsp,rcx,2
        copy_round_keys rsp,rcx,3
        copy_round_keys rsp,rcx,4
        copy_round_keys rsp,rcx,5
        copy_round_keys rsp,rcx,6
        copy_round_keys rsp,rcx,7
        copy_round_keys rsp,rcx,8
        copy_round_keys rsp,rcx,9
        copy_round_keys rsp,rcx,10
        mov rcx,rsp


align 16
lp128decfour_CBC:

        test eax,eax
        jz end_dec128_CBC

        cmp eax,4
        jl      lp128decsingle_CBC

        load_and_xor4 rdx, [rcx+10*16]
        add rdx,16*4
        aesdec4 [rcx+9*16]
        aesdec4 [rcx+8*16]
        aesdec4 [rcx+7*16]
        aesdec4 [rcx+6*16]
        aesdec4 [rcx+5*16]
        aesdec4 [rcx+4*16]
        aesdec4 [rcx+3*16]
        aesdec4 [rcx+2*16]
        aesdec4 [rcx+1*16]
        aesdeclast4 [rcx+0*16]

        pxor    xmm0,xmm5
        movdqu  xmm4,[rdx - 16*4 + 0*16]
        pxor    xmm1,xmm4
        movdqu  xmm4,[rdx - 16*4 + 1*16]
        pxor    xmm2,xmm4
        movdqu  xmm4,[rdx - 16*4 + 2*16]
        pxor    xmm3,xmm4
        movdqu  xmm5,[rdx - 16*4 + 3*16]

        sub eax,4
        store4 r8+rdx-(16*4)
        jmp lp128decfour_CBC


        align 16
lp128decsingle_CBC:

        movdqu xmm0, [rdx]
        movdqa  xmm1,xmm0
        movdqu xmm4,[rcx+10*16]
        pxor xmm0, xmm4
        aesdec1_u [rcx+9*16]
        aesdec1_u [rcx+8*16]
        aesdec1_u [rcx+7*16]
        aesdec1_u [rcx+6*16]
        aesdec1_u [rcx+5*16]
        aesdec1_u [rcx+4*16]
        aesdec1_u [rcx+3*16]
        aesdec1_u [rcx+2*16]
        aesdec1_u [rcx+1*16]
        aesdeclast1_u [rcx+0*16]

        pxor    xmm0,xmm5
        movdqa  xmm5,xmm1
        add rdx, 16
        movdqu  [r8 + rdx - 16], xmm0
        dec eax
        jnz lp128decsingle_CBC

end_dec128_CBC:

        mov        r9,[r9+24]
        movdqu [r9],xmm5
        add rsp,16*16+8
        ret



align 16
global iDec256_CBC
iDec256_CBC:

        linux_setup
        sub rsp,16*16+8

        mov r9,rcx
        mov rax,[rcx+24]
        movdqu  xmm5,[rax]

        mov eax,[rcx+32] ; numblocks
        mov rdx,[rcx]
        mov r8,[rcx+8]
        mov rcx,[rcx+16]


        sub r8,rdx

        test eax,eax
        jz end_dec256_CBC

        cmp eax,4
        jl      lp256decsingle_CBC

        test    rcx,0xf
        jz              lp256decfour_CBC

        copy_round_keys rsp,rcx,0
        copy_round_keys rsp,rcx,1
        copy_round_keys rsp,rcx,2
        copy_round_keys rsp,rcx,3
        copy_round_keys rsp,rcx,4
        copy_round_keys rsp,rcx,5
        copy_round_keys rsp,rcx,6
        copy_round_keys rsp,rcx,7
        copy_round_keys rsp,rcx,8
        copy_round_keys rsp,rcx,9
        copy_round_keys rsp,rcx,10
        copy_round_keys rsp,rcx,11
        copy_round_keys rsp,rcx,12
        copy_round_keys rsp,rcx,13
        copy_round_keys rsp,rcx,14
        mov rcx,rsp

align 16
lp256decfour_CBC:

        test eax,eax
        jz end_dec256_CBC

        cmp eax,4
        jl      lp256decsingle_CBC

        load_and_xor4 rdx, [rcx+14*16]
        add rdx,16*4
        aesdec4 [rcx+13*16]
        aesdec4 [rcx+12*16]
        aesdec4 [rcx+11*16]
        aesdec4 [rcx+10*16]
        aesdec4 [rcx+9*16]
        aesdec4 [rcx+8*16]
        aesdec4 [rcx+7*16]
        aesdec4 [rcx+6*16]
        aesdec4 [rcx+5*16]
        aesdec4 [rcx+4*16]
        aesdec4 [rcx+3*16]
        aesdec4 [rcx+2*16]
        aesdec4 [rcx+1*16]
        aesdeclast4 [rcx+0*16]

        pxor    xmm0,xmm5
        movdqu  xmm4,[rdx - 16*4 + 0*16]
        pxor    xmm1,xmm4
        movdqu  xmm4,[rdx - 16*4 + 1*16]
        pxor    xmm2,xmm4
        movdqu  xmm4,[rdx - 16*4 + 2*16]
        pxor    xmm3,xmm4
        movdqu  xmm5,[rdx - 16*4 + 3*16]

        sub eax,4
        store4 r8+rdx-(16*4)
        jmp lp256decfour_CBC


        align 16
lp256decsingle_CBC:

        movdqu xmm0, [rdx]
        movdqu xmm4,[rcx+14*16]
        movdqa  xmm1,xmm0
        pxor xmm0, xmm4
        aesdec1_u [rcx+13*16]
        aesdec1_u [rcx+12*16]
        aesdec1_u [rcx+11*16]
        aesdec1_u [rcx+10*16]
        aesdec1_u [rcx+9*16]
        aesdec1_u [rcx+8*16]
        aesdec1_u [rcx+7*16]
        aesdec1_u [rcx+6*16]
        aesdec1_u [rcx+5*16]
        aesdec1_u [rcx+4*16]
        aesdec1_u [rcx+3*16]
        aesdec1_u [rcx+2*16]
        aesdec1_u [rcx+1*16]
        aesdeclast1_u [rcx+0*16]

        pxor    xmm0,xmm5
        movdqa  xmm5,xmm1
        add rdx, 16
        movdqu  [r8 + rdx - 16], xmm0
        dec eax
        jnz lp256decsingle_CBC

end_dec256_CBC:

        mov        r9,[r9+24]
        movdqu [r9],xmm5
        add rsp,16*16+8
        ret



align 16
global iEnc128_CBC
iEnc128_CBC:

        linux_setup
        sub rsp,16*16+8

        mov r9,rcx
        mov rax,[rcx+24]
        movdqu xmm1,[rax]

        mov eax,[rcx+32] ; numblocks
        mov rdx,[rcx]
        mov r8,[rcx+8]
        mov rcx,[rcx+16]

        sub r8,rdx


        test    rcx,0xf
        jz              lp128encsingle_CBC

        copy_round_keys rsp,rcx,0
        copy_round_keys rsp,rcx,1
        copy_round_keys rsp,rcx,2
        copy_round_keys rsp,rcx,3
        copy_round_keys rsp,rcx,4
        copy_round_keys rsp,rcx,5
        copy_round_keys rsp,rcx,6
        copy_round_keys rsp,rcx,7
        copy_round_keys rsp,rcx,8
        copy_round_keys rsp,rcx,9
        copy_round_keys rsp,rcx,10
        mov rcx,rsp


        align 16

lp128encsingle_CBC:

        movdqu xmm0, [rdx]
        movdqu xmm4,[rcx+0*16]
        add rdx, 16
        pxor xmm0, xmm1
        pxor xmm0, xmm4
        aesenc1 [rcx+1*16]
        aesenc1 [rcx+2*16]
        aesenc1 [rcx+3*16]
        aesenc1 [rcx+4*16]
        aesenc1 [rcx+5*16]
        aesenc1 [rcx+6*16]
        aesenc1 [rcx+7*16]
        aesenc1 [rcx+8*16]
        aesenc1 [rcx+9*16]
        aesenclast1 [rcx+10*16]
        movdqa xmm1,xmm0

                ; Store output encrypted data into CIPHERTEXT array
        movdqu  [r8+rdx-16], xmm0
        dec eax
        jnz lp128encsingle_CBC

        mov        r9,[r9+24]
        movdqu [r9],xmm1
        add rsp,16*16+8
        ret



align 16
global iEnc256_CBC
iEnc256_CBC:

        linux_setup
        sub rsp,16*16+8

        mov r9,rcx
        mov rax,[rcx+24]
        movdqu xmm1,[rax]

        mov eax,[rcx+32] ; numblocks
        mov rdx,[rcx]
        mov r8,[rcx+8]
        mov rcx,[rcx+16]

        sub r8,rdx

        test    rcx,0xf
        jz              lp256encsingle_CBC

        copy_round_keys rsp,rcx,0
        copy_round_keys rsp,rcx,1
        copy_round_keys rsp,rcx,2
        copy_round_keys rsp,rcx,3
        copy_round_keys rsp,rcx,4
        copy_round_keys rsp,rcx,5
        copy_round_keys rsp,rcx,6
        copy_round_keys rsp,rcx,7
        copy_round_keys rsp,rcx,8
        copy_round_keys rsp,rcx,9
        copy_round_keys rsp,rcx,10
        copy_round_keys rsp,rcx,11
        copy_round_keys rsp,rcx,12
        copy_round_keys rsp,rcx,13
        copy_round_keys rsp,rcx,14
        mov rcx,rsp

        align 16

lp256encsingle_CBC:

        movdqu xmm0, [rdx]
        movdqu xmm4, [rcx+0*16]
        add rdx, 16
        pxor xmm0, xmm1
        pxor xmm0, xmm4
        aesenc1 [rcx+1*16]
        aesenc1 [rcx+2*16]
        aesenc1 [rcx+3*16]
        aesenc1 [rcx+4*16]
        aesenc1 [rcx+5*16]
        aesenc1 [rcx+6*16]
        aesenc1 [rcx+7*16]
        aesenc1 [rcx+8*16]
        aesenc1 [rcx+9*16]
        aesenc1 [rcx+10*16]
        aesenc1 [rcx+11*16]
        aesenc1 [rcx+12*16]
        aesenc1 [rcx+13*16]
        aesenclast1 [rcx+14*16]
        movdqa xmm1,xmm0

                ; Store output encrypted data into CIPHERTEXT array
        movdqu  [r8+rdx-16], xmm0
        dec eax
        jnz lp256encsingle_CBC

        mov        r9,[r9+24]
        movdqu [r9],xmm1
        add rsp,16*16+8
        ret

; Mark this file as not needing an executable stack.
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
%endif