root/usr/src/common/crypto/aes/amd64/aes_intel.S
/*
 * ====================================================================
 * Written by Intel Corporation for the OpenSSL project to add support
 * for Intel AES-NI instructions. Rights for redistribution and usage
 * in source and binary forms are granted according to the OpenSSL
 * license.
 *
 *   Author: Huang Ying <ying.huang at intel dot com>
 *           Vinodh Gopal <vinodh.gopal at intel dot com>
 *           Kahraman Akdemir
 *
 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
 * instructions that are going to be introduced in the next generation
 * of Intel processor, as of 2009. These instructions enable fast and
 * secure data encryption and decryption, using the Advanced Encryption
 * Standard (AES), defined by FIPS Publication number 197. The
 * architecture introduces six instructions that offer full hardware
 * support for AES. Four of them support high performance data
 * encryption and decryption, and the other two instructions support
 * the AES key expansion procedure.
 * ====================================================================
 */

/*
 * ====================================================================
 * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. All advertising materials mentioning features or use of this
 *    software must display the following acknowledgment:
 *    "This product includes software developed by the OpenSSL Project
 *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
 *
 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
 *    endorse or promote products derived from this software without
 *    prior written permission. For written permission, please contact
 *    openssl-core@openssl.org.
 *
 * 5. Products derived from this software may not be called "OpenSSL"
 *    nor may "OpenSSL" appear in their names without prior written
 *    permission of the OpenSSL Project.
 *
 * 6. Redistributions of any form whatsoever must retain the following
 *    acknowledgment:
 *    "This product includes software developed by the OpenSSL Project
 *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
 *
 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * ====================================================================
 */

/*
 * ====================================================================
 * OpenSolaris OS modifications
 *
 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
 * Huang Ying of Intel to the openssl-dev mailing list under the subject
 * of "Add support to Intel AES-NI instruction set for x86_64 platform".
 *
 * This OpenSolaris version has these major changes from the original source:
 *
 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
 * definitions for lint.
 *
 * 2. Formatted code, added comments, and added #includes and #defines.
 *
 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
 * calling kpreempt_disable() and kpreempt_enable().
 * If the TS bit is not set, Save and restore %xmm registers at the beginning
 * and end of function calls (%xmm* registers are not saved and restored by
 * during kernel thread preemption).
 *
 * 4. Renamed functions, reordered parameters, and changed return value
 * to match OpenSolaris:
 *
 * OpenSSL interface:
 *      int intel_AES_set_encrypt_key(const unsigned char *userKey,
 *              const int bits, AES_KEY *key);
 *      int intel_AES_set_decrypt_key(const unsigned char *userKey,
 *              const int bits, AES_KEY *key);
 *      Return values for above are non-zero on error, 0 on success.
 *
 *      void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 *              const AES_KEY *key);
 *      void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 *              const AES_KEY *key);
 *      typedef struct aes_key_st {
 *              unsigned int    rd_key[4 *(AES_MAXNR + 1)];
 *              int             rounds;
 *              unsigned int    pad[3];
 *      } AES_KEY;
 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
 * (ks32) instead of 64-bit (ks64).
 * Number of rounds (aka round count) is at offset 240 of AES_KEY.
 *
 * OpenSolaris OS interface (#ifdefs removed for readability):
 *      int rijndael_key_setup_dec_intel(uint32_t rk[],
 *              const uint32_t cipherKey[], uint64_t keyBits);
 *      int rijndael_key_setup_enc_intel(uint32_t rk[],
 *              const uint32_t cipherKey[], uint64_t keyBits);
 *      Return values for above are 0 on error, number of rounds on success.
 *
 *      void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 *              const uint32_t pt[4], uint32_t ct[4]);
 *      void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 *              const uint32_t pt[4], uint32_t ct[4]);
 *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
 *               uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
 *
 *      typedef union {
 *              uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 *      } aes_ks_t;
 *      typedef struct aes_key {
 *              aes_ks_t        encr_ks, decr_ks;
 *              long double     align128;
 *              int             flags, nr, type;
 *      } aes_key_t;
 *
 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 * ct is crypto text, and MAX_AES_NR is 14.
 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 *
 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 *
 * ====================================================================
 */

#if defined(lint) || defined(__lint)

#include <sys/types.h>

/* ARGSUSED */
void
aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
    uint32_t ct[4]) {
}
/* ARGSUSED */
void
aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
    uint32_t pt[4]) {
}
/* ARGSUSED */
int
rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
    uint64_t keyBits) {
        return (0);
}
/* ARGSUSED */
int
rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
   uint64_t keyBits) {
        return (0);
}


#else   /* lint */

#include <sys/asm_linkage.h>
#include <sys/controlregs.h>
#ifdef _KERNEL
#include <sys/machprivregs.h>
#endif

#ifdef _KERNEL
        /*
         * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
         * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
         * uses it to pass P2 to syscall.
         * This also occurs with the STTS macro, but we don't care if
         * P2 (%rsi) is modified just before function exit.
         * The CLTS and STTS macros push and pop P1 (%rdi) already.
         */
#ifdef __xpv
#define PROTECTED_CLTS \
        push    %rsi; \
        CLTS; \
        pop     %rsi
#else
#define PROTECTED_CLTS \
        CLTS
#endif  /* __xpv */

#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
        push    %rbp; \
        mov     %rsp, %rbp; \
        movq    %cr0, tmpreg; \
        testq   $CR0_TS, tmpreg; \
        jnz     1f; \
        and     $-XMM_ALIGN, %rsp; \
        sub     $[XMM_SIZE * 2], %rsp; \
        movaps  %xmm0, 16(%rsp); \
        movaps  %xmm1, (%rsp); \
        jmp     2f; \
1: \
        PROTECTED_CLTS; \
2:

        /*
         * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
         * otherwise set CR0_TS.
         */
#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
        testq   $CR0_TS, tmpreg; \
        jnz     1f; \
        movaps  (%rsp), %xmm1; \
        movaps  16(%rsp), %xmm0; \
        jmp     2f; \
1: \
        STTS(tmpreg); \
2: \
        mov     %rbp, %rsp; \
        pop     %rbp

        /*
         * If CR0_TS is not set, align stack (with push %rbp) and push
         * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
         */
#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
        push    %rbp; \
        mov     %rsp, %rbp; \
        movq    %cr0, tmpreg; \
        testq   $CR0_TS, tmpreg; \
        jnz     1f; \
        and     $-XMM_ALIGN, %rsp; \
        sub     $[XMM_SIZE * 7], %rsp; \
        movaps  %xmm0, 96(%rsp); \
        movaps  %xmm1, 80(%rsp); \
        movaps  %xmm2, 64(%rsp); \
        movaps  %xmm3, 48(%rsp); \
        movaps  %xmm4, 32(%rsp); \
        movaps  %xmm5, 16(%rsp); \
        movaps  %xmm6, (%rsp); \
        jmp     2f; \
1: \
        PROTECTED_CLTS; \
2:


        /*
         * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
         * otherwise set CR0_TS.
         */
#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
        testq   $CR0_TS, tmpreg; \
        jnz     1f; \
        movaps  (%rsp), %xmm6; \
        movaps  16(%rsp), %xmm5; \
        movaps  32(%rsp), %xmm4; \
        movaps  48(%rsp), %xmm3; \
        movaps  64(%rsp), %xmm2; \
        movaps  80(%rsp), %xmm1; \
        movaps  96(%rsp), %xmm0; \
        jmp     2f; \
1: \
        STTS(tmpreg); \
2: \
        mov     %rbp, %rsp; \
        pop     %rbp


#else
#define PROTECTED_CLTS
#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
#define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
#endif  /* _KERNEL */


/*
 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 * _key_expansion_256a(), _key_expansion_256b()
 *
 * Helper functions called by rijndael_key_setup_inc_intel().
 * Also used indirectly by rijndael_key_setup_dec_intel().
 *
 * Input:
 * %xmm0        User-provided cipher key
 * %xmm1        Round constant
 * Output:
 * (%rcx)       AES key
 */

.align  16
_key_expansion_128:
_key_expansion_256a:
        pshufd  $0b11111111, %xmm1, %xmm1
        shufps  $0b00010000, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        shufps  $0b10001100, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        pxor    %xmm1, %xmm0
        movaps  %xmm0, (%rcx)
        add     $0x10, %rcx
        ret
        SET_SIZE(_key_expansion_128)
        SET_SIZE(_key_expansion_256a)

.align 16
_key_expansion_192a:
        pshufd  $0b01010101, %xmm1, %xmm1
        shufps  $0b00010000, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        shufps  $0b10001100, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        pxor    %xmm1, %xmm0

        movaps  %xmm2, %xmm5
        movaps  %xmm2, %xmm6
        pslldq  $4, %xmm5
        pshufd  $0b11111111, %xmm0, %xmm3
        pxor    %xmm3, %xmm2
        pxor    %xmm5, %xmm2

        movaps  %xmm0, %xmm1
        shufps  $0b01000100, %xmm0, %xmm6
        movaps  %xmm6, (%rcx)
        shufps  $0b01001110, %xmm2, %xmm1
        movaps  %xmm1, 0x10(%rcx)
        add     $0x20, %rcx
        ret
        SET_SIZE(_key_expansion_192a)

.align 16
_key_expansion_192b:
        pshufd  $0b01010101, %xmm1, %xmm1
        shufps  $0b00010000, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        shufps  $0b10001100, %xmm0, %xmm4
        pxor    %xmm4, %xmm0
        pxor    %xmm1, %xmm0

        movaps  %xmm2, %xmm5
        pslldq  $4, %xmm5
        pshufd  $0b11111111, %xmm0, %xmm3
        pxor    %xmm3, %xmm2
        pxor    %xmm5, %xmm2

        movaps  %xmm0, (%rcx)
        add     $0x10, %rcx
        ret
        SET_SIZE(_key_expansion_192b)

.align 16
_key_expansion_256b:
        pshufd  $0b10101010, %xmm1, %xmm1
        shufps  $0b00010000, %xmm2, %xmm4
        pxor    %xmm4, %xmm2
        shufps  $0b10001100, %xmm2, %xmm4
        pxor    %xmm4, %xmm2
        pxor    %xmm1, %xmm2
        movaps  %xmm2, (%rcx)
        add     $0x10, %rcx
        ret
        SET_SIZE(_key_expansion_256b)


/*
 * rijndael_key_setup_enc_intel()
 * Expand the cipher key into the encryption key schedule.
 *
 * For kernel code, caller is responsible for ensuring kpreempt_disable()
 * has been called.  This is because %xmm registers are not saved/restored.
 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 * on the stack.
 *
 * OpenSolaris interface:
 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 *      uint64_t keyBits);
 * Return value is 0 on error, number of rounds on success.
 *
 * Original Intel OpenSSL interface:
 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
 *      const int bits, AES_KEY *key);
 * Return value is non-zero on error, 0 on success.
 */

#ifdef  OPENSSL_INTERFACE
#define rijndael_key_setup_enc_intel    intel_AES_set_encrypt_key
#define rijndael_key_setup_dec_intel    intel_AES_set_decrypt_key

#define USERCIPHERKEY           rdi     /* P1, 64 bits */
#define KEYSIZE32               esi     /* P2, 32 bits */
#define KEYSIZE64               rsi     /* P2, 64 bits */
#define AESKEY                  rdx     /* P3, 64 bits */

#else   /* OpenSolaris Interface */
#define AESKEY                  rdi     /* P1, 64 bits */
#define USERCIPHERKEY           rsi     /* P2, 64 bits */
#define KEYSIZE32               edx     /* P3, 32 bits */
#define KEYSIZE64               rdx     /* P3, 64 bits */
#endif  /* OPENSSL_INTERFACE */

#define ROUNDS32                KEYSIZE32       /* temp */
#define ROUNDS64                KEYSIZE64       /* temp */
#define ENDAESKEY               USERCIPHERKEY   /* temp */


ENTRY_NP(rijndael_key_setup_enc_intel)
        CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)

        / NULL pointer sanity check
        test    %USERCIPHERKEY, %USERCIPHERKEY
        jz      .Lenc_key_invalid_param
        test    %AESKEY, %AESKEY
        jz      .Lenc_key_invalid_param

        movups  (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes)
        movaps  %xmm0, (%AESKEY)
        lea     0x10(%AESKEY), %rcx     / key addr
        pxor    %xmm4, %xmm4            / xmm4 is assumed 0 in _key_expansion_x

        cmp     $256, %KEYSIZE32
        jnz     .Lenc_key192

        / AES 256: 14 rounds in encryption key schedule
#ifdef OPENSSL_INTERFACE
        mov     $14, %ROUNDS32
        movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 14
#endif  /* OPENSSL_INTERFACE */

        movups  0x10(%USERCIPHERKEY), %xmm2     / other user key (2nd 16 bytes)
        movaps  %xmm2, (%rcx)
        add     $0x10, %rcx

        aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x1, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x2, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x4, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x8, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x10, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
        call    _key_expansion_256a
        aeskeygenassist $0x20, %xmm0, %xmm1
        call    _key_expansion_256b
        aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
        call    _key_expansion_256a

        SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
#ifdef  OPENSSL_INTERFACE
        xor     %rax, %rax                      / return 0 (OK)
#else   /* Open Solaris Interface */
        mov     $14, %rax                       / return # rounds = 14
#endif
        ret

.align 4
.Lenc_key192:
        cmp     $192, %KEYSIZE32
        jnz     .Lenc_key128

        / AES 192: 12 rounds in encryption key schedule
#ifdef OPENSSL_INTERFACE
        mov     $12, %ROUNDS32
        movl    %ROUNDS32, 240(%AESKEY) / key.rounds = 12
#endif  /* OPENSSL_INTERFACE */

        movq    0x10(%USERCIPHERKEY), %xmm2     / other user key
        aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
        call    _key_expansion_192a
        aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
        call    _key_expansion_192b
        aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
        call    _key_expansion_192a
        aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
        call    _key_expansion_192b
        aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
        call    _key_expansion_192a
        aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
        call    _key_expansion_192b
        aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
        call    _key_expansion_192a
        aeskeygenassist $0x80, %xmm2, %xmm1     / expand the key
        call    _key_expansion_192b

        SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
#ifdef  OPENSSL_INTERFACE
        xor     %rax, %rax                      / return 0 (OK)
#else   /* OpenSolaris Interface */
        mov     $12, %rax                       / return # rounds = 12
#endif
        ret

.align 4
.Lenc_key128:
        cmp $128, %KEYSIZE32
        jnz .Lenc_key_invalid_key_bits

        / AES 128: 10 rounds in encryption key schedule
#ifdef OPENSSL_INTERFACE
        mov     $10, %ROUNDS32
        movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 10
#endif  /* OPENSSL_INTERFACE */

        aeskeygenassist $0x1, %xmm0, %xmm1      / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x2, %xmm0, %xmm1      / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x4, %xmm0, %xmm1      / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x8, %xmm0, %xmm1      / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x10, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x20, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x40, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x80, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x1b, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128
        aeskeygenassist $0x36, %xmm0, %xmm1     / expand the key
        call    _key_expansion_128

        SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
#ifdef  OPENSSL_INTERFACE
        xor     %rax, %rax                      / return 0 (OK)
#else   /* OpenSolaris Interface */
        mov     $10, %rax                       / return # rounds = 10
#endif
        ret

.Lenc_key_invalid_param:
#ifdef  OPENSSL_INTERFACE
        SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
        mov     $-1, %rax       / user key or AES key pointer is NULL
        ret
#else
        /* FALLTHROUGH */
#endif  /* OPENSSL_INTERFACE */

.Lenc_key_invalid_key_bits:
        SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
#ifdef  OPENSSL_INTERFACE
        mov     $-2, %rax       / keysize is invalid
#else   /* Open Solaris Interface */
        xor     %rax, %rax      / a key pointer is NULL or invalid keysize
#endif  /* OPENSSL_INTERFACE */

        ret
        SET_SIZE(rijndael_key_setup_enc_intel)


/*
 * rijndael_key_setup_dec_intel()
 * Expand the cipher key into the decryption key schedule.
 *
 * For kernel code, caller is responsible for ensuring kpreempt_disable()
 * has been called.  This is because %xmm registers are not saved/restored.
 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 * on the stack.
 *
 * OpenSolaris interface:
 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 *      uint64_t keyBits);
 * Return value is 0 on error, number of rounds on success.
 * P1->P2, P2->P3, P3->P1
 *
 * Original Intel OpenSSL interface:
 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
 *      const int bits, AES_KEY *key);
 * Return value is non-zero on error, 0 on success.
 */
ENTRY_NP(rijndael_key_setup_dec_intel)
        / Generate round keys used for encryption
        call    rijndael_key_setup_enc_intel
        test    %rax, %rax
#ifdef  OPENSSL_INTERFACE
        jnz     .Ldec_key_exit  / Failed if returned non-0
#else   /* OpenSolaris Interface */
        jz      .Ldec_key_exit  / Failed if returned 0
#endif  /* OPENSSL_INTERFACE */

        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)

        /*
         * Convert round keys used for encryption
         * to a form usable for decryption
         */
#ifndef OPENSSL_INTERFACE               /* OpenSolaris Interface */
        mov     %rax, %ROUNDS64         / set # rounds (10, 12, or 14)
                                        / (already set for OpenSSL)
#endif

        lea     0x10(%AESKEY), %rcx     / key addr
        shl     $4, %ROUNDS32
        add     %AESKEY, %ROUNDS64
        mov     %ROUNDS64, %ENDAESKEY

.align 4
.Ldec_key_reorder_loop:
        movaps  (%AESKEY), %xmm0
        movaps  (%ROUNDS64), %xmm1
        movaps  %xmm0, (%ROUNDS64)
        movaps  %xmm1, (%AESKEY)
        lea     0x10(%AESKEY), %AESKEY
        lea     -0x10(%ROUNDS64), %ROUNDS64
        cmp     %AESKEY, %ROUNDS64
        ja      .Ldec_key_reorder_loop

.align 4
.Ldec_key_inv_loop:
        movaps  (%rcx), %xmm0
        / Convert an encryption round key to a form usable for decryption
        / with the "AES Inverse Mix Columns" instruction
        aesimc  %xmm0, %xmm1
        movaps  %xmm1, (%rcx)
        lea     0x10(%rcx), %rcx
        cmp     %ENDAESKEY, %rcx
        jnz     .Ldec_key_inv_loop

        SET_TS_OR_POP_XMM0_XMM1(%r10)

.Ldec_key_exit:
        / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
        / OpenSSL: rax = 0 for OK, or non-zero for error
        ret
        SET_SIZE(rijndael_key_setup_dec_intel)


/*
 * aes_encrypt_intel()
 * Encrypt a single block (in and out can overlap).
 *
 * For kernel code, caller is responsible for ensuring kpreempt_disable()
 * has been called.  This is because %xmm registers are not saved/restored.
 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 * on the stack.
 *
 * Temporary register usage:
 * %xmm0        State
 * %xmm1        Key
 *
 * Original OpenSolaris Interface:
 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 *      const uint32_t pt[4], uint32_t ct[4])
 *
 * Original Intel OpenSSL Interface:
 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 *      const AES_KEY *key)
 */

#ifdef  OPENSSL_INTERFACE
#define aes_encrypt_intel       intel_AES_encrypt
#define aes_decrypt_intel       intel_AES_decrypt

#define INP             rdi     /* P1, 64 bits */
#define OUTP            rsi     /* P2, 64 bits */
#define KEYP            rdx     /* P3, 64 bits */

/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
#define NROUNDS32       ecx     /* temporary, 32 bits */
#define NROUNDS         cl      /* temporary,  8 bits */

#else   /* OpenSolaris Interface */
#define KEYP            rdi     /* P1, 64 bits */
#define NROUNDS         esi     /* P2, 32 bits */
#define INP             rdx     /* P3, 64 bits */
#define OUTP            rcx     /* P4, 64 bits */
#endif  /* OPENSSL_INTERFACE */

#define STATE           xmm0    /* temporary, 128 bits */
#define KEY             xmm1    /* temporary, 128 bits */

ENTRY_NP(aes_encrypt_intel)
        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)

        movups  (%INP), %STATE                  / input
        movaps  (%KEYP), %KEY                   / key
#ifdef  OPENSSL_INTERFACE
        mov     240(%KEYP), %NROUNDS32          / round count
#else   /* OpenSolaris Interface */
        /* Round count is already present as P2 in %rsi/%esi */
#endif  /* OPENSSL_INTERFACE */

        pxor    %KEY, %STATE                    / round 0
        lea     0x30(%KEYP), %KEYP
        cmp     $12, %NROUNDS
        jb      .Lenc128
        lea     0x20(%KEYP), %KEYP
        je      .Lenc192

        / AES 256
        lea     0x20(%KEYP), %KEYP
        movaps  -0x60(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  -0x50(%KEYP), %KEY
        aesenc  %KEY, %STATE

.align 4
.Lenc192:
        / AES 192 and 256
        movaps  -0x40(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  -0x30(%KEYP), %KEY
        aesenc  %KEY, %STATE

.align 4
.Lenc128:
        / AES 128, 192, and 256
        movaps  -0x20(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  -0x10(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  (%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x10(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x20(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x30(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x40(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x50(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x60(%KEYP), %KEY
        aesenc  %KEY, %STATE
        movaps  0x70(%KEYP), %KEY
        aesenclast       %KEY, %STATE           / last round
        movups  %STATE, (%OUTP)                 / output

        SET_TS_OR_POP_XMM0_XMM1(%r10)
        ret
        SET_SIZE(aes_encrypt_intel)


/*
 * aes_decrypt_intel()
 * Decrypt a single block (in and out can overlap).
 *
 * For kernel code, caller is responsible for ensuring kpreempt_disable()
 * has been called.  This is because %xmm registers are not saved/restored.
 * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 * on the stack.
 *
 * Temporary register usage:
 * %xmm0        State
 * %xmm1        Key
 *
 * Original OpenSolaris Interface:
 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 *      const uint32_t pt[4], uint32_t ct[4])/
 *
 * Original Intel OpenSSL Interface:
 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 *      const AES_KEY *key);
 */
ENTRY_NP(aes_decrypt_intel)
        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)

        movups  (%INP), %STATE                  / input
        movaps  (%KEYP), %KEY                   / key
#ifdef  OPENSSL_INTERFACE
        mov     240(%KEYP), %NROUNDS32          / round count
#else   /* OpenSolaris Interface */
        /* Round count is already present as P2 in %rsi/%esi */
#endif  /* OPENSSL_INTERFACE */

        pxor    %KEY, %STATE                    / round 0
        lea     0x30(%KEYP), %KEYP
        cmp     $12, %NROUNDS
        jb      .Ldec128
        lea     0x20(%KEYP), %KEYP
        je      .Ldec192

        / AES 256
        lea     0x20(%KEYP), %KEYP
        movaps  -0x60(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  -0x50(%KEYP), %KEY
        aesdec  %KEY, %STATE

.align 4
.Ldec192:
        / AES 192 and 256
        movaps  -0x40(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  -0x30(%KEYP), %KEY
        aesdec  %KEY, %STATE

.align 4
.Ldec128:
        / AES 128, 192, and 256
        movaps  -0x20(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  -0x10(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  (%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x10(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x20(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x30(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x40(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x50(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x60(%KEYP), %KEY
        aesdec  %KEY, %STATE
        movaps  0x70(%KEYP), %KEY
        aesdeclast      %KEY, %STATE            / last round
        movups  %STATE, (%OUTP)                 / output

        SET_TS_OR_POP_XMM0_XMM1(%r10)
        ret
        SET_SIZE(aes_decrypt_intel)

#endif  /* lint || __lint */