arch/x86/crypto/aes-gcm-aesni-x86_64.S

root/arch/x86/crypto/aes-gcm-aesni-x86_64.S
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// AES-NI optimized AES-GCM for x86_64
//
// Copyright 2024 Google LLC
//
// Author: Eric Biggers <ebiggers@google.com>
//
//------------------------------------------------------------------------------
//
// This file is dual-licensed, meaning that you can use it under your choice of
// either of the following two licenses:
//
// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
// of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// or
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------
//
// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
// support the original set of AES instructions, i.e. AES-NI.  Two
// implementations are provided, one that uses AVX and one that doesn't.  They
// are very similar, being generated by the same macros.  The only difference is
// that the AVX implementation takes advantage of VEX-coded instructions in some
// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
// implementation does *not* use 256-bit vectors, as AES is not supported on
// 256-bit vectors until the VAES feature (which this file doesn't target).
//
// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
//
// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
// more thoroughly commented.  This file has the following notable changes:
//
//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
//      there is only one AES block (and GHASH block) per register.
//
//    - Without AVX512, only 16 SIMD registers are available instead of 32.  We
//      work around this by being much more careful about using registers,
//      relying heavily on loads to load values as they are needed.
//
//    - Masking is not available either.  We work around this by implementing
//      partial block loads and stores using overlapping scalar loads and stores
//      combined with shifts and SSE4.1 insertion and extraction instructions.
//
//    - The main loop is organized differently due to the different design
//      constraints.  First, with just one AES block per SIMD register, on some
//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
//      do an 8-register wide loop.  Considering that and the fact that we have
//      just 16 SIMD registers to work with, it's not feasible to cache AES
//      round keys and GHASH key powers in registers across loop iterations.
//      That's not ideal, but also not actually that bad, since loads can run in
//      parallel with other instructions.  Significantly, this also makes it
//      possible to roll up the inner loops, relying on hardware loop unrolling
//      instead of software loop unrolling, greatly reducing code size.
//
//    - We implement the GHASH multiplications in the main loop using Karatsuba
//      multiplication instead of schoolbook multiplication.  This saves one
//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
//      XOR support that would be provided by AVX512, which would be more
//      beneficial to schoolbook than Karatsuba.)
//
//      As a rough approximation, we can assume that Karatsuba multiplication is
//      faster than schoolbook multiplication in this context if one pshufd and
//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
//      load is "free" due to running in parallel with arithmetic instructions.)
//      This is true on AMD CPUs, including all that support pclmulqdq up to at
//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
//      schoolbook multiplication should be faster, but only marginally.
//
//      Not all these CPUs were available to be tested.  However, benchmarks on
//      available CPUs suggest that this approximation is plausible.  Switching
//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
//      Considering that and the fact that Karatsuba should be even more
//      beneficial on older Intel CPUs, it seems like the right choice here.
//
//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
//      saved by using a multiplication-less reduction method.  We don't do that
//      because it would require a large number of shift and xor instructions,
//      making it less worthwhile and likely harmful on newer CPUs.
//
//      It does make sense to sometimes use a different reduction optimization
//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
//      multiply the low half of the data block by the hash key with the extra
//      factor of x^64.  This eliminates one step of the reduction.  However,
//      this is incompatible with Karatsuba multiplication.  Therefore, for
//      multi-block processing we use Karatsuba multiplication with a regular
//      reduction.  For single-block processing, we use the x^64 optimization.

#include <linux/linkage.h>

.section .rodata
.p2align 4
.Lbswap_mask:
        .octa   0x000102030405060708090a0b0c0d0e0f
.Lgfpoly:
        .quad   0xc200000000000000
.Lone:
        .quad   1
.Lgfpoly_and_internal_carrybit:
        .octa   0xc2000000000000010000000000000001
        // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
        // 'len' 0xff bytes and the rest zeroes.
.Lzeropad_mask:
        .octa   0xffffffffffffffffffffffffffffffff
        .octa   0

// Offsets in struct aes_gcm_key_aesni
#define OFFSETOF_AESKEYLEN      0
#define OFFSETOF_AESROUNDKEYS   16
#define OFFSETOF_H_POWERS       272
#define OFFSETOF_H_POWERS_XORED 400
#define OFFSETOF_H_TIMES_X64    464

.text

// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
// assumes that all operands are distinct and that any mem operand is aligned.
.macro  _vpclmulqdq     imm, src1, src2, dst
.if USE_AVX
        vpclmulqdq      \imm, \src1, \src2, \dst
.else
        movdqa          \src2, \dst
        pclmulqdq       \imm, \src1, \dst
.endif
.endm

// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
// that all operands are distinct and that any mem operand is aligned.
.macro  _vpshufb        src1, src2, dst
.if USE_AVX
        vpshufb         \src1, \src2, \dst
.else
        movdqa          \src2, \dst
        pshufb          \src1, \dst
.endif
.endm

// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
// all operands are distinct.
.macro  _vpand          src1, src2, dst
.if USE_AVX
        vpand           \src1, \src2, \dst
.else
        movdqu          \src1, \dst
        pand            \src2, \dst
.endif
.endm

// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
// be a temporary xmm register.
.macro  _xor_mem_to_reg mem, reg, tmp
.if USE_AVX
        vpxor           \mem, \reg, \reg
.else
        movdqu          \mem, \tmp
        pxor            \tmp, \reg
.endif
.endm

// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
// must be a temporary xmm register.
.macro  _test_mem       mem, reg, tmp
.if USE_AVX
        vptest          \mem, \reg
.else
        movdqu          \mem, \tmp
        ptest           \tmp, \reg
.endif
.endm

// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
.macro  _load_partial_block     src, dst, tmp64, tmp32
        sub             $8, %ecx                // LEN - 8
        jle             .Lle8\@

        // Load 9 <= LEN <= 15 bytes.
        movq            (\src), \dst            // Load first 8 bytes
        mov             (\src, %rcx), %rax      // Load last 8 bytes
        neg             %ecx
        shl             $3, %ecx
        shr             %cl, %rax               // Discard overlapping bytes
        pinsrq          $1, %rax, \dst
        jmp             .Ldone\@

.Lle8\@:
        add             $4, %ecx                // LEN - 4
        jl              .Llt4\@

        // Load 4 <= LEN <= 8 bytes.
        mov             (\src), %eax            // Load first 4 bytes
        mov             (\src, %rcx), \tmp32    // Load last 4 bytes
        jmp             .Lcombine\@

.Llt4\@:
        // Load 1 <= LEN <= 3 bytes.
        add             $2, %ecx                // LEN - 2
        movzbl          (\src), %eax            // Load first byte
        jl              .Lmovq\@
        movzwl          (\src, %rcx), \tmp32    // Load last 2 bytes
.Lcombine\@:
        shl             $3, %ecx
        shl             %cl, \tmp64
        or              \tmp64, %rax            // Combine the two parts
.Lmovq\@:
        movq            %rax, \dst
.Ldone\@:
.endm

// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
// Clobbers %rax, %rcx, and %rsi.
.macro  _store_partial_block    src, dst
        sub             $8, %ecx                // LEN - 8
        jl              .Llt8\@

        // Store 8 <= LEN <= 15 bytes.
        pextrq          $1, \src, %rax
        mov             %ecx, %esi
        shl             $3, %ecx
        ror             %cl, %rax
        mov             %rax, (\dst, %rsi)      // Store last LEN - 8 bytes
        movq            \src, (\dst)            // Store first 8 bytes
        jmp             .Ldone\@

.Llt8\@:
        add             $4, %ecx                // LEN - 4
        jl              .Llt4\@

        // Store 4 <= LEN <= 7 bytes.
        pextrd          $1, \src, %eax
        mov             %ecx, %esi
        shl             $3, %ecx
        ror             %cl, %eax
        mov             %eax, (\dst, %rsi)      // Store last LEN - 4 bytes
        movd            \src, (\dst)            // Store first 4 bytes
        jmp             .Ldone\@

.Llt4\@:
        // Store 1 <= LEN <= 3 bytes.
        pextrb          $0, \src, 0(\dst)
        cmp             $-2, %ecx               // LEN - 4 == -2, i.e. LEN == 2?
        jl              .Ldone\@
        pextrb          $1, \src, 1(\dst)
        je              .Ldone\@
        pextrb          $2, \src, 2(\dst)
.Ldone\@:
.endm

// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
.macro  _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1

        // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
.if \i == 0
        _vpclmulqdq     $0x01, \a, \b, \t0
.elseif \i == 1
        _vpclmulqdq     $0x00, \a_times_x64, \b, \t1
.elseif \i == 2
        pxor            \t1, \t0

        // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
.elseif \i == 3
        _vpclmulqdq     $0x11, \a, \b, \t1
.elseif \i == 4
        pclmulqdq       $0x10, \a_times_x64, \b
.elseif \i == 5
        pxor            \t1, \b
.elseif \i == 6

        // Fold MI into HI.
        pshufd          $0x4e, \t0, \t1         // Swap halves of MI
.elseif \i == 7
        pclmulqdq       $0x00, \gfpoly, \t0     // MI_L*(x^63 + x^62 + x^57)
.elseif \i == 8
        pxor            \t1, \b
.elseif \i == 9
        pxor            \t0, \b
.endif
.endm

// GHASH-multiply \a by \b and store the reduced product in \b.
// See _ghash_mul_step for details.
.macro  _ghash_mul      a, a_times_x64, b, gfpoly, t0, t1
.irp i, 0,1,2,3,4,5,6,7,8,9
        _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
.endr
.endm

// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
.macro  _ghash_mul_noreduce     a, a_xored, b, lo, mi, hi, t0

        // LO += a_L * b_L
        _vpclmulqdq     $0x00, \a, \b, \t0
        pxor            \t0, \lo

        // b_L + b_H
        pshufd          $0x4e, \b, \t0
        pxor            \b, \t0

        // HI += a_H * b_H
        pclmulqdq       $0x11, \a, \b
        pxor            \b, \hi

        // MI += (a_L + a_H) * (b_L + b_H)
        pclmulqdq       $0x00, \a_xored, \t0
        pxor            \t0, \mi
.endm

// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
// This assumes that _ghash_mul_noreduce was used.
.macro  _ghash_reduce   lo, mi, hi, dst, t0

        movq            .Lgfpoly(%rip), \t0

        // MI += LO + HI (needed because we used Karatsuba multiplication)
        pxor            \lo, \mi
        pxor            \hi, \mi

        // Fold LO into MI.
        pshufd          $0x4e, \lo, \dst
        pclmulqdq       $0x00, \t0, \lo
        pxor            \dst, \mi
        pxor            \lo, \mi

        // Fold MI into HI.
        pshufd          $0x4e, \mi, \dst
        pclmulqdq       $0x00, \t0, \mi
        pxor            \hi, \dst
        pxor            \mi, \dst
.endm

// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
//
// The whole GHASH update does:
//
//      GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
//                              blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
//
// This macro just does the first step: it does the unreduced multiplication
// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
// inner block counter in %rax, which is a value that counts up by 8 for each
// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
//
// To reduce the number of pclmulqdq instructions required, both this macro and
// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
// multiplication.  See the file comment for more details about this choice.
//
// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
// powers H^i and their XOR'd-together halves to be available in the struct
// pointed to by KEY.  Both macros clobber TMP[0-2].
.macro  _ghash_update_begin_8x  enc

        // Initialize the inner block counter.
        xor             %eax, %eax

        // Load the highest hash key power, H^8.
        movdqa          OFFSETOF_H_POWERS(KEY), TMP0

        // Load the first ciphertext block and byte-reflect it.
.if \enc
        movdqu          (DST), TMP1
.else
        movdqu          (SRC), TMP1
.endif
        pshufb          BSWAP_MASK, TMP1

        // Add the GHASH accumulator to the ciphertext block to get the block
        // 'b' that needs to be multiplied with the hash key power 'a'.
        pxor            TMP1, GHASH_ACC

        // b_L + b_H
        pshufd          $0x4e, GHASH_ACC, MI
        pxor            GHASH_ACC, MI

        // LO = a_L * b_L
        _vpclmulqdq     $0x00, TMP0, GHASH_ACC, LO

        // HI = a_H * b_H
        pclmulqdq       $0x11, TMP0, GHASH_ACC

        // MI = (a_L + a_H) * (b_L + b_H)
        pclmulqdq       $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
.endm

// Continue the GHASH update of 8 ciphertext blocks as described above by doing
// an unreduced multiplication of the next ciphertext block by the next lowest
// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
.macro  _ghash_update_continue_8x enc
        add             $8, %eax

        // Load the next lowest key power.
        movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), TMP0

        // Load the next ciphertext block and byte-reflect it.
.if \enc
        movdqu          (DST,%rax,2), TMP1
.else
        movdqu          (SRC,%rax,2), TMP1
.endif
        pshufb          BSWAP_MASK, TMP1

        // LO += a_L * b_L
        _vpclmulqdq     $0x00, TMP0, TMP1, TMP2
        pxor            TMP2, LO

        // b_L + b_H
        pshufd          $0x4e, TMP1, TMP2
        pxor            TMP1, TMP2

        // HI += a_H * b_H
        pclmulqdq       $0x11, TMP0, TMP1
        pxor            TMP1, GHASH_ACC

        // MI += (a_L + a_H) * (b_L + b_H)
        movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
        pclmulqdq       $0x00, TMP1, TMP2
        pxor            TMP2, MI
.endm

// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
// it uses the same register for HI and the destination.  It's also divided into
// two steps.  TMP1 must be preserved across steps.
//
// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
// increase the critical path length, and it seems to slightly hurt performance.
.macro  _ghash_update_end_8x_step       i
.if \i == 0
        movq            .Lgfpoly(%rip), TMP1
        pxor            LO, MI
        pxor            GHASH_ACC, MI
        pshufd          $0x4e, LO, TMP2
        pclmulqdq       $0x00, TMP1, LO
        pxor            TMP2, MI
        pxor            LO, MI
.elseif \i == 1
        pshufd          $0x4e, MI, TMP2
        pclmulqdq       $0x00, TMP1, MI
        pxor            TMP2, GHASH_ACC
        pxor            MI, GHASH_ACC
.endif
.endm

// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
//
// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
// related fields in the key struct.
.macro  _aes_gcm_precompute

        // Function arguments
        .set    KEY,            %rdi

        // Additional local variables.
        // %xmm0-%xmm1 and %rax are used as temporaries.
        .set    RNDKEYLAST_PTR, %rsi
        .set    H_CUR,          %xmm2
        .set    H_POW1,         %xmm3   // H^1
        .set    H_POW1_X64,     %xmm4   // H^1 * x^64
        .set    GFPOLY,         %xmm5

        // Encrypt an all-zeroes block to get the raw hash subkey.
        movl            OFFSETOF_AESKEYLEN(KEY), %eax
        lea             OFFSETOF_AESROUNDKEYS+6*16(KEY,%rax,4), RNDKEYLAST_PTR
        movdqa          OFFSETOF_AESROUNDKEYS(KEY), H_POW1
        lea             OFFSETOF_AESROUNDKEYS+16(KEY), %rax
1:
        aesenc          (%rax), H_POW1
        add             $16, %rax
        cmp             %rax, RNDKEYLAST_PTR
        jne             1b
        aesenclast      (RNDKEYLAST_PTR), H_POW1

        // Preprocess the raw hash subkey as needed to operate on GHASH's
        // bit-reflected values directly: reflect its bytes, then multiply it by
        // x^-1 (using the backwards interpretation of polynomial coefficients
        // from the GCM spec) or equivalently x^1 (using the alternative,
        // natural interpretation of polynomial coefficients).
        pshufb          .Lbswap_mask(%rip), H_POW1
        movdqa          H_POW1, %xmm0
        pshufd          $0xd3, %xmm0, %xmm0
        psrad           $31, %xmm0
        paddq           H_POW1, H_POW1
        pand            .Lgfpoly_and_internal_carrybit(%rip), %xmm0
        pxor            %xmm0, H_POW1

        // Store H^1.
        movdqa          H_POW1, OFFSETOF_H_POWERS+7*16(KEY)

        // Compute and store H^1 * x^64.
        movq            .Lgfpoly(%rip), GFPOLY
        pshufd          $0x4e, H_POW1, %xmm0
        _vpclmulqdq     $0x00, H_POW1, GFPOLY, H_POW1_X64
        pxor            %xmm0, H_POW1_X64
        movdqa          H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)

        // Compute and store the halves of H^1 XOR'd together.
        pxor            H_POW1, %xmm0
        movq            %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)

        // Compute and store the remaining key powers H^2 through H^8.
        movdqa          H_POW1, H_CUR
        mov             $6*8, %eax
.Lprecompute_next\@:
        // Compute H^i = H^{i-1} * H^1.
        _ghash_mul      H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
        // Store H^i.
        movdqa          H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
        // Compute and store the halves of H^i XOR'd together.
        pshufd          $0x4e, H_CUR, %xmm0
        pxor            H_CUR, %xmm0
        movq            %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
        sub             $8, %eax
        jge             .Lprecompute_next\@

        RET
.endm

// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
//                               u8 ghash_acc[16], const u8 *aad, int aadlen);
//
// This function processes the AAD (Additional Authenticated Data) in GCM.
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
// can be any length.  The caller must do any buffering needed to ensure this.
.macro  _aes_gcm_aad_update

        // Function arguments
        .set    KEY,            %rdi
        .set    GHASH_ACC_PTR,  %rsi
        .set    AAD,            %rdx
        .set    AADLEN,         %ecx
        // Note: _load_partial_block relies on AADLEN being in %ecx.

        // Additional local variables.
        // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
        .set    BSWAP_MASK,     %xmm2
        .set    GHASH_ACC,      %xmm3
        .set    H_POW1,         %xmm4   // H^1
        .set    H_POW1_X64,     %xmm5   // H^1 * x^64
        .set    GFPOLY,         %xmm6

        movdqa          .Lbswap_mask(%rip), BSWAP_MASK
        movdqu          (GHASH_ACC_PTR), GHASH_ACC
        movdqa          OFFSETOF_H_POWERS+7*16(KEY), H_POW1
        movdqa          OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
        movq            .Lgfpoly(%rip), GFPOLY

        // Process the AAD one full block at a time.
        sub             $16, AADLEN
        jl              .Laad_loop_1x_done\@
.Laad_loop_1x\@:
        movdqu          (AAD), %xmm0
        pshufb          BSWAP_MASK, %xmm0
        pxor            %xmm0, GHASH_ACC
        _ghash_mul      H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
        add             $16, AAD
        sub             $16, AADLEN
        jge             .Laad_loop_1x\@
.Laad_loop_1x_done\@:
        // Check whether there is a partial block at the end.
        add             $16, AADLEN
        jz              .Laad_done\@

        // Process a partial block of length 1 <= AADLEN <= 15.
        // _load_partial_block assumes that %ecx contains AADLEN.
        _load_partial_block     AAD, %xmm0, %r10, %r10d
        pshufb          BSWAP_MASK, %xmm0
        pxor            %xmm0, GHASH_ACC
        _ghash_mul      H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1

.Laad_done\@:
        movdqu          GHASH_ACC, (GHASH_ACC_PTR)
        RET
.endm

// Increment LE_CTR eight times to generate eight little-endian counter blocks,
// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
// the zero-th AES round key.  Clobbers TMP0 and TMP1.
.macro  _ctr_begin_8x
        movq            .Lone(%rip), TMP0
        movdqa          OFFSETOF_AESROUNDKEYS(KEY), TMP1 // zero-th round key
.irp i, 0,1,2,3,4,5,6,7
        _vpshufb        BSWAP_MASK, LE_CTR, AESDATA\i
        pxor            TMP1, AESDATA\i
        paddd           TMP0, LE_CTR
.endr
.endm

// Do a non-last round of AES on AESDATA[0-7] using \round_key.
.macro  _aesenc_8x      round_key
.irp i, 0,1,2,3,4,5,6,7
        aesenc          \round_key, AESDATA\i
.endr
.endm

// Do the last round of AES on AESDATA[0-7] using \round_key.
.macro  _aesenclast_8x  round_key
.irp i, 0,1,2,3,4,5,6,7
        aesenclast      \round_key, AESDATA\i
.endr
.endm

// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
// store the result to DST.  Clobbers TMP0.
.macro  _xor_data_8x
.irp i, 0,1,2,3,4,5,6,7
        _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
.endr
.irp i, 0,1,2,3,4,5,6,7
        movdqu          AESDATA\i, \i*16(DST)
.endr
.endm

// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
//                                        const u32 le_ctr[4], u8 ghash_acc[16],
//                                        const u8 *src, u8 *dst, int datalen);
//
// This macro generates a GCM encryption or decryption update function with the
// above prototype (with \enc selecting which one).
//
// This function computes the next portion of the CTR keystream, XOR's it with
// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
// next |datalen| ciphertext bytes.
//
// |datalen| must be a multiple of 16, except on the last call where it can be
// any length.  The caller must do any buffering needed to ensure this.  Both
// in-place and out-of-place en/decryption are supported.
//
// |le_ctr| must give the current counter in little-endian format.  For a new
// message, the low word of the counter must be 2.  This function loads the
// counter from |le_ctr| and increments the loaded counter as needed, but it
// does *not* store the updated counter back to |le_ctr|.  The caller must
// update |le_ctr| if any more data segments follow.  Internally, only the low
// 32-bit word of the counter is incremented, following the GCM standard.
.macro  _aes_gcm_update enc

        // Function arguments
        .set    KEY,            %rdi
        .set    LE_CTR_PTR,     %rsi    // Note: overlaps with usage as temp reg
        .set    GHASH_ACC_PTR,  %rdx
        .set    SRC,            %rcx
        .set    DST,            %r8
        .set    DATALEN,        %r9d
        .set    DATALEN64,      %r9     // Zero-extend DATALEN before using!
        // Note: the code setting up for _load_partial_block assumes that SRC is
        // in %rcx (and that DATALEN is *not* in %rcx).

        // Additional local variables

        // %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
        // with LE_CTR_PTR, which is used only at the beginning.

        .set    AESKEYLEN,      %r10d   // AES key length in bytes
        .set    AESKEYLEN64,    %r10
        .set    RNDKEYLAST_PTR, %r11    // Pointer to last AES round key

        // Put the most frequently used values in %xmm0-%xmm7 to reduce code
        // size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
        .set    TMP0,           %xmm0
        .set    TMP1,           %xmm1
        .set    TMP2,           %xmm2
        .set    LO,             %xmm3   // Low part of unreduced product
        .set    MI,             %xmm4   // Middle part of unreduced product
        .set    GHASH_ACC,      %xmm5   // GHASH accumulator; in main loop also
                                        // the high part of unreduced product
        .set    BSWAP_MASK,     %xmm6   // Shuffle mask for reflecting bytes
        .set    LE_CTR,         %xmm7   // Little-endian counter value
        .set    AESDATA0,       %xmm8
        .set    AESDATA1,       %xmm9
        .set    AESDATA2,       %xmm10
        .set    AESDATA3,       %xmm11
        .set    AESDATA4,       %xmm12
        .set    AESDATA5,       %xmm13
        .set    AESDATA6,       %xmm14
        .set    AESDATA7,       %xmm15

        movdqa          .Lbswap_mask(%rip), BSWAP_MASK
        movdqu          (GHASH_ACC_PTR), GHASH_ACC
        movdqu          (LE_CTR_PTR), LE_CTR

        movl            OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
        lea             OFFSETOF_AESROUNDKEYS+6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR

        // If there are at least 8*16 bytes of data, then continue into the main
        // loop, which processes 8*16 bytes of data per iteration.
        //
        // The main loop interleaves AES and GHASH to improve performance on
        // CPUs that can execute these instructions in parallel.  When
        // decrypting, the GHASH input (the ciphertext) is immediately
        // available.  When encrypting, we instead encrypt a set of 8 blocks
        // first and then GHASH those blocks while encrypting the next set of 8,
        // repeat that as needed, and finally GHASH the last set of 8 blocks.
        //
        // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
        // as this makes the immediate fit in a signed byte, saving 3 bytes.
        add             $-8*16, DATALEN
        jl              .Lcrypt_loop_8x_done\@
.if \enc
        // Encrypt the first 8 plaintext blocks.
        _ctr_begin_8x
        lea             OFFSETOF_AESROUNDKEYS+16(KEY), %rsi
        .p2align 4
1:
        movdqa          (%rsi), TMP0
        _aesenc_8x      TMP0
        add             $16, %rsi
        cmp             %rsi, RNDKEYLAST_PTR
        jne             1b
        movdqa          (%rsi), TMP0
        _aesenclast_8x  TMP0
        _xor_data_8x
        // Don't increment DST until the ciphertext blocks have been hashed.
        sub             $-8*16, SRC
        add             $-8*16, DATALEN
        jl              .Lghash_last_ciphertext_8x\@
.endif

        .p2align 4
.Lcrypt_loop_8x\@:

        // Generate the next set of 8 counter blocks and start encrypting them.
        _ctr_begin_8x
        lea             OFFSETOF_AESROUNDKEYS+16(KEY), %rsi

        // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
        // by doing the unreduced multiplication for the first ciphertext block.
        movdqa          (%rsi), TMP0
        add             $16, %rsi
        _aesenc_8x      TMP0
        _ghash_update_begin_8x \enc

        // Do 7 more rounds of AES, and continue the GHASH update by doing the
        // unreduced multiplication for the remaining ciphertext blocks.
        .p2align 4
1:
        movdqa          (%rsi), TMP0
        add             $16, %rsi
        _aesenc_8x      TMP0
        _ghash_update_continue_8x \enc
        cmp             $7*8, %eax
        jne             1b

        // Do the remaining AES rounds.
        .p2align 4
1:
        movdqa          (%rsi), TMP0
        add             $16, %rsi
        _aesenc_8x      TMP0
        cmp             %rsi, RNDKEYLAST_PTR
        jne             1b

        // Do the GHASH reduction and the last round of AES.
        movdqa          (RNDKEYLAST_PTR), TMP0
        _ghash_update_end_8x_step       0
        _aesenclast_8x  TMP0
        _ghash_update_end_8x_step       1

        // XOR the data with the AES-CTR keystream blocks.
.if \enc
        sub             $-8*16, DST
.endif
        _xor_data_8x
        sub             $-8*16, SRC
.if !\enc
        sub             $-8*16, DST
.endif
        add             $-8*16, DATALEN
        jge             .Lcrypt_loop_8x\@

.if \enc
.Lghash_last_ciphertext_8x\@:
        // Update GHASH with the last set of 8 ciphertext blocks.
        _ghash_update_begin_8x          \enc
        .p2align 4
1:
        _ghash_update_continue_8x       \enc
        cmp             $7*8, %eax
        jne             1b
        _ghash_update_end_8x_step       0
        _ghash_update_end_8x_step       1
        sub             $-8*16, DST
.endif

.Lcrypt_loop_8x_done\@:

        sub             $-8*16, DATALEN
        jz              .Ldone\@

        // Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
        // things simple and keep the code size down by just going one block at
        // a time, again taking advantage of hardware loop unrolling.  Since
        // there are enough key powers available for all remaining data, we do
        // the GHASH multiplications unreduced, and only reduce at the very end.

        .set    HI,             TMP2
        .set    H_POW,          AESDATA0
        .set    H_POW_XORED,    AESDATA1
        .set    ONE,            AESDATA2

        movq            .Lone(%rip), ONE

        // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
        pxor            LO, LO
        pxor            MI, MI
        pxor            HI, HI

        // Set up a block counter %rax to contain 8*(8-n), where n is the number
        // of blocks that remain, counting any partial block.  This will be used
        // to access the key powers H^n through H^1.
        mov             DATALEN, %eax
        neg             %eax
        and             $~15, %eax
        sar             $1, %eax
        add             $64, %eax

        sub             $16, DATALEN
        jl              .Lcrypt_loop_1x_done\@

        // Process the data one full block at a time.
.Lcrypt_loop_1x\@:

        // Encrypt the next counter block.
        _vpshufb        BSWAP_MASK, LE_CTR, TMP0
        paddd           ONE, LE_CTR
        pxor            OFFSETOF_AESROUNDKEYS(KEY), TMP0
        lea             -6*16(RNDKEYLAST_PTR), %rsi     // Reduce code size
        cmp             $24, AESKEYLEN
        jl              128f    // AES-128?
        je              192f    // AES-192?
        // AES-256
        aesenc          -7*16(%rsi), TMP0
        aesenc          -6*16(%rsi), TMP0
192:
        aesenc          -5*16(%rsi), TMP0
        aesenc          -4*16(%rsi), TMP0
128:
.irp i, -3,-2,-1,0,1,2,3,4,5
        aesenc          \i*16(%rsi), TMP0
.endr
        aesenclast      (RNDKEYLAST_PTR), TMP0

        // Load the next key power H^i.
        movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
        movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

        // XOR the keystream block that was just generated in TMP0 with the next
        // source data block and store the resulting en/decrypted data to DST.
.if \enc
        _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
        movdqu          TMP0, (DST)
.else
        movdqu          (SRC), TMP1
        pxor            TMP1, TMP0
        movdqu          TMP0, (DST)
.endif

        // Update GHASH with the ciphertext block.
.if \enc
        pshufb          BSWAP_MASK, TMP0
        pxor            TMP0, GHASH_ACC
.else
        pshufb          BSWAP_MASK, TMP1
        pxor            TMP1, GHASH_ACC
.endif
        _ghash_mul_noreduce     H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
        pxor            GHASH_ACC, GHASH_ACC

        add             $8, %eax
        add             $16, SRC
        add             $16, DST
        sub             $16, DATALEN
        jge             .Lcrypt_loop_1x\@
.Lcrypt_loop_1x_done\@:
        // Check whether there is a partial block at the end.
        add             $16, DATALEN
        jz              .Lghash_reduce\@

        // Process a partial block of length 1 <= DATALEN <= 15.

        // Encrypt a counter block for the last time.
        pshufb          BSWAP_MASK, LE_CTR
        pxor            OFFSETOF_AESROUNDKEYS(KEY), LE_CTR
        lea             OFFSETOF_AESROUNDKEYS+16(KEY), %rsi
1:
        aesenc          (%rsi), LE_CTR
        add             $16, %rsi
        cmp             %rsi, RNDKEYLAST_PTR
        jne             1b
        aesenclast      (RNDKEYLAST_PTR), LE_CTR

        // Load the lowest key power, H^1.
        movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
        movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

        // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
        // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
        // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
        mov             SRC, RNDKEYLAST_PTR
        mov             DATALEN, %ecx
        _load_partial_block     RNDKEYLAST_PTR, TMP0, %rsi, %esi

        // XOR the keystream block that was just generated in LE_CTR with the
        // source data block and store the resulting en/decrypted data to DST.
        pxor            TMP0, LE_CTR
        mov             DATALEN, %ecx
        _store_partial_block    LE_CTR, DST

        // If encrypting, zero-pad the final ciphertext block for GHASH.  (If
        // decrypting, this was already done by _load_partial_block.)
.if \enc
        lea             .Lzeropad_mask+16(%rip), %rax
        sub             DATALEN64, %rax
        _vpand          (%rax), LE_CTR, TMP0
.endif

        // Update GHASH with the final ciphertext block.
        pshufb          BSWAP_MASK, TMP0
        pxor            TMP0, GHASH_ACC
        _ghash_mul_noreduce     H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0

.Lghash_reduce\@:
        // Finally, do the GHASH reduction.
        _ghash_reduce   LO, MI, HI, GHASH_ACC, TMP0

.Ldone\@:
        // Store the updated GHASH accumulator back to memory.
        movdqu          GHASH_ACC, (GHASH_ACC_PTR)

        RET
.endm

// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
//                                 const u32 le_ctr[4], u8 ghash_acc[16],
//                                 u64 total_aadlen, u64 total_datalen);
// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
//                                 const u32 le_ctr[4], const u8 ghash_acc[16],
//                                 u64 total_aadlen, u64 total_datalen,
//                                 const u8 tag[16], int taglen);
//
// This macro generates one of the above two functions (with \enc selecting
// which one).  Both functions finish computing the GCM authentication tag by
// updating GHASH with the lengths block and encrypting the GHASH accumulator.
// |total_aadlen| and |total_datalen| must be the total length of the additional
// authenticated data and the en/decrypted data in bytes, respectively.
//
// The encryption function then stores the full-length (16-byte) computed
// authentication tag to |ghash_acc|.  The decryption function instead loads the
// expected authentication tag (the one that was transmitted) from the 16-byte
// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
// computed tag in constant time, and returns true if and only if they match.
.macro  _aes_gcm_final  enc

        // Function arguments
        .set    KEY,            %rdi
        .set    LE_CTR_PTR,     %rsi
        .set    GHASH_ACC_PTR,  %rdx
        .set    TOTAL_AADLEN,   %rcx
        .set    TOTAL_DATALEN,  %r8
        .set    TAG,            %r9
        .set    TAGLEN,         %r10d   // Originally at 8(%rsp)
        .set    TAGLEN64,       %r10

        // Additional local variables.
        // %rax and %xmm0-%xmm2 are used as temporary registers.
        .set    AESKEYLEN,      %r11d
        .set    AESKEYLEN64,    %r11
        .set    BSWAP_MASK,     %xmm3
        .set    GHASH_ACC,      %xmm4
        .set    H_POW1,         %xmm5   // H^1
        .set    H_POW1_X64,     %xmm6   // H^1 * x^64
        .set    GFPOLY,         %xmm7

        movdqa          .Lbswap_mask(%rip), BSWAP_MASK
        movl            OFFSETOF_AESKEYLEN(KEY), AESKEYLEN

        // Set up a counter block with 1 in the low 32-bit word.  This is the
        // counter that produces the ciphertext needed to encrypt the auth tag.
        movdqu          (LE_CTR_PTR), %xmm0
        mov             $1, %eax
        pinsrd          $0, %eax, %xmm0

        // Build the lengths block and XOR it into the GHASH accumulator.
        movq            TOTAL_DATALEN, GHASH_ACC
        pinsrq          $1, TOTAL_AADLEN, GHASH_ACC
        psllq           $3, GHASH_ACC   // Bytes to bits
        _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1

        movdqa          OFFSETOF_H_POWERS+7*16(KEY), H_POW1
        movdqa          OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
        movq            .Lgfpoly(%rip), GFPOLY

        // Make %rax point to the 6th from last AES round key.  (Using signed
        // byte offsets -7*16 through 6*16 decreases code size.)
        lea             OFFSETOF_AESROUNDKEYS(KEY,AESKEYLEN64,4), %rax

        // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
        // Interleave the AES and GHASH instructions to improve performance.
        pshufb          BSWAP_MASK, %xmm0
        pxor            OFFSETOF_AESROUNDKEYS(KEY), %xmm0
        cmp             $24, AESKEYLEN
        jl              128f    // AES-128?
        je              192f    // AES-192?
        // AES-256
        aesenc          -7*16(%rax), %xmm0
        aesenc          -6*16(%rax), %xmm0
192:
        aesenc          -5*16(%rax), %xmm0
        aesenc          -4*16(%rax), %xmm0
128:
.irp i, 0,1,2,3,4,5,6,7,8
        aesenc          (\i-3)*16(%rax), %xmm0
        _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
.endr
        aesenclast      6*16(%rax), %xmm0
        _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2

        // Undo the byte reflection of the GHASH accumulator.
        pshufb          BSWAP_MASK, GHASH_ACC

        // Encrypt the GHASH accumulator.
        pxor            %xmm0, GHASH_ACC

.if \enc
        // Return the computed auth tag.
        movdqu          GHASH_ACC, (GHASH_ACC_PTR)
.else
        .set            ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!

        // Verify the auth tag in constant time by XOR'ing the transmitted and
        // computed auth tags together and using the ptest instruction to check
        // whether the first TAGLEN bytes of the result are zero.
        _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
        movl            8(%rsp), TAGLEN
        lea             .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
        sub             TAGLEN64, ZEROPAD_MASK_PTR
        xor             %eax, %eax
        _test_mem       (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
        sete            %al
.endif
        RET
.endm

.set    USE_AVX, 0
SYM_FUNC_START(aes_gcm_precompute_aesni)
        _aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni)
SYM_FUNC_START(aes_gcm_aad_update_aesni)
        _aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni)
SYM_FUNC_START(aes_gcm_enc_update_aesni)
        _aes_gcm_update 1
SYM_FUNC_END(aes_gcm_enc_update_aesni)
SYM_FUNC_START(aes_gcm_dec_update_aesni)
        _aes_gcm_update 0
SYM_FUNC_END(aes_gcm_dec_update_aesni)
SYM_FUNC_START(aes_gcm_enc_final_aesni)
        _aes_gcm_final  1
SYM_FUNC_END(aes_gcm_enc_final_aesni)
SYM_FUNC_START(aes_gcm_dec_final_aesni)
        _aes_gcm_final  0
SYM_FUNC_END(aes_gcm_dec_final_aesni)

.set    USE_AVX, 1
SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
        _aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
        _aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
        _aes_gcm_update 1
SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
        _aes_gcm_update 0
SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
        _aes_gcm_final  1
SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
        _aes_gcm_final  0
SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
Linux