root/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')

#include <linux/linkage.h>

.text
.option arch, +zvkned, +zvbb, +zvkg

#include "aes-macros.S"

#define KEYP            a0
#define INP             a1
#define OUTP            a2
#define LEN             a3
#define TWEAKP          a4

#define LEN32           a5
#define TAIL_LEN        a6
#define VL              a7
#define VLMAX           t4

// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS          v16     // LMUL=4 (most of the time)
#define TWEAKS_BREV     v20     // LMUL=4 (most of the time)
#define MULTS_BREV      v24     // LMUL=4 (most of the time)
#define TMP0            v28
#define TMP1            v29
#define TMP2            v30
#define TMP3            v31

// xts_init initializes the following values:
//
//      TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
//      TWEAKS_BREV: same as TWEAKS, but bit-reversed
//      MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte.  The zvkg extension provides the vgmul
// instruction which does multiplication in this field.  Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring.  Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro  xts_init

        // Load the first tweak T.
        vsetivli        zero, 4, e32, m1, ta, ma
        vle32.v         TWEAKS, (TWEAKP)

        // If there's only one block (or no blocks at all), then skip the tweak
        // sequence computation because (at most) T itself is needed.
        li              t0, 16
        ble             LEN, t0, .Linit_single_block\@

        // Save a copy of T bit-reversed in v12.
        vbrev8.v        v12, TWEAKS

        //
        // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
        // that N <= 128.  Though, this code actually requires N < 64 (or
        // equivalently VLEN < 2048) due to the use of 64-bit intermediate
        // values here and in the x^N computation later.
        //
        vsetvli         VL, LEN32, e32, m4, ta, ma
        srli            t0, VL, 2       // t0 = N (num blocks)
        // Generate two sequences, each with N 32-bit values:
        // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
        vsetvli         zero, t0, e32, m1, ta, ma
        vmv.v.i         v0, 1
        vid.v           v1
        // Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
        // as two sequences, each with 2*N 32-bit values:
        // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
        vsetvli         zero, t0, e64, m2, ta, ma
        vzext.vf2       v2, v0
        vzext.vf2       v4, v1
        slli            t1, t0, 1       // t1 = 2*N
        vsetvli         zero, t1, e32, m2, ta, ma
        // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
        // widening to 64 bits per element.  When reinterpreted as N 128-bit
        // values, this is the needed sequence of 128-bit values 1 << i (x^i).
        vwsll.vv        v8, v2, v4

        // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
        // multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
        vsetvli         zero, LEN32, e32, m4, ta, ma
        vmv.v.i         TWEAKS_BREV, 0
        vaesz.vs        TWEAKS_BREV, v12
        vbrev8.v        v8, v8
        vgmul.vv        TWEAKS_BREV, v8

        // Save a copy of the sequence T*(x^i) with the bit reversal undone.
        vbrev8.v        TWEAKS, TWEAKS_BREV

        // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
        li              t1, 1
        sll             t1, t1, t0      // t1 = 1 << N
        vsetivli        zero, 2, e64, m1, ta, ma
        vmv.v.i         v0, 0
        vsetivli        zero, 1, e64, m1, tu, ma
        vmv.v.x         v0, t1
        vbrev8.v        v0, v0
        vsetvli         zero, LEN32, e32, m4, ta, ma
        vmv.v.i         MULTS_BREV, 0
        vaesz.vs        MULTS_BREV, v0

        j               .Linit_done\@

.Linit_single_block\@:
        vbrev8.v        TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm

// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
// the multiplier required to advance the tweak by one.
.macro  load_x
        li              t0, 0x40
        vsetivli        zero, 4, e32, m1, ta, ma
        vmv.v.i         MULTS_BREV, 0
        vsetivli        zero, 1, e8, m1, tu, ma
        vmv.v.x         MULTS_BREV, t0
.endm

.macro  __aes_xts_crypt enc, keylen
        // With 16 < len <= 31, there's no main loop, just ciphertext stealing.
        beqz            LEN32, .Lcts_without_main_loop\@

        vsetvli         VLMAX, zero, e32, m4, ta, ma
1:
        vsetvli         VL, LEN32, e32, m4, ta, ma
2:
        // Encrypt or decrypt VL/4 blocks.
        vle32.v         TMP0, (INP)
        vxor.vv         TMP0, TMP0, TWEAKS
        aes_crypt       TMP0, \enc, \keylen
        vxor.vv         TMP0, TMP0, TWEAKS
        vse32.v         TMP0, (OUTP)

        // Update the pointers and the remaining length.
        slli            t0, VL, 2
        add             INP, INP, t0
        add             OUTP, OUTP, t0
        sub             LEN32, LEN32, VL

        // Check whether more blocks remain.
        beqz            LEN32, .Lmain_loop_done\@

        // Compute the next sequence of tweaks by multiplying the previous
        // sequence by x^N.  Store the result in both bit-reversed order and
        // regular order (i.e. with the bit reversal undone).
        vgmul.vv        TWEAKS_BREV, MULTS_BREV
        vbrev8.v        TWEAKS, TWEAKS_BREV

        // Since we compute the tweak multipliers x^N in advance, we require
        // that each iteration process the same length except possibly the last.
        // This conflicts slightly with the behavior allowed by RISC-V Vector
        // Extension, where CPUs can select a lower length for both of the last
        // two iterations.  E.g., vl might take the sequence of values
        // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
        // can use x^4 again instead of computing x^3.  Therefore, we explicitly
        // keep the vl at VLMAX if there is at least VLMAX remaining.
        bge             LEN32, VLMAX, 2b
        j               1b

.Lmain_loop_done\@:
        load_x

        // Compute the next tweak.
        addi            t0, VL, -4
        vsetivli        zero, 4, e32, m4, ta, ma
        vslidedown.vx   TWEAKS_BREV, TWEAKS_BREV, t0    // Extract last tweak
        vsetivli        zero, 4, e32, m1, ta, ma
        vgmul.vv        TWEAKS_BREV, MULTS_BREV         // Advance to next tweak

        bnez            TAIL_LEN, .Lcts\@

        // Update *TWEAKP to contain the next tweak.
        vbrev8.v        TWEAKS, TWEAKS_BREV
        vse32.v         TWEAKS, (TWEAKP)
        ret

.Lcts_without_main_loop\@:
        load_x
.Lcts\@:
        // TWEAKS_BREV now contains the next tweak.  Compute the one after that.
        vsetivli        zero, 4, e32, m1, ta, ma
        vmv.v.v         TMP0, TWEAKS_BREV
        vgmul.vv        TMP0, MULTS_BREV
        // Undo the bit reversal of the next two tweaks and store them in TMP1
        // and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
        vbrev8.v        TMP1, TWEAKS_BREV
        vbrev8.v        TMP2, TMP0
.else
        vbrev8.v        TMP1, TMP0
        vbrev8.v        TMP2, TWEAKS_BREV
.endif

        // Encrypt/decrypt the last full block.
        vle32.v         TMP0, (INP)
        vxor.vv         TMP0, TMP0, TMP1
        aes_crypt       TMP0, \enc, \keylen
        vxor.vv         TMP0, TMP0, TMP1

        // Swap the first TAIL_LEN bytes of the above result with the tail.
        // Note that to support in-place encryption/decryption, the load from
        // the input tail must happen before the store to the output tail.
        addi            t0, INP, 16
        addi            t1, OUTP, 16
        vmv.v.v         TMP3, TMP0
        vsetvli         zero, TAIL_LEN, e8, m1, tu, ma
        vle8.v          TMP0, (t0)
        vse8.v          TMP3, (t1)

        // Encrypt/decrypt again and store the last full block.
        vsetivli        zero, 4, e32, m1, ta, ma
        vxor.vv         TMP0, TMP0, TMP2
        aes_crypt       TMP0, \enc, \keylen
        vxor.vv         TMP0, TMP0, TMP2
        vse32.v         TMP0, (OUTP)

        ret
.endm

.macro  aes_xts_crypt   enc

        // Check whether the length is a multiple of the AES block size.
        andi            TAIL_LEN, LEN, 15
        beqz            TAIL_LEN, 1f

        // The length isn't a multiple of the AES block size, so ciphertext
        // stealing will be required.  Ciphertext stealing involves special
        // handling of the partial block and the last full block, so subtract
        // the length of both from the length to be processed in the main loop.
        sub             LEN, LEN, TAIL_LEN
        addi            LEN, LEN, -16
1:
        srli            LEN32, LEN, 2
        // LEN and LEN32 now contain the total length of the blocks that will be
        // processed in the main loop, in bytes and 32-bit words respectively.

        xts_init
        aes_begin       KEYP, 128f, 192f
        __aes_xts_crypt \enc, 256
128:
        __aes_xts_crypt \enc, 128
192:
        __aes_xts_crypt \enc, 192
.endm

// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
//                                       const u8 *in, u8 *out, size_t len,
//                                       u8 tweak[16]);
//
// |key| is the data key.  |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done.  This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call.  If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
        aes_xts_crypt   1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
        aes_xts_crypt   0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)