root/lib/crypto/mips/chacha-core.S
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#define MASK_U32                0x3c
#define CHACHA20_BLOCK_SIZE     64
#define STACK_SIZE              32

#define X0      $t0
#define X1      $t1
#define X2      $t2
#define X3      $t3
#define X4      $t4
#define X5      $t5
#define X6      $t6
#define X7      $t7
#define X8      $t8
#define X9      $t9
#define X10     $v1
#define X11     $s6
#define X12     $s5
#define X13     $s4
#define X14     $s3
#define X15     $s2
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
#define T0      $s1
#define T1      $s0
#define T(n)    T ## n
#define X(n)    X ## n

/* Input arguments */
#define STATE           $a0
#define OUT             $a1
#define IN              $a2
#define BYTES           $a3

/* Output argument */
/* NONCE[0] is kept in a register and not in memory.
 * We don't want to touch original value in memory.
 * Must be incremented every loop iteration.
 */
#define NONCE_0         $v0

/* SAVED_X and SAVED_CA are set in the jump table.
 * Use regs which are overwritten on exit else we don't leak clear data.
 * They are used to handling the last bytes which are not multiple of 4.
 */
#define SAVED_X         X15
#define SAVED_CA        $s7

#define IS_UNALIGNED    $s7

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#define CPU_TO_LE32(n) \
        wsbh    n, n; \
        rotr    n, 16;
#else
#define MSB 3
#define LSB 0
#define CPU_TO_LE32(n)
#endif

#define FOR_EACH_WORD(x) \
        x( 0); \
        x( 1); \
        x( 2); \
        x( 3); \
        x( 4); \
        x( 5); \
        x( 6); \
        x( 7); \
        x( 8); \
        x( 9); \
        x(10); \
        x(11); \
        x(12); \
        x(13); \
        x(14); \
        x(15);

#define FOR_EACH_WORD_REV(x) \
        x(15); \
        x(14); \
        x(13); \
        x(12); \
        x(11); \
        x(10); \
        x( 9); \
        x( 8); \
        x( 7); \
        x( 6); \
        x( 5); \
        x( 4); \
        x( 3); \
        x( 2); \
        x( 1); \
        x( 0);

#define PLUS_ONE_0       1
#define PLUS_ONE_1       2
#define PLUS_ONE_2       3
#define PLUS_ONE_3       4
#define PLUS_ONE_4       5
#define PLUS_ONE_5       6
#define PLUS_ONE_6       7
#define PLUS_ONE_7       8
#define PLUS_ONE_8       9
#define PLUS_ONE_9      10
#define PLUS_ONE_10     11
#define PLUS_ONE_11     12
#define PLUS_ONE_12     13
#define PLUS_ONE_13     14
#define PLUS_ONE_14     15
#define PLUS_ONE_15     16
#define PLUS_ONE(x)     PLUS_ONE_ ## x
#define _CONCAT3(a,b,c) a ## b ## c
#define CONCAT3(a,b,c)  _CONCAT3(a,b,c)

#define STORE_UNALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
        .if (x != 12); \
                lw      T0, (x*4)(STATE); \
        .endif; \
        lwl     T1, (x*4)+MSB ## (IN); \
        lwr     T1, (x*4)+LSB ## (IN); \
        .if (x == 12); \
                addu    X ## x, NONCE_0; \
        .else; \
                addu    X ## x, T0; \
        .endif; \
        CPU_TO_LE32(X ## x); \
        xor     X ## x, T1; \
        swl     X ## x, (x*4)+MSB ## (OUT); \
        swr     X ## x, (x*4)+LSB ## (OUT);

#define STORE_ALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
        .if (x != 12); \
                lw      T0, (x*4)(STATE); \
        .endif; \
        lw      T1, (x*4) ## (IN); \
        .if (x == 12); \
                addu    X ## x, NONCE_0; \
        .else; \
                addu    X ## x, T0; \
        .endif; \
        CPU_TO_LE32(X ## x); \
        xor     X ## x, T1; \
        sw      X ## x, (x*4) ## (OUT);

/* Jump table macro.
 * Used for setup and handling the last bytes, which are not multiple of 4.
 * X15 is free to store Xn
 * Every jumptable entry must be equal in size.
 */
#define JMPTBL_ALIGNED(x) \
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
        .set    noreorder; \
        b       .Lchacha_mips_xor_aligned_ ## x ## _b; \
        .if (x == 12); \
                addu    SAVED_X, X ## x, NONCE_0; \
        .else; \
                addu    SAVED_X, X ## x, SAVED_CA; \
        .endif; \
        .set    reorder

#define JMPTBL_UNALIGNED(x) \
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
        .set    noreorder; \
        b       .Lchacha_mips_xor_unaligned_ ## x ## _b; \
        .if (x == 12); \
                addu    SAVED_X, X ## x, NONCE_0; \
        .else; \
                addu    SAVED_X, X ## x, SAVED_CA; \
        .endif; \
        .set    reorder

#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
        addu    X(A), X(K); \
        addu    X(B), X(L); \
        addu    X(C), X(M); \
        addu    X(D), X(N); \
        xor     X(V), X(A); \
        xor     X(W), X(B); \
        xor     X(Y), X(C); \
        xor     X(Z), X(D); \
        rotr    X(V), 32 - S; \
        rotr    X(W), 32 - S; \
        rotr    X(Y), 32 - S; \
        rotr    X(Z), 32 - S;

.text
.set    reorder
.set    noat
.globl  chacha_crypt_arch
.ent    chacha_crypt_arch
chacha_crypt_arch:
        .frame  $sp, STACK_SIZE, $ra

        /* Load number of rounds */
        lw      $at, 16($sp)

        addiu   $sp, -STACK_SIZE

        /* Return bytes = 0. */
        beqz    BYTES, .Lchacha_mips_end

        lw      NONCE_0, 48(STATE)

        /* Save s0-s7 */
        sw      $s0,  0($sp)
        sw      $s1,  4($sp)
        sw      $s2,  8($sp)
        sw      $s3, 12($sp)
        sw      $s4, 16($sp)
        sw      $s5, 20($sp)
        sw      $s6, 24($sp)
        sw      $s7, 28($sp)

        /* Test IN or OUT is unaligned.
         * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
         */
        or      IS_UNALIGNED, IN, OUT
        andi    IS_UNALIGNED, 0x3

        b       .Lchacha_rounds_start

.align 4
.Loop_chacha_rounds:
        addiu   IN,  CHACHA20_BLOCK_SIZE
        addiu   OUT, CHACHA20_BLOCK_SIZE
        addiu   NONCE_0, 1

.Lchacha_rounds_start:
        lw      X0,  0(STATE)
        lw      X1,  4(STATE)
        lw      X2,  8(STATE)
        lw      X3,  12(STATE)

        lw      X4,  16(STATE)
        lw      X5,  20(STATE)
        lw      X6,  24(STATE)
        lw      X7,  28(STATE)
        lw      X8,  32(STATE)
        lw      X9,  36(STATE)
        lw      X10, 40(STATE)
        lw      X11, 44(STATE)

        move    X12, NONCE_0
        lw      X13, 52(STATE)
        lw      X14, 56(STATE)
        lw      X15, 60(STATE)

.Loop_chacha_xor_rounds:
        addiu   $at, -2
        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
        bnez    $at, .Loop_chacha_xor_rounds

        addiu   BYTES, -(CHACHA20_BLOCK_SIZE)

        /* Is data src/dst unaligned? Jump */
        bnez    IS_UNALIGNED, .Loop_chacha_unaligned

        /* Set number rounds here to fill delayslot. */
        lw      $at, (STACK_SIZE+16)($sp)

        /* BYTES < 0, it has no full block. */
        bltz    BYTES, .Lchacha_mips_no_full_block_aligned

        FOR_EACH_WORD_REV(STORE_ALIGNED)

        /* BYTES > 0? Loop again. */
        bgtz    BYTES, .Loop_chacha_rounds

        /* Place this here to fill delay slot */
        addiu   NONCE_0, 1

        /* BYTES < 0? Handle last bytes */
        bltz    BYTES, .Lchacha_mips_xor_bytes

.Lchacha_mips_xor_done:
        /* Restore used registers */
        lw      $s0,  0($sp)
        lw      $s1,  4($sp)
        lw      $s2,  8($sp)
        lw      $s3, 12($sp)
        lw      $s4, 16($sp)
        lw      $s5, 20($sp)
        lw      $s6, 24($sp)
        lw      $s7, 28($sp)

        /* Write NONCE_0 back to right location in state */
        sw      NONCE_0, 48(STATE)

.Lchacha_mips_end:
        addiu   $sp, STACK_SIZE
        jr      $ra

.Lchacha_mips_no_full_block_aligned:
        /* Restore the offset on BYTES */
        addiu   BYTES, CHACHA20_BLOCK_SIZE

        /* Get number of full WORDS */
        andi    $at, BYTES, MASK_U32

        /* Load upper half of jump table addr */
        lui     T0, %hi(.Lchacha_mips_jmptbl_aligned_0)

        /* Calculate lower half jump table offset */
        ins     T0, $at, 1, 6

        /* Add offset to STATE */
        addu    T1, STATE, $at

        /* Add lower half jump table addr */
        addiu   T0, %lo(.Lchacha_mips_jmptbl_aligned_0)

        /* Read value from STATE */
        lw      SAVED_CA, 0(T1)

        /* Store remaining bytecounter as negative value */
        subu    BYTES, $at, BYTES

        jr      T0

        /* Jump table */
        FOR_EACH_WORD(JMPTBL_ALIGNED)


.Loop_chacha_unaligned:
        /* Set number rounds here to fill delayslot. */
        lw      $at, (STACK_SIZE+16)($sp)

        /* BYTES > 0, it has no full block. */
        bltz    BYTES, .Lchacha_mips_no_full_block_unaligned

        FOR_EACH_WORD_REV(STORE_UNALIGNED)

        /* BYTES > 0? Loop again. */
        bgtz    BYTES, .Loop_chacha_rounds

        /* Write NONCE_0 back to right location in state */
        sw      NONCE_0, 48(STATE)

        .set noreorder
        /* Fall through to byte handling */
        bgez    BYTES, .Lchacha_mips_xor_done
.Lchacha_mips_xor_unaligned_0_b:
.Lchacha_mips_xor_aligned_0_b:
        /* Place this here to fill delay slot */
        addiu   NONCE_0, 1
        .set reorder

.Lchacha_mips_xor_bytes:
        addu    IN, $at
        addu    OUT, $at
        /* First byte */
        lbu     T1, 0(IN)
        addiu   $at, BYTES, 1
        xor     T1, SAVED_X
        sb      T1, 0(OUT)
        beqz    $at, .Lchacha_mips_xor_done
        /* Second byte */
        lbu     T1, 1(IN)
        addiu   $at, BYTES, 2
        rotr    SAVED_X, 8
        xor     T1, SAVED_X
        sb      T1, 1(OUT)
        beqz    $at, .Lchacha_mips_xor_done
        /* Third byte */
        lbu     T1, 2(IN)
        rotr    SAVED_X, 8
        xor     T1, SAVED_X
        sb      T1, 2(OUT)
        b       .Lchacha_mips_xor_done

.Lchacha_mips_no_full_block_unaligned:
        /* Restore the offset on BYTES */
        addiu   BYTES, CHACHA20_BLOCK_SIZE

        /* Get number of full WORDS */
        andi    $at, BYTES, MASK_U32

        /* Load upper half of jump table addr */
        lui     T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)

        /* Calculate lower half jump table offset */
        ins     T0, $at, 1, 6

        /* Add offset to STATE */
        addu    T1, STATE, $at

        /* Add lower half jump table addr */
        addiu   T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)

        /* Read value from STATE */
        lw      SAVED_CA, 0(T1)

        /* Store remaining bytecounter as negative value */
        subu    BYTES, $at, BYTES

        jr      T0

        /* Jump table */
        FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha_crypt_arch
.set at

/* Input arguments
 * STATE        $a0
 * OUT          $a1
 * NROUND       $a2
 */

#undef X12
#undef X13
#undef X14
#undef X15

#define X12     $a3
#define X13     $at
#define X14     $v0
#define X15     STATE

.set noat
.globl  hchacha_block_arch
.ent    hchacha_block_arch
hchacha_block_arch:
        .frame  $sp, STACK_SIZE, $ra

        addiu   $sp, -STACK_SIZE

        /* Save X11(s6) */
        sw      X11, 0($sp)

        lw      X0,  0(STATE)
        lw      X1,  4(STATE)
        lw      X2,  8(STATE)
        lw      X3,  12(STATE)
        lw      X4,  16(STATE)
        lw      X5,  20(STATE)
        lw      X6,  24(STATE)
        lw      X7,  28(STATE)
        lw      X8,  32(STATE)
        lw      X9,  36(STATE)
        lw      X10, 40(STATE)
        lw      X11, 44(STATE)
        lw      X12, 48(STATE)
        lw      X13, 52(STATE)
        lw      X14, 56(STATE)
        lw      X15, 60(STATE)

.Loop_hchacha_xor_rounds:
        addiu   $a2, -2
        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
        AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
        AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
        AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
        AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
        bnez    $a2, .Loop_hchacha_xor_rounds

        /* Restore used register */
        lw      X11, 0($sp)

        sw      X0,  0(OUT)
        sw      X1,  4(OUT)
        sw      X2,  8(OUT)
        sw      X3,  12(OUT)
        sw      X12, 16(OUT)
        sw      X13, 20(OUT)
        sw      X14, 24(OUT)
        sw      X15, 28(OUT)

        addiu   $sp, STACK_SIZE
        jr      $ra
.end hchacha_block_arch
.set at