root/sys/arch/octeon/dev/octcrypto_asm.S
/*      $OpenBSD: octcrypto_asm.S,v 1.2 2019/01/03 17:06:22 visa Exp $  */

/*
 * Copyright (c) 2018 Visa Hankala
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <machine/asm.h>

        .set    noreorder
        .set    arch=octeon

/*
 * COP2 registers and operation codes
 */
#define MT_AES_IV               0x0102
#define MT_AES_KEY              0x0104
#define MT_AES_ENC0             0x010a
#define MT_AES_ENC1             0x310b
#define MT_AES_ENC_CBC0         0x0108
#define MT_AES_ENC_CBC1         0x3109
#define MT_AES_DEC_CBC0         0x010c
#define MT_AES_DEC_CBC1         0x310d
#define MF_AES_RESINP           0x0100
#define MT_AES_RESINP           0x0100
#define MT_HSH_DAT              0x0040
#define MT_HSH_DATW             0x0240
#define MT_HSH_IV               0x0048
#define MF_HSH_IV               0x0048
#define MT_HSH_IVW              0x0250
#define MF_HSH_IVW              0x0250
#define MT_HSH_STARTMD5         0x4047
#define MT_HSH_STARTSHA         0x4057
#define MT_HSH_STARTSHA256      0x404f
#define MT_HSH_STARTSHA512      0x424f

#define MT_GFM_MUL              0x0258
#define MT_GFM_POLY             0x025e
#define MF_GFM_RESINP           0x025a
#define MT_GFM_RESINP           0x025a
#define MT_GFM_XOR0             0x025c
#define MT_GFM_XORMUL1          0x425d

#define GF128_POLY              0xe100

/*
 * void octcrypto_aes_set_key(const uint64_t *key, size_t len)
 *
 * Load an AES key to the coprocessor.
 * `len' is in bytes.
 */
LEAF(octcrypto_aes_set_key, 0)
        ld      t0, (a0)
        ld      t1, 8(a0)
        dmtc2   t0, MT_AES_KEY
        bltu    a1, 24, 1f
         dmtc2  t1, MT_AES_KEY+1        /* 128-bit key */
        ld      t0, 16(a0)
        bltu    a1, 32, 1f
         dmtc2  t0, MT_AES_KEY+2        /* 192-bit key */
        ld      t0, 24(a0)
        dmtc2   t0, MT_AES_KEY+3        /* 256-bit key */
1:
        jr      ra
         nop
END(octcrypto_aes_set_key)

/*
 * void octcrypto_aes_clear(void)
 *
 * Clear AES state.
 */
LEAF(octcrypto_aes_clear, 0)
        dmtc2   zero, MT_AES_KEY
        dmtc2   zero, MT_AES_KEY+1
        dmtc2   zero, MT_AES_KEY+2
        dmtc2   zero, MT_AES_KEY+3
        dmtc2   zero, MT_AES_RESINP
        jr      ra
         dmtc2  zero, MT_AES_RESINP+1
END(octcrypto_aes_clear)

/*
 * void octcrypto_aes_enc(uint64_t *buf)
 *
 * AES encrypt a single block.
 */
LEAF(octcrypto_aes_enc, 0)
        ld      t0, (a0)
        ld      t1, 8(a0)
        dmtc2   t0, MT_AES_ENC0
        dmtc2   t1, MT_AES_ENC1
        dmfc2   t0, MF_AES_RESINP
        dmfc2   t1, MF_AES_RESINP+1
        sd      t0, (a0)
        jr      ra
         sd     t1, 8(a0)
END(octcrypto_aes_enc)

/*
 * void octcrypto_aes_cbc_enc(void *buf, size_t size, const void *iv)
 *
 * AES CBC encrypt a sequence of blocks.
 */
LEAF(octcrypto_aes_cbc_enc, 0)
        bltu    a1, 16, 3f
         and    t2, a1, 15
        dsubu   a3, a1, t2
        daddu   a3, a3, a0              /* Compute buffer end. */
        and     t2, a0, 7               /* Determine alignment. */
        ld      t0, (a2)
        ld      t1, 8(a2)
        dmtc2   t0, MT_AES_IV
        bne     t2, zero, 2f
         dmtc2  t1, MT_AES_IV+1

        /*
         * Aligned buffer
         */
1:
        ld      t0, (a0)
        ld      t1, 8(a0)
        daddu   a0, a0, 16
        dmtc2   t0, MT_AES_ENC_CBC0
        dmtc2   t1, MT_AES_ENC_CBC1
        dmfc2   t0, MF_AES_RESINP
        dmfc2   t1, MF_AES_RESINP+1
        sd      t0, -16(a0)
        bltu    a0, a3, 1b
         sd     t1, -8(a0)
        b       3f
         nop

        /*
         * Unaligned buffer
         */
2:
        LDHI    t0, (a0)
        LDLO    t0, 7(a0)
        LDHI    t1, 8(a0)
        LDLO    t1, 15(a0)
        daddu   a0, a0, 16
        dmtc2   t0, MT_AES_ENC_CBC0
        dmtc2   t1, MT_AES_ENC_CBC1
        dmfc2   t0, MF_AES_RESINP
        dmfc2   t1, MF_AES_RESINP+1
        SDHI    t0, -16(a0)
        SDLO    t0, -9(a0)
        SDHI    t1, -8(a0)
        bltu    a0, a3, 2b
         SDLO   t1, -1(a0)

3:
        jr      ra
         nop
END(octcrypto_aes_cbc_enc)

/*
 * void octcrypto_aes_cbc_dec(void *dst, size_t size, const void *iv)
 *
 * AES CBC decrypt a sequence of blocks.
 */
LEAF(octcrypto_aes_cbc_dec, 0)
        bltu    a1, 16, 3f
         and    t2, a1, 15
        dsubu   a3, a1, t2
        daddu   a3, a3, a0              /* Compute buffer end. */
        and     t2, a0, 7               /* Determine alignment. */
        ld      t0, (a2)
        ld      t1, 8(a2)
        dmtc2   t0, MT_AES_IV
        bne     t2, zero, 2f
         dmtc2  t1, MT_AES_IV+1

        /*
         * Aligned buffer
         */
1:
        ld      t0, (a0)
        ld      t1, 8(a0)
        daddu   a0, a0, 16
        dmtc2   t0, MT_AES_DEC_CBC0
        dmtc2   t1, MT_AES_DEC_CBC1
        dmfc2   t0, MF_AES_RESINP
        dmfc2   t1, MF_AES_RESINP+1
        sd      t0, -16(a0)
        bltu    a0, a3, 1b
         sd     t1, -8(a0)
        b       3f
         nop

        /*
         * Unaligned buffer
         */
2:
        LDHI    t0, (a0)
        LDLO    t0, 7(a0)
        LDHI    t1, 8(a0)
        LDLO    t1, 15(a0)
        daddu   a0, a0, 16
        dmtc2   t0, MT_AES_DEC_CBC0
        dmtc2   t1, MT_AES_DEC_CBC1
        dmfc2   t0, MF_AES_RESINP
        dmfc2   t1, MF_AES_RESINP+1
        SDHI    t0, -16(a0)
        SDLO    t0, -9(a0)
        SDHI    t1, -8(a0)
        bltu    a0, a3, 2b
         SDLO   t1, -1(a0)

3:
        jr      ra
         nop
END(octcrypto_aes_cbc_dec)

/*
 * void octcrypto_aes_ctr_enc(void *buf, size_t size, const void *icb)
 *
 * AES CTR encrypt a sequence of blocks.
 */
LEAF(octcrypto_aes_ctr_enc, 0)
        bltu    a1, 16, 3f
         and    t2, a1, 15
        dsubu   a3, a1, t2
        daddu   a3, a3, a0              /* Compute buffer end. */
        and     t2, a0, 7               /* Determine alignment. */
        ld      t0, (a2)
        bne     t2, zero, 2f
         ld     t1, 8(a2)

        /*
         * Aligned buffer
         */
1:
        /*
         * Increment the counter.
         * Assume big endian byte order and no overflow.
         */
        daddu   t1, 1
        /* Compute the keystream block. */
        dmtc2   t0, MT_AES_ENC0
        dmtc2   t1, MT_AES_ENC1
        dmfc2   t2, MF_AES_RESINP
        dmfc2   t3, MF_AES_RESINP+1
        /* XOR the plaintext and the keystream. */
        ld      a4, (a0)
        ld      a5, 8(a0)
        daddu   a0, a0, 16
        xor     a4, t2
        xor     a5, t3
        sd      a4, -16(a0)
        bltu    a0, a3, 1b
         sd     a5, -8(a0)
        b       3f
         nop

        /*
         * Unaligned buffer
         */
2:
        /*
         * Increment the counter.
         * Assume big endian byte order and no overflow.
         */
        daddu   t1, 1
        /* Compute the keystream block. */
        dmtc2   t0, MT_AES_ENC0
        dmtc2   t1, MT_AES_ENC1
        dmfc2   t2, MF_AES_RESINP
        dmfc2   t3, MF_AES_RESINP+1
        /* XOR the plaintext and the keystream. */
        LDHI    a4, (a0)
        LDLO    a4, 7(a0)
        LDHI    a5, 8(a0)
        LDLO    a5, 15(a0)
        daddu   a0, a0, 16
        xor     a4, t2
        xor     a5, t3
        SDHI    a4, -16(a0)
        SDLO    a4, -9(a0)
        SDHI    a5, -8(a0)
        bltu    a0, a3, 2b
         SDLO   a5, -1(a0)

3:
        jr      ra
         nop
END(octcrypto_aes_ctr_enc)

#define HASH_NARROW(name, type)                                         \
LEAF(octcrypto_hash_##name, 0)                                          \
        bltu    a1, 64, 3f;                                             \
         and    t3, a1, 63;                                             \
        and     t2, a0, 7;              /* Determine alignment. */      \
        dsubu   a3, a1, t3;                                             \
        bne     t2, zero, 2f;                                           \
         daddu  a3, a3, a0;             /* Compute buffer end. */       \
                                                                        \
        /*                                                              \
         * Aligned buffer                                               \
         */                                                             \
1:                                                                      \
        ld      t0, (a0);                                               \
        ld      t1, 8(a0);                                              \
        ld      t2, 16(a0);                                             \
        ld      t3, 24(a0);                                             \
        dmtc2   t0, MT_HSH_DAT;                                         \
        dmtc2   t1, MT_HSH_DAT+1;                                       \
        dmtc2   t2, MT_HSH_DAT+2;                                       \
        dmtc2   t3, MT_HSH_DAT+3;                                       \
        ld      t0, 32(a0);                                             \
        ld      t1, 40(a0);                                             \
        ld      t2, 48(a0);                                             \
        ld      t3, 56(a0);                                             \
        daddu   a0, a0, 64;                                             \
        dmtc2   t0, MT_HSH_DAT+4;                                       \
        dmtc2   t1, MT_HSH_DAT+5;                                       \
        dmtc2   t2, MT_HSH_DAT+6;                                       \
        bltu    a0, a3, 1b;                                             \
         dmtc2  t3, MT_HSH_START##type;                                 \
        b       3f;                                                     \
         nop;                                                           \
                                                                        \
        /*                                                              \
         * Unaligned buffer                                             \
         */                                                             \
2:                                                                      \
        LDHI    t0, (a0);                                               \
        LDLO    t0, 7(a0);                                              \
        LDHI    t1, 8(a0);                                              \
        LDLO    t1, 15(a0);                                             \
        LDHI    t2, 16(a0);                                             \
        LDLO    t2, 23(a0);                                             \
        LDHI    t3, 24(a0);                                             \
        LDLO    t3, 31(a0);                                             \
        dmtc2   t0, MT_HSH_DAT;                                         \
        dmtc2   t1, MT_HSH_DAT+1;                                       \
        dmtc2   t2, MT_HSH_DAT+2;                                       \
        dmtc2   t3, MT_HSH_DAT+3;                                       \
        LDHI    t0, 32(a0);                                             \
        LDLO    t0, 39(a0);                                             \
        LDHI    t1, 40(a0);                                             \
        LDLO    t1, 47(a0);                                             \
        LDHI    t2, 48(a0);                                             \
        LDLO    t2, 55(a0);                                             \
        LDHI    t3, 56(a0);                                             \
        LDLO    t3, 63(a0);                                             \
        daddu   a0, a0, 64;                                             \
        dmtc2   t0, MT_HSH_DAT+4;                                       \
        dmtc2   t1, MT_HSH_DAT+5;                                       \
        dmtc2   t2, MT_HSH_DAT+6;                                       \
        bltu    a0, a3, 2b;                                             \
         dmtc2  t3, MT_HSH_START##type;                                 \
3:                                                                      \
        jr      ra;                                                     \
         nop;                                                           \
END(octcrypto_hash_##name)

#define HASH_WIDE(name, type)                                           \
LEAF(octcrypto_hash_##name, 0)                                          \
        bltu    a1, 128, 3f;                                            \
         and    t3, a1, 127;                                            \
        and     t2, a0, 7;              /* Determine alignment. */      \
        dsubu   a3, a1, t3;                                             \
        bne     t2, zero, 2f;                                           \
         daddu  a3, a3, a0;             /* Compute buffer end. */       \
                                                                        \
        /*                                                              \
         * Aligned buffer                                               \
         */                                                             \
1:                                                                      \
        ld      t0, (a0);                                               \
        ld      t1, 8(a0);                                              \
        ld      t2, 16(a0);                                             \
        ld      t3, 24(a0);                                             \
        dmtc2   t0, MT_HSH_DATW;                                        \
        dmtc2   t1, MT_HSH_DATW+1;                                      \
        dmtc2   t2, MT_HSH_DATW+2;                                      \
        dmtc2   t3, MT_HSH_DATW+3;                                      \
        ld      t0, 32(a0);                                             \
        ld      t1, 40(a0);                                             \
        ld      t2, 48(a0);                                             \
        ld      t3, 56(a0);                                             \
        dmtc2   t0, MT_HSH_DATW+4;                                      \
        dmtc2   t1, MT_HSH_DATW+5;                                      \
        dmtc2   t2, MT_HSH_DATW+6;                                      \
        dmtc2   t3, MT_HSH_DATW+7;                                      \
        ld      t0, 64(a0);                                             \
        ld      t1, 72(a0);                                             \
        ld      t2, 80(a0);                                             \
        ld      t3, 88(a0);                                             \
        dmtc2   t0, MT_HSH_DATW+8;                                      \
        dmtc2   t1, MT_HSH_DATW+9;                                      \
        dmtc2   t2, MT_HSH_DATW+10;                                     \
        dmtc2   t3, MT_HSH_DATW+11;                                     \
        ld      t0, 96(a0);                                             \
        ld      t1, 104(a0);                                            \
        ld      t2, 112(a0);                                            \
        ld      t3, 120(a0);                                            \
        daddu   a0, a0, 128;                                            \
        dmtc2   t0, MT_HSH_DATW+12;                                     \
        dmtc2   t1, MT_HSH_DATW+13;                                     \
        dmtc2   t2, MT_HSH_DATW+14;                                     \
        bltu    a0, a3, 1b;                                             \
         dmtc2  t3, MT_HSH_START##type;                                 \
        b       3f;                                                     \
         nop;                                                           \
                                                                        \
        /*                                                              \
         * Unaligned buffer                                             \
         */                                                             \
2:                                                                      \
        LDHI    t0, (a0);                                               \
        LDLO    t0, 7(a0);                                              \
        LDHI    t1, 8(a0);                                              \
        LDLO    t1, 15(a0);                                             \
        LDHI    t2, 16(a0);                                             \
        LDLO    t2, 23(a0);                                             \
        LDHI    t3, 24(a0);                                             \
        LDLO    t3, 31(a0);                                             \
        dmtc2   t0, MT_HSH_DATW;                                        \
        dmtc2   t1, MT_HSH_DATW+1;                                      \
        dmtc2   t2, MT_HSH_DATW+2;                                      \
        dmtc2   t3, MT_HSH_DATW+3;                                      \
        LDHI    t0, 32(a0);                                             \
        LDLO    t0, 39(a0);                                             \
        LDHI    t1, 40(a0);                                             \
        LDLO    t1, 47(a0);                                             \
        LDHI    t2, 48(a0);                                             \
        LDLO    t2, 55(a0);                                             \
        LDHI    t3, 56(a0);                                             \
        LDLO    t3, 63(a0);                                             \
        dmtc2   t0, MT_HSH_DATW+4;                                      \
        dmtc2   t1, MT_HSH_DATW+5;                                      \
        dmtc2   t2, MT_HSH_DATW+6;                                      \
        dmtc2   t3, MT_HSH_DATW+7;                                      \
        LDHI    t0, 64(a0);                                             \
        LDLO    t0, 71(a0);                                             \
        LDHI    t1, 72(a0);                                             \
        LDLO    t1, 79(a0);                                             \
        LDHI    t2, 80(a0);                                             \
        LDLO    t2, 87(a0);                                             \
        LDHI    t3, 88(a0);                                             \
        LDLO    t3, 95(a0);                                             \
        dmtc2   t0, MT_HSH_DATW+8;                                      \
        dmtc2   t1, MT_HSH_DATW+9;                                      \
        dmtc2   t2, MT_HSH_DATW+10;                                     \
        dmtc2   t3, MT_HSH_DATW+11;                                     \
        LDHI    t0, 96(a0);                                             \
        LDLO    t0, 103(a0);                                            \
        LDHI    t1, 104(a0);                                            \
        LDLO    t1, 111(a0);                                            \
        LDHI    t2, 112(a0);                                            \
        LDLO    t2, 119(a0);                                            \
        LDHI    t3, 120(a0);                                            \
        LDLO    t3, 127(a0);                                            \
        daddu   a0, a0, 128;                                            \
        dmtc2   t0, MT_HSH_DATW+12;                                     \
        dmtc2   t1, MT_HSH_DATW+13;                                     \
        dmtc2   t2, MT_HSH_DATW+14;                                     \
        bltu    a0, a3, 2b;                                             \
         dmtc2  t3, MT_HSH_START##type;                                 \
3:                                                                      \
        jr      ra;                                                     \
         nop;                                                           \
END(octcrypto_hash_##name)

/*
 * void octcrypto_md5(const void *buf, size_t size)
 */
HASH_NARROW(md5, MD5)

/*
 * void octcrypto_sha1(const void *buf, size_t size)
 */
HASH_NARROW(sha1, SHA)

/*
 * void octcrypto_sha256(const void *src, size_t size)
 */
HASH_NARROW(sha256, SHA256)

/*
 * void octcrypto_sha512(const void *src, size_t size)
 */
HASH_WIDE(sha512, SHA512)

/*
 * void octcrypto_hash_set_ivn(const uint64_t *iv)
 */
LEAF(octcrypto_hash_set_ivn, 0)
        ld      t0, (a0)
        ld      t1, 8(a0)
        ld      t2, 16(a0)
        ld      t3, 24(a0)
        dmtc2   t0, MT_HSH_IV
        dmtc2   t1, MT_HSH_IV+1
        dmtc2   t2, MT_HSH_IV+2
        jr      ra
         dmtc2  t3, MT_HSH_IV+3
END(octcrypto_hash_set_ivn)

/*
 * void octcrypto_hash_set_ivw(const uint64_t *iv)
 */
LEAF(octcrypto_hash_set_ivw, 0)
        ld      t0, (a0)
        ld      t1, 8(a0)
        ld      t2, 16(a0)
        ld      t3, 24(a0)
        dmtc2   t0, MT_HSH_IVW
        dmtc2   t1, MT_HSH_IVW+1
        dmtc2   t2, MT_HSH_IVW+2
        dmtc2   t3, MT_HSH_IVW+3
        ld      t0, 32(a0)
        ld      t1, 40(a0)
        ld      t2, 48(a0)
        ld      t3, 56(a0)
        dmtc2   t0, MT_HSH_IVW+4
        dmtc2   t1, MT_HSH_IVW+5
        dmtc2   t2, MT_HSH_IVW+6
        jr      ra
         dmtc2  t3, MT_HSH_IVW+7
END(octcrypto_hash_set_ivw)

/*
 * void octcrypto_hash_get_ivn(uint64_t *iv)
 */
LEAF(octcrypto_hash_get_ivn, 0)
        dmfc2   t0, MF_HSH_IV
        dmfc2   t1, MF_HSH_IV+1
        dmfc2   t2, MF_HSH_IV+2
        dmfc2   t3, MF_HSH_IV+3
        sd      t0, (a0)
        sd      t1, 8(a0)
        sd      t2, 16(a0)
        jr      ra
         sd     t3, 24(a0)
END(octcrypto_hash_get_ivn)

/*
 * void octcrypto_hash_get_ivw(uint64_t *iv)
 */
LEAF(octcrypto_hash_get_ivw, 0)
        dmfc2   t0, MF_HSH_IVW
        dmfc2   t1, MF_HSH_IVW+1
        dmfc2   t2, MF_HSH_IVW+2
        dmfc2   t3, MF_HSH_IVW+3
        sd      t0, (a0)
        sd      t1, 8(a0)
        sd      t2, 16(a0)
        sd      t3, 24(a0)
        dmfc2   t0, MF_HSH_IVW+4
        dmfc2   t1, MF_HSH_IVW+5
        dmfc2   t2, MF_HSH_IVW+6
        dmfc2   t3, MF_HSH_IVW+7
        sd      t0, 32(a0)
        sd      t1, 40(a0)
        sd      t2, 48(a0)
        jr      ra
         sd     t3, 56(a0)
END(octcrypto_hash_get_ivw)

/*
 * void octcrypto_hash_clearn(void)
 */
LEAF(octcrypto_hash_clearn, 0)
        dmtc2   zero, MT_HSH_IV
        dmtc2   zero, MT_HSH_IV+1
        dmtc2   zero, MT_HSH_IV+2
        dmtc2   zero, MT_HSH_IV+3
        dmtc2   zero, MT_HSH_DAT
        dmtc2   zero, MT_HSH_DAT+1
        dmtc2   zero, MT_HSH_DAT+2
        dmtc2   zero, MT_HSH_DAT+3
        dmtc2   zero, MT_HSH_DAT+4
        dmtc2   zero, MT_HSH_DAT+5
        jr      ra
         dmtc2  zero, MT_HSH_DAT+6
END(octcrypto_hash_clearn)

/*
 * void octcrypto_hash_clearw(void)
 */
LEAF(octcrypto_hash_clearw, 0)
        dmtc2   zero, MT_HSH_IVW
        dmtc2   zero, MT_HSH_IVW+1
        dmtc2   zero, MT_HSH_IVW+2
        dmtc2   zero, MT_HSH_IVW+3
        dmtc2   zero, MT_HSH_IVW+4
        dmtc2   zero, MT_HSH_IVW+5
        dmtc2   zero, MT_HSH_IVW+6
        dmtc2   zero, MT_HSH_IVW+7
        dmtc2   zero, MT_HSH_DATW
        dmtc2   zero, MT_HSH_DATW+1
        dmtc2   zero, MT_HSH_DATW+2
        dmtc2   zero, MT_HSH_DATW+3
        dmtc2   zero, MT_HSH_DATW+4
        dmtc2   zero, MT_HSH_DATW+5
        dmtc2   zero, MT_HSH_DATW+6
        dmtc2   zero, MT_HSH_DATW+7
        dmtc2   zero, MT_HSH_DATW+8
        dmtc2   zero, MT_HSH_DATW+9
        dmtc2   zero, MT_HSH_DATW+10
        dmtc2   zero, MT_HSH_DATW+11
        dmtc2   zero, MT_HSH_DATW+12
        dmtc2   zero, MT_HSH_DATW+13
        jr      ra
         dmtc2  zero, MT_HSH_DATW+14
END(octcrypto_hash_clearw)

/*
 * void octcrypto_ghash_init(const uint64_t *key, const uint64_t *state)
 *
 * Initialize the Galois field multiplier unit with the GF(2^128) polynomial
 * and given key.
 * If state is given, load it to the unit; otherwise, use zero state.
 */
LEAF(octcrypto_ghash_init, 0)
        /* Set the polynomial. */
        li      t0, GF128_POLY
        dmtc2   t0, MT_GFM_POLY
        /* Set the hash key / multiplier. */
        ld      t0, (a0)
        ld      t1, 8(a0)
        dmtc2   t0, MT_GFM_MUL
        dmtc2   t1, MT_GFM_MUL+1
        /* Initialize the state. */
        bne     a1, zero, 1f
         move   t0, zero
        b       2f
         move   t1, zero
1:
        ld      t0, (a1)
        ld      t1, 8(a1)
2:
        dmtc2   t0, MT_GFM_RESINP
        jr      ra
         dmtc2  t1, MT_GFM_RESINP+1
END(octcrypto_ghash_init)

/*
 * void octcrypto_ghash_finish(uint64_t *x)
 *
 * Store the current GHASH state into buffer `x',
 * and clear the GFM unit's state.
 */
LEAF(octcrypto_ghash_finish, 0)
        dmfc2   t0, MF_GFM_RESINP
        dmfc2   t1, MF_GFM_RESINP+1
        sd      t0, (a0)
        sd      t1, 8(a0)
        dmtc2   zero, MT_GFM_RESINP
        dmtc2   zero, MT_GFM_RESINP+1
        dmtc2   zero, MT_GFM_MUL
        jr      ra
         dmtc2  zero, MT_GFM_MUL+1
END(octcrypto_ghash_finish)

/*
 * void octcrypto_ghash_update(const void *buf, size_t len)
 *
 * Update the GHASH state.
 *
 * For each X_i in X_0 || X_1 || ... || X_{n-1} = buf:
 *     Y := (Y ^ X_i) * H
 */
LEAF(octcrypto_ghash_update, 0)
        bltu    a1, 16, 3f
         and    t2, a1, 15
        dsubu   a3, a1, t2
        and     t0, a0, 7
        bne     t0, zero, 2f
         daddu  a3, a3, a0
1:
        /* Aligned buffer */
        ld      t0, (a0)
        ld      t1, 8(a0)
        daddu   a0, 16
        dmtc2   t0, MT_GFM_XOR0
        bltu    a0, a3, 1b
         dmtc2  t1, MT_GFM_XORMUL1
        b       3f
         nop
2:
        /* Unaligned buffer */
        LDHI    t0, (a0)
        LDLO    t0, 7(a0)
        LDHI    t1, 8(a0)
        LDLO    t1, 15(a0)
        daddu   a0, 16
        dmtc2   t0, MT_GFM_XOR0
        bltu    a0, a3, 2b
         dmtc2  t1, MT_GFM_XORMUL1
3:
        jr      ra
         nop
END(octcrypto_ghash_update)