root/lib/libcrypto/sha/sha1_aarch64_ce.S
/* $OpenBSD: sha1_aarch64_ce.S,v 1.5 2026/01/25 08:22:17 jsing Exp $ */
/*
 * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * SHA-1 implementation using the ARM Cryptographic Extension (CE).
 *
 * There are six instructions for hardware acceleration of SHA-1 - the
 * documentation for these instructions is woefully inadequate:
 *
 *  sha1c:   hash update (choose)
 *  sha1h:   fixed rotate
 *  sha1m:   hash update (majority)
 *  sha1p:   hash update (parity)
 *  sha1su0: message schedule update with sigma0 for four rounds
 *  sha1su1: message schedule update with sigma1 for four rounds
 */

#define ctx             x0
#define in              x1
#define num             x2

/* Note: the lower 64 bits of v8 through v15 are callee saved. */

#define hc0             v16
#define hc1             v17
#define hc1s            s17

#define hs0             v18
#define hs0q            q18
#define hs1             v19
#define hs1s            s19

#define w0              v20
#define w1              v21
#define w2              v22
#define w3              v23

#define k0              v24
#define k1              v25
#define k2              v26
#define k3              v27

#define tmp0            v28
#define tmp1            s29

#define tmp2            w11

/*
 * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7),
 * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha1su0 instruction computes
 * W0 = W8 ^ W2 ^ W0, while sha1su1 computes rol(W0 ^ W13, 1).
 */
#define sha1_message_schedule_update(m0, m1, m2, m3) \
        sha1su0 m0.4s, m1.4s, m2.4s;                                    \
        sha1su1 m0.4s, m3.4s

/*
 * Compute four SHA-1 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then
 * computing the remainder of each round (including the shuffle) via
 * sha1{c,p,m}/sha1h.
 */

#define sha1_round1(h0, h1, w, k) \
        add     tmp0.4s, w.4s, k.4s;            /* Tt = Wt + Kt */      \
        mov     tmp1, h0.s[0];                                          \
        sha1c   h0##q, h1##s, tmp0.4s;                                  \
        sha1h   h1##s, tmp1

#define sha1_round2(h0, h1, w, k) \
        add     tmp0.4s, w.4s, k.4s;            /* Tt = Wt + Kt */      \
        mov     tmp1, h0.s[0];                                          \
        sha1p   h0##q, h1##s, tmp0.4s;                                  \
        sha1h   h1##s, tmp1

#define sha1_round3(h0, h1, w, k) \
        add     tmp0.4s, w.4s, k.4s;            /* Tt = Wt + Kt */      \
        mov     tmp1, h0.s[0];                                          \
        sha1m   h0##q, h1##s, tmp0.4s;                                  \
        sha1h   h1##s, tmp1

#define sha1_round4(h0, h1, w, k) \
        add     tmp0.4s, w.4s, k.4s;            /* Tt = Wt + Kt */      \
        mov     tmp1, h0.s[0];                                          \
        sha1p   h0##q, h1##s, tmp0.4s;                                  \
        sha1h   h1##s, tmp1

.arch   armv8-a+sha2

.section .text

/*
 * void sha1_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
 *
 * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num
 */
.globl  sha1_block_ce
.type   sha1_block_ce,@function
sha1_block_ce:

        /*
         * Load SHA-1 round constants.
         */

        /* Round 1 - 0x5a827999 */
        movz    tmp2, #0x5a82, lsl #16
        movk    tmp2, #0x7999
        dup     k0.4s, tmp2

        /* Round 2 - 0x6ed9eba1 */
        movz    tmp2, #0x6ed9, lsl #16
        movk    tmp2, #0xeba1
        dup     k1.4s, tmp2

        /* Round 3 - 0x8f1bbcdc */
        movz    tmp2, #0x8f1b, lsl #16
        movk    tmp2, #0xbcdc
        dup     k2.4s, tmp2

        /* Round 4 - 0xca62c1d6 */
        movz    tmp2, #0xca62, lsl #16
        movk    tmp2, #0xc1d6
        dup     k3.4s, tmp2

        /* Load current hash state from context (hc0 = a:b:c:d, hc1 = e). */
        ld1     {hc0.4s}, [ctx]
        ldr     hc1s, [ctx, #(4*4)]

.Lblock_loop:
        /* Copy current hash state. */
        mov     hs0.16b, hc0.16b
        mov     hs1s, hc1.s[0]

        /* Load and byte swap message schedule. */
        ld1     {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
        rev32   w0.16b, w0.16b
        rev32   w1.16b, w1.16b
        rev32   w2.16b, w2.16b
        rev32   w3.16b, w3.16b

        /* Rounds 0 through 15 (four rounds at a time). */
        sha1_round1(hs0, hs1, w0, k0)
        sha1_round1(hs0, hs1, w1, k0)
        sha1_round1(hs0, hs1, w2, k0)
        sha1_round1(hs0, hs1, w3, k0)

        /* Rounds 16 through 31 (four rounds at a time). */
        sha1_message_schedule_update(w0, w1, w2, w3)
        sha1_message_schedule_update(w1, w2, w3, w0)
        sha1_message_schedule_update(w2, w3, w0, w1)
        sha1_message_schedule_update(w3, w0, w1, w2)

        sha1_round1(hs0, hs1, w0, k0)
        sha1_round2(hs0, hs1, w1, k1)
        sha1_round2(hs0, hs1, w2, k1)
        sha1_round2(hs0, hs1, w3, k1)

        /* Rounds 32 through 47 (four rounds at a time). */
        sha1_message_schedule_update(w0, w1, w2, w3)
        sha1_message_schedule_update(w1, w2, w3, w0)
        sha1_message_schedule_update(w2, w3, w0, w1)
        sha1_message_schedule_update(w3, w0, w1, w2)

        sha1_round2(hs0, hs1, w0, k1)
        sha1_round2(hs0, hs1, w1, k1)
        sha1_round3(hs0, hs1, w2, k2)
        sha1_round3(hs0, hs1, w3, k2)

        /* Rounds 48 through 63 (four rounds at a time). */
        sha1_message_schedule_update(w0, w1, w2, w3)
        sha1_message_schedule_update(w1, w2, w3, w0)
        sha1_message_schedule_update(w2, w3, w0, w1)
        sha1_message_schedule_update(w3, w0, w1, w2)

        sha1_round3(hs0, hs1, w0, k2)
        sha1_round3(hs0, hs1, w1, k2)
        sha1_round3(hs0, hs1, w2, k2)
        sha1_round4(hs0, hs1, w3, k3)

        /* Rounds 64 through 79 (four rounds at a time). */
        sha1_message_schedule_update(w0, w1, w2, w3)
        sha1_message_schedule_update(w1, w2, w3, w0)
        sha1_message_schedule_update(w2, w3, w0, w1)
        sha1_message_schedule_update(w3, w0, w1, w2)

        sha1_round4(hs0, hs1, w0, k3)
        sha1_round4(hs0, hs1, w1, k3)
        sha1_round4(hs0, hs1, w2, k3)
        sha1_round4(hs0, hs1, w3, k3)

        /* Add intermediate state to hash state. */
        add     hc0.4s, hc0.4s, hs0.4s
        add     hc1.4s, hc1.4s, hs1.4s

        sub     num, num, #1
        cbnz    num, .Lblock_loop

        /* Store hash state to context. */
        st1     {hc0.4s}, [ctx]
        str     hc1s, [ctx, #(4*4)]

        ret