root/lib/crypto/powerpc/sha1-spe-asm.S
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Fast SHA-1 implementation for SPE instruction set (PPC)
 *
 * This code makes use of the SPE SIMD instruction set as defined in
 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
 * Implementation is based on optimization guide notes from
 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
 *
 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
 */

#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>

#define rHP     r3      /* pointer to hash value                        */
#define rWP     r4      /* pointer to input                             */
#define rKP     r5      /* pointer to constants                         */

#define rW0     r14     /* 64 bit round words                           */
#define rW1     r15
#define rW2     r16
#define rW3     r17
#define rW4     r18
#define rW5     r19
#define rW6     r20
#define rW7     r21

#define rH0     r6      /* 32 bit hash values                           */
#define rH1     r7
#define rH2     r8
#define rH3     r9
#define rH4     r10

#define rT0     r22     /* 64 bit temporary                             */
#define rT1     r0      /* 32 bit temporaries                           */
#define rT2     r11
#define rT3     r12

#define rK      r23     /* 64 bit constant in volatile register         */

#define LOAD_K01

#define LOAD_K11 \
        evlwwsplat      rK,0(rKP);

#define LOAD_K21 \
        evlwwsplat      rK,4(rKP);

#define LOAD_K31 \
        evlwwsplat      rK,8(rKP);

#define LOAD_K41 \
        evlwwsplat      rK,12(rKP);

#define INITIALIZE \
        stwu            r1,-128(r1);    /* create stack frame           */ \
        evstdw          r14,8(r1);      /* We must save non volatile    */ \
        evstdw          r15,16(r1);     /* registers. Take the chance   */ \
        evstdw          r16,24(r1);     /* and save the SPE part too    */ \
        evstdw          r17,32(r1);                                        \
        evstdw          r18,40(r1);                                        \
        evstdw          r19,48(r1);                                        \
        evstdw          r20,56(r1);                                        \
        evstdw          r21,64(r1);                                        \
        evstdw          r22,72(r1);                                        \
        evstdw          r23,80(r1);


#define FINALIZE \
        evldw           r14,8(r1);      /* restore SPE registers        */ \
        evldw           r15,16(r1);                                        \
        evldw           r16,24(r1);                                        \
        evldw           r17,32(r1);                                        \
        evldw           r18,40(r1);                                        \
        evldw           r19,48(r1);                                        \
        evldw           r20,56(r1);                                        \
        evldw           r21,64(r1);                                        \
        evldw           r22,72(r1);                                        \
        evldw           r23,80(r1);                                        \
        xor             r0,r0,r0;                                          \
        stw             r0,8(r1);       /* Delete sensitive data        */ \
        stw             r0,16(r1);      /* that we might have pushed    */ \
        stw             r0,24(r1);      /* from other context that runs */ \
        stw             r0,32(r1);      /* the same code. Assume that   */ \
        stw             r0,40(r1);      /* the lower part of the GPRs   */ \
        stw             r0,48(r1);      /* were already overwritten on  */ \
        stw             r0,56(r1);      /* the way down to here         */ \
        stw             r0,64(r1);                                         \
        stw             r0,72(r1);                                         \
        stw             r0,80(r1);                                         \
        addi            r1,r1,128;      /* cleanup stack frame          */

#ifdef __BIG_ENDIAN__
#define LOAD_DATA(reg, off) \
        lwz             reg,off(rWP);   /* load data                    */
#define NEXT_BLOCK \
        addi            rWP,rWP,64;     /* increment per block          */
#else
#define LOAD_DATA(reg, off) \
        lwbrx           reg,0,rWP;      /* load data                    */ \
        addi            rWP,rWP,4;      /* increment per word           */
#define NEXT_BLOCK                      /* nothing to do                */
#endif

#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
        LOAD_DATA(w0, off)              /* 1: W                         */ \
        and             rT2,b,c;        /* 1: F' = B and C              */ \
        LOAD_K##k##1                                                       \
        andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
        rotrwi          rT0,a,27;       /* 1: A' = A rotl 5             */ \
        or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
        add             e,e,rT0;        /* 1: E = E + A'                */ \
        rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
        add             e,e,w0;         /* 1: E = E + W                 */ \
        LOAD_DATA(w1, off+4)            /* 2: W                         */ \
        add             e,e,rT2;        /* 1: E = E + F                 */ \
        and             rT1,a,b;        /* 2: F' = B and C              */ \
        add             e,e,rK;         /* 1: E = E + K                 */ \
        andc            rT2,c,a;        /* 2: F" = ~B and D             */ \
        add             d,d,rK;         /* 2: E = E + K                 */ \
        or              rT2,rT2,rT1;    /* 2: F = F' or F"              */ \
        rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
        add             d,d,w1;         /* 2: E = E + W                 */ \
        rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
        add             d,d,rT0;        /* 2: E = E + A'                */ \
        evmergelo       w1,w1,w0;       /*    mix W[0]/W[1]             */ \
        add             d,d,rT2         /* 2: E = E + F                 */

#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
        and             rT2,b,c;        /* 1: F' = B and C              */ \
        evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
        andc            rT1,d,b;        /* 1: F" = ~B and D             */ \
        evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
        or              rT1,rT1,rT2;    /* 1: F = F' or F"              */ \
        evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
        add             e,e,rT1;        /* 1: E = E + F                 */ \
        evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
        rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
        evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
        add             e,e,rT2;        /* 1: E = E + A'                */ \
        evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
        rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
        LOAD_K##k##1                                                       \
        evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
        add             e,e,rT0;        /* 1: E = E + WK                */ \
        add             d,d,rT1;        /* 2: E = E + WK                */ \
        and             rT2,a,b;        /* 2: F' = B and C              */ \
        andc            rT1,c,a;        /* 2: F" = ~B and D             */ \
        rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
        or              rT1,rT1,rT2;    /* 2: F = F' or F"              */ \
        add             d,d,rT0;        /* 2: E = E + A'                */ \
        rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
        add             d,d,rT1         /* 2: E = E + F                 */

#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
        evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
        xor             rT2,b,c;        /* 1: F' = B xor C              */ \
        evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
        xor             rT2,rT2,d;      /* 1: F = F' xor D              */ \
        evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
        add             e,e,rT2;        /* 1: E = E + F                 */ \
        evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
        rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
        evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
        add             e,e,rT2;        /* 1: E = E + A'                */ \
        evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
        rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
        LOAD_K##k##1                                                       \
        evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
        add             e,e,rT0;        /* 1: E = E + WK                */ \
        xor             rT2,a,b;        /* 2: F' = B xor C              */ \
        add             d,d,rT1;        /* 2: E = E + WK                */ \
        xor             rT2,rT2,c;      /* 2: F = F' xor D              */ \
        rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
        add             d,d,rT2;        /* 2: E = E + F                 */ \
        rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
        add             d,d,rT0         /* 2: E = E + A'                */

#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
        and             rT2,b,c;        /* 1: F' = B and C              */ \
        evmergelohi     rT0,w7,w6;      /*    W[-3]                     */ \
        or              rT1,b,c;        /* 1: F" = B or C               */ \
        evxor           w0,w0,rT0;      /*    W = W[-16] xor W[-3]      */ \
        and             rT1,d,rT1;      /* 1: F" = F" and D             */ \
        evxor           w0,w0,w4;       /*    W = W xor W[-8]           */ \
        or              rT2,rT2,rT1;    /* 1: F = F' or F"              */ \
        evxor           w0,w0,w1;       /*    W = W xor W[-14]          */ \
        add             e,e,rT2;        /* 1: E = E + F                 */ \
        evrlwi          w0,w0,1;        /*    W = W rotl 1              */ \
        rotrwi          rT2,a,27;       /* 1: A' = A rotl 5             */ \
        evaddw          rT0,w0,rK;      /*    WK = W + K                */ \
        add             e,e,rT2;        /* 1: E = E + A'                */ \
        LOAD_K##k##1                                                       \
        evmergehi       rT1,rT1,rT0;    /*    WK1/WK2                   */ \
        rotrwi          b,b,2;          /* 1: B = B rotl 30             */ \
        add             e,e,rT0;        /* 1: E = E + WK                */ \
        and             rT2,a,b;        /* 2: F' = B and C              */ \
        or              rT0,a,b;        /* 2: F" = B or C               */ \
        add             d,d,rT1;        /* 2: E = E + WK                */ \
        and             rT0,c,rT0;      /* 2: F" = F" and D             */ \
        rotrwi          a,a,2;          /* 2: B = B rotl 30             */ \
        or              rT2,rT2,rT0;    /* 2: F = F' or F"              */ \
        rotrwi          rT0,e,27;       /* 2: A' = A rotl 5             */ \
        add             d,d,rT2;        /* 2: E = E + F                 */ \
        add             d,d,rT0         /* 2: E = E + A'                */

#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
        R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)

_GLOBAL(ppc_spe_sha1_transform)
        INITIALIZE

        lwz             rH0,0(rHP)
        lwz             rH1,4(rHP)
        mtctr           r5
        lwz             rH2,8(rHP)
        lis             rKP,PPC_SPE_SHA1_K@h
        lwz             rH3,12(rHP)
        ori             rKP,rKP,PPC_SPE_SHA1_K@l
        lwz             rH4,16(rHP)

ppc_spe_sha1_main:
        R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
        R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
        R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
        R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
        R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
        R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
        R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
        R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)

        R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
        R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)

        R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
        R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
        R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
        R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
        R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
        R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
        R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
        R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
        R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
        R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)

        R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
        R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
        R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
        R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
        R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
        R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
        R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
        R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
        R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
        R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)

        R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
        R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
        R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
        R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
        R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
        R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
        R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
        lwz             rT3,0(rHP)
        R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
        lwz             rW1,4(rHP)
        R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
        lwz             rW2,8(rHP)
        R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
        lwz             rW3,12(rHP)
        NEXT_BLOCK
        lwz             rW4,16(rHP)

        add             rH0,rH0,rT3
        stw             rH0,0(rHP)
        add             rH1,rH1,rW1
        stw             rH1,4(rHP)
        add             rH2,rH2,rW2
        stw             rH2,8(rHP)
        add             rH3,rH3,rW3
        stw             rH3,12(rHP)
        add             rH4,rH4,rW4
        stw             rH4,16(rHP)

        bdnz            ppc_spe_sha1_main

        FINALIZE
        blr

.data
.align 4
PPC_SPE_SHA1_K:
        .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6