root/lib/crypto/arm/sha1-armv4-large.S
#define __ARM_ARCH__ __LINUX_ARM_ARCH__
@ SPDX-License-Identifier: GPL-2.0

@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version 2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.

@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see https://www.openssl.org/~appro/cryptogams/.
@ ====================================================================

@ sha1_block procedure for ARMv4.
@
@ January 2007.

@ Size/performance trade-off
@ ====================================================================
@ impl          size in bytes   comp cycles[*]  measured performance
@ ====================================================================
@ thumb         304             3212            4420
@ armv4-small   392/+29%        1958/+64%       2250/+96%
@ armv4-compact 740/+89%        1552/+26%       1840/+22%
@ armv4-large   1420/+92%       1307/+19%       1370/+34%[***]
@ full unroll   ~5100/+260%     ~1260/+4%       ~1300/+5%
@ ====================================================================
@ thumb         = same as 'small' but in Thumb instructions[**] and
@                 with recurring code in two private functions;
@ small         = detached Xload/update, loops are folded;
@ compact       = detached Xload/update, 5x unroll;
@ large         = interleaved Xload/update, 5x unroll;
@ full unroll   = interleaved Xload/update, full unroll, estimated[!];
@
@ [*]   Manually counted instructions in "grand" loop body. Measured
@       performance is affected by prologue and epilogue overhead,
@       i-cache availability, branch penalties, etc.
@ [**]  While each Thumb instruction is twice smaller, they are not as
@       diverse as ARM ones: e.g., there are only two arithmetic
@       instructions with 3 arguments, no [fixed] rotate, addressing
@       modes are limited. As result it takes more instructions to do
@       the same job in Thumb, therefore the code is never twice as
@       small and always slower.
@ [***] which is also ~35% better than compiler generated code. Dual-
@       issue Cortex A8 core was measured to process input block in
@       ~990 cycles.

@ August 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
@ Cortex A8 core and in absolute terms ~870 cycles per input block
@ [or 13.6 cycles per byte].

@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 10%
@ improvement on Cortex A8 core and 12.2 cycles per byte.

#include <linux/linkage.h>

.text

.align  2
ENTRY(sha1_block_data_order)
        stmdb   sp!,{r4-r12,lr}
        add     r2,r1,r2,lsl#6  @ r2 to point at the end of r1
        ldmia   r0,{r3,r4,r5,r6,r7}
.Lloop:
        ldr     r8,.LK_00_19
        mov     r14,sp
        sub     sp,sp,#15*4
        mov     r5,r5,ror#30
        mov     r6,r6,ror#30
        mov     r7,r7,ror#30            @ [6]
.L_00_15:
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r7,r8,r7,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r5,r6                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r7,r8,r7,ror#2                  @ E+=K_00_19
        eor     r10,r5,r6                       @ F_xx_xx
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r4,r10,ror#2
        add     r7,r7,r9                        @ E+=X[i]
        eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r6,r8,r6,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r4,r5                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r6,r8,r6,ror#2                  @ E+=K_00_19
        eor     r10,r4,r5                       @ F_xx_xx
        add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r3,r10,ror#2
        add     r6,r6,r9                        @ E+=X[i]
        eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r5,r8,r5,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r3,r4                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r5,r8,r5,ror#2                  @ E+=K_00_19
        eor     r10,r3,r4                       @ F_xx_xx
        add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r7,r10,ror#2
        add     r5,r5,r9                        @ E+=X[i]
        eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r4,r8,r4,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r7,r3                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r4,r8,r4,ror#2                  @ E+=K_00_19
        eor     r10,r7,r3                       @ F_xx_xx
        add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r6,r10,ror#2
        add     r4,r4,r9                        @ E+=X[i]
        eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r3,r8,r3,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r6,r7                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r3,r8,r3,ror#2                  @ E+=K_00_19
        eor     r10,r6,r7                       @ F_xx_xx
        add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r5,r10,ror#2
        add     r3,r3,r9                        @ E+=X[i]
        eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
        cmp     r14,sp
        bne     .L_00_15                @ [((11+4)*5+2)*3]
        sub     sp,sp,#25*4
#if __ARM_ARCH__<7
        ldrb    r10,[r1,#2]
        ldrb    r9,[r1,#3]
        ldrb    r11,[r1,#1]
        add     r7,r8,r7,ror#2                  @ E+=K_00_19
        ldrb    r12,[r1],#4
        orr     r9,r9,r10,lsl#8
        eor     r10,r5,r6                       @ F_xx_xx
        orr     r9,r9,r11,lsl#16
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
        orr     r9,r9,r12,lsl#24
#else
        ldr     r9,[r1],#4                      @ handles unaligned
        add     r7,r8,r7,ror#2                  @ E+=K_00_19
        eor     r10,r5,r6                       @ F_xx_xx
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
#ifdef __ARMEL__
        rev     r9,r9                           @ byte swap
#endif
#endif
        and     r10,r4,r10,ror#2
        add     r7,r7,r9                        @ E+=X[i]
        eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
        str     r9,[r14,#-4]!
        add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r4,r5                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r3,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r6,r6,r9                        @ E+=X[i]
        eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
        add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r3,r4                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r7,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r5,r5,r9                        @ E+=X[i]
        eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
        add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r7,r3                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r6,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r4,r4,r9                        @ E+=X[i]
        eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
        add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r6,r7                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r5,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r3,r3,r9                        @ E+=X[i]
        eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
        add     r3,r3,r10                       @ E+=F_00_19(B,C,D)

        ldr     r8,.LK_20_39            @ [+15+16*4]
        cmn     sp,#0                   @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r5,r6                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        eor r10,r4,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r7,r7,r9                        @ E+=X[i]
        add     r7,r7,r10                       @ E+=F_20_39(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r4,r5                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        eor r10,r3,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r6,r6,r9                        @ E+=X[i]
        add     r6,r6,r10                       @ E+=F_20_39(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r3,r4                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        eor r10,r7,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r5,r5,r9                        @ E+=X[i]
        add     r5,r5,r10                       @ E+=F_20_39(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r7,r3                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        eor r10,r6,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r4,r4,r9                        @ E+=X[i]
        add     r4,r4,r10                       @ E+=F_20_39(B,C,D)
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r6,r7                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        eor r10,r5,r10,ror#2                                    @ F_xx_xx
                                                @ F_xx_xx
        add     r3,r3,r9                        @ E+=X[i]
        add     r3,r3,r10                       @ E+=F_20_39(B,C,D)
 ARM(   teq     r14,sp          )       @ preserve carry
 THUMB( mov     r11,sp          )
 THUMB( teq     r14,r11         )       @ preserve carry
        bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
        bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes

        ldr     r8,.LK_40_59
        sub     sp,sp,#20*4             @ [+2]
.L_40_59:
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r5,r6                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r4,r10,ror#2                                    @ F_xx_xx
        and r11,r5,r6                                   @ F_xx_xx
        add     r7,r7,r9                        @ E+=X[i]
        add     r7,r7,r10                       @ E+=F_40_59(B,C,D)
        add     r7,r7,r11,ror#2
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r4,r5                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r3,r10,ror#2                                    @ F_xx_xx
        and r11,r4,r5                                   @ F_xx_xx
        add     r6,r6,r9                        @ E+=X[i]
        add     r6,r6,r10                       @ E+=F_40_59(B,C,D)
        add     r6,r6,r11,ror#2
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r3,r4                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r7,r10,ror#2                                    @ F_xx_xx
        and r11,r3,r4                                   @ F_xx_xx
        add     r5,r5,r9                        @ E+=X[i]
        add     r5,r5,r10                       @ E+=F_40_59(B,C,D)
        add     r5,r5,r11,ror#2
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r7,r3                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r6,r10,ror#2                                    @ F_xx_xx
        and r11,r7,r3                                   @ F_xx_xx
        add     r4,r4,r9                        @ E+=X[i]
        add     r4,r4,r10                       @ E+=F_40_59(B,C,D)
        add     r4,r4,r11,ror#2
        ldr     r9,[r14,#15*4]
        ldr     r10,[r14,#13*4]
        ldr     r11,[r14,#7*4]
        add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
        ldr     r12,[r14,#2*4]
        eor     r9,r9,r10
        eor     r11,r11,r12                     @ 1 cycle stall
        eor     r10,r6,r7                       @ F_xx_xx
        mov     r9,r9,ror#31
        add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
        eor     r9,r9,r11,ror#31
        str     r9,[r14,#-4]!
        and r10,r5,r10,ror#2                                    @ F_xx_xx
        and r11,r6,r7                                   @ F_xx_xx
        add     r3,r3,r9                        @ E+=X[i]
        add     r3,r3,r10                       @ E+=F_40_59(B,C,D)
        add     r3,r3,r11,ror#2
        cmp     r14,sp
        bne     .L_40_59                @ [+((12+5)*5+2)*4]

        ldr     r8,.LK_60_79
        sub     sp,sp,#20*4
        cmp     sp,#0                   @ set carry to denote 60_79
        b       .L_20_39_or_60_79       @ [+4], spare 300 bytes
.L_done:
        add     sp,sp,#80*4             @ "deallocate" stack frame
        ldmia   r0,{r8,r9,r10,r11,r12}
        add     r3,r8,r3
        add     r4,r9,r4
        add     r5,r10,r5,ror#2
        add     r6,r11,r6,ror#2
        add     r7,r12,r7,ror#2
        stmia   r0,{r3,r4,r5,r6,r7}
        teq     r1,r2
        bne     .Lloop                  @ [+18], total 1307

        ldmia   sp!,{r4-r12,pc}
.align  2
.LK_00_19:      .word   0x5a827999
.LK_20_39:      .word   0x6ed9eba1
.LK_40_59:      .word   0x8f1bbcdc
.LK_60_79:      .word   0xca62c1d6
ENDPROC(sha1_block_data_order)
.asciz  "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
.align  2