root/arch/arm64/lib/strcmp.S
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2012-2022, Arm Limited.
 *
 * Adapted from the original at:
 * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

/* Assumptions:
 *
 * ARMv8-a, AArch64.
 * MTE compatible.
 */

#define L(label) .L ## label

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f

#define src1            x0
#define src2            x1
#define result          x0

#define data1           x2
#define data1w          w2
#define data2           x3
#define data2w          w3
#define has_nul         x4
#define diff            x5
#define off1            x5
#define syndrome        x6
#define tmp             x6
#define data3           x7
#define zeroones        x8
#define shift           x9
#define off2            x10

/* On big-endian early bytes are at MSB and on little-endian LSB.
   LS_FW means shifting towards early bytes.  */
#ifdef __AARCH64EB__
# define LS_FW lsl
#else
# define LS_FW lsr
#endif

/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
   can be done in parallel across the entire word.
   Since carry propagation makes 0x1 bytes before a NUL byte appear
   NUL too in big-endian, byte-reverse the data before the NUL check.  */


SYM_FUNC_START(__pi_strcmp)
        sub     off2, src2, src1
        mov     zeroones, REP8_01
        and     tmp, src1, 7
        tst     off2, 7
        b.ne    L(misaligned8)
        cbnz    tmp, L(mutual_align)

        .p2align 4

L(loop_aligned):
        ldr     data2, [src1, off2]
        ldr     data1, [src1], 8
L(start_realigned):
#ifdef __AARCH64EB__
        rev     tmp, data1
        sub     has_nul, tmp, zeroones
        orr     tmp, tmp, REP8_7f
#else
        sub     has_nul, data1, zeroones
        orr     tmp, data1, REP8_7f
#endif
        bics    has_nul, has_nul, tmp   /* Non-zero if NUL terminator.  */
        ccmp    data1, data2, 0, eq
        b.eq    L(loop_aligned)
#ifdef __AARCH64EB__
        rev     has_nul, has_nul
#endif
        eor     diff, data1, data2
        orr     syndrome, diff, has_nul
L(end):
#ifndef __AARCH64EB__
        rev     syndrome, syndrome
        rev     data1, data1
        rev     data2, data2
#endif
        clz     shift, syndrome
        /* The most-significant-non-zero bit of the syndrome marks either the
           first bit that is different, or the top bit of the first zero byte.
           Shifting left now will bring the critical information into the
           top bits.  */
        lsl     data1, data1, shift
        lsl     data2, data2, shift
        /* But we need to zero-extend (char is unsigned) the value and then
           perform a signed 32-bit subtraction.  */
        lsr     data1, data1, 56
        sub     result, data1, data2, lsr 56
        ret

        .p2align 4

L(mutual_align):
        /* Sources are mutually aligned, but are not currently at an
           alignment boundary.  Round down the addresses and then mask off
           the bytes that precede the start point.  */
        bic     src1, src1, 7
        ldr     data2, [src1, off2]
        ldr     data1, [src1], 8
        neg     shift, src2, lsl 3      /* Bits to alignment -64.  */
        mov     tmp, -1
        LS_FW   tmp, tmp, shift
        orr     data1, data1, tmp
        orr     data2, data2, tmp
        b       L(start_realigned)

L(misaligned8):
        /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
           checking to make sure that we don't access beyond the end of SRC2.  */
        cbz     tmp, L(src1_aligned)
L(do_misaligned):
        ldrb    data1w, [src1], 1
        ldrb    data2w, [src2], 1
        cmp     data1w, 0
        ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
        b.ne    L(done)
        tst     src1, 7
        b.ne    L(do_misaligned)

L(src1_aligned):
        neg     shift, src2, lsl 3
        bic     src2, src2, 7
        ldr     data3, [src2], 8
#ifdef __AARCH64EB__
        rev     data3, data3
#endif
        lsr     tmp, zeroones, shift
        orr     data3, data3, tmp
        sub     has_nul, data3, zeroones
        orr     tmp, data3, REP8_7f
        bics    has_nul, has_nul, tmp
        b.ne    L(tail)

        sub     off1, src2, src1

        .p2align 4

L(loop_unaligned):
        ldr     data3, [src1, off1]
        ldr     data2, [src1, off2]
#ifdef __AARCH64EB__
        rev     data3, data3
#endif
        sub     has_nul, data3, zeroones
        orr     tmp, data3, REP8_7f
        ldr     data1, [src1], 8
        bics    has_nul, has_nul, tmp
        ccmp    data1, data2, 0, eq
        b.eq    L(loop_unaligned)

        lsl     tmp, has_nul, shift
#ifdef __AARCH64EB__
        rev     tmp, tmp
#endif
        eor     diff, data1, data2
        orr     syndrome, diff, tmp
        cbnz    syndrome, L(end)
L(tail):
        ldr     data1, [src1]
        neg     shift, shift
        lsr     data2, data3, shift
        lsr     has_nul, has_nul, shift
#ifdef __AARCH64EB__
        rev     data2, data2
        rev     has_nul, has_nul
#endif
        eor     diff, data1, data2
        orr     syndrome, diff, has_nul
        b       L(end)

L(done):
        sub     result, data1, data2
        ret
SYM_FUNC_END(__pi_strcmp)
SYM_FUNC_ALIAS_WEAK(strcmp, __pi_strcmp)
EXPORT_SYMBOL_NOKASAN(strcmp)