root/lib/libc/aarch64/string/strlcpy.S
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
*/

#include <machine/asm.h>

        .weak strlcpy
        .set strlcpy, __strlcpy
        .text

ENTRY(__strlcpy)
        subs    x2, x2, #1
        b.lo    .L0

        mov     x9, x0                  // stash copy of dst pointer
        bic     x10, x1, #0xf           // src aligned
        and     x11, x1, #0xf           // src offset

        ldr     q1, [x10]
        cmeq    v1.16b, v1.16b, #0      // NUL found in head?

        mov     x8, #-1                 // fill register with 0xfff..fff
        lsl     x12, x11, #2
        lsl     x8, x8, x12             // mask of bytes in the string

        shrn    v1.8b, v1.8h, #4
        fmov    x5, d1

        ands    x5, x5, x8
        b.ne    .Lhead_nul

        ldr     q3, [x10, #16]          // load second string chunk
        ldr     q2, [x1]                // load true head
        mov     x8, #32
        sub     x8, x8, x11

        cmeq    v1.16b, v3.16b, #0      // NUL found in second chunk?

        subs    x2, x2, x8
        b.ls    .Lhead_buf_end

        /* process second chunk */
        shrn    v1.8b, v1.8h, #4
        fmov    x5, d1
        cbnz    x5, .Lsecond_nul

        /* string didn't end in second chunk and neither did buffer */
        ldr     q1,     [x10, #32]      // load next string chunk
        str     q2,     [x0]            // deposit head into buffer
        sub     x0, x0, x11             // adjust x0
        str     q3,     [x0, #16]       // deposit second chunk
        add     x10, x10, #32           // advance src
        add     x0, x0, #32             // advance dst
        subs    x2, x2, #16             // enough left for another round?
        b.ls    1f

        /* main loop unrolled twice */
        .p2align 4
0:
        cmeq    v2.16b, v1.16b, #0      // NUL found in second chunk?
        shrn    v2.8b, v2.8h, #4
        fmov    x5, d2

        cbnz    x5, 3f

        str     q1, [x0]
        ldr     q1, [x10, #16]          // load next chunk

        cmp     x2, #16                 // more than a full chunk left?
        b.ls    2f

        add     x10, x10, #32           // advance pointers
        add     x0, x0, #32

        cmeq    v2.16b, v1.16b, #0      // NUL found in second chunk?
        shrn    v2.8b, v2.8h, #4
        fmov    x5, d2
        cbnz    x5, 4f                  // process chunk if match

        str     q1, [x0, #-16]
        ldr     q1, [x10]               // load next chunk

        subs    x2, x2, #32
        b.hi    0b

1:
        sub     x10, x10, #16           // undo second advancement
        add     x2, x2, #16
        sub     x0, x0, #16

        /* 1--16 bytes left in the buffer but string has not ended yet */
2:
        cmeq    v2.16b, v1.16b, #0      // NUL found in second chunk?
        shrn    v2.8b, v2.8h, #4
        fmov    x4, d2

        mov     x6, #0xf
        mov     x7, x4

        lsl     x5, x2, #2              // shift 0xf to the limits position
        lsl     x5, x6, x5
        cmp     x2, #16                 // dont induce match if limit >=16
        csel    x5, x5, xzr, lo
        orr     x8, x4, x5              // treat limit as if terminator present

        rbit    x8, x8                  // simulate x86 tzcnt
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2

        add     x0, x0, x8

        ldr     q1, [x10, x8]           // load tail
        str     q1, [x0]                // store tail
        strb    wzr, [x0, #16]

        /* continue to find the end of the string */
        cbnz    x7, 1f

        /* we opt for a simpler strlen than the one in libc as the
         * cmeq, shrn approach is faster for shorter strings.
         */
        .p2align 4
0:
        ldr     q1, [x10, #32]
        cmeq    v1.16b, v1.16b, #0      // bytewise compare against NUL
        shrn    v1.8b, v1.8h, #4
        fmov    x7, d1
        cbnz    x7, 2f

        ldr     q1, [x10, #48]
        cmeq    v1.16b, v1.16b, #0      // bytewise compare against NUL
        shrn    v1.8b, v1.8h, #4
        fmov    x7, d1
        add     x10, x10, #32
        cbz     x7, 0b

1:      sub     x10, x10, #16
2:      rbit    x8, x7
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2

        sub     x10, x10, x1
        add     x0, x10, #32
        add     x0, x0, x8

        ret

4:
        sub     x10, x10, #16           // undo second advancement
        sub     x0, x0, #16             // undo second advancement

        /* string has ended but buffer has not */
3:
        rbit    x8, x5
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2

        add     x0, x0, x8              // restore dst pointer
        add     x10, x10, x8

        ldr     q1, [x10, #-15]
        str     q1, [x0, #-15]
        add     x0, x0, #1
        sub     x0, x10, x1

        ret

.Lhead_buf_end:
        shrn    v1.8b, v1.8h, #4
        fmov    x8, d1

        add     x2, x2, #32             // restore limit

        mov     x7, x8
        mov     x6, #0xf

        cmp     x2, #16                 // should we induce a match or not
        b.lo    0f

        rbit    x8, x8
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2
        add     x8, x8, #16

        cmp     x8, x2
        csel    x8, x8, x2, lo          // copy min(buflen, srclen) bytes
        b       1f
0:

        rbit    x8, x8
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2

        mov     x8, x2
1:

        sub     x8, x8, x11
        strb    wzr, [x9, x8]

        /* continue to find the end of the string */
        cbnz    x7, 1f

        /* we opt for a simpler strlen than the one in libc as the
         * cmeq, shrn approach is faster for shorter strings.
         */
        .p2align 4
0:
        ldr     q1, [x10, #32]
        cmeq    v1.16b, v1.16b, #0      // bytewise compare against NUL
        shrn    v1.8b, v1.8h, #4
        fmov    x7, d1
        cbnz    x7, 2f

        ldr     q1, [x10, #48]
        cmeq    v1.16b, v1.16b, #0      // bytewise compare against NUL
        shrn    v1.8b, v1.8h, #4
        fmov    x7, d1
        add     x10, x10, #32
        cbz     x7, 0b

1:      sub     x10, x10, #16
2:      rbit    x6, x7
        clz     x6, x6                  // index of mismatch
        lsr     x6, x6, #2

        sub     x10, x10, x1
        add     x0, x10, #32
        add     x0, x0, x6

        add     x4, x9, x8              // dst + cnt
        add     x5, x1, x8              // src + cnt

        b       .L1732

.Lsecond_nul:
        add     x2, x2, x8

        rbit    x8, x5
        clz     x8, x8                  // index of mismatch
        lsr     x5, x8, #2

        sub     x8, x11, #16
        sub     x0, x5, x8              // string length

        cmp     x0, x2                  // did we match or hit limit first?
        csel    x8, x2, x0, hi

        add     x4, x9, x8              // dst + cnt
        add     x5, x1, x8              // src + cnt

        strb    wzr, [x4]

        /* copy 17-32 bytes */
.L1732:
        cmp     x8, #16
        b.lo    .L0816
        ldp     x16, x17, [x1]
        ldp     x12, x1, [x5, #-16]
        stp     x16, x17, [x9]
        stp     x12, x1, [x4, #-16]
        ret

.Lhead_nul:
        rbit    x8, x5
        clz     x8, x8                  // index of mismatch
        lsr     x8, x8, #2

        sub     x0, x8, x11
        cmp     x0, x2
        csel    x8, x2, x0, hi

        add     x4, x9, x8              // dst + cnt
        add     x5, x1, x8              // src + cnt
        strb    wzr, [x4]

        /* Copy 8-16 bytes */
.L0816:
        tbz     x8, #3, .L0407
        ldr     x16, [x1]
        ldr     x17, [x5, #-8]
        str     x16, [x9]
        str     x17, [x4, #-8]
        ret

        /* Copy 4-7 bytes */
        .p2align 4
.L0407:
        cmp     x8, #3
        b.ls    .L0203
        ldr     w16, [x1]
        ldr     w18, [x5, #-4]
        str     w16, [x9]
        str     w18, [x4, #-4]
        ret

.L0203:
        tbz     x8, 1, .L0001
        ldrh    w16, [x1]
        ldrh    w17, [x5, #-2]
        strh    w16, [x9]
        strh    w17, [x4, #-2]
        ret

.L0001:
        ldrb    w16, [x1]
        strb    w16, [x9]
        strb    wzr, [x4]
        ret

.L0:
        mov     x0, x1
        b       strlen
        ret
END(__strlcpy)