root/lib/libc/powerpc64/string/bcopy.S
/*-
 * Copyright (c) 2018 Instituto de Pesquisas Eldorado
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the author nor the names of its contributors may
 *    be used to endorse or promote products derived from this software
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <machine/asm.h>
#define BLOCK_SIZE_BITS                 6
#define BLOCK_SIZE                      (1 << BLOCK_SIZE_BITS)
#define BLOCK_SIZE_MASK                 (BLOCK_SIZE - 1)

/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/
#ifndef ALIGN_MASK
#define ALIGN_MASK                      0x7
#endif

#define MULTI_PHASE_THRESHOLD           512

#ifndef FN_NAME
#ifdef MEMMOVE
#define FN_NAME                         __memmove
WEAK_REFERENCE(__memmove, memmove);
#else
#define FN_NAME                         __bcopy
WEAK_REFERENCE(__bcopy, bcopy);
#endif
#endif

/*
 * r3: dst
 * r4: src
 * r5: len
 */

ENTRY(FN_NAME)
        cmpld   %r3, %r4                /* src == dst? nothing to do */
        beqlr-
        cmpdi   %r5, 0                  /* len == 0? nothing to do */
        beqlr-

#ifdef MEMMOVE
        std     %r3, -8(%r1)            /* save dst */
#else   /* bcopy: swap src/dst */
        mr      %r0, %r3
        mr      %r3, %r4
        mr      %r4, %r0
#endif

        /* First check for relative alignment, if unaligned copy one byte at a time */
        andi.   %r8, %r3, ALIGN_MASK
        andi.   %r7, %r4, ALIGN_MASK
        cmpd    %r7, %r8
        bne     .Lunaligned


        cmpldi  %r5, MULTI_PHASE_THRESHOLD
        bge     .Lmulti_phase
        b       .Lfast_copy

.Lunaligned:
        /* forward or backward copy? */
        cmpd    %r4, %r3
        blt     .Lbackward_unaligned

        /* Just need to setup increment and jump to copy */
        li      %r0, 1
        mtctr   %r5
        b       .Lsingle_1_loop

.Lbackward_unaligned:
        /* advance src and dst to last byte, set decrement and jump to copy */
        add     %r3, %r3, %r5
        addi    %r3, %r3, -1
        add     %r4, %r4, %r5
        addi    %r4, %r4, -1
        li      %r0, -1
        mtctr   %r5
        b       .Lsingle_1_loop

.Lfast_copy:
        /* align src */
        cmpd    %r4, %r3                /* forward or backward copy? */
        blt     .Lbackward_align

        .align 5
.Lalign:
        andi.   %r0, %r4, 15
        beq     .Lsingle_copy
        lbz     %r0, 0(%r4)
        addi    %r4, %r4, 1
        stb     %r0, 0(%r3)
        addi    %r3, %r3, 1
        addi    %r5, %r5, -1
        cmpdi   %r5, 0
        beq-    .Ldone
        b       .Lalign

.Lbackward_align:
        /* advance src and dst to end (past last byte) */
        add     %r3, %r3, %r5
        add     %r4, %r4, %r5
        .align 5
.Lbackward_align_loop:
        andi.   %r0, %r4, 15
        beq     .Lbackward_single_copy
        lbzu    %r0, -1(%r4)
        addi    %r5, %r5, -1
        stbu    %r0, -1(%r3)
        cmpdi   %r5, 0
        beq-    .Ldone
        b       .Lbackward_align_loop

.Lsingle_copy:
        /* forward copy */
        li      %r0, 1
        li      %r8, 16
        li      %r9, 0
        b       .Lsingle_phase

.Lbackward_single_copy:
        /* backward copy */
        li      %r0, -1
        li      %r8, -16
        li      %r9, -15
        /* point src and dst to last byte */
        addi    %r3, %r3, -1
        addi    %r4, %r4, -1

.Lsingle_phase:
        srdi.   %r6, %r5, 4             /* number of 16-bytes */
        beq     .Lsingle_1

        /* pre-adjustment */
        add     %r3, %r3, %r9
        add     %r4, %r4, %r9

        mtctr   %r6
        .align 5
.Lsingle_16_loop:
        ld      %r6, 0(%r4)
        ld      %r7, 8(%r4)
        add     %r4, %r4, %r8
        std     %r6, 0(%r3)
        std     %r7, 8(%r3)
        add     %r3, %r3, %r8
        bdnz    .Lsingle_16_loop

        /* post-adjustment */
        sub     %r3, %r3, %r9
        sub     %r4, %r4, %r9

.Lsingle_1:
        andi.   %r6, %r5, 0x0f          /* number of 1-bytes */
        beq     .Ldone                  /* 1-bytes == 0? done */

        mtctr   %r6
        .align 5
.Lsingle_1_loop:
        lbz     %r6, 0(%r4)
        add     %r4, %r4, %r0           /* increment */
        stb     %r6, 0(%r3)
        add     %r3, %r3, %r0           /* increment */
        bdnz    .Lsingle_1_loop

.Ldone:
#ifdef MEMMOVE
        ld      %r3, -8(%r1)            /* restore dst */
#endif
        blr


.Lmulti_phase:
        /* set up multi-phase copy parameters */

        /* r7 = bytes before the aligned section of the buffer */
        andi.   %r6, %r4, 15
        subfic  %r7, %r6, 16
        /* r8 = bytes in and after the aligned section of the buffer */
        sub     %r8, %r5, %r7
        /* r9 = bytes after the aligned section of the buffer */
        andi.   %r9, %r8, BLOCK_SIZE_MASK
        /* r10 = BLOCKS in the aligned section of the buffer */
        srdi    %r10, %r8, BLOCK_SIZE_BITS

        /* forward or backward copy? */
        cmpd    %r4, %r3
        blt     .Lbackward_multi_copy

        /* set up forward copy parameters */
        std     %r7,  -32(%r1)          /* bytes to copy in phase 1 */
        std     %r10, -40(%r1)          /* BLOCKS to copy in phase 2 */
        std     %r9,  -48(%r1)          /* bytes to copy in phase 3 */

        li      %r0, 1                  /* increment for phases 1 and 3 */
        li      %r5, BLOCK_SIZE         /* increment for phase 2 */

        /* op offsets for phase 2 */
        li      %r7,  0
        li      %r8,  16
        li      %r9,  32
        li      %r10, 48

        std     %r8, -16(%r1)           /* 16-byte increment (16) */
        std     %r7, -24(%r1)           /* 16-byte pre/post adjustment (0) */

        b       .Lphase1

.Lbackward_multi_copy:
        /* set up backward copy parameters */
        std     %r9,  -32(%r1)          /* bytes to copy in phase 1 */
        std     %r10, -40(%r1)          /* BLOCKS to copy in phase 2 */
        std     %r7,  -48(%r1)          /* bytes to copy in phase 3 */

        li      %r0, -1                 /* increment for phases 1 and 3 */
        add     %r6, %r5, %r0           /* r6 = len - 1 */
        li      %r5, -BLOCK_SIZE        /* increment for phase 2 */
        /* advance src and dst to the last position */
        add     %r3, %r3, %r6
        add     %r4, %r4, %r6

        /* op offsets for phase 2 */
        li      %r7,  -15
        li      %r8,  -31
        li      %r9,  -47
        li      %r10, -63

        add     %r6, %r7, %r0           /* r6 = -16 */
        std     %r6, -16(%r1)           /* 16-byte increment (-16) */
        std     %r7, -24(%r1)           /* 16-byte pre/post adjustment (-15) */

.Lphase1:
        ld      %r6, -32(%r1)           /* bytes to copy in phase 1 */
        cmpldi  %r6, 0                  /* r6 == 0? skip phase 1 */
        beq+    .Lphase2

        mtctr   %r6
        .align 5
.Lphase1_loop:
        lbz     %r6, 0(%r4)
        add     %r4, %r4, %r0           /* phase 1 increment */
        stb     %r6, 0(%r3)
        add     %r3, %r3, %r0           /* phase 1 increment */
        bdnz    .Lphase1_loop

.Lphase2:
        ld      %r6, -40(%r1)           /* BLOCKS to copy in phase 2 */
        cmpldi  %r6, 0                  /* %r6 == 0? skip phase 2 */
        beq     .Lphase3

#ifdef FN_PHASE2
FN_PHASE2
#else
        /* save registers */
        std     %r14, -56(%r1)
        std     %r15, -64(%r1)
        std     %r16, -72(%r1)
        std     %r17, -80(%r1)
        std     %r18, -88(%r1)
        std     %r19, -96(%r1)
        std     %r20, -104(%r1)
        std     %r21, -112(%r1)

        addi    %r18, %r7, 8
        addi    %r19, %r8, 8
        addi    %r20, %r9, 8
        addi    %r21, %r10, 8

        mtctr   %r6
        .align 5
.Lphase2_loop:
        ldx     %r14, %r7,  %r4
        ldx     %r15, %r18, %r4
        ldx     %r16, %r8,  %r4
        ldx     %r17, %r19, %r4
        stdx    %r14, %r7,  %r3
        stdx    %r15, %r18, %r3
        stdx    %r16, %r8,  %r3
        stdx    %r17, %r19, %r3

        ldx     %r14, %r9,  %r4
        ldx     %r15, %r20, %r4
        ldx     %r16, %r10, %r4
        ldx     %r17, %r21, %r4
        stdx    %r14, %r9,  %r3
        stdx    %r15, %r20, %r3
        stdx    %r16, %r10, %r3
        stdx    %r17, %r21, %r3

        add     %r4, %r4, %r5           /* phase 2 increment */
        add     %r3, %r3, %r5           /* phase 2 increment */

        bdnz    .Lphase2_loop

        /* restore registers */
        ld      %r14, -56(%r1)
        ld      %r15, -64(%r1)
        ld      %r16, -72(%r1)
        ld      %r17, -80(%r1)
        ld      %r18, -88(%r1)
        ld      %r19, -96(%r1)
        ld      %r20, -104(%r1)
        ld      %r21, -112(%r1)
#endif

.Lphase3:
        /* load registers for transitioning into the single-phase logic */
        ld      %r5, -48(%r1)           /* bytes to copy in phase 3 */
        ld      %r8, -16(%r1)           /* 16-byte increment */
        ld      %r9, -24(%r1)           /* 16-byte pre/post adjustment */
        b       .Lsingle_phase

END(FN_NAME)

        .section .note.GNU-stack,"",%progbits