root/lib/libc/arch/m88k/string/memcpy.S
/*
 * Copyright (c) 2025, Miodrag Vallat.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * This tries to be as efficient as possible, by performing 32-bit loads and
 * stores, while keeping the logic deciding what to do as short as possible
 * to be as less hurtful as possible to small copies, and those which, due
 * to alignment constraints, have to be performed byte by byte anyway.
 *
 * The general logic is as follows:
 * - check the low two bits of the difference between src and dst. If bit 0 set,
 *   perform byte-by-byte copies; if bit 1 is set, perform halfword-by-halfword
 *   copies; otherwise, perform word-by-word copies.
 * - assume no overlap, copy forward.
 * - copy the necessary byte and/or halfword to reach copy unit alignment,
 *   do the bulk of the copy, and copy any possibly remaining byte and/or
 *   halfword to complete.
 */

#include "DEFS.h"

        ENTRY(memcpy)

#define SAVE_DST        %r2     /* never modified */
#define SRC             %r3
#define LEN             %r4
#define UNIT            %r5
#define TMP             %r6
#define DST             %r8

        bcnd.n  eq0, LEN, .L_done       /* nothing to do! */
         xor    TMP, SAVE_DST, SRC
        bcnd.n  eq0, TMP, .L_done       /* nothing to do! */
         or     DST, SAVE_DST, %r0

        bb1     0, TMP, .L_byte
        bb1     1, TMP, .L_half

        /* word copy */

        /* compute the number of words to copy */
        extu    UNIT, LEN, 0<2>
        bcnd    eq0, UNIT, .L_byte

        /* align to word boundary */
        bb0     0, SRC, .L_word_maybe_early_half
        ld.b    TMP, SRC, 0
        addu    SRC, SRC, 1
        st.b    TMP, DST, 0
        subu    LEN, LEN, 1
        addu    DST, DST, 1
.L_word_maybe_early_half:
        bb0     1, SRC, .L_word_copy
        ld.h    TMP, SRC, 0
        addu    SRC, SRC, 2
        st.h    TMP, DST, 0
        subu    LEN, LEN, 2
        addu    DST, DST, 2

.L_word_copy:
        /* worst case of 4 <= initial len < 7 and src and dst not aligned */
        /* in this case len may be < 4 at this point */
        extu    UNIT, LEN, 0<2>
        bcnd    eq0, UNIT, .L_byte

.L_word_loop:
        ld      TMP, SRC, 0
        subu    UNIT, UNIT, 1
        st      TMP, DST, 0
        addu    SRC, SRC, 4
        bcnd.n  ne0, UNIT, .L_word_loop
         addu   DST, DST, 4
.L_word_remainder:
        bb0     1, LEN, .L_word_maybe_byte
        ld.h    TMP, SRC, 0
        st.h    TMP, DST, 0
        addu    SRC, SRC, 2
        addu    DST, DST, 2
.L_word_maybe_byte:
        bb0     0, LEN, .L_done
        ld.b    TMP, SRC, 0
        jmp.n   %r1
         st.b   TMP, DST, 0
        
        /* halfword copy */

.L_half:
        /* compute the number of halfwords to copy */
        extu    UNIT, LEN, 0<1>
        bcnd    eq0, UNIT, .L_byte

        /* align to halfword boundary */
        bb0     0, SRC, .L_half_copy
        ld.b    TMP, SRC, 0
        addu    SRC, SRC, 1
        st.b    TMP, DST, 0
        subu    LEN, LEN, 1
        addu    DST, DST, 1

        /* worst case of initial len == 2 and src and dst not aligned */
        /* in this case len may be < 2 at this point */
        extu    UNIT, LEN, 0<1>
        bcnd    eq0, UNIT, .L_byte

.L_half_copy:
.L_half_loop:
        ld.h    TMP, SRC, 0
        subu    UNIT, UNIT, 1
        st.h    TMP, DST, 0
        addu    SRC, SRC, 2
        bcnd.n  ne0, UNIT, .L_half_loop
         addu   DST, DST, 2
.L_half_remainder:
        bb0     0, LEN, .L_done
        ld.b    TMP, SRC, 0
        jmp.n   %r1
         st.b   TMP, DST, 0

        /* byte copy */
#undef UNIT
.L_byte:
.L_byte_loop:
        ld.b    TMP, SRC, 0
        subu    LEN, LEN, 1
        st.b    TMP, DST, 0
        addu    SRC, SRC, 1
        bcnd.n  ne0, LEN, .L_byte_loop
         addu   DST, DST, 1
.L_done:
        jmp     %r1

        END_STRONG(memcpy)