root/lib/libc/arch/m88k/string/memmove.S
/*
 * Copyright (c) 2025, Miodrag Vallat.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * This tries to be as efficient as possible, by performing 32-bit loads and
 * stores, while keeping the logic deciding what to do as short as possible
 * to be as less hurtful as possible to small copies, and those which, due
 * to alignment constraints, have to be performed byte by byte anyway.
 *
 * The general logic is as follows:
 * - check the low two bits of the difference between src and dst. If bit 0 set,
 *   perform byte-by-byte copies; if bit 1 is set, perform halfword-by-halfword
 *   copies; otherwise, perform word-by-word copies.
 * - the overlap occurs only if src < dst < src + len, which, in unsigned
 *   arithmetic, is equivalent to (dst - src) < len.
 * - if no overlap, copy forward by jumping into memcpy; if overlap, copy
 *   backwards.
 * - copy the necessary byte and/or halfword to reach copy unit alignment, do
 *   the bulk of the copy, and copy any possibly remaining byte and/or halfword
 *   to complete.
 */

#include "DEFS.h"

        ENTRY(memmove)

#define SAVE_DST        %r2     /* never modified */
#define SRC             %r3
#define LEN             %r4
#define UNIT            %r5
#define TMP             %r6
#define TMP2            %r7
#define DST             %r8

        bcnd.n  eq0, LEN, .L_done       /* nothing to do! */
         xor    TMP, SAVE_DST, SRC
        bcnd.n  eq0, TMP, .L_done       /* nothing to do! */
         subu   TMP2, SAVE_DST, SRC
        cmp     UNIT, TMP2, LEN
        bb1.n   lo, UNIT, .L_overlap
         or     DST, SAVE_DST, %r0

        /* no overlap, delegate operation to memcpy() */
#ifdef __PIC__
        br      _HIDDEN(memcpy)#plt
#else
        br      _HIDDEN(memcpy)
#endif

.L_overlap:
         addu   SRC, SRC, LEN
        bb1.n   0, TMP, .L_byte
         addu   DST, DST, LEN
        bb1     1, TMP, .L_half

        /* word copy, backwards */

        /* compute the number of words to copy */
        extu    UNIT, LEN, 0<2>
        bcnd    eq0, UNIT, .L_byte

        /* align to word boundary */
        bb0     0, SRC, .L_word_maybe_half
        subu    SRC, SRC, 1
        subu    DST, DST, 1
        subu    LEN, LEN, 1
        ld.b    TMP, SRC, 0
        st.b    TMP, DST, 0
.L_word_maybe_half:
        bb0     1, SRC, .L_word_copy
        subu    SRC, SRC, 2
        subu    DST, DST, 2
        subu    LEN, LEN, 2
        ld.h    TMP, SRC, 0
        st.h    TMP, DST, 0

.L_word_copy:
        /* worst case of 4 <= initial len < 7 and src and dst not aligned */
        /* in this case len may be < 4 at this point */
        extu    UNIT, LEN, 0<2>
        bcnd    eq0, UNIT, .L_byte

.L_word_loop:
        bcnd    eq0, UNIT, .L_word_done
        subu    SRC, SRC, 4
        subu    DST, DST, 4
        subu    UNIT, UNIT, 1
        ld      TMP, SRC, 0
        br.n    .L_word_loop
         st     TMP, DST, 0
.L_word_done:
        bb0     1, LEN, .L_word_maybe_final_byte
        subu    SRC, SRC, 2
        subu    DST, DST, 2
        ld.h    TMP, SRC, 0
        st.h    TMP, DST, 0
.L_word_maybe_final_byte:
        bb0     0, LEN, .L_done
        subu    SRC, SRC, 1
        subu    DST, DST, 1
        ld.b    TMP, SRC, 0
        jmp.n   %r1
         st.b   TMP, DST, 0
        
        /* halfword copy, backwards */

.L_half:
        /* compute the number of halfwords to copy */
        extu    UNIT, LEN, 0<1>
        bcnd    eq0, UNIT, .L_byte

        /* align to halfword boundary */
        bb0     0, SRC, .L_half_copy
        subu    SRC, SRC, 1
        subu    DST, DST, 1
        subu    LEN, LEN, 1
        ld.b    TMP, SRC, 0
        st.b    TMP, DST, 0

        /* worst case of initial len == 2 and src and dst not aligned */
        /* in this case len may be < 2 at this point */
        extu    UNIT, LEN, 0<1>
        bcnd    eq0, UNIT, .L_byte

.L_half_copy:
.L_half_loop:
        bcnd    eq0, UNIT, .L_half_done
        subu    SRC, SRC, 2
        subu    DST, DST, 2
        subu    UNIT, UNIT, 1
        ld.h    TMP, SRC, 0
        br.n    .L_half_loop
         st.h   TMP, DST, 0
.L_half_done:
        bb0     0, LEN, .L_done
        subu    SRC, SRC, 1
        subu    DST, DST, 1
        ld.b    TMP, SRC, 0
        jmp.n   %r1
         st.b   TMP, DST, 0

        /* byte copy, backwards */
#undef UNIT
.L_byte:
.L_byte_loop:
        bcnd    eq0, LEN, .L_done
        subu    SRC, SRC, 1
        subu    DST, DST, 1
        subu    LEN, LEN, 1
        ld.b    TMP, SRC, 0
        br.n    .L_byte_loop
         st.b   TMP, DST, 0
.L_done:
        jmp     %r1

        END_STRONG(memmove)