root/usr/src/lib/libc/sparcv9/gen/memcpy.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

        .file   "memcpy.s"

/*
 * memcpy(s1, s2, len)
 *
 * Copy s2 to s1, always copy n bytes.
 * Note: this does not work for overlapped copies, bcopy() does
 *
 * Added entry __align_cpy_1 is generally for use of the compilers.
 *
 *
 * Fast assembler language version of the following C-program for memcpy
 * which represents the `standard' for the C-library.
 *
 *      void *
 *      memcpy(void *s, const void *s0, size_t n)
 *      {
 *              if (n != 0) {
 *                      char *s1 = s;
 *                      const char *s2 = s0;
 *                      do {
 *                              *s1++ = *s2++;
 *                      } while (--n != 0);
 *              }
 *              return (s);
 *      }
 */

#include <sys/asm_linkage.h>

        ANSI_PRAGMA_WEAK(memcpy,function)

        ENTRY(memcpy)
        ENTRY(__align_cpy_1)
        mov     %o0, %g5                ! save des address for return val
        cmp     %o2, 17                 ! for small counts copy bytes
        bleu,pn %xcc, .dbytecp
        andcc   %o1, 3, %o5             ! is src word aligned
        bz,pn   %icc, .aldst
        cmp     %o5, 2                  ! is src half-word aligned
        be,pt   %xcc, .s2algn
        cmp     %o5, 3                  ! src is byte aligned
.s1algn:ldub    [%o1], %o3              ! move 1 or 3 bytes to align it
        inc     1, %o1
        stb     %o3, [%g5]              ! move a byte to align src
        inc     1, %g5
        bne,pt  %icc, .s2algn
        dec     %o2
        b       .ald                    ! now go align dest
        andcc   %g5, 3, %o5

.s2algn:lduh    [%o1], %o3              ! know src is 2 byte alinged
        inc     2, %o1
        srl     %o3, 8, %o4
        stb     %o4, [%g5]              ! have to do bytes,
        stb     %o3, [%g5 + 1]          ! don't know dst alingment
        inc     2, %g5
        dec     2, %o2

.aldst: andcc   %g5, 3, %o5             ! align the destination address
.ald:   bz,pn   %icc, .w4cp
        cmp     %o5, 2
        bz,pn   %icc, .w2cp
        cmp     %o5, 3
.w3cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 24, %o5
        stb     %o5, [%g5]
        bne,pt  %icc, .w1cp
        inc     %g5
        dec     1, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

1:      sll     %o4, 8, %g1             ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 24, %o5            ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %xcc, 1b
        inc     4, %g5
        sub     %o1, 3, %o1             ! used one byte of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w1cp:  srl     %o4, 8, %o5
        sth     %o5, [%g5]
        inc     2, %g5
        dec     3, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

2:      sll     %o4, 24, %g1            ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 8, %o5             ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %xcc, 2b
        inc     4, %g5
        sub     %o1, 1, %o1             ! used three bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w2cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 16, %o5
        sth     %o5, [%g5]
        inc     2, %g5
        dec     2, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

3:      sll     %o4, 16, %g1            ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 16, %o5            ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %xcc, 3b
        inc     4, %g5
        sub     %o1, 2, %o1             ! used two bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w4cp:  andn    %o2, 3, %o3             ! o3 is aligned word count
        sub     %o1, %g5, %o1           ! o1 gets the difference

1:      lduw    [%o1+%g5], %o4          ! read from address
        deccc   4, %o3                  ! decrement count
        st      %o4, [%g5]              ! write at destination address
        bgu,pt  %xcc, 1b
        inc     4, %g5                  ! increment to address
        b       7f
        and     %o2, 3, %o2             ! number of leftover bytes, if any

        !
        ! differenced byte copy, works with any alignment
        !
.dbytecp:
        b       7f
        sub     %o1, %g5, %o1           ! o1 gets the difference

4:      stb     %o4, [%g5]              ! write to address
        inc     %g5                     ! inc to address
7:      deccc   %o2                     ! decrement count
        bgeu,a,pt %xcc,4b               ! loop till done
        ldub    [%o1+%g5], %o4          ! read from address
        retl
        nop

        SET_SIZE(memcpy)
        SET_SIZE(__align_cpy_1)