root/usr/src/lib/libc/capabilities/sun4v/common/memcpy.S
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

        .file   "memcpy.s"

/*
 * memcpy(s1, s2, len)
 *
 * Copy s2 to s1, always copy n bytes.
 * Note: this C code does not work for overlapped copies.
 *       Memmove() and bcopy() do.
 *
 * Added entry __align_cpy_1 is generally for use of the compilers.
 *
 * Fast assembler language version of the following C-program for memcpy
 * which represents the `standard' for the C-library.
 *
 *      void *
 *      memcpy(void *s, const void *s0, size_t n)
 *      {
 *              if (n != 0) {
 *                  char *s1 = s;
 *                  const char *s2 = s0;
 *                  do {
 *                      *s1++ = *s2++;
 *                  } while (--n != 0);
 *              }
 *              return (s);
 *      }
 *
 *
 * N1 Flow :
 *
 * if (count < 17) {
 *      Do the byte copy
 *      Return destination address
 * }
 * if (count < 128) {
 *      Is source aligned on word boundary
 *      If no then align source on word boundary then goto .ald
 *      If yes goto .ald
 *      .ald:
 *              Is destination aligned on word boundary
 *              Depending on destination offset (last 2 bits of destination)
 *              copy data by shifting and merging.
 *              Copy residue bytes as byte copy
 *              Return destination address
 * } else {
 *      Align destination on block boundary
 *      Depending on the source offset (last 4 bits of source address) align
 *      the data and store to destination. Both the load and store are done
 *      using ASI_BLK_INIT_ST_QUAD_LDD_P.
 *      For remaining count copy as much data in 8-byte chunk from source to
 *      destination.
 *      Followed by trailing copy using byte copy.
 *      Return saved destination address
 * }
 *
 *
 * N2 Flow :
 * Flow :
 *
 * if (count < 128) {
 *   if count < 3
 *      copy bytes; exit with dst addr
 *   if src & dst aligned on word boundary but not long word boundary,
 *     copy with ldw/stw; branch to finish_up
 *   if src & dst aligned on long word boundary
 *     copy with ldx/stx; branch to finish_up
 *   if src & dst not aligned and length <= 14
 *     copy bytes; exit with dst addr
 *   move enough bytes to get src to word boundary
 *   if dst now on word boundary
 * move_words:
 *     copy words; branch to finish_up
 *   if dst now on half word boundary
 *     load words, shift half words, store words; branch to finish_up
 *   if dst on byte 1
 *     load words, shift 3 bytes, store words; branch to finish_up
 *   if dst on byte 3
 *     load words, shift 1 byte, store words; branch to finish_up
 * finish_up:
 *     copy bytes; exit with dst addr
 * } else {                                         More than 128 bytes
 *   move bytes until dst is on long word boundary
 *   if( src is on long word boundary ) {
 *     if (count < 512) {
 * finish_long:                                    src/dst aligned on 8 bytes
 *       copy with ldx/stx in 8-way unrolled loop;
 *       copy final 0-63 bytes; exit with dst addr
 *     } else {                                 src/dst aligned; count > 512
 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
 *       src alignments relative to a 64 byte boundary to select the
 *       16-way unrolled loop to use for
 *       block load, fmovd, block-init-store, block-store, fmovd operations
 *       then go to finish_long.
 *     }
 *   } else {                                   src/dst not aligned on 8 bytes
 *     if src is word aligned and count < 512
 *       move words in 8-way unrolled loop
 *       move final 0-31 bytes; exit with dst addr
 *     if count < 512
 *       use alignaddr/faligndata combined with ldd/std in 8-way
 *       unrolled loop to move data.
 *       go to unalign_done
 *     else
 *       setup alignaddr for faligndata instructions
 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
 *       src alignments to nearest long word relative to 64 byte boundary to
 *       select the 8-way unrolled loop to use for
 *       block load, falign, fmovd, block-init-store, block-store loop
 *       (only use block-init-store when src/dst on 8 byte boundaries.)
 * unalign_done:
 *       move remaining bytes for unaligned cases. exit with dst addr.
 * }
 *
 * Comment on N2 memmove and memcpy common code and block-store-init:
 *   In the man page for memmove, it specifies that copying will take place
 *   correctly between objects that overlap.  For memcpy, behavior is
 *   undefined for objects that overlap.
 *
 *   In rare cases, some multi-threaded applications may attempt to examine
 *   the copy destination buffer during the copy. Using the block-store-init
 *   instruction allows those applications to observe zeros in some
 *   cache lines of the destination buffer for narrow windows. But the
 *   the block-store-init provides memory throughput advantages for many
 *   common applications. To meet both needs, those applications which need
 *   the destination buffer to retain meaning during the copy should use
 *   memmove instead of memcpy.  The memmove version duplicates the memcpy
 *   algorithms except the memmove version does not use block-store-init
 *   in those cases where memcpy does use block-store-init. Otherwise, when
 *   memmove can determine the source and destination do not overlap,
 *   memmove shares the memcpy code.
 */

#include <sys/asm_linkage.h>
#include <sys/niagaraasi.h>
#include <sys/asi.h>
#include <sys/trap.h>

/* documented name for primary block initializing store */
#define ASI_STBI_P      ASI_BLK_INIT_ST_QUAD_LDD_P

#define BLOCK_SIZE      64
#define FPRS_FEF        0x4

#define SHORTCOPY       3
#define SHORTCHECK      14
#define SHORT_LONG      64      /* max copy for short longword-aligned case */
                                /* must be at least 32 */
#define SMALL_MAX       128
#define MED_UMAX        512     /* max copy for medium un-aligned case */
#define MED_WMAX        512     /* max copy for medium word-aligned case */
#define MED_MAX         512     /* max copy for medium longword-aligned case */

#ifdef NIAGARA2_IMPL
#include <sys/sun4asi.h>

#else   /* NIAGARA2_IMPL */
/*
 * This define is to align data for the unaligned source cases.
 * The data1, data2 and data3 is merged into data1 and data2.
 * The data3 is preserved for next merge.
 */
#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
        sllx    data1, lshift, data1                            ;\
        srlx    data2, rshift, tmp                              ;\
        or      data1, tmp, data1                               ;\
        sllx    data2, lshift, data2                            ;\
        srlx    data3, rshift, tmp                              ;\
        or      data2, tmp, data2
/*
 * Align the data. Merge the data1 and data2 into data1.
 */
#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)        \
        sllx    data1, lshift, data1                            ;\
        srlx    data2, rshift, tmp                              ;\
        or      data1, tmp, data1
#endif  /* NIAGARA2_IMPL */


        ANSI_PRAGMA_WEAK(memmove,function)
        ANSI_PRAGMA_WEAK(memcpy,function)

        ENTRY(memmove)
        cmp     %o1, %o0        ! if from address is >= to use forward copy
        bgeu,pn %ncc, .forcpy   ! else use backward if ...
        sub     %o0, %o1, %o4   ! get difference of two addresses
        cmp     %o2, %o4        ! compare size and difference of addresses
        bleu,pn %ncc, .forcpy   ! if size is bigger, do overlapped copy
        add     %o1, %o2, %o5   ! get to end of source space

        !
        ! an overlapped copy that must be done "backwards"
        !
.chksize:
        cmp     %o2, 8                  ! less than 8 byte do byte copy
        blu,pt %ncc, 2f                 ! else continue

        ! Now size is bigger than 8
.dbalign:
        add     %o0, %o2, %g1           ! get to end of dest space
        andcc   %g1, 7, %o3             ! %o3 has bytes till dst 8 bytes aligned
        bz,a,pn %ncc, .dbbck            ! if dst is not 8 byte aligned: align it
        andn    %o2, 7, %o3             ! %o3 count is multiple of 8 bytes size
        sub     %o2, %o3, %o2           ! update o2 with new count

1:      dec     %o5                     ! decrement source
        ldub    [%o5], %g1              ! load one byte
        deccc   %o3                     ! decrement count
        bgu,pt  %ncc, 1b                ! if not done keep copying
        stb     %g1, [%o5+%o4]          ! store one byte into dest
        andncc  %o2, 7, %o3             ! %o3 count is multiple of 8 bytes size
        bz,pn   %ncc, 2f                ! if size < 8, move to byte copy

        ! Now Destination is 8 byte aligned
.dbbck:
        andcc   %o5, 7, %o0             ! %o0 has src offset
        bz,a,pn %ncc, .dbcopybc         ! if src is aligned to fast mem move
        sub     %o2, %o3, %o2           ! Residue bytes in %o2

.cpy_dbwdbc:                            ! alignment of src is needed
        sub     %o2, 8, %o2             ! set size one loop ahead
        sll     %o0, 3, %g1             ! %g1 is left shift
        mov     64, %g5                 ! init %g5 to be 64
        sub     %g5, %g1, %g5           ! %g5 right shift = (64 - left shift)
        sub     %o5, %o0, %o5           ! align the src at 8 bytes.
        add     %o4, %o0, %o4           ! increase difference between src & dst
        ldx     [%o5], %o1              ! load first 8 bytes
        srlx    %o1, %g5, %o1
1:      sub     %o5, 8, %o5             ! subtract 8 from src
        ldx     [%o5], %o0              ! load 8 byte
        sllx    %o0, %g1, %o3           ! shift loaded 8 bytes left into tmp reg
        or      %o1, %o3, %o3           ! align data
        stx     %o3, [%o5+%o4]          ! store 8 byte
        subcc   %o2, 8, %o2             ! subtract 8 byte from size
        bg,pt   %ncc, 1b                ! if size > 0 continue
        srlx    %o0, %g5, %o1           ! move extra byte for the next use

        srl     %g1, 3, %o0             ! retsote %o0 value for alignment
        add     %o5, %o0, %o5           ! restore src alignment
        sub     %o4, %o0, %o4           ! restore difference between src & dest

        ba      2f                      ! branch to the trailing byte copy
        add     %o2, 8, %o2             ! restore size value

.dbcopybc:                              ! alignment of src is not needed
1:      sub     %o5, 8, %o5             ! subtract from src
        ldx     [%o5], %g1              ! load 8 bytes
        subcc   %o3, 8, %o3             ! subtract from size
        bgu,pt  %ncc, 1b                ! if size is bigger 0 continue
        stx     %g1, [%o5+%o4]          ! store 8 bytes to destination

        ba      2f
        nop

.bcbyte:
1:      ldub    [%o5], %g1              ! load one byte
        stb     %g1, [%o5+%o4]          ! store one byte
2:      deccc   %o2                     ! decrement size
        bgeu,a,pt %ncc, 1b              ! if size is >= 0 continue
        dec     %o5                     ! decrement from address

.exitbc:                                ! exit from backward copy
        retl
        add     %o5, %o4, %o0           ! restore dest addr

#ifdef NIAGARA2_IMPL
        !
        ! Check to see if memmove is large aligned copy
        ! If so, use special version of copy that avoids
        ! use of block store init
        !
.forcpy:
        cmp     %o2, SMALL_MAX          ! check for not small case
        blt,pn  %ncc, .mv_short         ! merge with memcpy
        mov     %o0, %g1                ! save %o0
        neg     %o0, %o5
        andcc   %o5, 7, %o5             ! bytes till DST 8 byte aligned
        brz,pt  %o5, .mv_dst_aligned_on_8

        ! %o5 has the bytes to be written in partial store.
        sub     %o2, %o5, %o2
        sub     %o1, %o0, %o1           ! %o1 gets the difference
7:                                      ! dst aligning loop
        ldub    [%o1+%o0], %o4          ! load one byte
        subcc   %o5, 1, %o5
        stb     %o4, [%o0]
        bgu,pt  %ncc, 7b
        add     %o0, 1, %o0             ! advance dst
        add     %o1, %o0, %o1           ! restore %o1
.mv_dst_aligned_on_8:
        andcc   %o1, 7, %o5
        brnz,pt %o5, .src_dst_unaligned_on_8
        prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read

.mv_src_dst_aligned_on_8:
        ! check if we are copying MED_MAX or more bytes
        cmp     %o2, MED_MAX            ! limit to store buffer size
        bleu,pt %ncc, .medlong
        prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read

/*
 * The following memmove code mimics the memcpy code for large aligned copies,
 * but does not use the ASI_STBI_P (block initializing store) performance
 * optimization. See memmove rationale section in documentation
 */
.mv_large_align8_copy:                  ! Src and dst share 8 byte alignment
        rd      %fprs, %g5              ! check for unused fp
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
        bz,a    %ncc, 1f
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
1:
        ! align dst to 64 byte boundary
        andcc   %o0, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
        brz,pn  %o3, .mv_aligned_on_64
        sub     %o3, 64, %o3            ! %o3 has negative bytes to move
        add     %o2, %o3, %o2           ! adjust remaining count
.mv_align_to_64:
        ldx     [%o1], %o4
        add     %o1, 8, %o1             ! increment src ptr
        addcc   %o3, 8, %o3
        stx     %o4, [%o0]
        brnz,pt %o3, .mv_align_to_64
        add     %o0, 8, %o0             ! increment dst ptr

.mv_aligned_on_64:
        prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
        mov     %asi,%o4                ! save %asi
        ! Determine source alignment
        ! to correct 8 byte offset
        andcc   %o1, 0x20, %o3
        brnz,pn %o3, .mv_align_1
        mov     ASI_BLK_P, %asi         ! setup %asi for block load/store
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .mv_align_01
        nop
        andcc   %o1, 0x08, %o3
        brz,pn  %o3, .mv_align_000
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .mv_align_001
        nop
.mv_align_01:
        andcc   %o1, 0x08, %o3
        brnz,pn %o3, .mv_align_011
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .mv_align_010
        nop
.mv_align_1:
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .mv_align_11
        nop
        andcc   %o1, 0x08, %o3
        brnz,pn %o3, .mv_align_101
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .mv_align_100
        nop
.mv_align_11:
        andcc   %o1, 0x08, %o3
        brz,pn  %o3, .mv_align_110
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

.mv_align_111:
! Alignment off by 8 bytes
        ldd     [%o1], %d0
        add     %o1, 8, %o1
        sub     %o2, 8, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_111_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */
        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d0

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d30, %d0
        bgt,pt  %ncc, .mv_align_111_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        ba      .remain_stuff
        add     %o0, 8, %o0
        ! END OF mv_align_111

.mv_align_110:
! Alignment off by 16 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        add     %o1, 16, %o1
        sub     %o2, 16, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_110_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d28, %d0
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d2

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d28, %d0
        fmovd   %d30, %d2
        bgt,pt  %ncc, .mv_align_110_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        ba      .remain_stuff
        add     %o0, 16, %o0
        ! END OF mv_align_110

.mv_align_101:
! Alignment off by 24 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        add     %o1, 24, %o1
        sub     %o2, 24, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_101_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d4

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        fmovd   %d30, %d4
        bgt,pt  %ncc, .mv_align_101_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        ba      .remain_stuff
        add     %o0, 24, %o0
        ! END OF mv_align_101

.mv_align_100:
! Alignment off by 32 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16],%d4
        ldd     [%o1+24],%d6
        add     %o1, 32, %o1
        sub     %o2, 32, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_100_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */
        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d6

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        fmovd   %d30, %d6
        bgt,pt  %ncc, .mv_align_100_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        ba      .remain_stuff
        add     %o0, 32, %o0
        ! END OF mv_align_100

.mv_align_011:
! Alignment off by 40 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        add     %o1, 40, %o1
        sub     %o2, 40, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_011_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d8

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        fmovd   %d30, %d8
        bgt,pt  %ncc, .mv_align_011_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        ba      .remain_stuff
        add     %o0, 40, %o0
        ! END OF mv_align_011

.mv_align_010:
! Alignment off by 48 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        ldd     [%o1+40], %d10
        add     %o1, 48, %o1
        sub     %o2, 48, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_010_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d10

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        add     %o1, 128, %o1   ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        fmovd   %d30, %d10
        bgt,pt  %ncc, .mv_align_010_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        std     %d10, [%o0+40]
        ba      .remain_stuff
        add     %o0, 48, %o0
        ! END OF mv_align_010

.mv_align_001:
! Alignment off by 56 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        ldd     [%o1+40], %d10
        ldd     [%o1+48], %d12
        add     %o1, 56, %o1
        sub     %o2, 56, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_001_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d14
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d12

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d14
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        fmovd   %d30, %d12
        bgt,pt  %ncc, .mv_align_001_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        std     %d10, [%o0+40]
        std     %d12, [%o0+48]
        ba      .remain_stuff
        add     %o0, 56, %o0
        ! END OF mv_align_001

.mv_align_000:
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.mv_align_000_loop:
        /* ---- copy line 1 of 2. ---- */
        subcc   %o5, 128, %o5
        ldda    [%o1]%asi,%d0
        stda    %d0,[%o0]%asi
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read

        /* ---- copy line 2 of 2. ---- */
        add     %o0, 64, %o0
        ldda    [%o1+64]%asi,%d0
        add     %o1, 128, %o1           ! increment src
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! increment dst
        bgt,pt  %ncc, .mv_align_000_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .remain_stuff
        nop

        ! END OF mv_align_000
#else   /* NIAGARA2_IMPL */
#endif  /* NIAGARA2_IMPL */

        SET_SIZE(memmove)

        ENTRY(memcpy)
        ENTRY(__align_cpy_1)
#ifdef NIAGARA2_IMPL
        cmp     %o2, SMALL_MAX          ! check for not small case
        bgeu,pn %ncc, .medium           ! go to larger cases
        mov     %o0, %g1                ! save %o0
.mv_short:
        cmp     %o2, SHORTCOPY          ! check for really short case
        ble,pt  %ncc, .smallfin
        or      %o0, %o1, %o4           ! prepare alignment check
        andcc   %o4, 0x3, %o5           ! test for alignment
        bz,pt   %ncc, .smallword        ! branch to word aligned case
        cmp     %o2, SHORTCHECK
        ble,pt  %ncc, .smallrest
        andcc   %o1, 0x3, %o5           ! is src word aligned
        bz,pn   %ncc, .aldst
        cmp     %o5, 2                  ! is src half-word aligned
        be,pt   %ncc, .s2algn
        cmp     %o5, 3                  ! src is byte aligned
.s1algn:ldub    [%o1], %o3              ! move 1 or 3 bytes to align it
        inc     1, %o1
        stb     %o3, [%o0]              ! move a byte to align src
        inc     1, %o0
        bne,pt  %ncc, .s2algn
        dec     %o2
        b       .ald                    ! now go align dest
        andcc   %o0, 0x3, %o5

.s2algn:lduh    [%o1], %o3              ! know src is 2 byte aligned
        inc     2, %o1
        srl     %o3, 8, %o4
        stb     %o4, [%o0]              ! have to do bytes,
        stb     %o3, [%o0 + 1]          ! don't know dst alignment
        inc     2, %o0
        dec     2, %o2

.aldst: andcc   %o0, 0x3, %o5           ! align the destination address
.ald:   bz,pn   %ncc, .w4cp
        cmp     %o5, 2
        be,pn   %ncc, .w2cp
        cmp     %o5, 3
.w3cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 24, %o5
        stb     %o5, [%o0]
        bne,pt  %ncc, .w1cp
        inc     %o0
        dec     1, %o2
        andn    %o2, 3, %o3             ! %o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %o0, %o1           ! %o1 gets the difference

1:      sll     %o4, 8, %g5             ! save residual bytes
        lduw    [%o1+%o0], %o4
        deccc   4, %o3
        srl     %o4, 24, %o5            ! merge with residual
        or      %o5, %g5, %g5
        st      %g5, [%o0]
        bnz,pt  %ncc, 1b
        inc     4, %o0
        sub     %o1, 3, %o1             ! used one byte of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w1cp:  srl     %o4, 8, %o5
        sth     %o5, [%o0]
        inc     2, %o0
        dec     3, %o2
        andn    %o2, 3, %o3             ! %o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %o0, %o1           ! %o1 gets the difference

2:      sll     %o4, 24, %g5            ! save residual bytes
        lduw    [%o1+%o0], %o4
        deccc   4, %o3
        srl     %o4, 8, %o5             ! merge with residual
        or      %o5, %g5, %g5
        st      %g5, [%o0]
        bnz,pt  %ncc, 2b
        inc     4, %o0
        sub     %o1, 1, %o1             ! used three bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w2cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 16, %o5
        sth     %o5, [%o0]
        inc     2, %o0
        dec     2, %o2
        andn    %o2, 3, %o3             ! %o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %o0, %o1           ! %o1 gets the difference

3:      sll     %o4, 16, %g5            ! save residual bytes
        lduw    [%o1+%o0], %o4
        deccc   4, %o3
        srl     %o4, 16, %o5            ! merge with residual
        or      %o5, %g5, %g5
        st      %g5, [%o0]
        bnz,pt  %ncc, 3b
        inc     4, %o0
        sub     %o1, 2, %o1             ! used two bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w4cp:  andn    %o2, 3, %o3             ! %o3 is aligned word count
        sub     %o1, %o0, %o1           ! %o1 gets the difference

1:      lduw    [%o1+%o0], %o4          ! read from address
        deccc   4, %o3                  ! decrement count
        st      %o4, [%o0]              ! write at destination address
        bgu,pt  %ncc, 1b
        inc     4, %o0                  ! increment to address
        and     %o2, 3, %o2             ! number of leftover bytes, if any

        ! simple finish up byte copy, works with any alignment
7:
        add     %o1, %o0, %o1           ! restore %o1
.smallrest:
        tst     %o2
        bz,pt   %ncc, .smallx
        cmp     %o2, 4
        blt,pt  %ncc, .smallleft3
        nop
        sub     %o2, 3, %o2
.smallnotalign4:
        ldub    [%o1], %o3              ! read byte
        subcc   %o2, 4, %o2             ! reduce count by 4
        stb     %o3, [%o0]              ! write byte
        ldub    [%o1+1], %o3            ! repeat for total of 4 bytes
        add     %o1, 4, %o1             ! advance SRC by 4
        stb     %o3, [%o0+1]
        ldub    [%o1-2], %o3
        add     %o0, 4, %o0             ! advance DST by 4
        stb     %o3, [%o0-2]
        ldub    [%o1-1], %o3
        bgu,pt  %ncc, .smallnotalign4   ! loop til 3 or fewer bytes remain
        stb     %o3, [%o0-1]
        addcc   %o2, 3, %o2             ! restore count
        bz,pt   %ncc, .smallx
.smallleft3:                            ! 1, 2, or 3 bytes remain
        subcc   %o2, 1, %o2
        ldub    [%o1], %o3              ! load one byte
        bz,pt   %ncc, .smallx
        stb     %o3, [%o0]              ! store one byte
        ldub    [%o1+1], %o3            ! load second byte
        subcc   %o2, 1, %o2
        bz,pt   %ncc, .smallx
        stb     %o3, [%o0+1]            ! store second byte
        ldub    [%o1+2], %o3            ! load third byte
        stb     %o3, [%o0+2]            ! store third byte
.smallx:
        retl
        mov     %g1, %o0                ! restore %o0

.smallfin:
        tst     %o2
        bnz,pt  %ncc, .smallleft3
        nop
        retl
        mov     %g1, %o0                ! restore %o0

        .align 16
.smallwords:
        lduw    [%o1], %o3              ! read word
.smallwordx:
        subcc   %o2, 8, %o2             ! update count
        stw     %o3, [%o0]              ! write word
        add     %o1, 8, %o1             ! update SRC
        lduw    [%o1-4], %o3            ! read word
        add     %o0, 8, %o0             ! update DST
        bgu,pt  %ncc, .smallwords       ! loop until done
        stw     %o3, [%o0-4]            ! write word
        addcc   %o2, 7, %o2             ! restore count
        bz,pt   %ncc, .smallexit        ! check for completion
        cmp     %o2, 4                  ! check for 4 or more bytes left
        blt     %ncc, .smallleft3       ! if not, go to finish up
        nop
        lduw    [%o1], %o3
        add     %o1, 4, %o1
        subcc   %o2, 4, %o2
        add     %o0, 4, %o0
        bnz,pt  %ncc, .smallleft3
        stw     %o3, [%o0-4]
        retl
        mov     %g1, %o0                ! restore %o0

! 8 or more bytes, src and dest start on word boundary
! %o4 contains or %o0, %o1; %o3 contains first four bytes of src
.smalllong:
        andcc   %o4, 0x7, %o5           ! test for long alignment
        bnz,pt  %ncc, .smallwordx       ! branch to word aligned case
        cmp     %o2, SHORT_LONG-7
        bge,a   %ncc, .medl64           ! if we branch
        sub     %o2,56,%o2              ! adjust %o2 to -31 off count
        sub     %o1, %o0, %o1           ! %o1 gets the difference
.small_long_l:
        ldx     [%o1+%o0], %o3
        subcc   %o2, 8, %o2
        add     %o0, 8, %o0
        bgu,pt  %ncc, .small_long_l     ! loop until done
        stx     %o3, [%o0-8]            ! write word
        add     %o1, %o0, %o1           ! restore %o1
        addcc   %o2, 7, %o2             ! restore %o2 to correct count
        bz,pt   %ncc, .smallexit        ! check for completion
        cmp     %o2, 4                  ! check for 4 or more bytes left
        blt,pt  %ncc, .smallleft3       ! if not, go to finish up
        nop
        lduw    [%o1], %o3
        add     %o1, 4, %o1
        subcc   %o2, 4, %o2
        stw     %o3, [%o0]
        add     %o0, 4, %o0
        bnz,pt  %ncc, .smallleft3
        nop
        retl
        mov     %g1, %o0                ! restore %o0

        .align 16
! src and dest start on word boundary
.smallword:
        subcc   %o2, 7, %o2             ! adjust count
        bgu,pt  %ncc, .smalllong
        lduw    [%o1], %o3              ! read word
        addcc   %o2, 3, %o2             ! restore count
        bz,pt   %ncc, .smallexit
        stw     %o3, [%o0]              ! write word
        deccc   %o2                     ! reduce count for cc test
        ldub    [%o1+4], %o3            ! load one byte
        bz,pt   %ncc, .smallexit
        stb     %o3, [%o0+4]            ! store one byte
        ldub    [%o1+5], %o3            ! load second byte
        deccc   %o2
        bz,pt   %ncc, .smallexit
        stb     %o3, [%o0+5]            ! store second byte
        ldub    [%o1+6], %o3            ! load third byte
        stb     %o3, [%o0+6]            ! store third byte
.smallexit:
        retl
        mov     %g1, %o0                ! restore %o0

        .align 16
.medium:
        neg     %o0, %o5
        andcc   %o5, 7, %o5             ! bytes till DST 8 byte aligned
        brz,pt  %o5, .dst_aligned_on_8

        ! %o5 has the bytes to be written in partial store.
        sub     %o2, %o5, %o2
        sub     %o1, %o0, %o1           ! %o1 gets the difference
7:                                      ! dst aligning loop
        ldub    [%o1+%o0], %o4          ! load one byte
        subcc   %o5, 1, %o5
        stb     %o4, [%o0]
        bgu,pt  %ncc, 7b
        add     %o0, 1, %o0             ! advance dst
        add     %o1, %o0, %o1           ! restore %o1
.dst_aligned_on_8:
        andcc   %o1, 7, %o5
        brnz,pt %o5, .src_dst_unaligned_on_8
        prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read

.src_dst_aligned_on_8:
        ! check if we are copying MED_MAX or more bytes
        cmp     %o2, MED_MAX            ! limit to store buffer size
        bgu,pt  %ncc, .large_align8_copy
        prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is less than MED_MAX bytes
 */
.medlong:
        subcc   %o2, 63, %o2            ! adjust length to allow cc test
        ble,pt  %ncc, .medl63           ! skip big loop if less than 64 bytes
.medl64:
        prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache
        ldx     [%o1], %o4              ! load
        subcc   %o2, 64, %o2            ! decrement length count
        stx     %o4, [%o0]              ! and store
        ldx     [%o1+8], %o3            ! a block of 64 bytes
        stx     %o3, [%o0+8]
        ldx     [%o1+16], %o4
        stx     %o4, [%o0+16]
        ldx     [%o1+24], %o3
        stx     %o3, [%o0+24]
        ldx     [%o1+32], %o4           ! load
        stx     %o4, [%o0+32]           ! and store
        ldx     [%o1+40], %o3           ! a block of 64 bytes
        add     %o1, 64, %o1            ! increase src ptr by 64
        stx     %o3, [%o0+40]
        ldx     [%o1-16], %o4
        add     %o0, 64, %o0            ! increase dst ptr by 64
        stx     %o4, [%o0-16]
        ldx     [%o1-8], %o3
        bgu,pt  %ncc, .medl64           ! repeat if at least 64 bytes left
        stx     %o3, [%o0-8]
.medl63:
        addcc   %o2, 32, %o2            ! adjust remaining count
        ble,pt  %ncc, .medl31           ! to skip if 31 or fewer bytes left
        nop
        ldx     [%o1], %o4              ! load
        sub     %o2, 32, %o2            ! decrement length count
        stx     %o4, [%o0]              ! and store
        ldx     [%o1+8], %o3            ! a block of 32 bytes
        add     %o1, 32, %o1            ! increase src ptr by 32
        stx     %o3, [%o0+8]
        ldx     [%o1-16], %o4
        add     %o0, 32, %o0            ! increase dst ptr by 32
        stx     %o4, [%o0-16]
        ldx     [%o1-8], %o3
        stx     %o3, [%o0-8]
.medl31:
        addcc   %o2, 16, %o2            ! adjust remaining count
        ble,pt  %ncc, .medl15           ! skip if 15 or fewer bytes left
        nop                             !
        ldx     [%o1], %o4              ! load and store 16 bytes
        add     %o1, 16, %o1            ! increase src ptr by 16
        stx     %o4, [%o0]              !
        sub     %o2, 16, %o2            ! decrease count by 16
        ldx     [%o1-8], %o3            !
        add     %o0, 16, %o0            ! increase dst ptr by 16
        stx     %o3, [%o0-8]
.medl15:
        addcc   %o2, 15, %o2            ! restore count
        bz,pt   %ncc, .smallexit        ! exit if finished
        cmp     %o2, 8
        blt,pt  %ncc, .medw7            ! skip if 7 or fewer bytes left
        tst     %o2
        ldx     [%o1], %o4              ! load 8 bytes
        add     %o1, 8, %o1             ! increase src ptr by 8
        add     %o0, 8, %o0             ! increase dst ptr by 8
        subcc   %o2, 8, %o2             ! decrease count by 8
        bnz,pt  %ncc, .medw7
        stx     %o4, [%o0-8]            ! and store 8 bytes
        retl
        mov     %g1, %o0                ! restore %o0

        .align 16
.src_dst_unaligned_on_8:
        ! DST is 8-byte aligned, src is not
2:
        andcc   %o1, 0x3, %o5           ! test word alignment
        bnz,pt  %ncc, .unalignsetup     ! branch to skip if not word aligned
        prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read

/*
 * Handle all cases where src and dest are aligned on word
 * boundaries. Use unrolled loops for better performance.
 * This option wins over standard large data move when
 * source and destination is in cache for medium
 * to short data moves.
 */
        cmp     %o2, MED_WMAX           ! limit to store buffer size
        bge,pt  %ncc, .unalignrejoin    ! otherwise rejoin main loop
        prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read

        subcc   %o2, 31, %o2            ! adjust length to allow cc test
                                        ! for end of loop
        ble,pt  %ncc, .medw31           ! skip big loop if less than 16
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
.medw32:
        ld      [%o1], %o4              ! move a block of 32 bytes
        stw     %o4, [%o0]
        ld      [%o1+4], %o3
        stw     %o3, [%o0+4]
        ld      [%o1+8], %o4
        stw     %o4, [%o0+8]
        ld      [%o1+12], %o3
        stw     %o3, [%o0+12]
        ld      [%o1+16], %o4
        subcc   %o2, 32, %o2            ! decrement length count
        stw     %o4, [%o0+16]
        ld      [%o1+20], %o3
        add     %o1, 32, %o1            ! increase src ptr by 32
        stw     %o3, [%o0+20]
        ld      [%o1-8], %o4
        add     %o0, 32, %o0            ! increase dst ptr by 32
        stw     %o4, [%o0-8]
        ld      [%o1-4], %o3
        bgu,pt  %ncc, .medw32           ! repeat if at least 32 bytes left
        stw     %o3, [%o0-4]
.medw31:
        addcc   %o2, 31, %o2            ! restore count

        bz,pt   %ncc, .smallexit        ! exit if finished
        nop
        cmp     %o2, 16
        blt,pt  %ncc, .medw15
        nop
        ld      [%o1], %o4              ! move a block of 16 bytes
        subcc   %o2, 16, %o2            ! decrement length count
        stw     %o4, [%o0]
        ld      [%o1+4], %o3
        add     %o1, 16, %o1            ! increase src ptr by 16
        stw     %o3, [%o0+4]
        ld      [%o1-8], %o4
        add     %o0, 16, %o0            ! increase dst ptr by 16
        stw     %o4, [%o0-8]
        ld      [%o1-4], %o3
        stw     %o3, [%o0-4]
.medw15:
        bz,pt   %ncc, .smallexit        ! exit if finished
        cmp     %o2, 8
        blt,pt  %ncc, .medw7            ! skip if 7 or fewer bytes left
        tst     %o2
        ld      [%o1], %o4              ! load 4 bytes
        subcc   %o2, 8, %o2             ! decrease count by 8
        stw     %o4, [%o0]              ! and store 4 bytes
        add     %o1, 8, %o1             ! increase src ptr by 8
        ld      [%o1-4], %o3            ! load 4 bytes
        add     %o0, 8, %o0             ! increase dst ptr by 8
        stw     %o3, [%o0-4]            ! and store 4 bytes
        bz,pt   %ncc, .smallexit        ! exit if finished
.medw7:                                 ! count is ge 1, less than 8
        cmp     %o2, 4                  ! check for 4 bytes left
        blt,pt  %ncc, .smallleft3       ! skip if 3 or fewer bytes left
        nop                             !
        ld      [%o1], %o4              ! load 4 bytes
        add     %o1, 4, %o1             ! increase src ptr by 4
        add     %o0, 4, %o0             ! increase dst ptr by 4
        subcc   %o2, 4, %o2             ! decrease count by 4
        bnz     .smallleft3
        stw     %o4, [%o0-4]            ! and store 4 bytes
        retl
        mov     %g1, %o0                ! restore %o0

        .align  16
.large_align8_copy:                     ! Src and dst share 8 byte alignment
        rd      %fprs, %g5              ! check for unused fp
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
        bz,a    %ncc, 1f
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
1:
        ! align dst to 64 byte boundary
        andcc   %o0, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
        brz,pn  %o3, .aligned_to_64
        andcc   %o0, 8, %o3             ! odd long words to move?
        brz,pt  %o3, .aligned_to_16
        nop
        ldx     [%o1], %o4
        sub     %o2, 8, %o2
        add     %o1, 8, %o1             ! increment src ptr
        add     %o0, 8, %o0             ! increment dst ptr
        stx     %o4, [%o0-8]
.aligned_to_16:
        andcc   %o0, 16, %o3            ! pair of long words to move?
        brz,pt  %o3, .aligned_to_32
        nop
        ldx     [%o1], %o4
        sub     %o2, 16, %o2
        stx     %o4, [%o0]
        add     %o1, 16, %o1            ! increment src ptr
        ldx     [%o1-8], %o4
        add     %o0, 16, %o0            ! increment dst ptr
        stx     %o4, [%o0-8]
.aligned_to_32:
        andcc   %o0, 32, %o3            ! four long words to move?
        brz,pt  %o3, .aligned_to_64
        nop
        ldx     [%o1], %o4
        sub     %o2, 32, %o2
        stx     %o4, [%o0]
        ldx     [%o1+8], %o4
        stx     %o4, [%o0+8]
        ldx     [%o1+16], %o4
        stx     %o4, [%o0+16]
        add     %o1, 32, %o1            ! increment src ptr
        ldx     [%o1-8], %o4
        add     %o0, 32, %o0            ! increment dst ptr
        stx     %o4, [%o0-8]
.aligned_to_64:
        prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
        mov     %asi,%o4                ! save %asi
        ! Determine source alignment
        ! to correct 8 byte offset
        andcc   %o1, 0x20, %o3
        brnz,pn %o3, .align_1
        mov     ASI_BLK_P, %asi         ! setup %asi for block load/store
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .align_01
        nop
        andcc   %o1, 0x08, %o3
        brz,pn  %o3, .align_000
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .align_001
        nop
.align_01:
        andcc   %o1, 0x08, %o3
        brnz,pn %o3, .align_011
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .align_010
        nop
.align_1:
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .align_11
        nop
        andcc   %o1, 0x08, %o3
        brnz,pn %o3, .align_101
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
        ba      .align_100
        nop
.align_11:
        andcc   %o1, 0x08, %o3
        brz,pn  %o3, .align_110
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

.align_111:
! Alignment off by 8 bytes
        ldd     [%o1], %d0
        add     %o1, 8, %o1
        sub     %o2, 8, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_111_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */
        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d0

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d2
        fmovd   %d18, %d4
        fmovd   %d20, %d6
        fmovd   %d22, %d8
        fmovd   %d24, %d10
        fmovd   %d26, %d12
        fmovd   %d28, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d30, %d0
        bgt,pt  %ncc, .align_111_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        ba      .remain_stuff
        add     %o0, 8, %o0
        ! END OF align_111

.align_110:
! Alignment off by 16 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        add     %o1, 16, %o1
        sub     %o2, 16, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_110_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d28, %d0
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d2

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d4
        fmovd   %d18, %d6
        fmovd   %d20, %d8
        fmovd   %d22, %d10
        fmovd   %d24, %d12
        fmovd   %d26, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d28, %d0
        fmovd   %d30, %d2
        bgt,pt  %ncc, .align_110_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        ba      .remain_stuff
        add     %o0, 16, %o0
        ! END OF align_110

.align_101:
! Alignment off by 24 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        add     %o1, 24, %o1
        sub     %o2, 24, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_101_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d4

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d6
        fmovd   %d18, %d8
        fmovd   %d20, %d10
        fmovd   %d22, %d12
        fmovd   %d24, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d26, %d0
        fmovd   %d28, %d2
        fmovd   %d30, %d4
        bgt,pt  %ncc, .align_101_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        ba      .remain_stuff
        add     %o0, 24, %o0
        ! END OF align_101

.align_100:
! Alignment off by 32 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16],%d4
        ldd     [%o1+24],%d6
        add     %o1, 32, %o1
        sub     %o2, 32, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_100_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */
        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d6

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d8
        fmovd   %d18, %d10
        fmovd   %d20, %d12
        fmovd   %d22, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d24, %d0
        fmovd   %d26, %d2
        fmovd   %d28, %d4
        fmovd   %d30, %d6
        bgt,pt  %ncc, .align_100_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        ba      .remain_stuff
        add     %o0, 32, %o0
        ! END OF align_100

.align_011:
! Alignment off by 40 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        add     %o1, 40, %o1
        sub     %o2, 40, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_011_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d8

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d10
        fmovd   %d18, %d12
        fmovd   %d20, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d22, %d0
        fmovd   %d24, %d2
        fmovd   %d26, %d4
        fmovd   %d28, %d6
        fmovd   %d30, %d8
        bgt,pt  %ncc, .align_011_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        ba      .remain_stuff
        add     %o0, 40, %o0
        ! END OF align_011

.align_010:
! Alignment off by 48 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        ldd     [%o1+40], %d10
        add     %o1, 48, %o1
        sub     %o2, 48, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_010_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d10

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d12
        fmovd   %d18, %d14
        add     %o1, 128, %o1   ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d20, %d0
        fmovd   %d22, %d2
        fmovd   %d24, %d4
        fmovd   %d26, %d6
        fmovd   %d28, %d8
        fmovd   %d30, %d10
        bgt,pt  %ncc, .align_010_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        std     %d10, [%o0+40]
        ba      .remain_stuff
        add     %o0, 48, %o0
        ! END OF align_010

.align_001:
! Alignment off by 56 bytes
        ldd     [%o1], %d0
        ldd     [%o1+8], %d2
        ldd     [%o1+16], %d4
        ldd     [%o1+24], %d6
        ldd     [%o1+32], %d8
        ldd     [%o1+40], %d10
        ldd     [%o1+48], %d12
        add     %o1, 56, %o1
        sub     %o2, 56, %o2
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_001_loop:
        subcc   %o5, 128, %o5
        /* ---- copy line 1 of 2. ---- */

        ldda    [%o1]%asi,%d16          ! block load
        fmovd   %d16, %d14
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
        fmovd   %d30, %d12

        /* ---- copy line 2 of 2. ---- */
        ldda    [%o1+64]%asi,%d16
        fmovd   %d16, %d14
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! advance dst
        fmovd   %d18, %d0
        fmovd   %d20, %d2
        fmovd   %d22, %d4
        fmovd   %d24, %d6
        fmovd   %d26, %d8
        fmovd   %d28, %d10
        fmovd   %d30, %d12
        bgt,pt  %ncc, .align_001_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        std     %d0, [%o0]
        std     %d2, [%o0+8]
        std     %d4, [%o0+16]
        std     %d6, [%o0+24]
        std     %d8, [%o0+32]
        std     %d10, [%o0+40]
        std     %d12, [%o0+48]
        ba      .remain_stuff
        add     %o0, 56, %o0
        ! END OF align_001

.align_000:
        andn    %o2, 0x7f, %o5          ! %o5 is multiple of 2*block size
        and     %o2, 0x7f, %o2          ! residue bytes in %o2
.align_000_loop:
        /* ---- copy line 1 of 2. ---- */
        subcc   %o5, 128, %o5
        ldda    [%o1]%asi,%d0
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read

        /* ---- copy line 2 of 2. ---- */
        add     %o0, 64, %o0
        ldda    [%o1+64]%asi,%d0
        add     %o1, 128, %o1           ! increment src
        stxa    %g0,[%o0]ASI_STBI_P     ! block initializing store
        stda    %d0,[%o0]%asi
        add     %o0, 64, %o0            ! increment dst
        bgt,pt  %ncc, .align_000_loop
        prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read

        ! END OF align_000

.remain_stuff:
        mov     %o4, %asi               ! restore %asi
        brnz    %g5, .medlong
        membar  #Sync
        ba      .medlong
        wr      %g5, %g0, %fprs

        .align 16
        ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
.unalignsetup:
        prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
.unalignrejoin:
        rd      %fprs, %g5              ! check for unused fp
        ! if fprs.fef == 0, set it.
        ! Setting it when already set costs more than checking
        andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
        bz,a    %ncc, 1f
        wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
1:
        cmp     %o2, MED_UMAX           ! check for medium unaligned limit
        bge,pt  %ncc,.unalign_large
        nop
        andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
        and     %o2, 0x3f, %o2          ! residue bytes in %o2
        cmp     %o2, 8                  ! Insure we don't load beyond
        bgt     .unalign_adjust         ! end of source buffer
        andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
        add     %o2, 64, %o2            ! adjust to leave loop
        sub     %o5, 64, %o5            ! early if necessary
.unalign_adjust:
        alignaddr %o1, %g0, %g0         ! generate %gsr
        add     %o1, %o5, %o1           ! advance %o1 to after blocks
        ldd     [%o4], %d0
.unalign_loop:
        ldd     [%o4+8], %d2
        faligndata %d0, %d2, %d16
        ldd     [%o4+16], %d4
        std     %d16, [%o0]
        faligndata %d2, %d4, %d18
        ldd     [%o4+24], %d6
        std     %d18, [%o0+8]
        faligndata %d4, %d6, %d20
        ldd     [%o4+32], %d8
        std     %d20, [%o0+16]
        faligndata %d6, %d8, %d22
        ldd     [%o4+40], %d10
        std     %d22, [%o0+24]
        faligndata %d8, %d10, %d24
        ldd     [%o4+48], %d12
        std     %d24, [%o0+32]
        faligndata %d10, %d12, %d26
        ldd     [%o4+56], %d14
        std     %d26, [%o0+40]
        faligndata %d12, %d14, %d28
        ldd     [%o4+64], %d0
        std     %d28, [%o0+48]
        faligndata %d14, %d0, %d30
        add     %o4, BLOCK_SIZE, %o4
        std     %d30, [%o0+56]
        add     %o0, BLOCK_SIZE, %o0
        subcc   %o5, BLOCK_SIZE, %o5
        bgu,pt  %ncc, .unalign_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        nop

.unalign_large:
        andcc   %o0, 0x3f, %o3          ! is dst 64-byte block aligned?
        bz      %ncc, .unalignsrc
        sub     %o3, 64, %o3            ! %o3 will be multiple of 8
        neg     %o3                     ! bytes until dest is 64 byte aligned
        sub     %o2, %o3, %o2           ! update cnt with bytes to be moved
        ! Move bytes according to source alignment
        andcc   %o1, 0x1, %o5
        bnz     %ncc, .unalignbyte      ! check for byte alignment
        nop
        andcc   %o1, 2, %o5             ! check for half word alignment
        bnz     %ncc, .unalignhalf
        nop
        ! Src is word aligned
.unalignword:
        ld      [%o1], %o4              ! load 4 bytes
        stw     %o4, [%o0]              ! and store 4 bytes
        ld      [%o1+4], %o4            ! load 4 bytes
        add     %o1, 8, %o1             ! increase src ptr by 8
        stw     %o4, [%o0+4]            ! and store 4 bytes
        subcc   %o3, 8, %o3             ! decrease count by 8
        bnz     %ncc, .unalignword
        add     %o0, 8, %o0             ! increase dst ptr by 8
        ba      .unalignsrc
        nop

        ! Src is half-word aligned
.unalignhalf:
        lduh    [%o1], %o4              ! load 2 bytes
        sllx    %o4, 32, %o5            ! shift left
        lduw    [%o1+2], %o4
        or      %o4, %o5, %o5
        sllx    %o5, 16, %o5
        lduh    [%o1+6], %o4
        or      %o4, %o5, %o5
        stx     %o5, [%o0]
        add     %o1, 8, %o1
        subcc   %o3, 8, %o3
        bnz     %ncc, .unalignhalf
        add     %o0, 8, %o0
        ba      .unalignsrc
        nop

        ! Src is Byte aligned
.unalignbyte:
        sub     %o0, %o1, %o0           ! share pointer advance
.unalignbyte_loop:
        ldub    [%o1], %o4
        sllx    %o4, 56, %o5
        lduh    [%o1+1], %o4
        sllx    %o4, 40, %o4
        or      %o4, %o5, %o5
        lduh    [%o1+3], %o4
        sllx    %o4, 24, %o4
        or      %o4, %o5, %o5
        lduh    [%o1+5], %o4
        sllx    %o4,  8, %o4
        or      %o4, %o5, %o5
        ldub    [%o1+7], %o4
        or      %o4, %o5, %o5
        stx     %o5, [%o0+%o1]
        subcc   %o3, 8, %o3
        bnz     %ncc, .unalignbyte_loop
        add     %o1, 8, %o1
        add     %o0,%o1, %o0            ! restore pointer

        ! Destination is now block (64 byte aligned)
.unalignsrc:
        andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
        and     %o2, 0x3f, %o2          ! residue bytes in %o2
        add     %o2, 64, %o2            ! Insure we don't load beyond
        sub     %o5, 64, %o5            ! end of source buffer

        andn    %o1, 0x3f, %o4          ! %o4 has block aligned src address
        prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read
        alignaddr %o1, %g0, %g0         ! generate %gsr
        add     %o1, %o5, %o1           ! advance %o1 to after blocks
        !
        ! Determine source alignment to correct 8 byte offset
        andcc   %o1, 0x20, %o3
        brnz,pn %o3, .unalign_1
        nop
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .unalign_01
        nop
        andcc   %o1, 0x08, %o3
        brz,a   %o3, .unalign_000
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_001
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_01:
        andcc   %o1, 0x08, %o3
        brnz,a  %o3, .unalign_011
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_010
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_1:
        andcc   %o1, 0x10, %o3
        brnz,pn %o3, .unalign_11
        nop
        andcc   %o1, 0x08, %o3
        brnz,a  %o3, .unalign_101
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_100
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
.unalign_11:
        andcc   %o1, 0x08, %o3
        brz,pn  %o3, .unalign_110
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read

.unalign_111:
        ldd     [%o4+56], %d14
.unalign_111_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d14, %d16, %d48
        faligndata %d16, %d18, %d50
        faligndata %d18, %d20, %d52
        faligndata %d20, %d22, %d54
        faligndata %d22, %d24, %d56
        faligndata %d24, %d26, %d58
        faligndata %d26, %d28, %d60
        faligndata %d28, %d30, %d62
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_111_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_110:
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_110_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d12, %d14, %d48
        faligndata %d14, %d16, %d50
        faligndata %d16, %d18, %d52
        faligndata %d18, %d20, %d54
        faligndata %d20, %d22, %d56
        faligndata %d22, %d24, %d58
        faligndata %d24, %d26, %d60
        faligndata %d26, %d28, %d62
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_110_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_101:
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_101_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d10, %d12, %d48
        faligndata %d12, %d14, %d50
        faligndata %d14, %d16, %d52
        faligndata %d16, %d18, %d54
        faligndata %d18, %d20, %d56
        faligndata %d20, %d22, %d58
        faligndata %d22, %d24, %d60
        faligndata %d24, %d26, %d62
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_101_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_100:
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_100_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d8, %d10, %d48
        faligndata %d10, %d12, %d50
        faligndata %d12, %d14, %d52
        faligndata %d14, %d16, %d54
        faligndata %d16, %d18, %d56
        faligndata %d18, %d20, %d58
        faligndata %d20, %d22, %d60
        faligndata %d22, %d24, %d62
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_100_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_011:
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_011_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d6, %d8, %d48
        faligndata %d8, %d10, %d50
        faligndata %d10, %d12, %d52
        faligndata %d12, %d14, %d54
        faligndata %d14, %d16, %d56
        faligndata %d16, %d18, %d58
        faligndata %d18, %d20, %d60
        faligndata %d20, %d22, %d62
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_011_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_010:
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_010_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d4, %d6, %d48
        faligndata %d6, %d8, %d50
        faligndata %d8, %d10, %d52
        faligndata %d10, %d12, %d54
        faligndata %d12, %d14, %d56
        faligndata %d14, %d16, %d58
        faligndata %d16, %d18, %d60
        faligndata %d18, %d20, %d62
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_010_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_001:
        ldd     [%o4+8], %d2
        ldd     [%o4+16], %d4
        ldd     [%o4+24], %d6
        ldd     [%o4+32], %d8
        ldd     [%o4+40], %d10
        ldd     [%o4+48], %d12
        ldd     [%o4+56], %d14
.unalign_001_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d2, %d4, %d48
        faligndata %d4, %d6, %d50
        faligndata %d6, %d8, %d52
        faligndata %d8, %d10, %d54
        faligndata %d10, %d12, %d56
        faligndata %d12, %d14, %d58
        faligndata %d14, %d16, %d60
        faligndata %d16, %d18, %d62
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_001_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        ba      .unalign_done
        membar  #Sync

.unalign_000:
        ldda    [%o4]ASI_BLK_P, %d0
.unalign_000_loop:
        add     %o4, 64, %o4
        ldda    [%o4]ASI_BLK_P, %d16
        faligndata %d0, %d2, %d48
        faligndata %d2, %d4, %d50
        faligndata %d4, %d6, %d52
        faligndata %d6, %d8, %d54
        faligndata %d8, %d10, %d56
        faligndata %d10, %d12, %d58
        faligndata %d12, %d14, %d60
        faligndata %d14, %d16, %d62
        fmovd   %d16, %d0
        fmovd   %d18, %d2
        fmovd   %d20, %d4
        fmovd   %d22, %d6
        fmovd   %d24, %d8
        fmovd   %d26, %d10
        fmovd   %d28, %d12
        fmovd   %d30, %d14
        stda    %d48, [%o0]ASI_BLK_P
        subcc   %o5, 64, %o5
        add     %o0, 64, %o0
        bgu,pt  %ncc, .unalign_000_loop
        prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read
        membar  #Sync

.unalign_done:
        ! Handle trailing bytes, 64 to 127
        ! Dest long word aligned, Src not long word aligned
        cmp     %o2, 15
        bleu    %ncc, .unalign_short

        andn    %o2, 0x7, %o5           ! %o5 is multiple of 8
        and     %o2, 0x7, %o2           ! residue bytes in %o2
        add     %o2, 8, %o2
        sub     %o5, 8, %o5             ! insure we don't load past end of src
        andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
        add     %o1, %o5, %o1           ! advance %o1 to after multiple of 8
        ldd     [%o4], %d0              ! fetch partial word
.unalign_by8:
        ldd     [%o4+8], %d2
        add     %o4, 8, %o4
        faligndata %d0, %d2, %d16
        subcc   %o5, 8, %o5
        std     %d16, [%o0]
        fmovd   %d2, %d0
        bgu,pt  %ncc, .unalign_by8
        add     %o0, 8, %o0

.unalign_short:
        brnz    %g5, .smallrest
        nop
        ba      .smallrest
        wr      %g5, %g0, %fprs
#else   /* NIAGARA2_IMPL */
.forcpy:
        mov     %o0, %g5                ! save des address for return val
        cmp     %o2, 17                 ! for small counts copy bytes
        bleu,pt %ncc, .dbytecp
        nop

        cmp     %o2, 0x80               ! For lengths less than 128 bytes no
        bleu,pn %ncc, .no_blkcpy        ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P

        /*
         * Make sure that source and destination buffers are 64 bytes apart.
         * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy
         * the data.
         */
        subcc   %o1, %o0, %o3
        blu     %ncc, .blkalgndst
        cmp     %o3, 0x40               ! if src - dst >= 0x40
        bgeu,pt %ncc, .blkalgndst       ! then use ASI_BLK_INIT_ST_QUAD_LDD_P
.no_blkcpy:
        andcc   %o1, 3, %o5             ! is src word aligned
        bz,pn   %ncc, .aldst
        cmp     %o5, 2                  ! is src half-word aligned
        be,pt   %ncc, .s2algn
        cmp     %o5, 3                  ! src is byte aligned
.s1algn:ldub    [%o1], %o3              ! move 1 or 3 bytes to align it
        inc     1, %o1
        stb     %o3, [%g5]              ! move a byte to align src
        inc     1, %g5
        bne,pt  %ncc, .s2algn
        dec     %o2
        b       .ald                    ! now go align dest
        andcc   %g5, 3, %o5

.s2algn:lduh    [%o1], %o3              ! know src is 2 byte alinged
        inc     2, %o1
        srl     %o3, 8, %o4
        stb     %o4, [%g5]              ! have to do bytes,
        stb     %o3, [%g5 + 1]          ! don't know dst alingment
        inc     2, %g5
        dec     2, %o2

.aldst: andcc   %g5, 3, %o5             ! align the destination address
.ald:   bz,pn   %ncc, .w4cp
        cmp     %o5, 2
        bz,pn   %ncc, .w2cp
        cmp     %o5, 3
.w3cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 24, %o5
        stb     %o5, [%g5]
        bne,pt  %ncc, .w1cp
        inc     %g5
        dec     1, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

1:      sll     %o4, 8, %g1             ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 24, %o5            ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %ncc, 1b
        inc     4, %g5
        sub     %o1, 3, %o1             ! used one byte of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w1cp:  srl     %o4, 8, %o5
        sth     %o5, [%g5]
        inc     2, %g5
        dec     3, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

2:      sll     %o4, 24, %g1            ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 8, %o5             ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %ncc, 2b
        inc     4, %g5
        sub     %o1, 1, %o1             ! used three bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w2cp:  lduw    [%o1], %o4
        inc     4, %o1
        srl     %o4, 16, %o5
        sth     %o5, [%g5]
        inc     2, %g5
        dec     2, %o2
        andn    %o2, 3, %o3             ! o3 is aligned word count
        dec     4, %o3                  ! avoid reading beyond tail of src
        sub     %o1, %g5, %o1           ! o1 gets the difference

3:      sll     %o4, 16, %g1            ! save residual bytes
        lduw    [%o1+%g5], %o4
        deccc   4, %o3
        srl     %o4, 16, %o5            ! merge with residual
        or      %o5, %g1, %g1
        st      %g1, [%g5]
        bnz,pt  %ncc, 3b
        inc     4, %g5
        sub     %o1, 2, %o1             ! used two bytes of last word read
        and     %o2, 3, %o2
        b       7f
        inc     4, %o2

.w4cp:  andn    %o2, 3, %o3             ! o3 is aligned word count
        sub     %o1, %g5, %o1           ! o1 gets the difference

1:      lduw    [%o1+%g5], %o4          ! read from address
        deccc   4, %o3                  ! decrement count
        st      %o4, [%g5]              ! write at destination address
        bgu,pt  %ncc, 1b
        inc     4, %g5                  ! increment to address
        b       7f
        and     %o2, 3, %o2             ! number of leftover bytes, if any

        !
        ! differenced byte copy, works with any alignment
        !
.dbytecp:
        b       7f
        sub     %o1, %g5, %o1           ! o1 gets the difference

4:      stb     %o4, [%g5]              ! write to address
        inc     %g5                     ! inc to address
7:      deccc   %o2                     ! decrement count
        bgeu,a,pt %ncc,4b               ! loop till done
        ldub    [%o1+%g5], %o4          ! read from address
        retl                            ! %o0 was preserved
        nop

.blkalgndst:
        save    %sp, -SA(MINFRAME), %sp

        ! Block (64 bytes) align the destination.
        andcc   %i0, 0x3f, %i3          ! is dst block aligned
        bz      %ncc, .chksrc           ! dst already block aligned
        sub     %i3, 0x40, %i3
        neg     %i3                     ! bytes till dst 64 bytes aligned
        sub     %i2, %i3, %i2           ! update i2 with new count

        ! Based on source and destination alignment do
        ! either 8 bytes, 4 bytes, 2 bytes or byte copy.

        ! Is dst & src 8B aligned
        or      %i0, %i1, %o2
        andcc   %o2, 0x7, %g0
        bz      %ncc, .alewdcp
        nop

        ! Is dst & src 4B aligned
        andcc   %o2, 0x3, %g0
        bz      %ncc, .alwdcp
        nop

        ! Is dst & src 2B aligned
        andcc   %o2, 0x1, %g0
        bz      %ncc, .alhlfwdcp
        nop

        ! 1B aligned
1:      ldub    [%i1], %o2
        stb     %o2, [%i0]
        inc     %i1
        deccc   %i3
        bgu,pt  %ncc, 1b
        inc     %i0

        ba      .chksrc
        nop

        ! dst & src 4B aligned
.alwdcp:
        ld      [%i1], %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        subcc   %i3, 0x4, %i3
        bgu,pt  %ncc, .alwdcp
        add     %i0, 0x4, %i0

        ba      .chksrc
        nop

        ! dst & src 2B aligned
.alhlfwdcp:
        lduh    [%i1], %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        subcc   %i3, 0x2, %i3
        bgu,pt  %ncc, .alhlfwdcp
        add     %i0, 0x2, %i0

        ba      .chksrc
        nop

        ! dst & src 8B aligned
.alewdcp:
        ldx     [%i1], %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        subcc   %i3, 0x8, %i3
        bgu,pt  %ncc, .alewdcp
        add     %i0, 0x8, %i0

        ! Now Destination is block (64 bytes) aligned
.chksrc:
        andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
        sub     %i2, %i3, %i2           ! Residue bytes in %i2
        mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
        andcc   %i1, 0xf, %l1           ! is src quadword aligned
        bz,pn   %ncc, .blkcpy           ! src offset in %l1
        nop
        cmp     %l1, 0x8
        bgu     %ncc, .cpy_upper_double
        nop
        blu     %ncc, .cpy_lower_double
        nop

        ! Falls through when source offset is equal to 8 i.e.
        ! source is double word aligned.
        ! In this case no shift/merge of data is required
        sub     %i1, %l1, %i1           ! align the src at 16 bytes.
        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetch [%o0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %o2
loop0:
        ldda    [%i1+0x10]%asi, %o4
        prefetch [%o0+0x40], #one_read

        stxa    %o3, [%i0+0x0]%asi
        stxa    %o4, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %o2
        stxa    %o5, [%i0+0x10]%asi
        stxa    %o2, [%i0+0x18]%asi

        ldda    [%i1+0x30]%asi, %o4
        stxa    %o3, [%i0+0x20]%asi
        stxa    %o4, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %o2
        stxa    %o5, [%i0+0x30]%asi
        stxa    %o2, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %ncc, loop0
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %l1, %i1           ! increment the source by src offset

.cpy_lower_double:
        sub     %i1, %l1, %i1           ! align the src at 16 bytes.
        sll     %l1, 3, %l2             ! %l2 left shift
        mov     0x40, %l3
        sub     %l3, %l2, %l3           ! %l3 right shift = (64 - left shift)
        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetch [%o0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %o2      ! partial data in %o2 and %o3 has
                                        ! complete data
loop1:
        ldda    [%i1+0x10]%asi, %o4     ! %o4 has partial data for this read.
        ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)        ! merge %o2, %o3 and %o4
                                                        ! into %o2 and %o3
        prefetch [%o0+0x40], #one_read
        stxa    %o2, [%i0+0x0]%asi
        stxa    %o3, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %o2
        ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)        ! merge %o2 with %o5 and
        stxa    %o4, [%i0+0x10]%asi                     ! %o4 from previous read
        stxa    %o5, [%i0+0x18]%asi                     ! into %o4 and %o5

        ! Repeat the same for next 32 bytes.

        ldda    [%i1+0x30]%asi, %o4
        ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1)
        stxa    %o2, [%i0+0x20]%asi
        stxa    %o3, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %o2
        ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1)
        stxa    %o4, [%i0+0x30]%asi
        stxa    %o5, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %ncc, loop1
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %l1, %i1           ! increment the source by src offset

.cpy_upper_double:
        sub     %i1, %l1, %i1           ! align the src at 16 bytes.
        mov     0x8, %l2
        sub     %l1, %l2, %l2
        sll     %l2, 3, %l2             ! %l2 left shift
        mov     0x40, %l3
        sub     %l3, %l2, %l3           ! %l3 right shift = (64 - left shift)
        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetch [%o0+0x0], #one_read
        ldda    [%i1+0x0]%asi, %o2      ! partial data in %o3 for this read and
                                        ! no data in %o2
loop2:
        ldda    [%i1+0x10]%asi, %o4     ! %o4 has complete data and %o5 has
                                        ! partial
        ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)        ! merge %o3, %o4 and %o5
                                                        ! into %o3 and %o4
        prefetch [%o0+0x40], #one_read
        stxa    %o3, [%i0+0x0]%asi
        stxa    %o4, [%i0+0x8]%asi

        ldda    [%i1+0x20]%asi, %o2
        ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)        ! merge %o2 and %o3 with
        stxa    %o5, [%i0+0x10]%asi                     ! %o5 from previous read
        stxa    %o2, [%i0+0x18]%asi                     ! into %o5 and %o2

        ! Repeat the same for next 32 bytes.

        ldda    [%i1+0x30]%asi, %o4
        ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1)
        stxa    %o3, [%i0+0x20]%asi
        stxa    %o4, [%i0+0x28]%asi

        ldda    [%i1+0x40]%asi, %o2
        ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1)
        stxa    %o5, [%i0+0x30]%asi
        stxa    %o2, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %ncc, loop2
        add     %i0, 0x40, %i0
        ba      .blkdone
        add     %i1, %l1, %i1           ! increment the source by src offset

        ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
.blkcpy:
        andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
        prefetch [%o0+0x0], #one_read
1:
        prefetch [%o0+0x40], #one_read

        ldda    [%i1+0x0]%asi, %o2
        ldda    [%i1+0x10]%asi, %o4

        stxa    %o2, [%i0+0x0]%asi
        stxa    %o3, [%i0+0x8]%asi
        stxa    %o4, [%i0+0x10]%asi
        stxa    %o5, [%i0+0x18]%asi

        ldda    [%i1+0x20]%asi, %o2
        ldda    [%i1+0x30]%asi, %o4

        stxa    %o2, [%i0+0x20]%asi
        stxa    %o3, [%i0+0x28]%asi
        stxa    %o4, [%i0+0x30]%asi
        stxa    %o5, [%i0+0x38]%asi

        add     %o0, 0x40, %o0
        add     %i1, 0x40, %i1
        subcc   %i3, 0x40, %i3
        bgu,pt  %ncc, 1b
        add     %i0, 0x40, %i0

.blkdone:
        membar  #Sync

        mov     ASI_PNF, %asi           ! restore %asi to default
                                        ! ASI_PRIMARY_NOFAULT value
        tst     %i2
        bz,pt   %ncc, .blkexit
        nop

        ! Handle trailing bytes
        cmp     %i2, 0x8
        blu,pt  %ncc, .residue
        nop

        ! Can we do some 8B ops
        or      %i1, %i0, %o2
        andcc   %o2, 0x7, %g0
        bnz     %ncc, .last4
        nop

        ! Do 8byte ops as long as possible
.last8:
        ldx     [%i1], %o2
        stx     %o2, [%i0]
        add     %i1, 0x8, %i1
        sub     %i2, 0x8, %i2
        cmp     %i2, 0x8
        bgu,pt  %ncc, .last8
        add     %i0, 0x8, %i0

        tst     %i2
        bz,pt   %ncc, .blkexit
        nop

        ba      .residue
        nop

.last4:
        ! Can we do 4B ops
        andcc   %o2, 0x3, %g0
        bnz     %ncc, .last2
        nop
1:
        ld      [%i1], %o2
        st      %o2, [%i0]
        add     %i1, 0x4, %i1
        sub     %i2, 0x4, %i2
        cmp     %i2, 0x4
        bgu,pt  %ncc, 1b
        add     %i0, 0x4, %i0

        cmp     %i2, 0
        bz,pt   %ncc, .blkexit
        nop

        ba      .residue
        nop

.last2:
        ! Can we do 2B ops
        andcc   %o2, 0x1, %g0
        bnz     %ncc, .residue
        nop

1:
        lduh    [%i1], %o2
        stuh    %o2, [%i0]
        add     %i1, 0x2, %i1
        sub     %i2, 0x2, %i2
        cmp     %i2, 0x2
        bgu,pt  %ncc, 1b
        add     %i0, 0x2, %i0

        cmp     %i2, 0
        bz,pt   %ncc, .blkexit
        nop

.residue:
        ldub    [%i1], %o2
        stb     %o2, [%i0]
        inc     %i1
        deccc   %i2
        bgu,pt  %ncc, .residue
        inc     %i0

.blkexit:

        ret
        restore %g5, %g0, %o0

#endif  /* NIAGARA2_IMPL */
        SET_SIZE(memcpy)
        SET_SIZE(__align_cpy_1)